diff --git a/.config/hakari.toml b/.config/hakari.toml index 9913ecc9c0..b5990d090e 100644 --- a/.config/hakari.toml +++ b/.config/hakari.toml @@ -23,10 +23,30 @@ platforms = [ ] [final-excludes] -# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but -# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded -# from depending on workspace-hack because most of the dependencies are not used. -workspace-members = ["vm_monitor"] +workspace-members = [ + # vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but + # it is built primarly in separate repo neondatabase/autoscaling and thus is excluded + # from depending on workspace-hack because most of the dependencies are not used. + "vm_monitor", + # All of these exist in libs and are not usually built independently. + # Putting workspace hack there adds a bottleneck for cargo builds. + "compute_api", + "consumption_metrics", + "desim", + "metrics", + "pageserver_api", + "postgres_backend", + "postgres_connection", + "postgres_ffi", + "pq_proto", + "remote_storage", + "safekeeper_api", + "tenant_size_model", + "tracing-utils", + "utils", + "wal_craft", + "walproposer", +] # Write out exact versions rather than a semver range. (Defaults to false.) # exact-versions = true diff --git a/.config/nextest.toml b/.config/nextest.toml index a9398e4ab0..affdc16f31 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -1,2 +1,2 @@ [profile.default] -slow-timeout = { period = "20s", terminate-after = 3 } +slow-timeout = { period = "60s", terminate-after = 3 } diff --git a/.dockerignore b/.dockerignore index ae0ad8fd77..c7a2f78e32 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,27 +1,30 @@ * -!rust-toolchain.toml -!Cargo.toml +# Files !Cargo.lock +!Cargo.toml !Makefile +!rust-toolchain.toml +!scripts/combine_control_files.py +!scripts/ninstall.sh +!vm-cgconfig.conf +!docker-compose/run-tests.sh +# Directories !.cargo/ !.config/ -!control_plane/ !compute_tools/ +!control_plane/ !libs/ +!neon_local/ !pageserver/ +!patches/ !pgxn/ !proxy/ +!storage_scrubber/ !safekeeper/ -!s3_scrubber/ !storage_broker/ +!storage_controller/ !trace/ -!vendor/postgres-v14/ -!vendor/postgres-v15/ -!vendor/postgres-v16/ +!vendor/postgres-*/ !workspace_hack/ -!neon_local/ -!scripts/ninstall.sh -!scripts/combine_control_files.py -!vm-cgconfig.conf diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000..6ba6b3c887 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# allows for nicer hunk headers with git show +*.rs diff=rust diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000..c8fd1209de --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,6 @@ + +blank_issues_enabled: true +contact_links: + - name: Feature request + url: https://console.neon.tech/app/projects?modal=feedback + about: For feature requests in the Neon product, please submit via the feedback form on `https://console.neon.tech` diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md index 019e6e7345..c442f50fde 100644 --- a/.github/ISSUE_TEMPLATE/epic-template.md +++ b/.github/ISSUE_TEMPLATE/epic-template.md @@ -16,9 +16,9 @@ assignees: '' ## Implementation ideas - +## Tasks ```[tasklist] -### Tasks +- [ ] Example Task ``` diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 362480f256..4ad8a7b460 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -1,12 +1,15 @@ self-hosted-runner: labels: - arm64 - - dev - - gen3 - large + - large-arm64 - small + - small-arm64 - us-east-2 config-variables: + - BENCHMARK_PROJECT_ID_PUB + - BENCHMARK_PROJECT_ID_SUB - REMOTE_STORAGE_AZURE_CONTAINER - REMOTE_STORAGE_AZURE_REGION - SLACK_UPCOMING_RELEASE_CHANNEL_ID + - DEV_AWS_OIDC_ROLE_ARN diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index abdbba802e..11adc8df86 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -39,7 +39,7 @@ runs: PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) if [ "${PR_NUMBER}" != "null" ]; then BRANCH_OR_PR=pr-${PR_NUMBER} - elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then + elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then # Shortcut for special branches BRANCH_OR_PR=${GITHUB_REF_NAME} else @@ -59,7 +59,7 @@ runs: BUCKET: neon-github-public-dev # TODO: We can replace with a special docker image with Java and Allure pre-installed - - uses: actions/setup-java@v3 + - uses: actions/setup-java@v4 with: distribution: 'temurin' java-version: '17' @@ -76,8 +76,8 @@ runs: rm -f ${ALLURE_ZIP} fi env: - ALLURE_VERSION: 2.24.0 - ALLURE_ZIP_SHA256: 60b1d6ce65d9ef24b23cf9c2c19fd736a123487c38e54759f1ed1a7a77353c90 + ALLURE_VERSION: 2.27.0 + ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777 # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this - name: Acquire lock @@ -150,7 +150,7 @@ runs: # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work, # and to keep files on the host to upload them to the database - time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}" + time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/" # Generate redirect cat < ${WORKDIR}/index.html @@ -179,22 +179,11 @@ runs: aws s3 rm "s3://${BUCKET}/${LOCK_FILE}" fi - - name: Store Allure test stat in the DB - if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }} - shell: bash -euxo pipefail {0} - env: - COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} - REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }} - run: | - export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR} - - ./scripts/pysync - - poetry run python3 scripts/ingest_regress_test_result.py \ - --revision ${COMMIT_SHA} \ - --reference ${GITHUB_REF} \ - --build-type unified \ - --ingest ${WORKDIR}/report/data/suites.json + - name: Cache poetry deps + uses: actions/cache@v4 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Store Allure test stat in the DB (new) if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }} @@ -226,7 +215,7 @@ runs: rm -rf ${WORKDIR} fi - - uses: actions/github-script@v6 + - uses: actions/github-script@v7 if: always() env: REPORT_URL: ${{ steps.generate-report.outputs.report-url }} diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml index 7ae9937d42..df4a6712ac 100644 --- a/.github/actions/allure-report-store/action.yml +++ b/.github/actions/allure-report-store/action.yml @@ -19,7 +19,7 @@ runs: PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) if [ "${PR_NUMBER}" != "null" ]; then BRANCH_OR_PR=pr-${PR_NUMBER} - elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then + elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then # Shortcut for special branches BRANCH_OR_PR=${GITHUB_REF_NAME} else diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml index ce26e7825b..01c216b1ac 100644 --- a/.github/actions/download/action.yml +++ b/.github/actions/download/action.yml @@ -26,7 +26,7 @@ runs: TARGET: ${{ inputs.path }} ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }} - PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }} + PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}/{2}', github.event.pull_request.head.sha || github.sha, github.run_id, github.run_attempt) }} run: | BUCKET=neon-github-public-dev FILENAME=$(basename $ARCHIVE) diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml index f1eea34ab9..9f752d5a89 100644 --- a/.github/actions/neon-branch-create/action.yml +++ b/.github/actions/neon-branch-create/action.yml @@ -3,14 +3,14 @@ description: 'Create Branch using API' inputs: api_key: - desctiption: 'Neon API key' + description: 'Neon API key' required: true project_id: - desctiption: 'ID of the Project to create Branch in' + description: 'ID of the Project to create Branch in' required: true api_host: - desctiption: 'Neon API host' - default: console.stage.neon.tech + description: 'Neon API host' + default: console-stage.neon.build outputs: dsn: description: 'Created Branch DSN (for main database)' diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml index f8cd351dd9..58141a4a3f 100644 --- a/.github/actions/neon-branch-delete/action.yml +++ b/.github/actions/neon-branch-delete/action.yml @@ -3,17 +3,17 @@ description: 'Delete Branch using API' inputs: api_key: - desctiption: 'Neon API key' + description: 'Neon API key' required: true project_id: - desctiption: 'ID of the Project which should be deleted' + description: 'ID of the Project which should be deleted' required: true branch_id: - desctiption: 'ID of the branch to delete' + description: 'ID of the branch to delete' required: true api_host: - desctiption: 'Neon API host' - default: console.stage.neon.tech + description: 'Neon API host' + default: console-stage.neon.build runs: using: "composite" diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index ae6464990e..f4a194639f 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -3,22 +3,19 @@ description: 'Create Neon Project using API' inputs: api_key: - desctiption: 'Neon API key' + description: 'Neon API key' required: true region_id: - desctiption: 'Region ID, if not set the project will be created in the default region' + description: 'Region ID, if not set the project will be created in the default region' default: aws-us-east-2 postgres_version: - desctiption: 'Postgres version; default is 15' - default: 15 + description: 'Postgres version; default is 16' + default: '16' api_host: - desctiption: 'Neon API host' - default: console.stage.neon.tech - provisioner: - desctiption: 'k8s-pod or k8s-neonvm' - default: 'k8s-pod' + description: 'Neon API host' + default: console-stage.neon.build compute_units: - desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal' + description: '[Min, Max] compute units' default: '[1, 1]' outputs: @@ -37,10 +34,6 @@ runs: # A shell without `set -x` to not to expose password/dsn in logs shell: bash -euo pipefail {0} run: | - if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then - echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU" - fi - project=$(curl \ "https://${API_HOST}/api/v2/projects" \ --fail \ @@ -52,7 +45,7 @@ runs: \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\", \"pg_version\": ${POSTGRES_VERSION}, \"region_id\": \"${REGION_ID}\", - \"provisioner\": \"${PROVISIONER}\", + \"provisioner\": \"k8s-neonvm\", \"autoscaling_limit_min_cu\": ${MIN_CU}, \"autoscaling_limit_max_cu\": ${MAX_CU}, \"settings\": { } @@ -75,6 +68,5 @@ runs: API_KEY: ${{ inputs.api_key }} REGION_ID: ${{ inputs.region_id }} POSTGRES_VERSION: ${{ inputs.postgres_version }} - PROVISIONER: ${{ inputs.provisioner }} MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }} MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }} diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml index adc8510a34..35e165fd61 100644 --- a/.github/actions/neon-project-delete/action.yml +++ b/.github/actions/neon-project-delete/action.yml @@ -3,14 +3,14 @@ description: 'Delete Neon Project using API' inputs: api_key: - desctiption: 'Neon API key' + description: 'Neon API key' required: true project_id: - desctiption: 'ID of the Project to delete' + description: 'ID of the Project to delete' required: true api_host: - desctiption: 'Neon API host' - default: console.stage.neon.tech + description: 'Neon API host' + default: console-stage.neon.build runs: using: "composite" diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 8dfa6c465f..4008cd0d36 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -43,7 +43,11 @@ inputs: pg_version: description: 'Postgres version to use for tests' required: false - default: 'v14' + default: 'v16' + benchmark_durations: + description: 'benchmark durations JSON' + required: false + default: '{}' runs: using: "composite" @@ -52,14 +56,14 @@ runs: if: inputs.build_type != 'remote' uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact path: /tmp/neon - name: Download Neon binaries for the previous release if: inputs.build_type != 'remote' uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact path: /tmp/neon-previous prefix: latest @@ -67,7 +71,7 @@ runs: if: inputs.build_type != 'remote' uses: ./.github/actions/download with: - name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }} + name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }} path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }} prefix: latest # The lack of compatibility snapshot (for example, for the new Postgres version) @@ -76,17 +80,15 @@ runs: - name: Checkout if: inputs.needs_postgres_source == 'true' - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: true - fetch-depth: 1 - name: Cache poetry deps - id: cache_poetry - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Install Python deps shell: bash -euxo pipefail {0} @@ -111,6 +113,8 @@ runs: export PLATFORM=${PLATFORM:-github-actions-selfhosted} export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} export DEFAULT_PG_VERSION=${PG_VERSION#v} + export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib + export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-} if [ "${BUILD_TYPE}" = "remote" ]; then export REMOTE_ENV=1 @@ -126,8 +130,8 @@ runs: exit 1 fi if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then - # -n16 uses sixteen processes to run tests via pytest-xdist - EXTRA_PARAMS="-n16 $EXTRA_PARAMS" + # -n sets the number of parallel processes that pytest-xdist will run + EXTRA_PARAMS="-n12 $EXTRA_PARAMS" # --dist=loadgroup points tests marked with @pytest.mark.xdist_group # to the same worker to make @pytest.mark.order work with xdist @@ -160,28 +164,33 @@ runs: # We use pytest-split plugin to run benchmarks in parallel on different CI runners if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then mkdir -p $TEST_OUTPUT - poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json" + echo '${{ inputs.benchmark_durations || '{}' }}' > $TEST_OUTPUT/benchmark_durations.json EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS" fi - if [[ "${{ inputs.build_type }}" == "debug" ]]; then + if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) - elif [[ "${{ inputs.build_type }}" == "release" ]]; then - cov_prefix=() else cov_prefix=() fi # Wake up the cluster if we use remote neon instance if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" + QUERIES=("SELECT version()") + if [[ "${PLATFORM}" = "neon"* ]]; then + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") + fi + + for q in "${QUERIES[@]}"; do + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "${q}" + done fi # Run the tests. # - # The junit.xml file allows CI tools to display more fine-grained test information - # in its "Tests" tab in the results page. + # --alluredir saves test results in Allure format (in a specified directory) # --verbose prints name of each test (helpful when there are # multiple tests in one file) # -rA prints summary in the end @@ -190,7 +199,6 @@ runs: # mkdir -p $TEST_OUTPUT/allure/results "${cov_prefix[@]}" ./scripts/pytest \ - --junitxml=$TEST_OUTPUT/junit.xml \ --alluredir=$TEST_OUTPUT/allure/results \ --tb=short \ --verbose \ @@ -203,13 +211,13 @@ runs: fi - name: Upload compatibility snapshot - if: github.ref_name == 'release' + # Note, that we use `github.base_ref` which is a target branch for a PR + if: github.event_name == 'pull_request' && github.base_ref == 'release' uses: ./.github/actions/upload with: - name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}-${{ github.run_id }} + name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }} # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/ - prefix: latest - name: Upload test results if: ${{ !cancelled() }} diff --git a/.github/actions/set-docker-config-dir/action.yml b/.github/actions/set-docker-config-dir/action.yml new file mode 100644 index 0000000000..3ee8bec8c6 --- /dev/null +++ b/.github/actions/set-docker-config-dir/action.yml @@ -0,0 +1,36 @@ +name: "Set custom docker config directory" +description: "Create a directory for docker config and set DOCKER_CONFIG" + +# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings +runs: + using: "composite" + steps: + - name: Show warning on GitHub-hosted runners + if: runner.environment == 'github-hosted' + shell: bash -euo pipefail {0} + run: | + # Using the following environment variables to find a path to the workflow file + # ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch + # ${GITHUB_REPOSITORY} - octocat/hello-world + # ${GITHUB_REF} - refs/heads/my_branch + # From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables + + filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"} + filename=${filename_with_ref%"@$GITHUB_REF"} + + # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message + title='Unnecessary usage of `.github/actions/set-docker-config-dir`' + message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners' + echo "::warning file=${filename},title=${title}::${message}" + + - uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7 + env: + DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }} + with: + main: | + mkdir -p "${DOCKER_CONFIG}" + echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV + post: | + if [ -d "${DOCKER_CONFIG}" ]; then + rm -r "${DOCKER_CONFIG}" + fi diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml index 63973dfbe7..edcece7d2b 100644 --- a/.github/actions/upload/action.yml +++ b/.github/actions/upload/action.yml @@ -8,7 +8,7 @@ inputs: description: "A directory or file to upload" required: true prefix: - description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" + description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" required: false runs: @@ -45,7 +45,7 @@ runs: env: SOURCE: ${{ inputs.path }} ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst - PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }} + PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}/{2}', github.event.pull_request.head.sha || github.sha, github.run_id , github.run_attempt) }} run: | BUCKET=neon-github-public-dev FILENAME=$(basename $ARCHIVE) diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml new file mode 100644 index 0000000000..a52e43b4da --- /dev/null +++ b/.github/workflows/_benchmarking_preparation.yml @@ -0,0 +1,154 @@ +name: Prepare benchmarking databases by restoring dumps + +on: + workflow_call: + # no inputs needed + +defaults: + run: + shell: bash -euxo pipefail {0} + +jobs: + setup-databases: + strategy: + fail-fast: false + matrix: + platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ] + database: [ clickbench, tpch, userexample ] + + env: + LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib + PLATFORM: ${{ matrix.platform }} + PG_BINARIES: /tmp/neon/pg_install/v16/bin + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + options: --init + + steps: + - name: Set up Connection String + id: set-up-prep-connstr + run: | + case "${PLATFORM}" in + neon) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} + ;; + aws-rds-postgres) + CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} + ;; + aws-aurora-serverless-v2-postgres) + CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }} + ;; + *) + echo >&2 "Unknown PLATFORM=${PLATFORM}" + exit 1 + ;; + esac + + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + + - uses: actions/checkout@v4 + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + + # we create a table that has one row for each database that we want to restore with the status whether the restore is done + - name: Create benchmark_restore_status table if it does not exist + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + DATABASE_NAME: ${{ matrix.database }} + # to avoid a race condition of multiple jobs trying to create the table at the same time, + # we use an advisory lock + run: | + ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c " + SELECT pg_advisory_lock(4711); + CREATE TABLE IF NOT EXISTS benchmark_restore_status ( + databasename text primary key, + restore_done boolean + ); + SELECT pg_advisory_unlock(4711); + " + + - name: Check if restore is already done + id: check-restore-done + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + DATABASE_NAME: ${{ matrix.database }} + run: | + skip=false + if ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM benchmark_restore_status WHERE databasename='${{ env.DATABASE_NAME }}' AND restore_done=true;" | grep -q 1; then + echo "Restore already done for database ${{ env.DATABASE_NAME }} on platform ${{ env.PLATFORM }}. Skipping this database." + skip=true + fi + echo "skip=${skip}" | tee -a $GITHUB_OUTPUT + + - name: Check and create database if it does not exist + if: steps.check-restore-done.outputs.skip != 'true' + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + DATABASE_NAME: ${{ matrix.database }} + run: | + DB_EXISTS=$(${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM pg_database WHERE datname='${{ env.DATABASE_NAME }}'") + if [ "$DB_EXISTS" != "1" ]; then + echo "Database ${{ env.DATABASE_NAME }} does not exist. Creating it..." + ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "CREATE DATABASE \"${{ env.DATABASE_NAME }}\";" + else + echo "Database ${{ env.DATABASE_NAME }} already exists." + fi + + - name: Download dump from S3 to /tmp/dumps + if: steps.check-restore-done.outputs.skip != 'true' + env: + DATABASE_NAME: ${{ matrix.database }} + run: | + mkdir -p /tmp/dumps + aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/ + + - name: Replace database name in connection string + if: steps.check-restore-done.outputs.skip != 'true' + id: replace-dbname + env: + DATABASE_NAME: ${{ matrix.database }} + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + run: | + # Extract the part before the database name + base_connstr="${BENCHMARK_CONNSTR%/*}" + # Extract the query parameters (if any) after the database name + query_params="${BENCHMARK_CONNSTR#*\?}" + # Reconstruct the new connection string + if [ "$query_params" != "$BENCHMARK_CONNSTR" ]; then + new_connstr="${base_connstr}/${DATABASE_NAME}?${query_params}" + else + new_connstr="${base_connstr}/${DATABASE_NAME}" + fi + echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT + + - name: Restore dump + if: steps.check-restore-done.outputs.skip != 'true' + env: + DATABASE_NAME: ${{ matrix.database }} + DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }} + # the following works only with larger computes: + # PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7" + # we add the || true because: + # the dumps were created with Neon and contain neon extensions that are not + # available in RDS, so we will always report an error, but we can ignore it + run: | + ${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \ + -d "${DATABASE_CONNSTR}" /tmp/dumps/${DATABASE_NAME}.pg_dump || true + + - name: Update benchmark_restore_status table + if: steps.check-restore-done.outputs.skip != 'true' + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + DATABASE_NAME: ${{ matrix.database }} + run: | + ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c " + INSERT INTO benchmark_restore_status (databasename, restore_done) VALUES ('${{ env.DATABASE_NAME }}', true) + ON CONFLICT (databasename) DO UPDATE SET restore_done = true; + " diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml new file mode 100644 index 0000000000..e18e6a1201 --- /dev/null +++ b/.github/workflows/_build-and-test-locally.yml @@ -0,0 +1,303 @@ +name: Build and Test Locally + +on: + workflow_call: + inputs: + arch: + description: 'x64 or arm64' + required: true + type: string + build-tag: + description: 'build tag' + required: true + type: string + build-tools-image: + description: 'build-tools image' + required: true + type: string + build-type: + description: 'debug or release' + required: true + type: string + pg-versions: + description: 'a json array of postgres versions to run regression tests on' + required: true + type: string + +defaults: + run: + shell: bash -euxo pipefail {0} + +env: + RUST_BACKTRACE: 1 + COPT: '-Werror' + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + +jobs: + build-neon: + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} + container: + image: ${{ inputs.build-tools-image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + # Raise locked memory limit for tokio-epoll-uring. + # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12), + # io_uring will account the memory of the CQ and SQ as locked. + # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391 + options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 + env: + BUILD_TYPE: ${{ inputs.build-type }} + GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }} + BUILD_TAG: ${{ inputs.build-tag }} + + steps: + - name: Fix git ownership + run: | + # Workaround for `fatal: detected dubious ownership in repository at ...` + # + # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers + # Ref https://github.com/actions/checkout/issues/785 + # + git config --global --add safe.directory ${{ github.workspace }} + git config --global --add safe.directory ${GITHUB_WORKSPACE} + for r in 14 15 16; do + git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" + git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" + done + + - uses: actions/checkout@v4 + with: + submodules: true + + - name: Set pg 14 revision for caching + id: pg_v14_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT + + - name: Set pg 15 revision for caching + id: pg_v15_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT + + - name: Set pg 16 revision for caching + id: pg_v16_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + + # Set some environment variables used by all the steps. + # + # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. + # It also includes --features, if any + # + # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS, + # because "cargo metadata" doesn't accept --release or --debug options + # + # We run tests with addtional features, that are turned off by default (e.g. in release builds), see + # corresponding Cargo.toml files for their descriptions. + - name: Set env variables + env: + ARCH: ${{ inputs.arch }} + run: | + CARGO_FEATURES="--features testing" + if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then + cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" + CARGO_FLAGS="--locked" + elif [[ $BUILD_TYPE == "debug" ]]; then + cov_prefix="" + CARGO_FLAGS="--locked" + elif [[ $BUILD_TYPE == "release" ]]; then + cov_prefix="" + CARGO_FLAGS="--locked --release" + fi + { + echo "cov_prefix=${cov_prefix}" + echo "CARGO_FEATURES=${CARGO_FEATURES}" + echo "CARGO_FLAGS=${CARGO_FLAGS}" + echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" + } >> $GITHUB_ENV + + - name: Cache postgres v14 build + id: cache_pg_14 + uses: actions/cache@v4 + with: + path: pg_install/v14 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + + - name: Cache postgres v15 build + id: cache_pg_15 + uses: actions/cache@v4 + with: + path: pg_install/v15 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + + - name: Cache postgres v16 build + id: cache_pg_16 + uses: actions/cache@v4 + with: + path: pg_install/v16 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + + - name: Build postgres v14 + if: steps.cache_pg_14.outputs.cache-hit != 'true' + run: mold -run make postgres-v14 -j$(nproc) + + - name: Build postgres v15 + if: steps.cache_pg_15.outputs.cache-hit != 'true' + run: mold -run make postgres-v15 -j$(nproc) + + - name: Build postgres v16 + if: steps.cache_pg_16.outputs.cache-hit != 'true' + run: mold -run make postgres-v16 -j$(nproc) + + - name: Build neon extensions + run: mold -run make neon-pg-ext -j$(nproc) + + - name: Build walproposer-lib + run: mold -run make walproposer-lib -j$(nproc) + + - name: Run cargo build + run: | + PQ_LIB_DIR=$(pwd)/pg_install/v16/lib + export PQ_LIB_DIR + ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests + + # Do install *before* running rust tests because they might recompile the + # binaries with different features/flags. + - name: Install rust binaries + env: + ARCH: ${{ inputs.arch }} + run: | + # Install target binaries + mkdir -p /tmp/neon/bin/ + binaries=$( + ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps | + jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' + ) + for bin in $binaries; do + SRC=target/$BUILD_TYPE/$bin + DST=/tmp/neon/bin/$bin + cp "$SRC" "$DST" + done + + # Install test executables and write list of all binaries (for code coverage) + if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then + # Keep bloated coverage data files away from the rest of the artifact + mkdir -p /tmp/coverage/ + + mkdir -p /tmp/neon/test_bin/ + + test_exe_paths=$( + ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run | + jq -r '.executable | select(. != null)' + ) + for bin in $test_exe_paths; do + SRC=$bin + DST=/tmp/neon/test_bin/$(basename $bin) + + # We don't need debug symbols for code coverage, so strip them out to make + # the artifact smaller. + strip "$SRC" -o "$DST" + echo "$DST" >> /tmp/coverage/binaries.list + done + + for bin in $binaries; do + echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list + done + fi + + - name: Run rust tests + env: + NEXTEST_RETRIES: 3 + run: | + PQ_LIB_DIR=$(pwd)/pg_install/v16/lib + export PQ_LIB_DIR + LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib + export LD_LIBRARY_PATH + + #nextest does not yet support running doctests + ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES + + # run all non-pageserver tests + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)' + + # run pageserver tests with different settings + for io_engine in std-fs tokio-epoll-uring ; do + for io_buffer_alignment in 0 1 512 ; do + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' + done + done + + # Run separate tests for real S3 + export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty + export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests + export REMOTE_STORAGE_S3_REGION=eu-central-1 + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)' + + # Run separate tests for real Azure Blob Storage + # XXX: replace region with `eu-central-1`-like region + export ENABLE_REAL_AZURE_REMOTE_STORAGE=y + export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}" + export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" + export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" + export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)' + + - name: Install postgres binaries + run: cp -a pg_install /tmp/neon/pg_install + + - name: Upload Neon artifact + uses: ./.github/actions/upload + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact + path: /tmp/neon + + # XXX: keep this after the binaries.list is formed, so the coverage can properly work later + - name: Merge and upload coverage data + if: inputs.build-type == 'debug' + uses: ./.github/actions/save-coverage-data + + regress-tests: + # Don't run regression tests on debug arm64 builds + if: inputs.build-type != 'debug' || inputs.arch != 'arm64' + needs: [ build-neon ] + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} + container: + image: ${{ inputs.build-tools-image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + # for changed limits, see comments on `options:` earlier in this file + options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 + strategy: + fail-fast: false + matrix: + pg_version: ${{ fromJson(inputs.pg-versions) }} + steps: + - uses: actions/checkout@v4 + with: + submodules: true + + - name: Pytest regression tests + uses: ./.github/actions/run-python-test-set + timeout-minutes: 60 + with: + build_type: ${{ inputs.build-type }} + test_selection: regress + needs_postgres_source: true + run_with_real_s3: true + real_s3_bucket: neon-github-ci-tests + real_s3_region: eu-central-1 + rerun_flaky: true + pg_version: ${{ matrix.pg_version }} + env: + TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + CHECK_ONDISK_DATA_COMPATIBILITY: nonempty + BUILD_TAG: ${{ inputs.build-tag }} + PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring + + # Temporary disable this step until we figure out why it's so flaky + # Ref https://github.com/neondatabase/neon/issues/4540 + - name: Merge and upload coverage data + if: | + false && + inputs.build-type == 'debug' && matrix.pg_version == 'v16' + uses: ./.github/actions/save-coverage-data diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index 584828c1d0..85cfe7446e 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -16,8 +16,15 @@ concurrency: cancel-in-progress: ${{ github.event_name == 'pull_request' }} jobs: + check-permissions: + if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} + uses: ./.github/workflows/check-permissions.yml + with: + github-event-name: ${{ github.event_name}} + actionlint: - runs-on: ubuntu-latest + needs: [ check-permissions ] + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 - uses: reviewdog/action-actionlint@v1 @@ -29,3 +36,16 @@ jobs: fail_on_error: true filter_mode: nofilter level: error + + - name: Disallow 'ubuntu-latest' runners + run: | + PAT='^\s*runs-on:.*-latest' + if grep -ERq $PAT .github/workflows; then + grep -ERl $PAT .github/workflows |\ + while read -r f + do + l=$(grep -nE $PAT $f | awk -F: '{print $1}' | head -1) + echo "::error file=$f,line=$l::Please use 'ubuntu-22.04' instead of 'ubuntu-latest'" + done + exit 1 + fi diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml index 5b21011b83..0a0898d30c 100644 --- a/.github/workflows/approved-for-ci-run.yml +++ b/.github/workflows/approved-for-ci-run.yml @@ -18,6 +18,7 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number }} + cancel-in-progress: false env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -43,7 +44,7 @@ jobs: contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) && contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run') - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run" @@ -59,24 +60,50 @@ jobs: github.event.action == 'labeled' && contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run') - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run" - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: ref: main token: ${{ secrets.CI_ACCESS_TOKEN }} + + - name: Look for existing PR + id: get-pr + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')" + echo "ALREADY_CREATED=${ALREADY_CREATED}" >> ${GITHUB_OUTPUT} + + - name: Get changed labels + id: get-labels + if: steps.get-pr.outputs.ALREADY_CREATED != '' + env: + ALREADY_CREATED: ${{ steps.get-pr.outputs.ALREADY_CREATED }} + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + LABELS_TO_REMOVE=$(comm -23 <(gh pr --repo ${GITHUB_REPOSITORY} view ${ALREADY_CREATED} --json labels --jq '.labels.[].name'| ( grep -E '^run' || true ) | sort) \ + <(gh pr --repo ${GITHUB_REPOSITORY} view ${PR_NUMBER} --json labels --jq '.labels.[].name' | ( grep -E '^run' || true ) | sort ) |\ + ( grep -v run-e2e-tests-in-draft || true ) | paste -sd , -) + LABELS_TO_ADD=$(comm -13 <(gh pr --repo ${GITHUB_REPOSITORY} view ${ALREADY_CREATED} --json labels --jq '.labels.[].name'| ( grep -E '^run' || true ) |sort) \ + <(gh pr --repo ${GITHUB_REPOSITORY} view ${PR_NUMBER} --json labels --jq '.labels.[].name' | ( grep -E '^run' || true ) | sort ) |\ + paste -sd , -) + echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT} + echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT} - run: gh pr checkout "${PR_NUMBER}" - run: git checkout -b "${BRANCH}" - run: git push --force origin "${BRANCH}" + if: steps.get-pr.outputs.ALREADY_CREATED == '' - name: Create a Pull Request for CI run (if required) - env: + if: steps.get-pr.outputs.ALREADY_CREATED == '' + env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | cat << EOF > body.md @@ -87,15 +114,33 @@ jobs: Feel free to review/comment/discuss the original PR #${PR_NUMBER}. EOF - ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')" - if [ -z "${ALREADY_CREATED}" ]; then - gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \ + LABELS=$( (gh pr --repo "${GITHUB_REPOSITORY}" view ${PR_NUMBER} --json labels --jq '.labels.[].name'; echo run-e2e-tests-in-draft )| \ + grep -E '^run' | paste -sd , -) + gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \ --body-file "body.md" \ --head "${BRANCH}" \ --base "main" \ + --label ${LABELS} \ --draft + - name: Modify the existing pull request (if required) + if: steps.get-pr.outputs.ALREADY_CREATED != '' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + LABELS_TO_ADD: ${{ steps.get-labels.outputs.LABELS_TO_ADD }} + LABELS_TO_REMOVE: ${{ steps.get-labels.outputs.LABELS_TO_REMOVE }} + ALREADY_CREATED: ${{ steps.get-pr.outputs.ALREADY_CREATED }} + run: | + ADD_CMD= + REMOVE_CMD= + [ -z "${LABELS_TO_ADD}" ] || ADD_CMD="--add-label ${LABELS_TO_ADD}" + [ -z "${LABELS_TO_REMOVE}" ] || REMOVE_CMD="--remove-label ${LABELS_TO_REMOVE}" + if [ -n "${ADD_CMD}" ] || [ -n "${REMOVE_CMD}" ]; then + gh pr --repo "${GITHUB_REPOSITORY}" edit ${ALREADY_CREATED} ${ADD_CMD} ${REMOVE_CMD} fi + - run: git push --force origin "${BRANCH}" + if: steps.get-pr.outputs.ALREADY_CREATED != '' + cleanup: # Close PRs and delete branchs if the original PR is closed. @@ -107,7 +152,7 @@ jobs: github.event.action == 'closed' && github.event.pull_request.head.repo.full_name != github.repository - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 8bf12c31b1..a4a597acde 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -38,6 +38,11 @@ on: description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch' required: false default: false + run_only_pgvector_tests: + type: boolean + description: 'Run pgvector tests but no other tests. If not set, all tests including pgvector tests will be run' + required: false + default: false defaults: run: @@ -50,28 +55,54 @@ concurrency: jobs: bench: + if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} + permissions: + contents: write + statuses: write + id-token: write # Required for OIDC authentication in azure runners + strategy: + fail-fast: false + matrix: + include: + - DEFAULT_PG_VERSION: 16 + PLATFORM: "neon-staging" + region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} + RUNNER: [ self-hosted, us-east-2, x64 ] + IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + - DEFAULT_PG_VERSION: 16 + PLATFORM: "azure-staging" + region_id: 'azure-eastus2' + RUNNER: [ self-hosted, eastus2, x64 ] + IMAGE: neondatabase/build-tools:pinned env: TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_SCALES_MATRIX: "10,100" POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} - PLATFORM: "neon-staging" + PLATFORM: ${{ matrix.PLATFORM }} - runs-on: [ self-hosted, us-east-2, x64 ] + runs-on: ${{ matrix.RUNNER }} container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: ${{ matrix.IMAGE }} options: --init steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 + + - name: Configure AWS credentials # necessary on Azure runners + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours - name: Download Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-release-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest @@ -79,7 +110,7 @@ jobs: id: create-neon-project uses: ./.github/actions/neon-project-create with: - region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} + region_id: ${{ matrix.region_id }} postgres_version: ${{ env.DEFAULT_PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} @@ -90,10 +121,18 @@ jobs: test_selection: performance run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} + pg_version: ${{ env.DEFAULT_PG_VERSION }} # Set --sparse-ordering option of pytest-order plugin # to ensure tests are running in order of appears in the file. # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests - extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py + extra_params: + -m remote_cluster + --sparse-ordering + --timeout 14400 + --ignore test_runner/performance/test_perf_olap.py + --ignore test_runner/performance/test_perf_pgvector_queries.py + --ignore test_runner/performance/test_logical_replication.py + --ignore test_runner/performance/test_physical_replication.py env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -107,6 +146,7 @@ jobs: api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -115,22 +155,106 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: | + Periodic perf testing: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + + replication-tests: + if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} + env: + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + DEFAULT_PG_VERSION: 16 + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} + PLATFORM: "neon-staging" + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + options: --init + + steps: + - uses: actions/checkout@v4 + + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + + - name: Run Logical Replication benchmarks + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_logical_replication.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 5400 + pg_version: ${{ env.DEFAULT_PG_VERSION }} + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }} + BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }} + + - name: Run Physical Replication benchmarks + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_physical_replication.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 5400 + pg_version: ${{ env.DEFAULT_PG_VERSION }} + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Create Allure report + id: create-allure-report + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + with: + store-test-results-into-db: true + env: + REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream + slack-message: | + Periodic replication testing: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} generate-matrices: + if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday) # # Available platforms: - # - neon-captest-new: Freshly created project (1 CU) - # - neon-captest-freetier: Use freetier-sized compute (0.25 CU) - # - neon-captest-reuse: Reusing existing project + # - neonvm-captest-new: Freshly created project (1 CU) + # - neonvm-captest-freetier: Use freetier-sized compute (0.25 CU) + # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region + # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region + # - neonvm-captest-reuse: Reusing existing project # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage env: RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }} - runs-on: ubuntu-latest + DEFAULT_REGION_ID: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} + runs-on: ubuntu-22.04 outputs: pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }} olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }} @@ -140,22 +264,37 @@ jobs: - name: Generate matrix for pgbench benchmark id: pgbench-compare-matrix run: | + region_id_default=${{ env.DEFAULT_REGION_ID }} + runner_default='["self-hosted", "us-east-2", "x64"]' + runner_azure='["self-hosted", "eastus2", "x64"]' + image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned" matrix='{ + "pg_version" : [ + 16 + ], + "region_id" : [ + "'"$region_id_default"'" + ], "platform": [ - "neon-captest-new", - "neon-captest-reuse", + "neonvm-captest-new", + "neonvm-captest-reuse", "neonvm-captest-new" ], "db_size": [ "10gb" ], - "include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" }, - { "platform": "neon-captest-new", "db_size": "50gb" }, - { "platform": "neonvm-captest-freetier", "db_size": "3gb" }, - { "platform": "neonvm-captest-new", "db_size": "50gb" }] + "runner": ['"$runner_default"'], + "image": [ "'"$image_default"'" ], + "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }] }' - if [ "$(date +%A)" = "Saturday" ]; then - matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"}, - { "platform": "rds-aurora", "db_size": "50gb"}]') + if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then + matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -165,13 +304,13 @@ jobs: run: | matrix='{ "platform": [ - "neon-captest-reuse" + "neonvm-captest-reuse" ] }' if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" }, - { "platform": "rds-aurora" }]') + { "platform": "rds-aurora" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -181,7 +320,7 @@ jobs: run: | matrix='{ "platform": [ - "neon-captest-reuse" + "neonvm-captest-reuse" ], "scale": [ "10" @@ -190,13 +329,22 @@ jobs: if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" }, - { "platform": "rds-aurora", "scale": "10" }]') + { "platform": "rds-aurora", "scale": "10" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT + prepare_AWS_RDS_databases: + uses: ./.github/workflows/_benchmarking_preparation.yml + secrets: inherit + pgbench-compare: - needs: [ generate-matrices ] + if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} + needs: [ generate-matrices, prepare_AWS_RDS_databases ] + permissions: + contents: write + statuses: write + id-token: write # Required for OIDC authentication in azure runners strategy: fail-fast: false @@ -206,54 +354,58 @@ jobs: TEST_PG_BENCH_DURATIONS_MATRIX: "60m" TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }} POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: ${{ matrix.pg_version }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} - runs-on: [ self-hosted, us-east-2, x64 ] + runs-on: ${{ matrix.runner }} container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: ${{ matrix.image }} options: --init # Increase timeout to 8h, default timeout is 6h timeout-minutes: 480 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 + + - name: Configure AWS credentials # necessary on Azure runners + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours - name: Download Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-release-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - - name: Add Postgres binaries to PATH - run: | - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Create Neon Project - if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform) + if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) id: create-neon-project uses: ./.github/actions/neon-project-create with: - region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} + region_id: ${{ matrix.region_id }} postgres_version: ${{ env.DEFAULT_PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }} - provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }} + compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }} - name: Set up Connection String id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-reuse) + neonvm-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; - neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier) + neonvm-captest-sharding-reuse) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }} + ;; + neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) @@ -270,12 +422,6 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" - if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" - fi - psql ${CONNSTR} -c "${QUERY}" - - name: Benchmark init uses: ./.github/actions/run-python-test-set with: @@ -284,6 +430,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -297,6 +444,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -310,6 +458,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -323,6 +472,7 @@ jobs: api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -331,7 +481,133 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: | + Periodic perf testing on ${{ matrix.platform }}: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + + pgbench-pgvector: + permissions: + contents: write + statuses: write + id-token: write # Required for OIDC authentication in azure runners + strategy: + fail-fast: false + matrix: + include: + - PLATFORM: "neonvm-captest-pgvector" + RUNNER: [ self-hosted, us-east-2, x64 ] + IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + - PLATFORM: "azure-captest-pgvector" + RUNNER: [ self-hosted, eastus2, x64 ] + IMAGE: neondatabase/build-tools:pinned + + env: + TEST_PG_BENCH_DURATIONS_MATRIX: "15m" + TEST_PG_BENCH_SCALES_MATRIX: "1" + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + DEFAULT_PG_VERSION: 16 + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote + LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} + PLATFORM: ${{ matrix.PLATFORM }} + + runs-on: ${{ matrix.RUNNER }} + container: + image: ${{ matrix.IMAGE }} + options: --init + + steps: + - uses: actions/checkout@v4 + + # until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16 + # instead of using Neon artifacts containing pgbench + - name: Install postgresql-16 where pytest expects it + run: | + cd /home/nonroot + wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb + wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb + wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb + dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg + dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg + dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg + mkdir -p /tmp/neon/pg_install/v16/bin + ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench + ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql + ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib + /tmp/neon/pg_install/v16/bin/pgbench --version + /tmp/neon/pg_install/v16/bin/psql --version + + - name: Set up Connection String + id: set-up-connstr + run: | + case "${PLATFORM}" in + neonvm-captest-pgvector) + CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }} + ;; + azure-captest-pgvector) + CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }} + ;; + *) + echo >&2 "Unknown PLATFORM=${PLATFORM}" + exit 1 + ;; + esac + + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + + - name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3 + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + + - name: Benchmark pgvector hnsw indexing + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_perf_olap.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing + pg_version: ${{ env.DEFAULT_PG_VERSION }} + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + + - name: Benchmark pgvector queries + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_perf_pgvector_queries.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 21600 + pg_version: ${{ env.DEFAULT_PG_VERSION }} + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + + - name: Create Allure report + id: create-allure-report + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C033QLM5P7D" # dev-staging-stream + slack-message: | + Periodic perf testing on ${{ env.PLATFORM }}: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} @@ -343,8 +619,8 @@ jobs: # # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB - if: ${{ !cancelled() }} - needs: [ generate-matrices, pgbench-compare ] + if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} + needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ] strategy: fail-fast: false @@ -352,7 +628,7 @@ jobs: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }} TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }} @@ -362,29 +638,24 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned options: --init steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Download Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-release-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - - name: Add Postgres binaries to PATH - run: | - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Set up Connection String id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-reuse) + neonvm-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }} ;; rds-aurora) @@ -394,19 +665,13 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }} ;; *) - echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" + echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" - if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" - fi - psql ${CONNSTR} -c "${QUERY}" - - name: ClickBench benchmark uses: ./.github/actions/run-python-test-set with: @@ -415,6 +680,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_clickbench + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -424,6 +690,7 @@ jobs: TEST_OLAP_SCALE: 10 - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -432,7 +699,10 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic OLAP perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: | + Periodic OLAP perf testing on ${{ matrix.platform }}: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} @@ -443,8 +713,8 @@ jobs: # We might change it after https://github.com/neondatabase/neon/issues/2900. # # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB) - if: ${{ !cancelled() }} - needs: [ generate-matrices, clickbench-compare ] + if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} + needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ] strategy: fail-fast: false @@ -452,7 +722,7 @@ jobs: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} @@ -461,38 +731,33 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned options: --init steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Download Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-release-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - - name: Add Postgres binaries to PATH - run: | - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Get Connstring Secret Name run: | case "${PLATFORM}" in - neon-captest-reuse) + neonvm-captest-reuse) ENV_PLATFORM=CAPTEST_TPCH ;; rds-aurora) ENV_PLATFORM=RDS_AURORA_TPCH ;; rds-postgres) - ENV_PLATFORM=RDS_AURORA_TPCH + ENV_PLATFORM=RDS_POSTGRES_TPCH ;; *) - echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" + echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac @@ -507,12 +772,6 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" - if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" - fi - psql ${CONNSTR} -c "${QUERY}" - - name: Run TPC-H benchmark uses: ./.github/actions/run-python-test-set with: @@ -521,6 +780,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_tpch + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -528,6 +788,7 @@ jobs: TEST_OLAP_SCALE: ${{ matrix.scale }} - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -536,13 +797,16 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: | + Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} user-examples-compare: - if: ${{ !cancelled() }} - needs: [ generate-matrices, tpch-compare ] + if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} + needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ] strategy: fail-fast: false @@ -550,7 +814,7 @@ jobs: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} @@ -558,29 +822,24 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned options: --init steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Download Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-release-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - - name: Add Postgres binaries to PATH - run: | - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Set up Connection String id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-reuse) + neonvm-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }} ;; rds-aurora) @@ -590,19 +849,13 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }} ;; *) - echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" + echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" - if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" - fi - psql ${CONNSTR} -c "${QUERY}" - - name: Run user examples uses: ./.github/actions/run-python-test-set with: @@ -611,12 +864,14 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_user_examples + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -625,6 +880,10 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: | + Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> + env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml new file mode 100644 index 0000000000..ca5ff573e1 --- /dev/null +++ b/.github/workflows/build-build-tools-image.yml @@ -0,0 +1,103 @@ +name: Build build-tools image + +on: + workflow_call: + inputs: + image-tag: + description: "build-tools image tag" + required: true + type: string + outputs: + image-tag: + description: "build-tools tag" + value: ${{ inputs.image-tag }} + image: + description: "build-tools image" + value: neondatabase/build-tools:${{ inputs.image-tag }} + +defaults: + run: + shell: bash -euo pipefail {0} + +concurrency: + group: build-build-tools-image-${{ inputs.image-tag }} + cancel-in-progress: false + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +jobs: + check-image: + uses: ./.github/workflows/check-build-tools-image.yml + + build-image: + needs: [ check-image ] + if: needs.check-image.outputs.found == 'false' + + strategy: + matrix: + arch: [ x64, arm64 ] + + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + + env: + IMAGE_TAG: ${{ inputs.image-tag }} + + steps: + - name: Check `input.tag` is correct + env: + INPUTS_IMAGE_TAG: ${{ inputs.image-tag }} + CHECK_IMAGE_TAG : ${{ needs.check-image.outputs.image-tag }} + run: | + if [ "${INPUTS_IMAGE_TAG}" != "${CHECK_IMAGE_TAG}" ]; then + echo "'inputs.image-tag' (${INPUTS_IMAGE_TAG}) does not match the tag of the latest build-tools image 'inputs.image-tag' (${CHECK_IMAGE_TAG})" + exit 1 + fi + + - uses: actions/checkout@v4 + + - uses: ./.github/actions/set-docker-config-dir + - uses: docker/setup-buildx-action@v3 + with: + cache-binary: false + + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - uses: docker/login-action@v3 + with: + registry: cache.neon.build + username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }} + password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }} + + - uses: docker/build-push-action@v6 + with: + context: . + provenance: false + push: true + pull: true + file: Dockerfile.build-tools + cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }} + tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }} + + merge-images: + needs: [ build-image ] + runs-on: ubuntu-22.04 + + env: + IMAGE_TAG: ${{ inputs.image-tag }} + + steps: + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Create multi-arch image + run: | + docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \ + neondatabase/build-tools:${IMAGE_TAG}-x64 \ + neondatabase/build-tools:${IMAGE_TAG}-arm64 diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml deleted file mode 100644 index e401b2f418..0000000000 --- a/.github/workflows/build_and_push_docker_image.yml +++ /dev/null @@ -1,105 +0,0 @@ -name: Build and Push Docker Image - -on: - workflow_call: - inputs: - dockerfile-path: - required: true - type: string - image-name: - required: true - type: string - outputs: - build-tools-tag: - description: "tag generated for build tools" - value: ${{ jobs.tag.outputs.build-tools-tag }} - -jobs: - check-if-build-tools-dockerfile-changed: - runs-on: ubuntu-latest - outputs: - docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }} - steps: - - name: Check if Dockerfile.buildtools has changed - id: dockerfile - run: | - if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then - echo "docker_file_changed=false" >> $GITHUB_OUTPUT - exit - fi - updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only) - if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then - echo "docker_file_changed=true" >> $GITHUB_OUTPUT - fi - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - tag: - runs-on: ubuntu-latest - needs: [ check-if-build-tools-dockerfile-changed ] - outputs: - build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}} - - steps: - - name: Get buildtools tag - env: - DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }} - run: | - if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then - IMAGE_TAG=$GITHUB_RUN_ID - else - IMAGE_TAG=pinned - fi - - echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT - shell: bash - id: buildtools-tag - - kaniko: - if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' - needs: [ tag, check-if-build-tools-dockerfile-changed ] - runs-on: [ self-hosted, dev, x64 ] - container: gcr.io/kaniko-project/executor:v1.7.0-debug - - steps: - - name: Checkout - uses: actions/checkout@v1 - - - name: Configure ECR login - run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - - name: Kaniko build - run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 - - kaniko-arm: - if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' - needs: [ tag, check-if-build-tools-dockerfile-changed ] - runs-on: [ self-hosted, dev, arm64 ] - container: gcr.io/kaniko-project/executor:v1.7.0-debug - - steps: - - name: Checkout - uses: actions/checkout@v1 - - - name: Configure ECR login - run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - - name: Kaniko build - run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64 - - manifest: - if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' - name: 'manifest' - runs-on: [ self-hosted, dev, x64 ] - needs: - - tag - - kaniko - - kaniko-arm - - check-if-build-tools-dockerfile-changed - - steps: - - name: Create manifest - run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64 - - - name: Push manifest - run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 643d24696d..ee5fd1b0c6 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -5,6 +5,7 @@ on: branches: - main - release + - release-proxy pull_request: defaults: @@ -21,36 +22,20 @@ env: COPT: '-Werror' AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} - NEXTEST_RETRIES: 3 # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix - E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} + E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} jobs: check-permissions: - runs-on: ubuntu-latest - - steps: - - name: Disallow PRs from forks - if: | - github.event_name == 'pull_request' && - github.event.pull_request.head.repo.full_name != github.repository - - run: | - if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then - MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork" - else - MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run" - fi - - echo >&2 "We don't run CI for PRs from forks" - echo >&2 "${MESSAGE}" - - exit 1 + if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} + uses: ./.github/workflows/check-permissions.yml + with: + github-event-name: ${{ github.event_name }} cancel-previous-e2e-tests: needs: [ check-permissions ] if: github.event_name == 'pull_request' - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Cancel previous e2e-tests runs for this PR @@ -63,14 +48,14 @@ jobs: tag: needs: [ check-permissions ] - runs-on: [ self-hosted, gen3, small ] + runs-on: [ self-hosted, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned outputs: build-tag: ${{steps.build-tag.outputs.tag}} steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -83,6 +68,8 @@ jobs: echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT elif [[ "$GITHUB_REF_NAME" == "release" ]]; then echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then + echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT else echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT @@ -90,34 +77,36 @@ jobs: shell: bash id: build-tag - build-buildtools-image: + check-build-tools-image: needs: [ check-permissions ] - uses: ./.github/workflows/build_and_push_docker_image.yml + uses: ./.github/workflows/check-build-tools-image.yml + + build-build-tools-image: + needs: [ check-build-tools-image ] + uses: ./.github/workflows/build-build-tools-image.yml with: - dockerfile-path: Dockerfile.buildtools - image-name: build-tools + image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }} secrets: inherit check-codestyle-python: - needs: [ check-permissions, build-buildtools-image ] - runs-on: [ self-hosted, gen3, small ] + needs: [ check-permissions, build-build-tools-image ] + runs-on: [ self-hosted, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: - name: Checkout - uses: actions/checkout@v3 - with: - submodules: false - fetch-depth: 1 + uses: actions/checkout@v4 - name: Cache poetry deps - id: cache_poetry - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Install Python deps run: ./scripts/pysync @@ -132,29 +121,35 @@ jobs: run: poetry run mypy . check-codestyle-rust: - needs: [ check-permissions, build-buildtools-image ] - runs-on: [ self-hosted, gen3, large ] + needs: [ check-permissions, build-build-tools-image ] + strategy: + matrix: + arch: [ x64, arm64 ] + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} + container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: true - fetch-depth: 1 # Disabled for now # - name: Restore cargo deps cache # id: cache_cargo -# uses: actions/cache@v3 +# uses: actions/cache@v4 # with: # path: | # !~/.cargo/registry/src # ~/.cargo/git/ # target/ -# key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} +# key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} # Some of our rust modules use FFI and need those to be checked - name: Get postgres headers @@ -198,294 +193,83 @@ jobs: if: ${{ !cancelled() }} run: cargo deny check --hide-inclusion-graph - build-neon: - needs: [ check-permissions, tag, build-buildtools-image ] - runs-on: [ self-hosted, gen3, large ] + build-and-test-locally: + needs: [ tag, build-build-tools-image ] + strategy: + fail-fast: false + matrix: + arch: [ x64, arm64 ] + # Do not build or run tests in debug for release branches + build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }} + include: + - build-type: release + arch: arm64 + uses: ./.github/workflows/_build-and-test-locally.yml + with: + arch: ${{ matrix.arch }} + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }} + build-tag: ${{ needs.tag.outputs.build-tag }} + build-type: ${{ matrix.build-type }} + # Run tests on all Postgres versions in release builds and only on the latest version in debug builds + pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }} + secrets: inherit + + # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking + get-benchmarks-durations: + if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') + outputs: + json: ${{ steps.get-benchmark-durations.outputs.json }} + needs: [ check-permissions, build-build-tools-image ] + runs-on: [ self-hosted, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init - strategy: - fail-fast: false - matrix: - build_type: [ debug, release ] - env: - BUILD_TYPE: ${{ matrix.build_type }} - GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }} - BUILD_TAG: ${{ needs.tag.outputs.build-tag }} - - steps: - - name: Fix git ownership - run: | - # Workaround for `fatal: detected dubious ownership in repository at ...` - # - # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers - # Ref https://github.com/actions/checkout/issues/785 - # - git config --global --add safe.directory ${{ github.workspace }} - git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do - git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" - git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" - done - - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 1 - - - name: Check Postgres submodules revision - shell: bash -euo pipefail {0} - run: | - # This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally). - # Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603 - - FAILED=false - for postgres in postgres-v14 postgres-v15 postgres-v16; do - expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"') - actual=$(git rev-parse "HEAD:vendor/${postgres}") - if [ "${expected}" != "${actual}" ]; then - echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'" - FAILED=true - fi - done - - if [ "${FAILED}" = "true" ]; then - echo >&2 "Please update vendors/revisions.json if these changes are intentional" - exit 1 - fi - - - name: Set pg 14 revision for caching - id: pg_v14_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT - - - name: Set pg 15 revision for caching - id: pg_v15_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT - - - name: Set pg 16 revision for caching - id: pg_v16_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT - - # Set some environment variables used by all the steps. - # - # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. - # It also includes --features, if any - # - # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS, - # because "cargo metadata" doesn't accept --release or --debug options - # - # We run tests with addtional features, that are turned off by default (e.g. in release builds), see - # corresponding Cargo.toml files for their descriptions. - - name: Set env variables - run: | - CARGO_FEATURES="--features testing" - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FLAGS="--locked" - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix="" - CARGO_FLAGS="--locked --release" - fi - { - echo "cov_prefix=${cov_prefix}" - echo "CARGO_FEATURES=${CARGO_FEATURES}" - echo "CARGO_FLAGS=${CARGO_FLAGS}" - echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" - } >> $GITHUB_ENV - - # Disabled for now - # Don't include the ~/.cargo/registry/src directory. It contains just - # uncompressed versions of the crates in ~/.cargo/registry/cache - # directory, and it's faster to let 'cargo' to rebuild it from the - # compressed crates. -# - name: Cache cargo deps -# id: cache_cargo -# uses: actions/cache@v3 -# with: -# path: | -# ~/.cargo/registry/ -# !~/.cargo/registry/src -# ~/.cargo/git/ -# target/ -# # Fall back to older versions of the key, if no cache for current Cargo.lock was found -# key: | -# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} -# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}- - - - name: Cache postgres v14 build - id: cache_pg_14 - uses: actions/cache@v3 - with: - path: pg_install/v14 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Cache postgres v15 build - id: cache_pg_15 - uses: actions/cache@v3 - with: - path: pg_install/v15 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Cache postgres v16 build - id: cache_pg_16 - uses: actions/cache@v3 - with: - path: pg_install/v16 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Build postgres v14 - if: steps.cache_pg_14.outputs.cache-hit != 'true' - run: mold -run make postgres-v14 -j$(nproc) - - - name: Build postgres v15 - if: steps.cache_pg_15.outputs.cache-hit != 'true' - run: mold -run make postgres-v15 -j$(nproc) - - - name: Build postgres v16 - if: steps.cache_pg_16.outputs.cache-hit != 'true' - run: mold -run make postgres-v16 -j$(nproc) - - - name: Build neon extensions - run: mold -run make neon-pg-ext -j$(nproc) - - - name: Build walproposer-lib - run: mold -run make walproposer-lib -j$(nproc) - - - name: Run cargo build - run: | - ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests - - - name: Run rust tests - run: | - ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES - - # Run separate tests for real S3 - export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty - export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests - export REMOTE_STORAGE_S3_REGION=eu-central-1 - # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)' - - # Run separate tests for real Azure Blob Storage - # XXX: replace region with `eu-central-1`-like region - export ENABLE_REAL_AZURE_REMOTE_STORAGE=y - export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}" - export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" - export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" - export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" - # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)' - - - name: Install rust binaries - run: | - # Install target binaries - mkdir -p /tmp/neon/bin/ - binaries=$( - ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps | - jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' - ) - for bin in $binaries; do - SRC=target/$BUILD_TYPE/$bin - DST=/tmp/neon/bin/$bin - cp "$SRC" "$DST" - done - - # Install test executables and write list of all binaries (for code coverage) - if [[ $BUILD_TYPE == "debug" ]]; then - # Keep bloated coverage data files away from the rest of the artifact - mkdir -p /tmp/coverage/ - - mkdir -p /tmp/neon/test_bin/ - - test_exe_paths=$( - ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run | - jq -r '.executable | select(. != null)' - ) - for bin in $test_exe_paths; do - SRC=$bin - DST=/tmp/neon/test_bin/$(basename $bin) - - # We don't need debug symbols for code coverage, so strip them out to make - # the artifact smaller. - strip "$SRC" -o "$DST" - echo "$DST" >> /tmp/coverage/binaries.list - done - - for bin in $binaries; do - echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list - done - fi - - - name: Install postgres binaries - run: cp -a pg_install /tmp/neon/pg_install - - - name: Upload Neon artifact - uses: ./.github/actions/upload - with: - name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact - path: /tmp/neon - - # XXX: keep this after the binaries.list is formed, so the coverage can properly work later - - name: Merge and upload coverage data - if: matrix.build_type == 'debug' - uses: ./.github/actions/save-coverage-data - - regress-tests: - needs: [ check-permissions, build-neon, build-buildtools-image, tag ] - runs-on: [ self-hosted, gen3, large ] - container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} - # Default shared memory is 64mb - options: --init --shm-size=512mb - strategy: - fail-fast: false - matrix: - build_type: [ debug, release ] - pg_version: [ v14, v15, v16 ] steps: - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 1 + uses: actions/checkout@v4 - - name: Pytest regression tests - uses: ./.github/actions/run-python-test-set + - name: Cache poetry deps + uses: actions/cache@v4 with: - build_type: ${{ matrix.build_type }} - test_selection: regress - needs_postgres_source: true - run_with_real_s3: true - real_s3_bucket: neon-github-ci-tests - real_s3_region: eu-central-1 - rerun_flaky: true - pg_version: ${{ matrix.pg_version }} + path: ~/.cache/pypoetry/virtualenvs + key: v1-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + run: ./scripts/pysync + + - name: get benchmark durations + id: get-benchmark-durations env: TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} - CHECK_ONDISK_DATA_COMPATIBILITY: nonempty - BUILD_TAG: ${{ needs.tag.outputs.build-tag }} - - - name: Merge and upload coverage data - if: matrix.build_type == 'debug' && matrix.pg_version == 'v14' - uses: ./.github/actions/save-coverage-data + run: | + poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" \ + --days 10 \ + --output /tmp/benchmark_durations.json + echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT benchmarks: - needs: [ check-permissions, build-neon, build-buildtools-image ] - runs-on: [ self-hosted, gen3, small ] - container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} - # Default shared memory is 64mb - options: --init --shm-size=512mb if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') + needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ] + runs-on: [ self-hosted, small ] + container: + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + # for changed limits, see comments on `options:` earlier in this file + options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 strategy: fail-fast: false matrix: - pytest_split_group: [ 1, 2, 3, 4 ] + # the amount of groups (N) should be reflected in `extra_params: --splits N ...` + pytest_split_group: [ 1, 2, 3, 4, 5 ] build_type: [ release ] steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Pytest benchmarks uses: ./.github/actions/run-python-test-set @@ -494,25 +278,49 @@ jobs: test_selection: performance run_in_parallel: false save_perf_report: ${{ github.ref_name == 'main' }} - extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }} + extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} + benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }} + pg_version: v16 env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" + PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring + SYNC_AFTER_EACH_TEST: true # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones - create-test-report: - needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ] - if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }} + report-benchmarks-failures: + needs: [ benchmarks, create-test-report ] + if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure' + runs-on: ubuntu-22.04 - runs-on: [ self-hosted, gen3, small ] + steps: + - uses: slackapi/slack-github-action@v1 + with: + channel-id: C060CNA47S9 # on-call-staging-storage-stream + slack-message: | + Benchmarks failed on main <${{ github.event.head_commit.url }}|${{ github.sha }}> + <${{ needs.create-test-report.outputs.report-url }}|Allure report> + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + + create-test-report: + needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ] + if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }} + outputs: + report-url: ${{ steps.create-allure-report.outputs.report-url }} + + runs-on: [ self-hosted, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Create Allure report if: ${{ !cancelled() }} @@ -521,10 +329,9 @@ jobs: with: store-test-results-into-db: true env: - REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} - - uses: actions/github-script@v6 + - uses: actions/github-script@v7 if: ${{ !cancelled() }} with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries @@ -550,10 +357,13 @@ jobs: }) coverage-report: - needs: [ check-permissions, regress-tests, build-buildtools-image ] - runs-on: [ self-hosted, gen3, small ] + needs: [ check-permissions, build-build-tools-image, build-and-test-locally ] + runs-on: [ self-hosted, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init strategy: fail-fast: false @@ -564,7 +374,7 @@ jobs: coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }} steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: true fetch-depth: 0 @@ -572,7 +382,7 @@ jobs: - name: Get Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact path: /tmp/neon - name: Get coverage artifact @@ -599,17 +409,6 @@ jobs: --input-objects=/tmp/coverage/binaries.list \ --format=lcov - - name: Upload coverage report - id: upload-coverage-report - env: - BUCKET: neon-github-public-dev - COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} - run: | - aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA} - - REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html - echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT - - name: Build coverage report NEW id: upload-coverage-report-new env: @@ -644,23 +443,13 @@ jobs: REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT - - uses: actions/github-script@v6 + - uses: actions/github-script@v7 env: - REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }} REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }} COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} with: script: | - const { REPORT_URL, REPORT_URL_NEW, COMMIT_SHA } = process.env - - await github.rest.repos.createCommitStatus({ - owner: context.repo.owner, - repo: context.repo.repo, - sha: `${COMMIT_SHA}`, - state: 'success', - target_url: `${REPORT_URL}`, - context: 'Code coverage report', - }) + const { REPORT_URL_NEW, COMMIT_SHA } = process.env await github.rest.repos.createCommitStatus({ owner: context.repo.owner, @@ -672,223 +461,252 @@ jobs: }) trigger-e2e-tests: + if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }} needs: [ check-permissions, promote-images, tag ] - runs-on: [ self-hosted, gen3, small ] - container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned - options: --init - steps: - - name: Set PR's status to pending and request a remote CI test - run: | - # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit - # but we need to use a real sha of a latest commit in the PR's branch for the e2e job, - # to place a job run status update later. - COMMIT_SHA=${{ github.event.pull_request.head.sha }} - # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those - COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}} + uses: ./.github/workflows/trigger-e2e-tests.yml + secrets: inherit - REMOTE_REPO="${{ github.repository_owner }}/cloud" + neon-image-arch: + needs: [ check-permissions, build-build-tools-image, tag ] + strategy: + matrix: + arch: [ x64, arm64 ] - curl -f -X POST \ - https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \ - -H "Accept: application/vnd.github.v3+json" \ - --user "${{ secrets.CI_ACCESS_TOKEN }}" \ - --data \ - "{ - \"state\": \"pending\", - \"context\": \"neon-cloud-e2e\", - \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\" - }" - - curl -f -X POST \ - https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \ - -H "Accept: application/vnd.github.v3+json" \ - --user "${{ secrets.CI_ACCESS_TOKEN }}" \ - --data \ - "{ - \"ref\": \"main\", - \"inputs\": { - \"ci_job_name\": \"neon-cloud-e2e\", - \"commit_hash\": \"$COMMIT_SHA\", - \"remote_repo\": \"${{ github.repository }}\", - \"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\", - \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\", - \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\" - } - }" - - neon-image: - needs: [ check-permissions, build-buildtools-image, tag ] - runs-on: [ self-hosted, gen3, large ] - container: gcr.io/kaniko-project/executor:v1.9.2-debug - defaults: - run: - shell: sh -eu {0} + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - name: Checkout - uses: actions/checkout@v1 # v3 won't work with kaniko + uses: actions/checkout@v4 with: submodules: true fetch-depth: 0 - - name: Configure ECR and Docker Hub login - run: | - DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) - echo "::add-mask::${DOCKERHUB_AUTH}" + - uses: ./.github/actions/set-docker-config-dir + - uses: docker/setup-buildx-action@v3 + with: + cache-binary: false - cat <<-EOF > /kaniko/.docker/config.json - { - "auths": { - "https://index.docker.io/v1/": { - "auth": "${DOCKERHUB_AUTH}" - } - }, - "credHelpers": { - "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" - } - } - EOF + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Kaniko build neon - run: - /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true - --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache - --context . - --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }} - --build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }} - --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com - --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} - --destination neondatabase/neon:${{needs.tag.outputs.build-tag}} + - uses: docker/login-action@v3 + with: + registry: cache.neon.build + username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }} + password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }} - # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - - name: Cleanup ECR folder - run: rm -rf ~/.ecr + - uses: docker/build-push-action@v6 + with: + context: . + # ARM-specific flags are recommended for Graviton ≥ 2, these flags are also supported by Ampere Altra (Azure) + # https://github.com/aws/aws-graviton-getting-started/blob/57dc813626d0266f1cc12ef83474745bb1f31fb4/rust.md + build-args: | + ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }} + GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} + BUILD_TAG=${{ needs.tag.outputs.build-tag }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + provenance: false + push: true + pull: true + file: Dockerfile + cache-from: type=registry,ref=cache.neon.build/neon:cache-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0},mode=max', matrix.arch) || '' }} + tags: | + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} - compute-tools-image: - runs-on: [ self-hosted, gen3, large ] - needs: [ check-permissions, build-buildtools-image, tag ] - container: gcr.io/kaniko-project/executor:v1.9.2-debug - defaults: - run: - shell: sh -eu {0} + neon-image: + needs: [ neon-image-arch, tag ] + runs-on: ubuntu-22.04 steps: - - name: Checkout - uses: actions/checkout@v1 # v3 won't work with kaniko + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Configure ECR and Docker Hub login + - name: Create multi-arch image run: | - DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) - echo "::add-mask::${DOCKERHUB_AUTH}" + docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64 - cat <<-EOF > /kaniko/.docker/config.json - { - "auths": { - "https://index.docker.io/v1/": { - "auth": "${DOCKERHUB_AUTH}" - } - }, - "credHelpers": { - "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" - } - } - EOF + - uses: docker/login-action@v3 + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - name: Kaniko build compute tools - run: - /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true - --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache - --context . - --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} - --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}} - --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com - --dockerfile Dockerfile.compute-tools - --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} - --destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} + - name: Push multi-arch image to ECR + run: | + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }} - # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - - name: Cleanup ECR folder - run: rm -rf ~/.ecr - - compute-node-image: - needs: [ check-permissions, build-buildtools-image, tag ] - runs-on: [ self-hosted, gen3, large ] - container: - image: gcr.io/kaniko-project/executor:v1.9.2-debug - # Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution."" - # Should be prevented by https://github.com/neondatabase/neon/issues/4281 - options: --add-host=download.osgeo.org:140.211.15.30 + compute-node-image-arch: + needs: [ check-permissions, build-build-tools-image, tag ] strategy: fail-fast: false matrix: version: [ v14, v15, v16 ] - defaults: - run: - shell: sh -eu {0} + arch: [ x64, arm64 ] + + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - name: Checkout - uses: actions/checkout@v1 # v3 won't work with kaniko + uses: actions/checkout@v4 with: submodules: true fetch-depth: 0 - - name: Configure ECR and Docker Hub login + - uses: ./.github/actions/set-docker-config-dir + - uses: docker/setup-buildx-action@v3 + with: + cache-binary: false + # Disable parallelism for docker buildkit. + # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner. + buildkitd-config-inline: | + [worker.oci] + max-parallelism = 1 + + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - uses: docker/login-action@v3 + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} + + - uses: docker/login-action@v3 + with: + registry: cache.neon.build + username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }} + password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }} + + - name: Build compute-node image + uses: docker/build-push-action@v6 + with: + context: . + build-args: | + GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} + PG_VERSION=${{ matrix.version }} + BUILD_TAG=${{ needs.tag.outputs.build-tag }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + provenance: false + push: true + pull: true + file: Dockerfile.compute-node + cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }} + tags: | + neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} + + - name: Build neon extensions test image + if: matrix.version == 'v16' + uses: docker/build-push-action@v6 + with: + context: . + build-args: | + GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} + PG_VERSION=${{ matrix.version }} + BUILD_TAG=${{ needs.tag.outputs.build-tag }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + provenance: false + push: true + pull: true + file: Dockerfile.compute-node + target: neon-pg-ext-test + cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }} + tags: | + neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }} + + - name: Build compute-tools image + # compute-tools are Postgres independent, so build it only once + if: matrix.version == 'v16' + uses: docker/build-push-action@v6 + with: + target: compute-tools-image + context: . + build-args: | + GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} + BUILD_TAG=${{ needs.tag.outputs.build-tag }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + provenance: false + push: true + pull: true + file: Dockerfile.compute-node + tags: | + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} + + compute-node-image: + needs: [ compute-node-image-arch, tag ] + runs-on: ubuntu-22.04 + + strategy: + matrix: + version: [ v14, v15, v16 ] + + steps: + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Create multi-arch compute-node image run: | - DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) - echo "::add-mask::${DOCKERHUB_AUTH}" + docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \ + neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 - cat <<-EOF > /kaniko/.docker/config.json - { - "auths": { - "https://index.docker.io/v1/": { - "auth": "${DOCKERHUB_AUTH}" - } - }, - "credHelpers": { - "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" - } - } - EOF + - name: Create multi-arch neon-test-extensions image + if: matrix.version == 'v16' + run: | + docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \ + neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 - - name: Kaniko build compute node with extensions - run: - /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true - --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache - --context . - --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - --build-arg PG_VERSION=${{ matrix.version }} - --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} - --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}} - --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com - --dockerfile Dockerfile.compute-node - --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - --cleanup + - name: Create multi-arch compute-tools image + if: matrix.version == 'v16' + run: | + docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64 - # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - - name: Cleanup ECR folder - run: rm -rf ~/.ecr + - uses: docker/login-action@v3 + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} + + - name: Push multi-arch compute-node-${{ matrix.version }} image to ECR + run: | + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + + - name: Push multi-arch compute-tools image to ECR + if: matrix.version == 'v16' + run: | + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} vm-compute-node-image: needs: [ check-permissions, tag, compute-node-image ] - runs-on: [ self-hosted, gen3, large ] + runs-on: [ self-hosted, large ] strategy: fail-fast: false matrix: version: [ v14, v15, v16 ] - defaults: - run: - shell: sh -eu {0} env: - VM_BUILDER_VERSION: v0.21.0 + VM_BUILDER_VERSION: v0.29.3 steps: - name: Checkout - uses: actions/checkout@v1 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -897,33 +715,50 @@ jobs: curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder chmod +x vm-builder + - uses: ./.github/actions/set-docker-config-dir + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + # Note: we need a separate pull step here because otherwise vm-builder will try to pull, and # it won't have the proper authentication (written at v0.6.0) - name: Pulling compute-node image run: | - docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} - name: Build vm image run: | ./vm-builder \ -spec=vm-image-spec.yaml \ - -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \ - -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ + -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} - name: Pushing vm-compute-node image run: | - docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} test-images: - needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ] - runs-on: [ self-hosted, gen3, small ] + needs: [ check-permissions, tag, neon-image, compute-node-image ] + strategy: + fail-fast: false + matrix: + arch: [ x64, arm64 ] + + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 + - uses: ./.github/actions/set-docker-config-dir + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library. # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify. # Regular pageserver version string looks like @@ -934,7 +769,7 @@ jobs: - name: Verify image versions shell: bash # ensure no set -e for better error messages run: | - pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") + pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") echo "Pageserver version string: $pageserver_version" @@ -948,8 +783,9 @@ jobs: exit 1 fi - - name: Verify docker-compose example - run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh + - name: Verify docker-compose example and test extensions + timeout-minutes: 20 + run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh - name: Print logs and clean up if: always() @@ -958,89 +794,97 @@ jobs: docker compose -f ./docker-compose/docker-compose.yml down promote-images: + permissions: + contents: read # This is required for actions/checkout + id-token: write # This is required for Azure Login to work. needs: [ check-permissions, tag, test-images, vm-compute-node-image ] - runs-on: [ self-hosted, gen3, small ] - container: golang:1.19-bullseye - # Don't add if-condition here. - # The job should always be run because we have dependant other jobs that shouldn't be skipped + runs-on: ubuntu-22.04 + + env: + VERSIONS: v14 v15 v16 steps: - - name: Install Crane & ECR helper - run: | - go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 - go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Configure ECR login - run: | - mkdir /github/home/.docker/ - echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + - name: Login to dev ECR + uses: docker/login-action@v3 + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - name: Copy vm-compute-node images to Docker Hub + - name: Copy vm-compute-node images to ECR run: | - crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14 - crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15 - crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16 + for version in ${VERSIONS}; do + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} + done + + - name: Azure login + if: github.ref_name == 'main' + uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 + with: + client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }} + + - name: Login to ACR + if: github.ref_name == 'main' + run: | + az acr login --name=neoneastus2 + + - name: Copy docker images to ACR-dev + if: github.ref_name == 'main' + run: | + for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do + docker buildx imagetools create \ + -t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/${image}:${{ needs.tag.outputs.build-tag }} + done - name: Add latest tag to images - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' + if: github.ref_name == 'main' run: | - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest + for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do + docker buildx imagetools create -t $repo/neon:latest \ + $repo/neon:${{ needs.tag.outputs.build-tag }} - - name: Push images to production ECR - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' + docker buildx imagetools create -t $repo/compute-tools:latest \ + $repo/compute-tools:${{ needs.tag.outputs.build-tag }} + + for version in ${VERSIONS}; do + docker buildx imagetools create -t $repo/compute-node-${version}:latest \ + $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }} + + docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \ + $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} + done + done + docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \ + neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }} + + - name: Login to prod ECR + uses: docker/login-action@v3 + if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' + with: + registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_ACCESS_KEY_ID }} + password: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_SECRET_ACCESS_KEY }} + + - name: Copy all images to prod ECR + if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' run: | - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest - - - name: Configure Docker Hub login - run: | - # ECR Credential Helper & Docker Hub don't work together in config, hence reset - echo "" > /github/home/.docker/config.json - crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io - - - name: Push vm-compute-node to Docker Hub - run: | - crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} - crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} - crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} - - - name: Push latest tags to Docker Hub - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' - run: | - crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest - - - name: Cleanup ECR folder - run: rm -rf ~/.ecr + for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do + docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \ + 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} + done trigger-custom-extensions-build-and-wait: needs: [ check-permissions, tag ] - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Set PR's status to pending and request a remote CI test run: | @@ -1113,10 +957,10 @@ jobs: exit 1 deploy: - needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ] - if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch' + needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ] + if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy' - runs-on: [ self-hosted, gen3, small ] + runs-on: [ self-hosted, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest steps: - name: Fix git ownership @@ -1134,9 +978,8 @@ jobs: done - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: - submodules: false fetch-depth: 0 - name: Trigger deploy workflow @@ -1144,20 +987,49 @@ jobs: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | if [[ "$GITHUB_REF_NAME" == "main" ]]; then - gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false - - # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions - gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true + gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false + gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}} elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} + gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \ + -f deployPgSniRouter=false \ + -f deployProxy=false \ + -f deployStorage=true \ + -f deployStorageBroker=true \ + -f deployStorageController=true \ + -f branch=main \ + -f dockerTag=${{needs.tag.outputs.build-tag}} \ + -f deployPreprodRegion=true + + gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \ + -f deployStorage=true \ + -f deployStorageBroker=true \ + -f deployStorageController=true \ + -f branch=main \ + -f dockerTag=${{needs.tag.outputs.build-tag}} + elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then + gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \ + -f deployPgSniRouter=true \ + -f deployProxy=true \ + -f deployStorage=false \ + -f deployStorageBroker=false \ + -f deployStorageController=false \ + -f branch=main \ + -f dockerTag=${{needs.tag.outputs.build-tag}} \ + -f deployPreprodRegion=true + + gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \ + -f deployPgSniRouter=true \ + -f deployProxy=true \ + -f branch=main \ + -f dockerTag=${{needs.tag.outputs.build-tag}} else echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" exit 1 fi - name: Create git tag - if: github.ref_name == 'release' - uses: actions/github-script@v6 + if: github.ref_name == 'release' || github.ref_name == 'release-proxy' + uses: actions/github-script@v7 with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 @@ -1169,9 +1041,10 @@ jobs: sha: context.sha, }) + # TODO: check how GitHub releases looks for proxy releases and enable it if it's ok - name: Create GitHub release if: github.ref_name == 'release' - uses: actions/github-script@v6 + uses: actions/github-script@v7 with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 @@ -1183,40 +1056,124 @@ jobs: generate_release_notes: true, }) + # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory promote-compatibility-data: - needs: [ check-permissions, promote-images, tag, regress-tests ] + needs: [ deploy ] if: github.ref_name == 'release' - runs-on: [ self-hosted, gen3, small ] - container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned - options: --init + runs-on: ubuntu-22.04 steps: - - name: Promote compatibility snapshot for the release + - name: Fetch GITHUB_RUN_ID and COMMIT_SHA for the last merged release PR + id: fetch-last-release-pr-info + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + branch_name_and_pr_number=$(gh pr list \ + --repo "${GITHUB_REPOSITORY}" \ + --base release \ + --state merged \ + --limit 10 \ + --json mergeCommit,headRefName,number \ + --jq ".[] | select(.mergeCommit.oid==\"${GITHUB_SHA}\") | { branch_name: .headRefName, pr_number: .number }") + branch_name=$(echo "${branch_name_and_pr_number}" | jq -r '.branch_name') + pr_number=$(echo "${branch_name_and_pr_number}" | jq -r '.pr_number') + + run_id=$(gh run list \ + --repo "${GITHUB_REPOSITORY}" \ + --workflow build_and_test.yml \ + --branch "${branch_name}" \ + --json databaseId \ + --limit 1 \ + --jq '.[].databaseId') + + last_commit_sha=$(gh pr view "${pr_number}" \ + --repo "${GITHUB_REPOSITORY}" \ + --json commits \ + --jq '.commits[-1].oid') + + echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT} + echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT} + + - name: Promote compatibility snapshot and Neon artifact env: BUCKET: neon-github-public-dev - PREFIX: artifacts/latest + AWS_REGION: eu-central-1 + COMMIT_SHA: ${{ steps.fetch-last-release-pr-info.outputs.commit-sha }} + RUN_ID: ${{ steps.fetch-last-release-pr-info.outputs.run-id }} run: | - # Update compatibility snapshot for the release - for pg_version in v14 v15 v16; do - for build_type in debug release; do - OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst - NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst + old_prefix="artifacts/${COMMIT_SHA}/${RUN_ID}" + new_prefix="artifacts/latest" - time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME} + files_to_promote=() + files_on_s3=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${old_prefix} | jq -r '.Contents[]?.Key' || true) + + for arch in X64 ARM64; do + for build_type in debug release; do + neon_artifact_filename="neon-Linux-${arch}-${build_type}-artifact.tar.zst" + s3_key=$(echo "${files_on_s3}" | grep ${neon_artifact_filename} | sort --version-sort | tail -1 || true) + if [ -z "${s3_key}" ]; then + echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${neon_artifact_filename} nor its version from previous attempts exist" + exit 1 + fi + + files_to_promote+=("s3://${BUCKET}/${s3_key}") + + for pg_version in v14 v15 v16; do + # We run less tests for debug builds, so we don't need to promote them + if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then + continue + fi + + compatibility_data_filename="compatibility-snapshot-${arch}-${build_type}-pg${pg_version}.tar.zst" + s3_key=$(echo "${files_on_s3}" | grep ${compatibility_data_filename} | sort --version-sort | tail -1 || true) + if [ -z "${s3_key}" ]; then + echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${compatibility_data_filename} nor its version from previous attempts exist" + exit 1 + fi + + files_to_promote+=("s3://${BUCKET}/${s3_key}") + done done done - # Update Neon artifact for the release (reuse already uploaded artifact) - for build_type in debug release; do - OLD_PREFIX=artifacts/${GITHUB_RUN_ID} - FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst - - S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) - if [ -z "${S3_KEY}" ]; then - echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist" - exit 1 - fi - - time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME} + for f in "${files_to_promote[@]}"; do + time aws s3 cp --only-show-errors ${f} s3://${BUCKET}/${new_prefix}/ done + + pin-build-tools-image: + needs: [ build-build-tools-image, promote-images, build-and-test-locally ] + if: github.ref_name == 'main' + uses: ./.github/workflows/pin-build-tools-image.yml + with: + from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }} + secrets: inherit + + # This job simplifies setting branch protection rules (in GitHub UI) + # by allowing to set only this job instead of listing many others. + # It also makes it easier to rename or parametrise jobs (using matrix) + # which requires changes in branch protection rules + # + # Note, that we can't add external check (like `neon-cloud-e2e`) we still need to use GitHub UI for that. + # + # https://github.com/neondatabase/neon/settings/branch_protection_rules + conclusion: + if: always() + # Format `needs` differently to make the list more readable. + # Usually we do `needs: [...]` + needs: + - build-and-test-locally + - check-codestyle-python + - check-codestyle-rust + - promote-images + - test-images + - trigger-custom-extensions-build-and-wait + runs-on: ubuntu-22.04 + steps: + # The list of possible results: + # https://docs.github.com/en/actions/learn-github-actions/contexts#needs-context + - name: Fail the job if any of the dependencies do not succeed + run: exit 1 + if: | + contains(needs.*.result, 'failure') + || contains(needs.*.result, 'cancelled') + || contains(needs.*.result, 'skipped') diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml new file mode 100644 index 0000000000..807a9ef3bd --- /dev/null +++ b/.github/workflows/check-build-tools-image.yml @@ -0,0 +1,51 @@ +name: Check build-tools image + +on: + workflow_call: + outputs: + image-tag: + description: "build-tools image tag" + value: ${{ jobs.check-image.outputs.tag }} + found: + description: "Whether the image is found in the registry" + value: ${{ jobs.check-image.outputs.found }} + +defaults: + run: + shell: bash -euo pipefail {0} + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +jobs: + check-image: + runs-on: ubuntu-22.04 + outputs: + tag: ${{ steps.get-build-tools-tag.outputs.image-tag }} + found: ${{ steps.check-image.outputs.found }} + + steps: + - uses: actions/checkout@v4 + + - name: Get build-tools image tag for the current commit + id: get-build-tools-tag + env: + IMAGE_TAG: | + ${{ hashFiles('Dockerfile.build-tools', + '.github/workflows/check-build-tools-image.yml', + '.github/workflows/build-build-tools-image.yml') }} + run: | + echo "image-tag=${IMAGE_TAG}" | tee -a $GITHUB_OUTPUT + + - name: Check if such tag found in the registry + id: check-image + env: + IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }} + run: | + if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then + found=true + else + found=false + fi + + echo "found=${found}" | tee -a $GITHUB_OUTPUT diff --git a/.github/workflows/check-permissions.yml b/.github/workflows/check-permissions.yml new file mode 100644 index 0000000000..9c42794797 --- /dev/null +++ b/.github/workflows/check-permissions.yml @@ -0,0 +1,36 @@ +name: Check Permissions + +on: + workflow_call: + inputs: + github-event-name: + required: true + type: string + +defaults: + run: + shell: bash -euo pipefail {0} + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +jobs: + check-permissions: + runs-on: ubuntu-22.04 + steps: + - name: Disallow CI runs on PRs from forks + if: | + inputs.github-event-name == 'pull_request' && + github.event.pull_request.head.repo.full_name != github.repository + run: | + if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then + MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork" + else + MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run" + fi + + # TODO: use actions/github-script to post this message as a PR comment + echo >&2 "We don't run CI for PRs from forks" + echo >&2 "${MESSAGE}" + + exit 1 diff --git a/.github/workflows/cleanup-caches-by-a-branch.yml b/.github/workflows/cleanup-caches-by-a-branch.yml new file mode 100644 index 0000000000..0c074e36dc --- /dev/null +++ b/.github/workflows/cleanup-caches-by-a-branch.yml @@ -0,0 +1,32 @@ +# A workflow from +# https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#force-deleting-cache-entries + +name: cleanup caches by a branch +on: + pull_request: + types: + - closed + +jobs: + cleanup: + runs-on: ubuntu-22.04 + steps: + - name: Cleanup + run: | + gh extension install actions/gh-actions-cache + + echo "Fetching list of cache key" + cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH -L 100 | cut -f 1 ) + + ## Setting this to not fail the workflow while deleting cache keys. + set +e + echo "Deleting caches..." + for cacheKey in $cacheKeysForPR + do + gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm + done + echo "Done" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO: ${{ github.repository }} + BRANCH: refs/pull/${{ github.event.pull_request.number }}/merge diff --git a/.github/workflows/label-for-external-users.yml b/.github/workflows/label-for-external-users.yml new file mode 100644 index 0000000000..585d118dfb --- /dev/null +++ b/.github/workflows/label-for-external-users.yml @@ -0,0 +1,54 @@ +name: Add `external` label to issues and PRs created by external users + +on: + issues: + types: + - opened + pull_request_target: + types: + - opened + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +env: + LABEL: external + +jobs: + check-user: + runs-on: ubuntu-22.04 + + outputs: + is-member: ${{ steps.check-user.outputs.is-member }} + + steps: + - name: Check whether `${{ github.actor }}` is a member of `${{ github.repository_owner }}` + id: check-user + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then + is_member=true + else + is_member=false + fi + + echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT} + + add-label: + if: needs.check-user.outputs.is-member == 'false' + needs: [ check-user ] + + runs-on: ubuntu-22.04 + permissions: + pull-requests: write # for `gh pr edit` + issues: write # for `gh issue edit` + + steps: + - name: Add `${{ env.LABEL }}` label + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request_target' && 'pull_request' || 'issue'].number }} + GH_CLI_COMMAND: ${{ github.event_name == 'pull_request_target' && 'pr' || 'issue' }} + run: | + gh ${GH_CLI_COMMAND} --repo ${GITHUB_REPOSITORY} edit --add-label=${LABEL} ${ITEM_NUMBER} diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index c6c2b7386a..7fecdbde8c 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -20,13 +20,31 @@ env: COPT: '-Werror' jobs: + check-permissions: + if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} + uses: ./.github/workflows/check-permissions.yml + with: + github-event-name: ${{ github.event_name}} + + check-build-tools-image: + needs: [ check-permissions ] + uses: ./.github/workflows/check-build-tools-image.yml + + build-build-tools-image: + needs: [ check-build-tools-image ] + uses: ./.github/workflows/build-build-tools-image.yml + with: + image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }} + secrets: inherit + check-macos-build: + needs: [ check-permissions ] if: | contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || github.ref_name == 'main' timeout-minutes: 90 - runs-on: macos-latest + runs-on: macos-14 env: # Use release build only, to have less debug info around @@ -38,7 +56,6 @@ jobs: uses: actions/checkout@v4 with: submodules: true - fetch-depth: 1 - name: Install macOS postgres dependencies run: brew install flex bison openssl protobuf icu4c pkg-config @@ -57,24 +74,24 @@ jobs: - name: Cache postgres v14 build id: cache_pg_14 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v14 - key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v15 build id: cache_pg_15 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v15 - key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v16 build id: cache_pg_16 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v16 - key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Set extra env for macOS run: | @@ -82,14 +99,14 @@ jobs: echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV - name: Cache cargo deps - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: | ~/.cargo/registry !~/.cargo/registry/src ~/.cargo/git target - key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust + key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' @@ -110,214 +127,27 @@ jobs: run: make walproposer-lib -j$(sysctl -n hw.ncpu) - name: Run cargo build - run: cargo build --all --release + run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release - name: Check that no warnings are produced run: ./run_clippy.sh - check-linux-arm-build: - timeout-minutes: 90 - runs-on: [ self-hosted, dev, arm64 ] - - env: - # Use release build only, to have less debug info around - # Hence keeping target/ (and general cache size) smaller - BUILD_TYPE: release - CARGO_FEATURES: --features testing - CARGO_FLAGS: --locked --release - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} - - container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned - options: --init - - steps: - - name: Fix git ownership - run: | - # Workaround for `fatal: detected dubious ownership in repository at ...` - # - # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers - # Ref https://github.com/actions/checkout/issues/785 - # - git config --global --add safe.directory ${{ github.workspace }} - git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do - git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" - git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" - done - - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: true - fetch-depth: 1 - - - name: Set pg 14 revision for caching - id: pg_v14_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT - - - name: Set pg 15 revision for caching - id: pg_v15_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT - - - name: Set pg 16 revision for caching - id: pg_v16_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT - - - name: Set env variables - run: | - echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV - - - name: Cache postgres v14 build - id: cache_pg_14 - uses: actions/cache@v3 - with: - path: pg_install/v14 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Cache postgres v15 build - id: cache_pg_15 - uses: actions/cache@v3 - with: - path: pg_install/v15 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Cache postgres v16 build - id: cache_pg_16 - uses: actions/cache@v3 - with: - path: pg_install/v16 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Build postgres v14 - if: steps.cache_pg_14.outputs.cache-hit != 'true' - run: mold -run make postgres-v14 -j$(nproc) - - - name: Build postgres v15 - if: steps.cache_pg_15.outputs.cache-hit != 'true' - run: mold -run make postgres-v15 -j$(nproc) - - - name: Build postgres v16 - if: steps.cache_pg_16.outputs.cache-hit != 'true' - run: mold -run make postgres-v16 -j$(nproc) - - - name: Build neon extensions - run: mold -run make neon-pg-ext -j$(nproc) - - - name: Build walproposer-lib - run: mold -run make walproposer-lib -j$(nproc) - - - name: Run cargo build - run: | - mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests - - - name: Run cargo test - run: | - cargo test $CARGO_FLAGS $CARGO_FEATURES - - # Run separate tests for real S3 - export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty - export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests - export REMOTE_STORAGE_S3_REGION=eu-central-1 - # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3 - - # Run separate tests for real Azure Blob Storage - # XXX: replace region with `eu-central-1`-like region - export ENABLE_REAL_AZURE_REMOTE_STORAGE=y - export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}" - export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" - export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" - export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" - # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure - - check-codestyle-rust-arm: - timeout-minutes: 90 - runs-on: [ self-hosted, dev, arm64 ] - - container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned - options: --init - - steps: - - name: Fix git ownership - run: | - # Workaround for `fatal: detected dubious ownership in repository at ...` - # - # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers - # Ref https://github.com/actions/checkout/issues/785 - # - git config --global --add safe.directory ${{ github.workspace }} - git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do - git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" - git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" - done - - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: true - fetch-depth: 1 - - # Some of our rust modules use FFI and need those to be checked - - name: Get postgres headers - run: make postgres-headers -j$(nproc) - - # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations. - # This will catch compiler & clippy warnings in all feature combinations. - # TODO: use cargo hack for build and test as well, but, that's quite expensive. - # NB: keep clippy args in sync with ./run_clippy.sh - - run: | - CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" - if [ "$CLIPPY_COMMON_ARGS" = "" ]; then - echo "No clippy args found in .neon_clippy_args" - exit 1 - fi - echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV - - name: Run cargo clippy (debug) - run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS - - name: Run cargo clippy (release) - run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS - - - name: Check documentation generation - run: cargo doc --workspace --no-deps --document-private-items - env: - RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" - - # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run - - name: Check formatting - if: ${{ !cancelled() }} - run: cargo fmt --all -- --check - - # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci - - name: Check rust dependencies - if: ${{ !cancelled() }} - run: | - cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date - cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack - - # https://github.com/EmbarkStudios/cargo-deny - - name: Check rust licenses/bans/advisories/sources - if: ${{ !cancelled() }} - run: cargo deny check - gather-rust-build-stats: + needs: [ check-permissions, build-build-tools-image ] if: | contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') || contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || github.ref_name == 'main' - runs-on: [ self-hosted, gen3, large ] + runs-on: [ self-hosted, large ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init env: BUILD_TYPE: release - # remove the cachepot wrapper and build without crate caches - RUSTC_WRAPPER: "" # build with incremental compilation produce partial results # so do not attempt to cache this build, also disable the incremental compilation CARGO_INCREMENTAL: 0 @@ -327,7 +157,6 @@ jobs: uses: actions/checkout@v4 with: submodules: true - fetch-depth: 1 # Some of our rust modules use FFI and need those to be checked - name: Get postgres headers @@ -337,7 +166,7 @@ jobs: run: make walproposer-lib -j$(nproc) - name: Produce the build stats - run: cargo build --all --release --timings + run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc) - name: Upload the build stats id: upload-stats @@ -352,7 +181,7 @@ jobs: echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT - name: Publish build stats report - uses: actions/github-script@v6 + uses: actions/github-script@v7 env: REPORT_URL: ${{ steps.upload-stats.outputs.report-url }} SHA: ${{ github.event.pull_request.head.sha || github.sha }} diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml new file mode 100644 index 0000000000..615937b5a1 --- /dev/null +++ b/.github/workflows/periodic_pagebench.yml @@ -0,0 +1,155 @@ +name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region + +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '0 18 * * *' # Runs at 6 PM UTC every day + workflow_dispatch: # Allows manual triggering of the workflow + inputs: + commit_hash: + type: string + description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.' + required: false + default: '' + +defaults: + run: + shell: bash -euo pipefail {0} + +concurrency: + group: ${{ github.workflow }} + cancel-in-progress: false + +jobs: + trigger_bench_on_ec2_machine_in_eu_central_1: + runs-on: [ self-hosted, small ] + container: + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + timeout-minutes: 360 # Set the timeout to 6 hours + env: + API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }} + RUN_ID: ${{ github.run_id }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }} + AWS_DEFAULT_REGION : "eu-central-1" + AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74" + steps: + # we don't need the neon source code because we run everything remotely + # however we still need the local github actions to run the allure step below + - uses: actions/checkout@v4 + + - name: Show my own (github runner) external IP address - usefull for IP allowlisting + run: curl https://ifconfig.me + + - name: Start EC2 instance and wait for the instance to boot up + run: | + aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID + aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID + sleep 60 # sleep some time to allow cloudinit and our API server to start up + + - name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US + run: | + public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text) + echo "Public IP of the EC2 instance: $public_ip" + echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV + + - name: Determine commit hash + env: + INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }} + run: | + if [ -z "$INPUT_COMMIT_HASH" ]; then + echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV + else + echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV + fi + + - name: Start Bench with run_id + run: | + curl -k -X 'POST' \ + "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -H "Authorization: Bearer $API_KEY" \ + -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}" + + - name: Poll Test Status + id: poll_step + run: | + status="" + while [[ "$status" != "failure" && "$status" != "success" ]]; do + response=$(curl -k -X 'GET' \ + "${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer $API_KEY") + echo "Response: $response" + set +x + status=$(echo $response | jq -r '.status') + echo "Test status: $status" + if [[ "$status" == "failure" ]]; then + echo "Test failed" + exit 1 # Fail the job step if status is failure + elif [[ "$status" == "success" || "$status" == "null" ]]; then + break + elif [[ "$status" == "too_many_runs" ]]; then + echo "Too many runs already running" + echo "too_many_runs=true" >> "$GITHUB_OUTPUT" + exit 1 + fi + + sleep 60 # Poll every 60 seconds + done + + - name: Retrieve Test Logs + if: always() && steps.poll_step.outputs.too_many_runs != 'true' + run: | + curl -k -X 'GET' \ + "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \ + -H 'accept: application/gzip' \ + -H "Authorization: Bearer $API_KEY" \ + --output "test_log_${GITHUB_RUN_ID}.gz" + + - name: Unzip Test Log and Print it into this job's log + if: always() && steps.poll_step.outputs.too_many_runs != 'true' + run: | + gzip -d "test_log_${GITHUB_RUN_ID}.gz" + cat "test_log_${GITHUB_RUN_ID}" + + - name: Create Allure report + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C033QLM5P7D" # dev-staging-stream + slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + + - name: Cleanup Test Resources + if: always() + run: | + curl -k -X 'POST' \ + "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer $API_KEY" \ + -d '' + + - name: Stop EC2 instance and wait for the instance to be stopped + if: always() && steps.poll_step.outputs.too_many_runs != 'true' + run: | + aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID + aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml new file mode 100644 index 0000000000..23a2e3876c --- /dev/null +++ b/.github/workflows/pg-clients.yml @@ -0,0 +1,211 @@ +name: Test Postgres client libraries + +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '23 02 * * *' # run once a day, timezone is utc + pull_request: + paths: + - '.github/workflows/pg-clients.yml' + - 'test_runner/pg_clients/**' + - 'test_runner/logical_repl/**' + - 'poetry.lock' + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref_name }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +defaults: + run: + shell: bash -euxo pipefail {0} + +env: + DEFAULT_PG_VERSION: 16 + PLATFORM: neon-captest-new + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + AWS_DEFAULT_REGION: eu-central-1 + +jobs: + check-permissions: + if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} + uses: ./.github/workflows/check-permissions.yml + with: + github-event-name: ${{ github.event_name }} + + check-build-tools-image: + needs: [ check-permissions ] + uses: ./.github/workflows/check-build-tools-image.yml + + build-build-tools-image: + needs: [ check-build-tools-image ] + uses: ./.github/workflows/build-build-tools-image.yml + with: + image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }} + secrets: inherit + + test-logical-replication: + needs: [ build-build-tools-image ] + runs-on: ubuntu-22.04 + + container: + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init --user root + services: + clickhouse: + image: clickhouse/clickhouse-server:24.6.3.64 + ports: + - 9000:9000 + - 8123:8123 + zookeeper: + image: quay.io/debezium/zookeeper:2.7 + ports: + - 2181:2181 + kafka: + image: quay.io/debezium/kafka:2.7 + env: + ZOOKEEPER_CONNECT: "zookeeper:2181" + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 + KAFKA_BROKER_ID: 1 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_JMX_PORT: 9991 + ports: + - 9092:9092 + debezium: + image: quay.io/debezium/connect:2.7 + env: + BOOTSTRAP_SERVERS: kafka:9092 + GROUP_ID: 1 + CONFIG_STORAGE_TOPIC: debezium-config + OFFSET_STORAGE_TOPIC: debezium-offset + STATUS_STORAGE_TOPIC: debezium-status + DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector + ports: + - 8083:8083 + steps: + - uses: actions/checkout@v4 + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + + - name: Create Neon Project + id: create-neon-project + uses: ./.github/actions/neon-project-create + with: + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + postgres_version: ${{ env.DEFAULT_PG_VERSION }} + + - name: Run tests + uses: ./.github/actions/run-python-test-set + with: + build_type: remote + test_selection: logical_repl + run_in_parallel: false + extra_params: -m remote_cluster + pg_version: ${{ env.DEFAULT_PG_VERSION }} + env: + BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} + + - name: Delete Neon Project + if: always() + uses: ./.github/actions/neon-project-delete + with: + project_id: ${{ steps.create-neon-project.outputs.project_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Create Allure report + if: ${{ !cancelled() }} + id: create-allure-report + uses: ./.github/actions/allure-report-generate + with: + store-test-results-into-db: true + env: + REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + + - name: Post to a Slack channel + if: github.event.schedule && failure() + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream + slack-message: | + Testing the logical replication: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>) + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + + test-postgres-client-libs: + needs: [ build-build-tools-image ] + runs-on: ubuntu-22.04 + + container: + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init --user root + + steps: + - uses: actions/checkout@v4 + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + + - name: Create Neon Project + id: create-neon-project + uses: ./.github/actions/neon-project-create + with: + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + postgres_version: ${{ env.DEFAULT_PG_VERSION }} + + - name: Run tests + uses: ./.github/actions/run-python-test-set + with: + build_type: remote + test_selection: pg_clients + run_in_parallel: false + extra_params: -m remote_cluster + pg_version: ${{ env.DEFAULT_PG_VERSION }} + env: + BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} + + - name: Delete Neon Project + if: always() + uses: ./.github/actions/neon-project-delete + with: + project_id: ${{ steps.create-neon-project.outputs.project_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Create Allure report + if: ${{ !cancelled() }} + id: create-allure-report + uses: ./.github/actions/allure-report-generate + with: + store-test-results-into-db: true + env: + REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + + - name: Post to a Slack channel + if: github.event.schedule && failure() + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream + slack-message: | + Testing Postgres clients: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>) + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml deleted file mode 100644 index 224b7b4a6d..0000000000 --- a/.github/workflows/pg_clients.yml +++ /dev/null @@ -1,99 +0,0 @@ -name: Test Postgres client libraries - -on: - schedule: - # * is a special character in YAML so you have to quote this string - # ┌───────────── minute (0 - 59) - # │ ┌───────────── hour (0 - 23) - # │ │ ┌───────────── day of the month (1 - 31) - # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) - # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '23 02 * * *' # run once a day, timezone is utc - - workflow_dispatch: - -concurrency: - # Allow only one workflow per any non-`main` branch. - group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} - cancel-in-progress: true - -jobs: - test-postgres-client-libs: - # TODO: switch to gen2 runner, requires docker - runs-on: [ ubuntu-latest ] - - env: - DEFAULT_PG_VERSION: 14 - TEST_OUTPUT: /tmp/test_output - - steps: - - name: Checkout - uses: actions/checkout@v3 - - - uses: actions/setup-python@v4 - with: - python-version: 3.9 - - - name: Install Poetry - uses: snok/install-poetry@v1 - - - name: Cache poetry deps - id: cache_poetry - uses: actions/cache@v3 - with: - path: ~/.cache/pypoetry/virtualenvs - key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} - - - name: Install Python deps - shell: bash -euxo pipefail {0} - run: ./scripts/pysync - - - name: Create Neon Project - id: create-neon-project - uses: ./.github/actions/neon-project-create - with: - api_key: ${{ secrets.NEON_STAGING_API_KEY }} - postgres_version: ${{ env.DEFAULT_PG_VERSION }} - - - name: Run pytest - env: - REMOTE_ENV: 1 - BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} - POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - shell: bash -euxo pipefail {0} - run: | - # Test framework expects we have psql binary; - # but since we don't really need it in this test, let's mock it - mkdir -p "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin" && touch "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin/psql"; - ./scripts/pytest \ - --junitxml=$TEST_OUTPUT/junit.xml \ - --tb=short \ - --verbose \ - -m "remote_cluster" \ - -rA "test_runner/pg_clients" - - - name: Delete Neon Project - if: ${{ always() }} - uses: ./.github/actions/neon-project-delete - with: - project_id: ${{ steps.create-neon-project.outputs.project_id }} - api_key: ${{ secrets.NEON_STAGING_API_KEY }} - - # We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI. - # It will be fixed after switching to gen2 runner - - name: Upload python test logs - if: always() - uses: actions/upload-artifact@v3 - with: - retention-days: 7 - name: python-test-pg_clients-${{ runner.os }}-stage-logs - path: ${{ env.TEST_OUTPUT }} - - - name: Post to a Slack channel - if: ${{ github.event.schedule && failure() }} - uses: slackapi/slack-github-action@v1 - with: - channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" - env: - SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml new file mode 100644 index 0000000000..2e79498fc4 --- /dev/null +++ b/.github/workflows/pin-build-tools-image.yml @@ -0,0 +1,101 @@ +name: 'Pin build-tools image' + +on: + workflow_dispatch: + inputs: + from-tag: + description: 'Source tag' + required: true + type: string + force: + description: 'Force the image to be pinned' + default: false + type: boolean + workflow_call: + inputs: + from-tag: + description: 'Source tag' + required: true + type: string + force: + description: 'Force the image to be pinned' + default: false + type: boolean + +defaults: + run: + shell: bash -euo pipefail {0} + +concurrency: + group: pin-build-tools-image-${{ inputs.from-tag }} + cancel-in-progress: false + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +env: + FROM_TAG: ${{ inputs.from-tag }} + TO_TAG: pinned + +jobs: + check-manifests: + runs-on: ubuntu-22.04 + outputs: + skip: ${{ steps.check-manifests.outputs.skip }} + + steps: + - name: Check if we really need to pin the image + id: check-manifests + run: | + docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json + docker manifest inspect neondatabase/build-tools:${TO_TAG} > ${TO_TAG}.json + + if diff ${FROM_TAG}.json ${TO_TAG}.json; then + skip=true + else + skip=false + fi + + echo "skip=${skip}" | tee -a $GITHUB_OUTPUT + + tag-image: + needs: check-manifests + + # use format(..) to catch both inputs.force = true AND inputs.force = 'true' + if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true' + + runs-on: ubuntu-22.04 + + permissions: + id-token: write # for `azure/login` + + steps: + - uses: docker/login-action@v3 + + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - uses: docker/login-action@v3 + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} + + - name: Azure login + uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 + with: + client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }} + + - name: Login to ACR + run: | + az acr login --name=neoneastus2 + + - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR + run: | + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \ + -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \ + -t neondatabase/build-tools:${TO_TAG} \ + neondatabase/build-tools:${FROM_TAG} diff --git a/.github/workflows/release-notify.yml b/.github/workflows/release-notify.yml index ba396dba74..8bd10e993c 100644 --- a/.github/workflows/release-notify.yml +++ b/.github/workflows/release-notify.yml @@ -19,7 +19,7 @@ on: jobs: notify: - runs-on: [ ubuntu-latest ] + runs-on: ubuntu-22.04 steps: - uses: neondatabase/dev-actions/release-pr-notify@main diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ba37c5827a..56ef6f4bbb 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -2,12 +2,31 @@ name: Create Release Branch on: schedule: - - cron: '0 6 * * 1' + # It should be kept in sync with if-condition in jobs + - cron: '0 6 * * MON' # Storage release + - cron: '0 6 * * THU' # Proxy release workflow_dispatch: + inputs: + create-storage-release-branch: + type: boolean + description: 'Create Storage release PR' + required: false + create-proxy-release-branch: + type: boolean + description: 'Create Proxy release PR' + required: false + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +defaults: + run: + shell: bash -euo pipefail {0} jobs: - create_release_branch: - runs-on: [ ubuntu-latest ] + create-storage-release-branch: + if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }} + runs-on: ubuntu-22.04 permissions: contents: write # for `git push` @@ -18,27 +37,71 @@ jobs: with: ref: main - - name: Get current date - id: date - run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT + - name: Set environment variables + run: | + echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV + echo "RELEASE_BRANCH=rc/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV - name: Create release branch - run: git checkout -b releases/${{ steps.date.outputs.date }} + run: git checkout -b $RELEASE_BRANCH - name: Push new branch - run: git push origin releases/${{ steps.date.outputs.date }} + run: git push origin $RELEASE_BRANCH - name: Create pull request into release env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | - cat << EOF > body.md - ## Release ${{ steps.date.outputs.date }} + TITLE="Storage & Compute release ${RELEASE_DATE}" - **Please merge this PR using 'Create a merge commit'!** + cat << EOF > body.md + ## ${TITLE} + + **Please merge this Pull Request using 'Create a merge commit' button** EOF - gh pr create --title "Release ${{ steps.date.outputs.date }}" \ + gh pr create --title "${TITLE}" \ --body-file "body.md" \ - --head "releases/${{ steps.date.outputs.date }}" \ + --head "${RELEASE_BRANCH}" \ --base "release" + + create-proxy-release-branch: + if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }} + runs-on: ubuntu-22.04 + + permissions: + contents: write # for `git push` + + steps: + - name: Check out code + uses: actions/checkout@v4 + with: + ref: main + + - name: Set environment variables + run: | + echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV + echo "RELEASE_BRANCH=rc/proxy/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV + + - name: Create release branch + run: git checkout -b $RELEASE_BRANCH + + - name: Push new branch + run: git push origin $RELEASE_BRANCH + + - name: Create pull request into release + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + TITLE="Proxy release ${RELEASE_DATE}" + + cat << EOF > body.md + ## ${TITLE} + + **Please merge this Pull Request using 'Create a merge commit' button** + EOF + + gh pr create --title "${TITLE}" \ + --body-file "body.md" \ + --head "${RELEASE_BRANCH}" \ + --base "release-proxy" diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml new file mode 100644 index 0000000000..6fbe785c56 --- /dev/null +++ b/.github/workflows/trigger-e2e-tests.yml @@ -0,0 +1,147 @@ +name: Trigger E2E Tests + +on: + pull_request: + types: + - ready_for_review + workflow_call: + +defaults: + run: + shell: bash -euxo pipefail {0} + +env: + # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix + E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} + +jobs: + cancel-previous-e2e-tests: + if: github.event_name == 'pull_request' + runs-on: ubuntu-22.04 + + steps: + - name: Cancel previous e2e-tests runs for this PR + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + gh workflow --repo neondatabase/cloud \ + run cancel-previous-in-concurrency-group.yml \ + --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}" + + tag: + runs-on: ubuntu-22.04 + outputs: + build-tag: ${{ steps.build-tag.outputs.tag }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Get build tag + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }} + CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + run: | + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then + echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId') + echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT + fi + id: build-tag + + trigger-e2e-tests: + needs: [ tag ] + runs-on: ubuntu-22.04 + env: + EVENT_ACTION: ${{ github.event.action }} + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + TAG: ${{ needs.tag.outputs.build-tag }} + steps: + - name: Wait for `promote-images` job to finish + # It's important to have a timeout here, the script in the step can run infinitely + timeout-minutes: 60 + run: | + if [ "${GITHUB_EVENT_NAME}" != "pull_request" ] || [ "${EVENT_ACTION}" != "ready_for_review" ]; then + exit 0 + fi + + # For PRs we use the run id as the tag + BUILD_AND_TEST_RUN_ID=${TAG} + while true; do + conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion') + case "$conclusion" in + success) + break + ;; + failure | cancelled | skipped) + echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..." + exit 1 + ;; + *) + echo "The 'promote-images' hasn't succeed yet. Waiting..." + sleep 60 + ;; + esac + done + + - name: Set e2e-platforms + id: e2e-platforms + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Default set of platforms to run e2e tests on + platforms='["docker", "k8s"]' + + # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms. + # If the workflow run is not a pull request, add k8s-neonvm to the list. + if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then + for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do + case "$f" in + vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node) + platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique') + ;; + *) + # no-op + ;; + esac + done + else + platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique') + fi + + echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT + + - name: Set PR's status to pending and request a remote CI test + env: + E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }} + COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud" + + gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \ + --method POST \ + --raw-field "state=pending" \ + --raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \ + --raw-field "context=neon-cloud-e2e" + + gh workflow --repo ${REMOTE_REPO} \ + run testing.yml \ + --ref "main" \ + --raw-field "ci_job_name=neon-cloud-e2e" \ + --raw-field "commit_hash=$COMMIT_SHA" \ + --raw-field "remote_repo=${GITHUB_REPOSITORY}" \ + --raw-field "storage_image_tag=${TAG}" \ + --raw-field "compute_image_tag=${TAG}" \ + --raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \ + --raw-field "e2e-platforms=${E2E_PLATFORMS}" diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml deleted file mode 100644 index 88bab797b7..0000000000 --- a/.github/workflows/update_build_tools_image.yml +++ /dev/null @@ -1,130 +0,0 @@ -name: 'Update build tools image tag' - -# This workflow it used to update tag of build tools in ECR. -# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image. - -on: - workflow_dispatch: - inputs: - from-tag: - description: 'Source tag' - required: true - type: string - to-tag: - description: 'Destination tag' - required: true - type: string - default: 'pinned' - -defaults: - run: - shell: bash -euo pipefail {0} - -env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} - -permissions: {} - -jobs: - tag-image: - runs-on: [ self-hosted, gen3, small ] - container: golang:1.19-bullseye - - env: - IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools - FROM_TAG: ${{ inputs.from-tag }} - TO_TAG: ${{ inputs.to-tag }} - outputs: - next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }} - prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }} - - steps: - - name: Install Crane & ECR helper - run: | - go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1 - go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1 - - - name: Configure ECR login - run: | - mkdir /github/home/.docker/ - echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json - - - name: Get source image digest - id: next-digest - run: | - NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true) - if [ -z "${NEXT_DIGEST}" ]; then - echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist" - exit 1 - fi - - echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}" - echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT - - - name: Get destination image digest (if already exists) - id: prev-digest - run: | - PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true) - if [ -z "${PREV_DIGEST}" ]; then - echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)" - else - echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}" - - echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT - fi - - - name: Tag image - run: | - crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}" - - rollback-tag-image: - needs: tag-image - if: ${{ !success() }} - - runs-on: [ self-hosted, gen3, small ] - container: golang:1.19-bullseye - - env: - IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools - FROM_TAG: ${{ inputs.from-tag }} - TO_TAG: ${{ inputs.to-tag }} - - steps: - - name: Install Crane & ECR helper - run: | - go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1 - go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1 - - - name: Configure ECR login - run: | - mkdir /github/home/.docker/ - echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json - - - name: Restore previous tag if needed - run: | - NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}" - PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}" - - if [ -z "${NEXT_DIGEST}" ]; then - echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback" - exit 0 - fi - - if [ -z "${PREV_DIGEST}" ]; then - # I guess we should delete the tag here/untag the image, but crane does not support it - # - https://github.com/google/go-containerregistry/issues/999 - - echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback" - - exit 0 - fi - - CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}") - if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then - crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}" - - echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}" - else - echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored" - fi diff --git a/.gitignore b/.gitignore index 3f4495c9e7..2c38cdcc59 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ test_output/ neon.iml /.neon /integration_tests/.neon +compaction-suite-results.* # Coverage *.profraw diff --git a/.neon_clippy_args b/.neon_clippy_args index 25e09c61a6..4db32cf35c 100644 --- a/.neon_clippy_args +++ b/.neon_clippy_args @@ -1,4 +1,5 @@ # * `-A unknown_lints` – do not warn about unknown lint suppressions # that people with newer toolchains might use # * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) -export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings" +# * `-D clippy::todo` - don't let `todo!()` slip into `main` +export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings -D clippy::todo" diff --git a/CODEOWNERS b/CODEOWNERS index e384dc39f1..606dbb4e22 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,12 +1,13 @@ /compute_tools/ @neondatabase/control-plane @neondatabase/compute -/control_plane/ @neondatabase/compute @neondatabase/storage -/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage -/libs/postgres_ffi/ @neondatabase/compute +/storage_controller @neondatabase/storage +/libs/pageserver_api/ @neondatabase/storage +/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage /libs/remote_storage/ @neondatabase/storage -/libs/safekeeper_api/ @neondatabase/safekeepers -/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute +/libs/safekeeper_api/ @neondatabase/storage +/libs/vm_monitor/ @neondatabase/autoscaling /pageserver/ @neondatabase/storage /pgxn/ @neondatabase/compute +/pgxn/neon/ @neondatabase/compute @neondatabase/storage /proxy/ @neondatabase/proxy -/safekeeper/ @neondatabase/safekeepers +/safekeeper/ @neondatabase/storage /vendor/ @neondatabase/compute diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b318c295a3..164eb77f58 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -20,7 +20,7 @@ ln -s ../../pre-commit.py .git/hooks/pre-commit This will run following checks on staged files before each commit: - `rustfmt` -- checks for python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks). +- checks for Python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks). There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date. @@ -54,6 +54,9 @@ _An instruction for maintainers_ - If and only if it looks **safe** (i.e. it doesn't contain any malicious code which could expose secrets or harm the CI), then: - Press the "Approve and run" button in GitHub UI - Add the `approved-for-ci-run` label to the PR + - Currently draft PR will skip e2e test (only for internal contributors). After turning the PR 'Ready to Review' CI will trigger e2e test + - Add `run-e2e-tests-in-draft` label to run e2e test in draft PR (override above behaviour) + - The `approved-for-ci-run` workflow will add `run-e2e-tests-in-draft` automatically to run e2e test for external contributors Repeat all steps after any change to the PR. - When the changes are ready to get merged — merge the original PR (not the internal one) @@ -71,16 +74,11 @@ We're using the following approach to make it work: For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml) -## How do I add the "pinned" tag to an buildtools image? -We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation. +## How do I make build-tools image "pinned" -You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml, -or using GitHub CLI: +It's possible to update the `pinned` tag of the `build-tools` image using the `pin-build-tools-image.yml` workflow. ```bash -gh workflow -R neondatabase/neon run update_build_tools_image.yml \ - -f from-tag=6254913013 \ - -f to-tag=pinned \ - -# Default `-f to-tag` is `pinned`, so the parameter can be omitted. -``` \ No newline at end of file +gh workflow -R neondatabase/neon run pin-build-tools-image.yml \ + -f from-tag=cc98d9b00d670f182c507ae3783342bd7e64c31e +``` diff --git a/Cargo.lock b/Cargo.lock index f0e8b6a0ed..634af67198 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.8.5" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "const-random", @@ -241,7 +241,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -252,7 +252,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -262,40 +262,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba" [[package]] -name = "atomic-polyfill" -version = "1.0.2" +name = "atomic-take" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c314e70d181aa6053b26e3f7fbf86d1dfff84f816a6175b967666b3506ef7289" -dependencies = [ - "critical-section", -] - -[[package]] -name = "attachment_service" -version = "0.1.0" -dependencies = [ - "anyhow", - "camino", - "clap", - "control_plane", - "futures", - "git-version", - "hyper", - "metrics", - "pageserver_api", - "pageserver_client", - "postgres_backend", - "postgres_connection", - "scopeguard", - "serde", - "serde_json", - "thiserror", - "tokio", - "tokio-util", - "tracing", - "utils", - "workspace_hack", -] +checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3" [[package]] name = "autocfg" @@ -305,12 +275,11 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "aws-config" -version = "1.0.1" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80c950a809d39bc9480207cb1cfc879ace88ea7e3a4392a8e9999e45d6e5692e" +checksum = "baaa0be6ee7d90b775ae6ccb6d2ba182b91219ec2001f92338773a094246af1d" dependencies = [ "aws-credential-types", - "aws-http", "aws-runtime", "aws-sdk-sso", "aws-sdk-ssooidc", @@ -325,20 +294,21 @@ dependencies = [ "bytes", "fastrand 2.0.0", "hex", - "http", - "hyper", + "http 0.2.9", + "hyper 0.14.26", "ring 0.17.6", "time", "tokio", "tracing", + "url", "zeroize", ] [[package]] name = "aws-credential-types" -version = "1.0.1" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c1317e1a3514b103cf7d5828bbab3b4d30f56bd22d684f8568bc51b6cfbbb1c" +checksum = "e16838e6c9e12125face1c1eff1343c75e3ff540de98ff7ebd61874a89bcfeb9" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -346,30 +316,13 @@ dependencies = [ "zeroize", ] -[[package]] -name = "aws-http" -version = "0.60.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "361c4310fdce94328cc2d1ca0c8a48c13f43009c61d3367585685a50ca8c66b6" -dependencies = [ - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "http", - "http-body", - "pin-project-lite", - "tracing", -] - [[package]] name = "aws-runtime" -version = "1.0.1" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ed7ef604a15fd0d4d9e43701295161ea6b504b63c44990ead352afea2bc15e9" +checksum = "785da4a15e7b166b505fd577e4560c7a7cd8fbdf842eb1336cbcbf8944ce56f1" dependencies = [ "aws-credential-types", - "aws-http", "aws-sigv4", "aws-smithy-async", "aws-smithy-eventstream", @@ -377,21 +330,47 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", + "bytes", "fastrand 2.0.0", - "http", + "http 0.2.9", + "http-body 0.4.5", "percent-encoding", + "pin-project-lite", "tracing", "uuid", ] [[package]] -name = "aws-sdk-s3" -version = "1.4.0" +name = "aws-sdk-iam" +version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dcafc2fe52cc30b2d56685e2fa6a879ba50d79704594852112337a472ddbd24" +checksum = "b8ae76026bfb1b80a6aed0bb400c1139cd9c0563e26bce1986cd021c6a968c7b" dependencies = [ "aws-credential-types", - "aws-http", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-query", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "http 0.2.9", + "once_cell", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-s3" +version = "1.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bc5ce518d4b8d16e0408de7bdf1b3097cec61a7daa979750a208f8d9934386d" +dependencies = [ + "ahash", + "aws-credential-types", "aws-runtime", "aws-sigv4", "aws-smithy-async", @@ -405,23 +384,27 @@ dependencies = [ "aws-smithy-xml", "aws-types", "bytes", - "http", - "http-body", + "fastrand 2.0.0", + "hex", + "hmac", + "http 0.2.9", + "http-body 0.4.5", + "lru", "once_cell", "percent-encoding", - "regex", + "regex-lite", + "sha2", "tracing", "url", ] [[package]] name = "aws-sdk-sso" -version = "1.3.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0619ab97a5ca8982e7de073cdc66f93e5f6a1b05afc09e696bec1cb3607cd4df" +checksum = "ca3d6c4cba4e009391b72b0fcf12aff04ea3c9c3aa2ecaafa330326a8bd7e601" dependencies = [ "aws-credential-types", - "aws-http", "aws-runtime", "aws-smithy-async", "aws-smithy-http", @@ -431,19 +414,19 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "http", - "regex", + "http 0.2.9", + "once_cell", + "regex-lite", "tracing", ] [[package]] name = "aws-sdk-ssooidc" -version = "1.3.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f04b9f5474cc0f35d829510b2ec8c21e352309b46bf9633c5a81fb9321e9b1c7" +checksum = "73400dc239d14f63d932f4ca7b55af5e9ef1f857f7d70655249ccc287adb2570" dependencies = [ "aws-credential-types", - "aws-http", "aws-runtime", "aws-smithy-async", "aws-smithy-http", @@ -453,19 +436,19 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "http", - "regex", + "http 0.2.9", + "once_cell", + "regex-lite", "tracing", ] [[package]] name = "aws-sdk-sts" -version = "1.3.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5700da387716ccfc30b27f44b008f457e1baca5b0f05b6b95455778005e3432a" +checksum = "10f8858308af76fba3e5ffcf1bb56af5471574d2bdfaf0159470c25bc2f760e5" dependencies = [ "aws-credential-types", - "aws-http", "aws-runtime", "aws-smithy-async", "aws-smithy-http", @@ -476,16 +459,17 @@ dependencies = [ "aws-smithy-types", "aws-smithy-xml", "aws-types", - "http", - "regex", + "http 0.2.9", + "once_cell", + "regex-lite", "tracing", ] [[package]] name = "aws-sigv4" -version = "1.0.1" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "380adcc8134ad8bbdfeb2ace7626a869914ee266322965276cbc54066186d236" +checksum = "58b56f1cbe6fd4d0c2573df72868f20ab1c125ca9c9dbce17927a463433a2e57" dependencies = [ "aws-credential-types", "aws-smithy-eventstream", @@ -497,11 +481,11 @@ dependencies = [ "form_urlencoded", "hex", "hmac", - "http", + "http 0.2.9", + "http 1.1.0", "once_cell", - "p256", + "p256 0.11.1", "percent-encoding", - "regex", "ring 0.17.6", "sha2", "subtle", @@ -512,9 +496,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.0.2" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e37ca17d25fe1e210b6d4bdf59b81caebfe99f986201a1228cb5061233b4b13" +checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c" dependencies = [ "futures-util", "pin-project-lite", @@ -523,9 +507,9 @@ dependencies = [ [[package]] name = "aws-smithy-checksums" -version = "0.60.0" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5a373ec01aede3dd066ec018c1bc4e8f5dd11b2c11c59c8eef1a5c68101f397" +checksum = "83fa43bc04a6b2441968faeab56e68da3812f978a670a5db32accbdcafddd12f" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -533,8 +517,8 @@ dependencies = [ "crc32c", "crc32fast", "hex", - "http", - "http-body", + "http 0.2.9", + "http-body 0.4.5", "md-5", "pin-project-lite", "sha1", @@ -544,9 +528,9 @@ dependencies = [ [[package]] name = "aws-smithy-eventstream" -version = "0.60.0" +version = "0.60.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c669e1e5fc0d79561bf7a122b118bd50c898758354fe2c53eb8f2d31507cbc3" +checksum = "e6363078f927f612b970edf9d1903ef5cef9a64d1e8423525ebb1f0a1633c858" dependencies = [ "aws-smithy-types", "bytes", @@ -555,9 +539,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.60.0" +version = "0.60.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b1de8aee22f67de467b2e3d0dd0fb30859dc53f579a63bd5381766b987db644" +checksum = "4a7de001a1b9a25601016d8057ea16e31a45fdca3751304c8edf4ad72e706c08" dependencies = [ "aws-smithy-eventstream", "aws-smithy-runtime-api", @@ -565,8 +549,8 @@ dependencies = [ "bytes", "bytes-utils", "futures-core", - "http", - "http-body", + "http 0.2.9", + "http-body 0.4.5", "once_cell", "percent-encoding", "pin-project-lite", @@ -576,18 +560,18 @@ dependencies = [ [[package]] name = "aws-smithy-json" -version = "0.60.0" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a46dd338dc9576d6a6a5b5a19bd678dcad018ececee11cf28ecd7588bd1a55c" +checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6" dependencies = [ "aws-smithy-types", ] [[package]] name = "aws-smithy-query" -version = "0.60.0" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "feb5b8c7a86d4b6399169670723b7e6f21a39fc833a30f5c5a2f997608178129" +checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb" dependencies = [ "aws-smithy-types", "urlencoding", @@ -595,9 +579,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.0.2" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "273479291efc55e7b0bce985b139d86b6031adb8e50f65c1f712f20ba38f6388" +checksum = "c9ac79e9f3a4d576f3cd4a470a0275b138d9e7b11b1cd514a6858ae0a79dd5bb" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -605,29 +589,31 @@ dependencies = [ "aws-smithy-types", "bytes", "fastrand 2.0.0", - "h2", - "http", - "http-body", - "hyper", - "hyper-rustls", + "h2 0.3.26", + "http 0.2.9", + "http-body 0.4.5", + "http-body 1.0.0", + "hyper 0.14.26", + "hyper-rustls 0.24.0", "once_cell", "pin-project-lite", "pin-utils", - "rustls", + "rustls 0.21.11", "tokio", "tracing", ] [[package]] name = "aws-smithy-runtime-api" -version = "1.0.2" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6cebff0d977b6b6feed2fd07db52aac58ba3ccaf26cdd49f1af4add5061bef9" +checksum = "04ec42c2f5c0e7796a2848dde4d9f3bf8ce12ccbb3d5aa40c52fa0cdd61a1c47" dependencies = [ "aws-smithy-async", "aws-smithy-types", "bytes", - "http", + "http 0.2.9", + "http 1.1.0", "pin-project-lite", "tokio", "tracing", @@ -636,16 +622,19 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.0.2" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7f48b3f27ddb40ab19892a5abda331f403e3cb877965e4e51171447807104af" +checksum = "baf98d97bba6ddaba180f1b1147e202d8fe04940403a95a3f826c790f931bbd1" dependencies = [ "base64-simd", "bytes", "bytes-utils", "futures-core", - "http", - "http-body", + "http 0.2.9", + "http 1.1.0", + "http-body 0.4.5", + "http-body 1.0.0", + "http-body-util", "itoa", "num-integer", "pin-project-lite", @@ -659,24 +648,24 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.60.0" +version = "0.60.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ec40d74a67fd395bc3f6b4ccbdf1543672622d905ef3f979689aea5b730cb95" +checksum = "d123fbc2a4adc3c301652ba8e149bf4bc1d1725affb9784eb20c953ace06bf55" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "1.0.1" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8403fc56b1f3761e8efe45771ddc1165e47ec3417c68e68a4519b5cb030159ca" +checksum = "5a43b56df2c529fe44cb4d92bd64d0479883fb9608ff62daede4df5405381814" dependencies = [ "aws-credential-types", "aws-smithy-async", "aws-smithy-runtime-api", "aws-smithy-types", - "http", + "http 0.2.9", "rustc_version", "tracing", ] @@ -693,11 +682,11 @@ dependencies = [ "bitflags 1.3.2", "bytes", "futures-util", - "http", - "http-body", - "hyper", + "http 0.2.9", + "http-body 0.4.5", + "hyper 0.14.26", "itoa", - "matchit", + "matchit 0.7.0", "memchr", "mime", "percent-encoding", @@ -725,8 +714,8 @@ dependencies = [ "async-trait", "bytes", "futures-util", - "http", - "http-body", + "http 0.2.9", + "http-body 0.4.5", "mime", "rustversion", "tower-layer", @@ -735,9 +724,9 @@ dependencies = [ [[package]] name = "azure_core" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd" +checksum = "70fd680c0d0424a518229b1150922f92653ba2ac933aa000abc8bf1ca08105f7" dependencies = [ "async-trait", "base64 0.21.1", @@ -753,7 +742,7 @@ dependencies = [ "pin-project", "quick-xml", "rand 0.8.5", - "reqwest", + "reqwest 0.11.19", "rustc_version", "serde", "serde_json", @@ -765,9 +754,9 @@ dependencies = [ [[package]] name = "azure_identity" -version = "0.18.1" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8" +checksum = "a6d2060f5b2e1c664026ca4edd561306c473be887c1f7a81f10bf06f9b71c63f" dependencies = [ "async-lock", "async-trait", @@ -778,16 +767,15 @@ dependencies = [ "pin-project", "serde", "time", - "tz-rs", "url", "uuid", ] [[package]] name = "azure_storage" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1" +checksum = "15d3da73bfa09350e1bd6ae2a260806fcf90048c7e78cd2d8f88be60b19a7266" dependencies = [ "RustyXML", "async-lock", @@ -804,9 +792,9 @@ dependencies = [ [[package]] name = "azure_storage_blobs" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872" +checksum = "149c21834a4105d761e3dd33d91c2a3064acc05a3c978848ea8089102ae45c94" dependencies = [ "RustyXML", "azure_core", @@ -825,9 +813,9 @@ dependencies = [ [[package]] name = "azure_svc_blobstorage" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389" +checksum = "88c888b7bf522d5405218b8613bf0fae7ddaae6ef3bf4ad42ae005993c96ab8b" dependencies = [ "azure_core", "bytes", @@ -860,6 +848,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce" +[[package]] +name = "base16ct" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf" + [[package]] name = "base64" version = "0.13.1" @@ -878,6 +872,12 @@ version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105" +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "base64-simd" version = "0.8.0" @@ -894,6 +894,16 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" +[[package]] +name = "bcder" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c627747a6774aab38beb35990d88309481378558875a41da1a4b2e373c906ef0" +dependencies = [ + "bytes", + "smallvec", +] + [[package]] name = "bincode" version = "1.3.3" @@ -905,27 +915,30 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.65.1" +version = "0.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5" +checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.4.1", "cexpr", "clang-sys", - "lazy_static", - "lazycell", + "itertools 0.12.1", "log", - "peeking_take_while", - "prettyplease 0.2.6", + "prettyplease 0.2.17", "proc-macro2", "quote", "regex", "rustc-hash", "shlex", - "syn 2.0.32", - "which", + "syn 2.0.52", ] +[[package]] +name = "bit_field" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61" + [[package]] name = "bitflags" version = "1.3.2" @@ -965,6 +978,12 @@ version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" +[[package]] +name = "bytemuck" +version = "1.16.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83" + [[package]] name = "byteorder" version = "1.4.3" @@ -973,9 +992,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" +checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" dependencies = [ "serde", ] @@ -995,6 +1014,9 @@ name = "camino" version = "1.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c" +dependencies = [ + "serde", +] [[package]] name = "camino-tempfile" @@ -1052,9 +1074,9 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.31" +version = "0.4.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" +checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" dependencies = [ "android-tzdata", "iana-time-zone", @@ -1062,7 +1084,7 @@ dependencies = [ "num-traits", "serde", "wasm-bindgen", - "windows-targets 0.48.0", + "windows-targets 0.52.4", ] [[package]] @@ -1089,7 +1111,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b" dependencies = [ "ciborium-io", - "half", + "half 1.8.2", ] [[package]] @@ -1133,10 +1155,10 @@ version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1145,16 +1167,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" -[[package]] -name = "close_fds" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bc416f33de9d59e79e57560f450d21ff8393adcf1cdfc3e6d8fb93d5f88a2ed" -dependencies = [ - "cfg-if", - "libc", -] - [[package]] name = "colorchoice" version = "1.0.0" @@ -1199,7 +1211,6 @@ dependencies = [ "serde_json", "serde_with", "utils", - "workspace_hack", ] [[package]] @@ -1215,7 +1226,7 @@ dependencies = [ "compute_api", "flate2", "futures", - "hyper", + "hyper 0.14.26", "nix 0.27.1", "notify", "num_cpus", @@ -1223,16 +1234,19 @@ dependencies = [ "postgres", "regex", "remote_storage", - "reqwest", + "reqwest 0.12.4", + "rlimit", "rust-ini", "serde", "serde_json", "signal-hook", "tar", + "thiserror", "tokio", "tokio-postgres", + "tokio-stream", "tokio-util", - "toml_edit", + "toml_edit 0.19.10", "tracing", "tracing-opentelemetry", "tracing-subscriber", @@ -1279,12 +1293,6 @@ dependencies = [ "tiny-keccak", ] -[[package]] -name = "const_fn" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbdcdcb6d86f71c5e97409ad45898af11cbc995b4ee8112d59095a28d376c935" - [[package]] name = "const_format" version = "0.2.30" @@ -1315,7 +1323,6 @@ dependencies = [ "serde", "serde_with", "utils", - "workspace_hack", ] [[package]] @@ -1323,7 +1330,6 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", - "async-trait", "camino", "clap", "comfy-table", @@ -1331,7 +1337,9 @@ dependencies = [ "futures", "git-version", "hex", - "hyper", + "humantime", + "humantime-serde", + "hyper 0.14.26", "nix 0.27.1", "once_cell", "pageserver_api", @@ -1340,7 +1348,7 @@ dependencies = [ "postgres_backend", "postgres_connection", "regex", - "reqwest", + "reqwest 0.12.4", "safekeeper_api", "scopeguard", "serde", @@ -1352,10 +1360,12 @@ dependencies = [ "tokio", "tokio-postgres", "tokio-util", - "toml", + "toml 0.7.4", + "toml_edit 0.19.10", "tracing", "url", "utils", + "whoami", "workspace_hack", ] @@ -1386,9 +1396,9 @@ dependencies = [ [[package]] name = "crc32c" -version = "0.6.3" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e" +checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47" dependencies = [ "rustc_version", ] @@ -1414,7 +1424,7 @@ dependencies = [ "clap", "criterion-plot", "is-terminal", - "itertools", + "itertools 0.10.5", "num-traits", "once_cell", "oorandom", @@ -1435,15 +1445,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" dependencies = [ "cast", - "itertools", + "itertools 0.10.5", ] -[[package]] -name = "critical-section" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52" - [[package]] name = "crossbeam-channel" version = "0.5.8" @@ -1456,36 +1460,28 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" dependencies = [ - "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" -version = "0.9.14" +version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ - "autocfg", - "cfg-if", "crossbeam-utils", - "memoffset 0.8.0", - "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.15" +version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b" -dependencies = [ - "cfg-if", -] +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "crossterm" @@ -1536,8 +1532,10 @@ version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" dependencies = [ + "generic-array", "rand_core 0.6.4", "subtle", + "zeroize", ] [[package]] @@ -1571,7 +1569,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1582,7 +1580,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a" dependencies = [ "darling_core", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1592,7 +1590,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d" dependencies = [ "cfg-if", - "hashbrown 0.14.0", + "hashbrown 0.14.5", "lock_api", "once_cell", "parking_lot_core 0.9.8", @@ -1624,6 +1622,17 @@ dependencies = [ "zeroize", ] +[[package]] +name = "der" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c" +dependencies = [ + "const-oid", + "pem-rfc7468", + "zeroize", +] + [[package]] name = "der-parser" version = "8.2.0" @@ -1638,6 +1647,80 @@ dependencies = [ "rusticata-macros", ] +[[package]] +name = "deranged" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" +dependencies = [ + "powerfmt", + "serde", +] + +[[package]] +name = "desim" +version = "0.1.0" +dependencies = [ + "anyhow", + "bytes", + "hex", + "parking_lot 0.12.1", + "rand 0.8.5", + "scopeguard", + "smallvec", + "tracing", + "utils", +] + +[[package]] +name = "diesel" +version = "2.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71" +dependencies = [ + "bitflags 2.4.1", + "byteorder", + "chrono", + "diesel_derives", + "itoa", + "pq-sys", + "r2d2", + "serde_json", +] + +[[package]] +name = "diesel_derives" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59de76a222c2b8059f789cbe07afbfd8deb8c31dd0bc2a21f85e256c1def8259" +dependencies = [ + "diesel_table_macro_syntax", + "dsl_auto_type", + "proc-macro2", + "quote", + "syn 2.0.52", +] + +[[package]] +name = "diesel_migrations" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a73ce704bad4231f001bff3314d91dce4aba0770cee8b233991859abc15c1f6" +dependencies = [ + "diesel", + "migrations_internals", + "migrations_macros", +] + +[[package]] +name = "diesel_table_macro_syntax" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25" +dependencies = [ + "syn 2.0.52", +] + [[package]] name = "digest" version = "0.10.7" @@ -1645,6 +1728,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", + "const-oid", "crypto-common", "subtle", ] @@ -1657,7 +1741,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1669,6 +1753,20 @@ dependencies = [ "const-random", ] +[[package]] +name = "dsl_auto_type" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc" +dependencies = [ + "darling", + "either", + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.52", +] + [[package]] name = "dyn-clone" version = "1.0.14" @@ -1681,10 +1779,24 @@ version = "0.14.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" dependencies = [ - "der", - "elliptic-curve", - "rfc6979", - "signature", + "der 0.6.1", + "elliptic-curve 0.12.3", + "rfc6979 0.3.1", + "signature 1.6.4", +] + +[[package]] +name = "ecdsa" +version = "0.16.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca" +dependencies = [ + "der 0.7.8", + "digest", + "elliptic-curve 0.13.8", + "rfc6979 0.4.0", + "signature 2.2.0", + "spki 0.7.3", ] [[package]] @@ -1699,16 +1811,36 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" dependencies = [ - "base16ct", + "base16ct 0.1.1", "crypto-bigint 0.4.9", - "der", + "der 0.6.1", "digest", - "ff", + "ff 0.12.1", "generic-array", - "group", - "pkcs8", + "group 0.12.1", + "pkcs8 0.9.0", "rand_core 0.6.4", - "sec1", + "sec1 0.3.0", + "subtle", + "zeroize", +] + +[[package]] +name = "elliptic-curve" +version = "0.13.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47" +dependencies = [ + "base16ct 0.2.0", + "crypto-bigint 0.5.5", + "digest", + "ff 0.13.0", + "generic-array", + "group 0.13.0", + "pem-rfc7468", + "pkcs8 0.10.2", + "rand_core 0.6.4", + "sec1 0.7.3", "subtle", "zeroize", ] @@ -1749,6 +1881,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e875f1719c16de097dee81ed675e2d9bb63096823ed3f0ca827b7dea3028bbbb" dependencies = [ "enumset_derive", + "serde", ] [[package]] @@ -1760,7 +1893,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1784,23 +1917,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.1" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ - "errno-dragonfly", - "libc", - "windows-sys 0.48.0", -] - -[[package]] -name = "errno-dragonfly" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" -dependencies = [ - "cc", "libc", + "windows-sys 0.52.0", ] [[package]] @@ -1872,6 +1994,16 @@ dependencies = [ "subtle", ] +[[package]] +name = "ff" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ded41244b729663b1e574f1b4fb731469f69f79c17667b5d776b16cda0479449" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] + [[package]] name = "filetime" version = "0.2.22" @@ -1906,21 +2038,6 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" -[[package]] -name = "foreign-types" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" -dependencies = [ - "foreign-types-shared", -] - -[[package]] -name = "foreign-types-shared" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" - [[package]] name = "form_urlencoded" version = "1.1.0" @@ -1931,13 +2048,24 @@ dependencies = [ ] [[package]] -name = "fs2" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" +name = "framed-websockets" +version = "0.1.0" +source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127" dependencies = [ - "libc", - "winapi", + "base64 0.21.1", + "bytemuck", + "bytes", + "futures-core", + "futures-sink", + "http-body-util", + "hyper 1.2.0", + "hyper-util", + "pin-project", + "rand 0.8.5", + "sha1", + "thiserror", + "tokio", + "tokio-util", ] [[package]] @@ -1966,9 +2094,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" dependencies = [ "futures-core", "futures-sink", @@ -1976,9 +2104,9 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" [[package]] name = "futures-executor" @@ -1993,9 +2121,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" [[package]] name = "futures-lite" @@ -2014,26 +2142,26 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" +checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] name = "futures-sink" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" [[package]] name = "futures-task" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" [[package]] name = "futures-timer" @@ -2043,9 +2171,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" [[package]] name = "futures-util" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" dependencies = [ "futures-channel", "futures-core", @@ -2059,6 +2187,12 @@ dependencies = [ "slab", ] +[[package]] +name = "gen_ops" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a" + [[package]] name = "generic-array" version = "0.14.7" @@ -2067,6 +2201,7 @@ checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", + "zeroize", ] [[package]] @@ -2133,23 +2268,53 @@ version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7" dependencies = [ - "ff", + "ff 0.12.1", + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "group" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63" +dependencies = [ + "ff 0.13.0", "rand_core 0.6.4", "subtle", ] [[package]] name = "h2" -version = "0.3.24" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9" +checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" dependencies = [ "bytes", "fnv", "futures-core", "futures-sink", "futures-util", - "http", + "http 0.2.9", + "indexmap 2.0.1", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "h2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "816ec7294445779408f36fe57bc5b7fc1cf59664059096c65f905c1c61f58069" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http 1.1.0", "indexmap 2.0.1", "slab", "tokio", @@ -2164,12 +2329,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" [[package]] -name = "hash32" -version = "0.3.1" +name = "half" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47d60b12902ba28e2730cd37e95b8c9223af2808df9e902d4df49588d1470606" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" dependencies = [ - "byteorder", + "cfg-if", + "crunchy", + "num-traits", ] [[package]] @@ -2189,9 +2356,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.0" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" dependencies = [ "ahash", "allocator-api2", @@ -2199,11 +2366,11 @@ dependencies = [ [[package]] name = "hashlink" -version = "0.8.2" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0761a1b9491c4f2e3d66aa0f62d0fba0af9a0e2852e4d48ea506632a4b56e6aa" +checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af" dependencies = [ - "hashbrown 0.13.2", + "hashbrown 0.14.5", ] [[package]] @@ -2220,24 +2387,18 @@ dependencies = [ "num-traits", ] -[[package]] -name = "heapless" -version = "0.8.0" -source = "git+https://github.com/japaric/heapless.git?rev=644653bf3b831c6bb4963be2de24804acf5e5001#644653bf3b831c6bb4963be2de24804acf5e5001" -dependencies = [ - "atomic-polyfill", - "hash32", - "rustc_version", - "spin 0.9.8", - "stable_deref_trait", -] - [[package]] name = "heck" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hermit-abi" version = "0.3.3" @@ -2259,16 +2420,6 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46" -[[package]] -name = "histogram" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e673d137229619d5c2c8903b6ed5852b43636c0017ff2e66b1aafb8ccf04b80b" -dependencies = [ - "serde", - "thiserror", -] - [[package]] name = "hmac" version = "0.12.1" @@ -2289,6 +2440,17 @@ dependencies = [ "winapi", ] +[[package]] +name = "hostname" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9c7c7c8ac16c798734b8a24560c1362120597c40d5e1459f09498f8f6c8f2ba" +dependencies = [ + "cfg-if", + "libc", + "windows 0.52.0", +] + [[package]] name = "http" version = "0.2.9" @@ -2300,6 +2462,17 @@ dependencies = [ "itoa", ] +[[package]] +name = "http" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + [[package]] name = "http-body" version = "0.4.5" @@ -2307,7 +2480,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" dependencies = [ "bytes", - "http", + "http 0.2.9", + "pin-project-lite", +] + +[[package]] +name = "http-body" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" +dependencies = [ + "bytes", + "http 1.1.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41cb79eb393015dadd30fc252023adb0b2400a0caee0fa2a077e6e21a551e840" +dependencies = [ + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", "pin-project-lite", ] @@ -2369,9 +2565,9 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "h2", - "http", - "http-body", + "h2 0.3.26", + "http 0.2.9", + "http-body 0.4.5", "httparse", "httpdate", "itoa", @@ -2383,19 +2579,57 @@ dependencies = [ "want", ] +[[package]] +name = "hyper" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "h2 0.4.4", + "http 1.1.0", + "http-body 1.0.0", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", + "want", +] + [[package]] name = "hyper-rustls" version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7" dependencies = [ - "http", - "hyper", + "http 0.2.9", + "hyper 0.14.26", "log", - "rustls", - "rustls-native-certs", + "rustls 0.21.11", + "rustls-native-certs 0.6.2", "tokio", - "tokio-rustls", + "tokio-rustls 0.24.0", +] + +[[package]] +name = "hyper-rustls" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c" +dependencies = [ + "futures-util", + "http 1.1.0", + "hyper 1.2.0", + "hyper-util", + "rustls 0.22.4", + "rustls-pki-types", + "tokio", + "tokio-rustls 0.25.0", + "tower-service", ] [[package]] @@ -2404,36 +2638,30 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" dependencies = [ - "hyper", + "hyper 0.14.26", "pin-project-lite", "tokio", "tokio-io-timeout", ] [[package]] -name = "hyper-tls" -version = "0.5.0" +name = "hyper-util" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" +checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa" dependencies = [ "bytes", - "hyper", - "native-tls", - "tokio", - "tokio-native-tls", -] - -[[package]] -name = "hyper-tungstenite" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9" -dependencies = [ - "hyper", + "futures-channel", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "hyper 1.2.0", "pin-project-lite", + "socket2 0.5.5", "tokio", - "tokio-tungstenite", - "tungstenite", + "tower", + "tower-service", + "tracing", ] [[package]] @@ -2447,7 +2675,7 @@ dependencies = [ "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows", + "windows 0.48.0", ] [[package]] @@ -2493,9 +2721,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e" dependencies = [ "equivalent", - "hashbrown 0.14.0", + "hashbrown 0.14.5", ] +[[package]] +name = "indoc" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" + [[package]] name = "infer" version = "0.2.3" @@ -2554,14 +2788,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] -name = "io-lifetimes" -version = "1.0.11" +name = "io-uring" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" +checksum = "460648e47a07a43110fbfa2e0b14afb2be920093c31e5dccc50e49568e099762" dependencies = [ - "hermit-abi", + "bitflags 1.3.2", "libc", - "windows-sys 0.48.0", ] [[package]] @@ -2572,14 +2805,13 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" [[package]] name = "is-terminal" -version = "0.4.7" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f" +checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" dependencies = [ "hermit-abi", - "io-lifetimes", - "rustix 0.37.25", - "windows-sys 0.48.0", + "libc", + "windows-sys 0.52.0", ] [[package]] @@ -2591,6 +2823,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.6" @@ -2607,10 +2848,46 @@ dependencies = [ ] [[package]] -name = "js-sys" -version = "0.3.63" +name = "jose-b64" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790" +checksum = "bec69375368709666b21c76965ce67549f2d2db7605f1f8707d17c9656801b56" +dependencies = [ + "base64ct", + "serde", + "subtle", + "zeroize", +] + +[[package]] +name = "jose-jwa" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ab78e053fe886a351d67cf0d194c000f9d0dcb92906eb34d853d7e758a4b3a7" +dependencies = [ + "serde", +] + +[[package]] +name = "jose-jwk" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280fa263807fe0782ecb6f2baadc28dffc04e00558a58e33bfdb801d11fd58e7" +dependencies = [ + "jose-b64", + "jose-jwa", + "p256 0.13.2", + "p384", + "rsa", + "serde", + "zeroize", +] + +[[package]] +name = "js-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" dependencies = [ "wasm-bindgen", ] @@ -2623,7 +2900,7 @@ checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4" dependencies = [ "base64 0.21.1", "js-sys", - "pem 3.0.3", + "pem", "ring 0.17.6", "serde", "serde_json", @@ -2650,17 +2927,24 @@ dependencies = [ "libc", ] +[[package]] +name = "lasso" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2" +dependencies = [ + "dashmap", + "hashbrown 0.13.2", +] + [[package]] name = "lazy_static" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - -[[package]] -name = "lazycell" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" +dependencies = [ + "spin 0.5.2", +] [[package]] name = "libc" @@ -2679,16 +2963,22 @@ dependencies = [ ] [[package]] -name = "linux-raw-sys" -version = "0.1.4" +name = "libm" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] name = "linux-raw-sys" -version = "0.3.8" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" + +[[package]] +name = "linux-raw-sys" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0b5399f6804fbab912acbd8878ed3532d506b7c951b8f9f164ef90fef39e3f4" [[package]] name = "lock_api" @@ -2706,6 +2996,15 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +[[package]] +name = "lru" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc" +dependencies = [ + "hashbrown 0.14.5", +] + [[package]] name = "match_cfg" version = "0.1.0" @@ -2727,6 +3026,12 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40" +[[package]] +name = "matchit" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "540f1c43aed89909c0cc0cc604e3bb2f7e7a341a3728a9e6cfe760e733cd11ed" + [[package]] name = "md-5" version = "0.10.5" @@ -2742,12 +3047,62 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" +[[package]] +name = "measured" +version = "0.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3051f3a030d55d680cdef6ca50e80abd1182f8da29f2344a7c9cb575721138f0" +dependencies = [ + "bytes", + "crossbeam-utils", + "hashbrown 0.14.5", + "itoa", + "lasso", + "measured-derive", + "memchr", + "parking_lot 0.12.1", + "rustc-hash", + "ryu", +] + +[[package]] +name = "measured-derive" +version = "0.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.52", +] + +[[package]] +name = "measured-process" +version = "0.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec" +dependencies = [ + "libc", + "measured", + "procfs", +] + [[package]] name = "memchr" version = "2.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" +[[package]] +name = "memoffset" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +dependencies = [ + "autocfg", +] + [[package]] name = "memoffset" version = "0.8.0" @@ -2772,9 +3127,35 @@ version = "0.1.0" dependencies = [ "chrono", "libc", + "measured", + "measured-process", "once_cell", + "procfs", "prometheus", - "workspace_hack", + "rand 0.8.5", + "rand_distr", + "twox-hash", +] + +[[package]] +name = "migrations_internals" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd01039851e82f8799046eabbb354056283fb265c8ec0996af940f4e85a380ff" +dependencies = [ + "serde", + "toml 0.8.14", +] + +[[package]] +name = "migrations_macros" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffb161cc72176cb37aa47f1fc520d3ef02263d67d661f44f05d05a079e1237fd" +dependencies = [ + "migrations_internals", + "proc-macro2", + "quote", ] [[package]] @@ -2783,16 +3164,6 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" -[[package]] -name = "mime_guess" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef" -dependencies = [ - "mime", - "unicase", -] - [[package]] name = "minimal-lexical" version = "0.2.1" @@ -2810,9 +3181,9 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.10" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "log", @@ -2826,24 +3197,6 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" -[[package]] -name = "native-tls" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" -dependencies = [ - "lazy_static", - "libc", - "log", - "openssl", - "openssl-probe", - "openssl-sys", - "schannel", - "security-framework", - "security-framework-sys", - "tempfile", -] - [[package]] name = "nix" version = "0.25.1" @@ -2856,6 +3209,19 @@ dependencies = [ "libc", ] +[[package]] +name = "nix" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b" +dependencies = [ + "bitflags 1.3.2", + "cfg-if", + "libc", + "memoffset 0.7.1", + "pin-utils", +] + [[package]] name = "nix" version = "0.27.1" @@ -2930,6 +3296,23 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-bigint-dig" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151" +dependencies = [ + "byteorder", + "lazy_static", + "libm", + "num-integer", + "num-iter", + "num-traits", + "rand 0.8.5", + "smallvec", + "zeroize", +] + [[package]] name = "num-complex" version = "0.4.4" @@ -2939,6 +3322,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + [[package]] name = "num-integer" version = "0.1.45" @@ -2978,6 +3367,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -2990,15 +3380,6 @@ dependencies = [ "libc", ] -[[package]] -name = "num_threads" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" -dependencies = [ - "libc", -] - [[package]] name = "oauth2" version = "4.4.2" @@ -3008,7 +3389,7 @@ dependencies = [ "base64 0.13.1", "chrono", "getrandom 0.2.11", - "http", + "http 0.2.9", "rand 0.8.5", "serde", "serde_json", @@ -3048,50 +3429,12 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" -[[package]] -name = "openssl" -version = "0.10.60" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800" -dependencies = [ - "bitflags 2.4.1", - "cfg-if", - "foreign-types", - "libc", - "once_cell", - "openssl-macros", - "openssl-sys", -] - -[[package]] -name = "openssl-macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.32", -] - [[package]] name = "openssl-probe" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" -[[package]] -name = "openssl-sys" -version = "0.9.96" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f" -dependencies = [ - "cc", - "libc", - "pkg-config", - "vcpkg", -] - [[package]] name = "opentelemetry" version = "0.20.0" @@ -3110,9 +3453,9 @@ checksum = "c7594ec0e11d8e33faf03530a4c49af7064ebba81c1480e01be67d90b356508b" dependencies = [ "async-trait", "bytes", - "http", + "http 0.2.9", "opentelemetry_api", - "reqwest", + "reqwest 0.11.19", ] [[package]] @@ -3123,14 +3466,14 @@ checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275" dependencies = [ "async-trait", "futures-core", - "http", + "http 0.2.9", "opentelemetry-http", "opentelemetry-proto", "opentelemetry-semantic-conventions", "opentelemetry_api", "opentelemetry_sdk", "prost", - "reqwest", + "reqwest 0.11.19", "thiserror", "tokio", "tonic", @@ -3216,12 +3559,12 @@ dependencies = [ [[package]] name = "ordered-multimap" -version = "0.7.1" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f" +checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79" dependencies = [ "dlv-list", - "hashbrown 0.14.0", + "hashbrown 0.14.5", ] [[package]] @@ -3247,11 +3590,33 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594" dependencies = [ - "ecdsa", - "elliptic-curve", + "ecdsa 0.14.8", + "elliptic-curve 0.12.3", "sha2", ] +[[package]] +name = "p256" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b" +dependencies = [ + "ecdsa 0.16.9", + "elliptic-curve 0.13.8", + "primeorder", + "sha2", +] + +[[package]] +name = "p384" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70786f51bcc69f6a4c0360e063a4cac5419ef7c5cd5b3c99ad70f3be5ba79209" +dependencies = [ + "elliptic-curve 0.13.8", + "primeorder", +] + [[package]] name = "pagebench" version = "0.1.0" @@ -3284,12 +3649,18 @@ dependencies = [ "camino", "clap", "git-version", + "humantime", "pageserver", + "pageserver_api", "postgres_ffi", + "remote_storage", "serde", "serde_json", "svg_fmt", + "thiserror", "tokio", + "tokio-util", + "toml_edit 0.19.10", "utils", "workspace_hack", ] @@ -3299,16 +3670,17 @@ name = "pageserver" version = "0.1.0" dependencies = [ "anyhow", + "arc-swap", "async-compression", "async-stream", "async-trait", + "bit_field", "byteorder", "bytes", "camino", "camino-tempfile", "chrono", "clap", - "close_fds", "const_format", "consumption_metrics", "crc32c", @@ -3325,8 +3697,9 @@ dependencies = [ "hex-literal", "humantime", "humantime-serde", - "hyper", - "itertools", + "hyper 0.14.26", + "indoc", + "itertools 0.10.5", "md5", "metrics", "nix 0.27.1", @@ -3334,6 +3707,7 @@ dependencies = [ "num_cpus", "once_cell", "pageserver_api", + "pageserver_compaction", "pin-project-lite", "postgres", "postgres-protocol", @@ -3342,12 +3716,15 @@ dependencies = [ "postgres_connection", "postgres_ffi", "pq_proto", + "procfs", "rand 0.8.5", + "range-set-blaze", "regex", "remote_storage", - "reqwest", + "reqwest 0.12.4", "rpds", "scopeguard", + "send-future", "serde", "serde_json", "serde_path_to_error", @@ -3359,16 +3736,20 @@ dependencies = [ "strum_macros", "svg_fmt", "sync_wrapper", + "sysinfo", "tenant_size_model", "thiserror", + "tikv-jemallocator", "tokio", + "tokio-epoll-uring", "tokio-io-timeout", "tokio-postgres", "tokio-stream", "tokio-tar", "tokio-util", - "toml_edit", + "toml_edit 0.19.10", "tracing", + "twox-hash", "url", "utils", "walkdir", @@ -3383,20 +3764,28 @@ dependencies = [ "bincode", "byteorder", "bytes", + "camino", + "chrono", "const_format", "enum-map", "hex", + "humantime", "humantime-serde", + "itertools 0.10.5", + "nix 0.27.1", + "postgres_backend", "postgres_ffi", "rand 0.8.5", + "remote_storage", + "reqwest 0.12.4", "serde", "serde_json", "serde_with", + "storage_broker", "strum", "strum_macros", "thiserror", "utils", - "workspace_hack", ] [[package]] @@ -3404,12 +3793,11 @@ name = "pageserver_client" version = "0.1.0" dependencies = [ "anyhow", - "async-trait", "bytes", "futures", "pageserver_api", "postgres", - "reqwest", + "reqwest 0.12.4", "serde", "thiserror", "tokio", @@ -3420,6 +3808,52 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pageserver_compaction" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-compression", + "async-stream", + "byteorder", + "bytes", + "chrono", + "clap", + "const_format", + "consumption_metrics", + "criterion", + "crossbeam-utils", + "either", + "fail", + "flate2", + "futures", + "git-version", + "hex", + "hex-literal", + "humantime", + "humantime-serde", + "itertools 0.10.5", + "metrics", + "once_cell", + "pageserver_api", + "pin-project-lite", + "rand 0.8.5", + "smallvec", + "svg_fmt", + "sync_wrapper", + "thiserror", + "tokio", + "tokio-io-timeout", + "tokio-util", + "tracing", + "tracing-error", + "tracing-subscriber", + "url", + "utils", + "walkdir", + "workspace_hack", +] + [[package]] name = "parking" version = "2.1.1" @@ -3476,13 +3910,14 @@ dependencies = [ [[package]] name = "parquet" -version = "49.0.0" -source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9" +version = "51.0.0" +source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829" dependencies = [ "ahash", "bytes", "chrono", - "hashbrown 0.14.0", + "half 2.4.1", + "hashbrown 0.14.5", "num", "num-bigint", "paste", @@ -3490,17 +3925,18 @@ dependencies = [ "thrift", "twox-hash", "zstd", + "zstd-sys", ] [[package]] name = "parquet_derive" -version = "49.0.0" -source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9" +version = "51.0.0" +source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829" dependencies = [ "parquet", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -3522,9 +3958,9 @@ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" [[package]] name = "pbkdf2" -version = "0.12.1" +version = "0.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31" +checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2" dependencies = [ "digest", "hmac", @@ -3532,22 +3968,6 @@ dependencies = [ "sha2", ] -[[package]] -name = "peeking_take_while" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" - -[[package]] -name = "pem" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a" -dependencies = [ - "base64 0.21.1", - "serde", -] - [[package]] name = "pem" version = "3.0.3" @@ -3558,6 +3978,15 @@ dependencies = [ "serde", ] +[[package]] +name = "pem-rfc7468" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" +dependencies = [ + "base64ct", +] + [[package]] name = "percent-encoding" version = "2.2.0" @@ -3609,7 +4038,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -3624,14 +4053,35 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkcs1" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" +dependencies = [ + "der 0.7.8", + "pkcs8 0.10.2", + "spki 0.7.3", +] + [[package]] name = "pkcs8" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba" dependencies = [ - "der", - "spki", + "der 0.6.1", + "spki 0.6.0", +] + +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der 0.7.8", + "spki 0.7.3", ] [[package]] @@ -3671,7 +4121,7 @@ dependencies = [ [[package]] name = "postgres" version = "0.19.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "bytes", "fallible-iterator", @@ -3681,21 +4131,10 @@ dependencies = [ "tokio-postgres", ] -[[package]] -name = "postgres-native-tls" -version = "0.5.0" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" -dependencies = [ - "native-tls", - "tokio", - "tokio-native-tls", - "tokio-postgres", -] - [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "base64 0.20.0", "byteorder", @@ -3708,12 +4147,13 @@ dependencies = [ "rand 0.8.5", "sha2", "stringprep", + "tokio", ] [[package]] name = "postgres-types" version = "0.2.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "bytes", "fallible-iterator", @@ -3730,16 +4170,16 @@ dependencies = [ "futures", "once_cell", "pq_proto", - "rustls", - "rustls-pemfile", + "rustls 0.22.4", + "rustls-pemfile 2.1.1", "serde", "thiserror", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls", + "tokio-rustls 0.25.0", + "tokio-util", "tracing", - "workspace_hack", ] [[package]] @@ -3747,12 +4187,11 @@ name = "postgres_connection" version = "0.1.0" dependencies = [ "anyhow", - "itertools", + "itertools 0.10.5", "once_cell", "postgres", "tokio-postgres", "url", - "workspace_hack", ] [[package]] @@ -3775,28 +4214,43 @@ dependencies = [ "serde", "thiserror", "utils", - "workspace_hack", ] +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "ppv-lite86" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "pq-sys" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31c0052426df997c0cbd30789eb44ca097e3541717a7b8fa36b1c464ee7edebd" +dependencies = [ + "vcpkg", +] + [[package]] name = "pq_proto" version = "0.1.0" dependencies = [ "byteorder", "bytes", + "itertools 0.10.5", "pin-project-lite", "postgres-protocol", "rand 0.8.5", + "serde", "thiserror", "tokio", "tracing", - "workspace_hack", ] [[package]] @@ -3811,12 +4265,21 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.6" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1" +checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7" dependencies = [ "proc-macro2", - "syn 2.0.32", + "syn 2.0.52", +] + +[[package]] +name = "primeorder" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6" +dependencies = [ + "elliptic-curve 0.13.8", ] [[package]] @@ -3827,31 +4290,44 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.66" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] [[package]] name = "procfs" -version = "0.14.2" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69" +checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" dependencies = [ - "bitflags 1.3.2", - "byteorder", + "bitflags 2.4.1", + "chrono", + "flate2", "hex", "lazy_static", - "rustix 0.36.16", + "procfs-core", + "rustix", +] + +[[package]] +name = "procfs-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +dependencies = [ + "bitflags 2.4.1", + "chrono", + "hex", ] [[package]] name = "prometheus" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c" +checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1" dependencies = [ "cfg-if", "fnv", @@ -3880,8 +4356,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270" dependencies = [ "bytes", - "heck", - "itertools", + "heck 0.4.1", + "itertools 0.10.5", "lazy_static", "log", "multimap", @@ -3902,7 +4378,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4" dependencies = [ "anyhow", - "itertools", + "itertools 0.10.5", "proc-macro2", "quote", "syn 1.0.109", @@ -3921,8 +4397,16 @@ dependencies = [ name = "proxy" version = "0.1.0" dependencies = [ + "ahash", "anyhow", + "arc-swap", + "async-compression", "async-trait", + "atomic-take", + "aws-config", + "aws-sdk-iam", + "aws-sigv4", + "aws-types", "base64 0.13.1", "bstr", "bytes", @@ -3931,72 +4415,95 @@ dependencies = [ "chrono", "clap", "consumption_metrics", + "crossbeam-deque", "dashmap", + "ecdsa 0.16.9", + "env_logger", + "fallible-iterator", + "framed-websockets", "futures", "git-version", - "hashbrown 0.13.2", + "hashbrown 0.14.5", "hashlink", "hex", "hmac", - "hostname", + "hostname 0.3.1", + "http 1.1.0", + "http-body-util", "humantime", - "hyper", - "hyper-tungstenite", + "humantime-serde", + "hyper 0.14.26", + "hyper 1.2.0", + "hyper-util", + "indexmap 2.0.1", "ipnet", - "itertools", + "itertools 0.10.5", + "jose-jwa", + "jose-jwk", + "lasso", "md5", + "measured", "metrics", - "native-tls", "once_cell", "opentelemetry", + "p256 0.13.2", "parking_lot 0.12.1", "parquet", "parquet_derive", "pbkdf2", "pin-project-lite", - "postgres-native-tls", "postgres-protocol", "postgres_backend", "pq_proto", "prometheus", "rand 0.8.5", + "rand_distr", "rcgen", "redis", "regex", "remote_storage", - "reqwest", + "reqwest 0.12.4", "reqwest-middleware", "reqwest-retry", "reqwest-tracing", "routerify", + "rsa", "rstest", "rustc-hash", - "rustls", - "rustls-pemfile", + "rustls 0.22.4", + "rustls-native-certs 0.7.0", + "rustls-pemfile 2.1.1", "scopeguard", "serde", "serde_json", "sha2", + "signature 2.2.0", + "smallvec", "smol_str", "socket2 0.5.5", - "sync_wrapper", + "subtle", "task-local-extensions", "thiserror", - "tls-listener", + "tikv-jemalloc-ctl", + "tikv-jemallocator", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls", + "tokio-rustls 0.25.0", + "tokio-tungstenite", "tokio-util", + "tower-service", "tracing", "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", + "try-lock", + "typed-json", "url", + "urlencoding", "utils", "uuid", "walkdir", - "webpki-roots 0.25.2", "workspace_hack", "x509-parser", ] @@ -4013,13 +4520,24 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.32" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] +[[package]] +name = "r2d2" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93" +dependencies = [ + "log", + "parking_lot 0.12.1", + "scheduled-thread-pool", +] + [[package]] name = "rand" version = "0.7.3" @@ -4082,6 +4600,16 @@ dependencies = [ "getrandom 0.2.11", ] +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand 0.8.5", +] + [[package]] name = "rand_hc" version = "0.2.0" @@ -4091,6 +4619,18 @@ dependencies = [ "rand_core 0.5.1", ] +[[package]] +name = "range-set-blaze" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8421b5d459262eabbe49048d362897ff3e3830b44eac6cfe341d6acb2f0f13d2" +dependencies = [ + "gen_ops", + "itertools 0.12.1", + "num-integer", + "num-traits", +] + [[package]] name = "rayon" version = "1.7.0" @@ -4115,21 +4655,21 @@ dependencies = [ [[package]] name = "rcgen" -version = "0.11.1" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4954fbc00dcd4d8282c987710e50ba513d351400dbdd00e803a05172a90d8976" +checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1" dependencies = [ - "pem 2.0.1", - "ring 0.16.20", + "pem", + "ring 0.17.6", "time", "yasna", ] [[package]] name = "redis" -version = "0.24.0" +version = "0.25.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd" +checksum = "71d64e978fd98a0e6b105d066ba4889a7301fca65aeac850a877d8797343feeb" dependencies = [ "async-trait", "bytes", @@ -4138,15 +4678,15 @@ dependencies = [ "itoa", "percent-encoding", "pin-project-lite", - "rustls", - "rustls-native-certs", - "rustls-pemfile", - "rustls-webpki 0.101.7", + "rustls 0.22.4", + "rustls-native-certs 0.7.0", + "rustls-pemfile 2.1.1", + "rustls-pki-types", "ryu", "sha1_smol", - "socket2 0.4.9", + "socket2 0.5.5", "tokio", - "tokio-rustls", + "tokio-rustls 0.25.0", "tokio-util", "url", ] @@ -4169,6 +4709,15 @@ dependencies = [ "bitflags 1.3.2", ] +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "regex" version = "1.10.2" @@ -4201,6 +4750,12 @@ dependencies = [ "regex-syntax 0.8.2", ] +[[package]] +name = "regex-lite" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30b661b2f27137bdbc16f00eda72866a92bb28af1753ffbd56744fb6e2e9cd8e" + [[package]] name = "regex-syntax" version = "0.6.29" @@ -4224,6 +4779,7 @@ name = "remote_storage" version = "0.1.0" dependencies = [ "anyhow", + "async-stream", "async-trait", "aws-config", "aws-credential-types", @@ -4240,8 +4796,10 @@ dependencies = [ "futures", "futures-util", "http-types", - "hyper", - "itertools", + "humantime", + "humantime-serde", + "hyper 0.14.26", + "itertools 0.10.5", "metrics", "once_cell", "pin-project-lite", @@ -4249,13 +4807,14 @@ dependencies = [ "scopeguard", "serde", "serde_json", + "sync_wrapper", "test-context", "tokio", + "tokio-stream", "tokio-util", - "toml_edit", + "toml_edit 0.19.10", "tracing", "utils", - "workspace_hack", ] [[package]] @@ -4269,73 +4828,112 @@ dependencies = [ "encoding_rs", "futures-core", "futures-util", - "h2", - "http", - "http-body", - "hyper", - "hyper-rustls", - "hyper-tls", + "h2 0.3.26", + "http 0.2.9", + "http-body 0.4.5", + "hyper 0.14.26", + "hyper-rustls 0.24.0", "ipnet", "js-sys", "log", "mime", - "mime_guess", - "native-tls", "once_cell", "percent-encoding", "pin-project-lite", - "rustls", - "rustls-pemfile", + "rustls 0.21.11", + "rustls-pemfile 1.0.2", "serde", "serde_json", "serde_urlencoded", "tokio", - "tokio-native-tls", - "tokio-rustls", + "tokio-rustls 0.24.0", "tokio-util", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", - "wasm-streams", + "wasm-streams 0.3.0", "web-sys", "webpki-roots 0.25.2", - "winreg", + "winreg 0.50.0", +] + +[[package]] +name = "reqwest" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "566cafdd92868e0939d3fb961bd0dc25fcfaaed179291093b3d43e6b3150ea10" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "hyper 1.2.0", + "hyper-rustls 0.26.0", + "hyper-util", + "ipnet", + "js-sys", + "log", + "mime", + "once_cell", + "percent-encoding", + "pin-project-lite", + "rustls 0.22.4", + "rustls-pemfile 2.1.1", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-rustls 0.25.0", + "tokio-util", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams 0.4.0", + "web-sys", + "webpki-roots 0.26.1", + "winreg 0.52.0", ] [[package]] name = "reqwest-middleware" -version = "0.2.2" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4531c89d50effe1fac90d095c8b133c20c5c714204feee0bfc3fd158e784209d" +checksum = "0209efb52486ad88136190094ee214759ef7507068b27992256ed6610eb71a01" dependencies = [ "anyhow", "async-trait", - "http", - "reqwest", + "http 1.1.0", + "reqwest 0.12.4", "serde", - "task-local-extensions", "thiserror", + "tower-service", ] [[package]] name = "reqwest-retry" -version = "0.2.2" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48d0fd6ef4c6d23790399fe15efc8d12cd9f3d4133958f9bd7801ee5cbaec6c4" +checksum = "40f342894422862af74c50e1e9601cf0931accc9c6981e5eb413c46603b616b5" dependencies = [ "anyhow", "async-trait", "chrono", "futures", "getrandom 0.2.11", - "http", - "hyper", + "http 1.1.0", + "hyper 1.2.0", "parking_lot 0.11.2", - "reqwest", + "reqwest 0.12.4", "reqwest-middleware", "retry-policies", - "task-local-extensions", "tokio", "tracing", "wasm-timer", @@ -4343,27 +4941,27 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.4.7" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a0152176687dd5cfe7f507ac1cb1a491c679cfe483afd133a7db7aaea818bb3" +checksum = "b253954a1979e02eabccd7e9c3d61d8f86576108baa160775e7f160bb4e800a3" dependencies = [ "anyhow", "async-trait", "getrandom 0.2.11", - "matchit", + "http 1.1.0", + "matchit 0.8.2", "opentelemetry", - "reqwest", + "reqwest 0.12.4", "reqwest-middleware", - "task-local-extensions", "tracing", "tracing-opentelemetry", ] [[package]] name = "retry-policies" -version = "0.1.2" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e09bbcb5003282bcb688f0bae741b278e9c7e8f378f561522c9806c58e075d9b" +checksum = "493b4243e32d6eedd29f9a398896e35c6943a123b55eec97dcaee98310d25810" dependencies = [ "anyhow", "chrono", @@ -4381,6 +4979,16 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rfc6979" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2" +dependencies = [ + "hmac", + "subtle", +] + [[package]] name = "ring" version = "0.16.20" @@ -4410,14 +5018,23 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "rlimit" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3560f70f30a0f16d11d01ed078a07740fe6b489667abc7c7b029155d9f21c3d8" +dependencies = [ + "libc", +] + [[package]] name = "routerify" version = "3.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945" dependencies = [ - "http", - "hyper", + "http 0.2.9", + "hyper 0.14.26", "lazy_static", "percent-encoding", "regex", @@ -4432,6 +5049,26 @@ dependencies = [ "archery", ] +[[package]] +name = "rsa" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc" +dependencies = [ + "const-oid", + "digest", + "num-bigint-dig", + "num-integer", + "num-traits", + "pkcs1", + "pkcs8 0.10.2", + "rand_core 0.6.4", + "signature 2.2.0", + "spki 0.7.3", + "subtle", + "zeroize", +] + [[package]] name = "rstest" version = "0.18.2" @@ -4457,7 +5094,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.32", + "syn 2.0.52", "unicode-ident", ] @@ -4503,37 +5140,22 @@ dependencies = [ [[package]] name = "rustix" -version = "0.36.16" +version = "0.38.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab" +checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.4.1", "errno", - "io-lifetimes", "libc", - "linux-raw-sys 0.1.4", - "windows-sys 0.45.0", -] - -[[package]] -name = "rustix" -version = "0.37.25" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035" -dependencies = [ - "bitflags 1.3.2", - "errno", - "io-lifetimes", - "libc", - "linux-raw-sys 0.3.8", - "windows-sys 0.48.0", + "linux-raw-sys 0.4.13", + "windows-sys 0.52.0", ] [[package]] name = "rustls" -version = "0.21.9" +version = "0.21.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "629648aced5775d558af50b2b4c7b02983a04b312126d45eeead26e7caa498b9" +checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4" dependencies = [ "log", "ring 0.17.6", @@ -4541,6 +5163,20 @@ dependencies = [ "sct", ] +[[package]] +name = "rustls" +version = "0.22.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432" +dependencies = [ + "log", + "ring 0.17.6", + "rustls-pki-types", + "rustls-webpki 0.102.2", + "subtle", + "zeroize", +] + [[package]] name = "rustls-native-certs" version = "0.6.2" @@ -4548,7 +5184,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0167bac7a9f490495f3c33013e7722b53cb087ecbe082fb0c6387c96f634ea50" dependencies = [ "openssl-probe", - "rustls-pemfile", + "rustls-pemfile 1.0.2", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-native-certs" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792" +dependencies = [ + "openssl-probe", + "rustls-pemfile 2.1.1", + "rustls-pki-types", "schannel", "security-framework", ] @@ -4562,6 +5211,22 @@ dependencies = [ "base64 0.21.1", ] +[[package]] +name = "rustls-pemfile" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab" +dependencies = [ + "base64 0.21.1", + "rustls-pki-types", +] + +[[package]] +name = "rustls-pki-types" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8" + [[package]] name = "rustls-webpki" version = "0.100.2" @@ -4582,6 +5247,17 @@ dependencies = [ "untrusted 0.9.0", ] +[[package]] +name = "rustls-webpki" +version = "0.102.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610" +dependencies = [ + "ring 0.17.6", + "rustls-pki-types", + "untrusted 0.9.0", +] + [[package]] name = "rustversion" version = "1.0.12" @@ -4594,45 +5270,6 @@ version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" -[[package]] -name = "s3_scrubber" -version = "0.1.0" -dependencies = [ - "anyhow", - "async-stream", - "aws-config", - "aws-sdk-s3", - "aws-smithy-async", - "bincode", - "bytes", - "chrono", - "clap", - "crc32c", - "either", - "futures", - "futures-util", - "hex", - "histogram", - "itertools", - "pageserver", - "pageserver_api", - "rand 0.8.5", - "remote_storage", - "reqwest", - "serde", - "serde_json", - "serde_with", - "thiserror", - "tokio", - "tokio-rustls", - "tokio-stream", - "tracing", - "tracing-appender", - "tracing-subscriber", - "utils", - "workspace_hack", -] - [[package]] name = "safekeeper" version = "0.1.0" @@ -4648,13 +5285,13 @@ dependencies = [ "clap", "const_format", "crc32c", + "desim", "fail", - "fs2", "futures", "git-version", "hex", "humantime", - "hyper", + "hyper 0.14.26", "metrics", "once_cell", "parking_lot 0.12.1", @@ -4663,9 +5300,10 @@ dependencies = [ "postgres_backend", "postgres_ffi", "pq_proto", + "rand 0.8.5", "regex", "remote_storage", - "reqwest", + "reqwest 0.12.4", "safekeeper_api", "scopeguard", "sd-notify", @@ -4675,16 +5313,21 @@ dependencies = [ "sha2", "signal-hook", "storage_broker", + "strum", + "strum_macros", "thiserror", "tokio", "tokio-io-timeout", "tokio-postgres", "tokio-stream", + "tokio-tar", "tokio-util", - "toml_edit", + "toml_edit 0.19.10", "tracing", + "tracing-subscriber", "url", "utils", + "walproposer", "workspace_hack", ] @@ -4696,7 +5339,6 @@ dependencies = [ "serde", "serde_with", "utils", - "workspace_hack", ] [[package]] @@ -4717,6 +5359,15 @@ dependencies = [ "windows-sys 0.42.0", ] +[[package]] +name = "scheduled-thread-pool" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19" +dependencies = [ + "parking_lot 0.12.1", +] + [[package]] name = "scopeguard" version = "1.1.0" @@ -4745,10 +5396,24 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" dependencies = [ - "base16ct", - "der", + "base16ct 0.1.1", + "der 0.6.1", "generic-array", - "pkcs8", + "pkcs8 0.9.0", + "subtle", + "zeroize", +] + +[[package]] +name = "sec1" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc" +dependencies = [ + "base16ct 0.2.0", + "der 0.7.8", + "generic-array", + "pkcs8 0.10.2", "subtle", "zeroize", ] @@ -4783,14 +5448,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" [[package]] -name = "sentry" -version = "0.31.6" +name = "send-future" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b" +checksum = "224e328af6e080cddbab3c770b1cf50f0351ba0577091ef2410c3951d835ff87" + +[[package]] +name = "sentry" +version = "0.32.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02" dependencies = [ "httpdate", - "reqwest", - "rustls", + "reqwest 0.12.4", + "rustls 0.21.11", "sentry-backtrace", "sentry-contexts", "sentry-core", @@ -4803,9 +5474,9 @@ dependencies = [ [[package]] name = "sentry-backtrace" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ac2bac6f310c4c4c4bb094d1541d32ae497f8c5c23405e85492cefdfe0971a9" +checksum = "a79194074f34b0cbe5dd33896e5928bbc6ab63a889bd9df2264af5acb186921e" dependencies = [ "backtrace", "once_cell", @@ -4815,11 +5486,11 @@ dependencies = [ [[package]] name = "sentry-contexts" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c3e17295cecdbacf66c5bd38d6e1147e09e1e9d824d2d5341f76638eda02a3a" +checksum = "eba8870c5dba2bfd9db25c75574a11429f6b95957b0a78ac02e2970dd7a5249a" dependencies = [ - "hostname", + "hostname 0.4.0", "libc", "os_info", "rustc_version", @@ -4829,9 +5500,9 @@ dependencies = [ [[package]] name = "sentry-core" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8339474f587f36cb110fa1ed1b64229eea6d47b0b886375579297b7e47aeb055" +checksum = "46a75011ea1c0d5c46e9e57df03ce81f5c7f0a9e199086334a1f9c0a541e0826" dependencies = [ "once_cell", "rand 0.8.5", @@ -4842,9 +5513,9 @@ dependencies = [ [[package]] name = "sentry-panic" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "875b69f506da75bd664029eafb05f8934297d2990192896d17325f066bd665b7" +checksum = "2eaa3ecfa3c8750c78dcfd4637cfa2598b95b52897ed184b4dc77fcf7d95060d" dependencies = [ "sentry-backtrace", "sentry-core", @@ -4852,9 +5523,9 @@ dependencies = [ [[package]] name = "sentry-tracing" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89feead9bdd116f8035e89567651340fc382db29240b6c55ef412078b08d1aa3" +checksum = "f715932bf369a61b7256687c6f0554141b7ce097287e30e3f7ed6e9de82498fe" dependencies = [ "sentry-backtrace", "sentry-core", @@ -4864,13 +5535,13 @@ dependencies = [ [[package]] name = "sentry-types" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99dc599bd6646884fc403d593cdcb9816dd67c50cff3271c01ff123617908dcd" +checksum = "4519c900ce734f7a0eb7aba0869dfb225a7af8820634a7dd51449e3b093cfb7c" dependencies = [ "debugid", - "getrandom 0.2.11", "hex", + "rand 0.8.5", "serde", "serde_json", "thiserror", @@ -4887,9 +5558,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.183" +version = "1.0.203" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c" +checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" dependencies = [ "serde_derive", ] @@ -4906,22 +5577,23 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.183" +version = "1.0.203" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816" +checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] name = "serde_json" -version = "1.0.96" +version = "1.0.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1" +checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed" dependencies = [ "itoa", + "memchr", "ryu", "serde", ] @@ -4949,9 +5621,9 @@ dependencies = [ [[package]] name = "serde_spanned" -version = "0.6.2" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93107647184f6027e3b7dcb2e11034cf95ffa1e3a682c67951963ac69c1c007d" +checksum = "79e674e01f999af37c49f70a6ede167a8a60b2503e56c5599532a65baa5969a0" dependencies = [ "serde", ] @@ -4993,7 +5665,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -5015,13 +5687,23 @@ checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012" [[package]] name = "sha2" -version = "0.10.6" +version = "0.10.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" dependencies = [ "cfg-if", "cpufeatures", "digest", + "sha2-asm", +] + +[[package]] +name = "sha2-asm" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f27ba7066011e3fb30d808b51affff34f0a66d3a03a58edd787c6e420e40e44e" +dependencies = [ + "cc", ] [[package]] @@ -5079,6 +5761,16 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "digest", + "rand_core 0.6.4", +] + [[package]] name = "simple_asn1" version = "0.6.2" @@ -5108,9 +5800,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.11.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] name = "smol_str" @@ -5152,9 +5844,6 @@ name = "spin" version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" -dependencies = [ - "lock_api", -] [[package]] name = "spki" @@ -5163,14 +5852,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b" dependencies = [ "base64ct", - "der", + "der 0.6.1", ] [[package]] -name = "stable_deref_trait" -version = "1.2.0" +name = "spki" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der 0.7.8", +] [[package]] name = "static_assertions" @@ -5192,7 +5885,7 @@ dependencies = [ "futures-util", "git-version", "humantime", - "hyper", + "hyper 0.14.26", "metrics", "once_cell", "parking_lot 0.12.1", @@ -5206,6 +5899,143 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "storage_controller" +version = "0.1.0" +dependencies = [ + "anyhow", + "aws-config", + "bytes", + "camino", + "chrono", + "clap", + "control_plane", + "diesel", + "diesel_migrations", + "fail", + "futures", + "git-version", + "hex", + "humantime", + "hyper 0.14.26", + "itertools 0.10.5", + "lasso", + "measured", + "metrics", + "once_cell", + "pageserver_api", + "pageserver_client", + "postgres_connection", + "r2d2", + "rand 0.8.5", + "reqwest 0.12.4", + "routerify", + "scopeguard", + "serde", + "serde_json", + "strum", + "strum_macros", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "utils", + "workspace_hack", +] + +[[package]] +name = "storage_controller_client" +version = "0.1.0" +dependencies = [ + "anyhow", + "bytes", + "futures", + "pageserver_api", + "pageserver_client", + "postgres", + "reqwest 0.12.4", + "serde", + "thiserror", + "tokio", + "tokio-postgres", + "tokio-stream", + "tokio-util", + "utils", + "workspace_hack", +] + +[[package]] +name = "storage_scrubber" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-stream", + "aws-config", + "aws-sdk-s3", + "aws-smithy-async", + "bincode", + "bytes", + "camino", + "chrono", + "clap", + "crc32c", + "either", + "futures", + "futures-util", + "git-version", + "hex", + "humantime", + "itertools 0.10.5", + "once_cell", + "pageserver", + "pageserver_api", + "postgres_ffi", + "rand 0.8.5", + "remote_storage", + "reqwest 0.12.4", + "rustls 0.22.4", + "rustls-native-certs 0.7.0", + "serde", + "serde_json", + "serde_with", + "storage_controller_client", + "thiserror", + "tokio", + "tokio-postgres", + "tokio-postgres-rustls", + "tokio-rustls 0.25.0", + "tokio-stream", + "tokio-util", + "tracing", + "tracing-appender", + "tracing-subscriber", + "utils", + "workspace_hack", +] + +[[package]] +name = "storcon_cli" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "comfy-table", + "futures", + "humantime", + "hyper 0.14.26", + "pageserver_api", + "pageserver_client", + "reqwest 0.12.4", + "serde", + "serde_json", + "storage_controller_client", + "thiserror", + "tokio", + "tracing", + "utils", + "workspace_hack", +] + [[package]] name = "stringprep" version = "0.1.2" @@ -5234,7 +6064,7 @@ version = "0.24.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", "rustversion", @@ -5249,9 +6079,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "svg_fmt" -version = "0.4.1" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2" +checksum = "20e16a0f46cf5fd675563ef54f26e83e20f2366bcf027bcb3cc3ed2b98aaf2ca" [[package]] name = "syn" @@ -5266,9 +6096,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.32" +version = "2.0.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2" +checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" dependencies = [ "proc-macro2", "quote", @@ -5280,6 +6110,9 @@ name = "sync_wrapper" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" +dependencies = [ + "futures-core", +] [[package]] name = "synstructure" @@ -5330,15 +6163,15 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.5.0" +version = "3.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998" +checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" dependencies = [ "cfg-if", - "fastrand 1.9.0", - "redox_syscall 0.3.5", - "rustix 0.37.25", - "windows-sys 0.45.0", + "fastrand 2.0.0", + "redox_syscall 0.4.1", + "rustix", + "windows-sys 0.52.0", ] [[package]] @@ -5348,7 +6181,6 @@ dependencies = [ "anyhow", "serde", "serde_json", - "workspace_hack", ] [[package]] @@ -5362,43 +6194,43 @@ dependencies = [ [[package]] name = "test-context" -version = "0.1.4" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3" +checksum = "6676ab8513edfd2601a108621103fdb45cac9098305ca25ec93f7023b06b05d9" dependencies = [ - "async-trait", "futures", "test-context-macros", ] [[package]] name = "test-context-macros" -version = "0.1.4" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d" +checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1" dependencies = [ + "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.52", ] [[package]] name = "thiserror" -version = "1.0.40" +version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac" +checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.40" +version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" +checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -5423,15 +6255,47 @@ dependencies = [ ] [[package]] -name = "time" -version = "0.3.21" +name = "tikv-jemalloc-ctl" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc" +checksum = "619bfed27d807b54f7f776b9430d4f8060e66ee138a28632ca898584d462c31c" dependencies = [ + "libc", + "paste", + "tikv-jemalloc-sys", +] + +[[package]] +name = "tikv-jemalloc-sys" +version = "0.5.4+5.3.0-patched" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9402443cb8fd499b6f327e40565234ff34dbda27460c5b47db0db77443dd85d1" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "tikv-jemallocator" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "965fe0c26be5c56c94e38ba547249074803efd52adfb66de62107d95aab3eaca" +dependencies = [ + "libc", + "tikv-jemalloc-sys", +] + +[[package]] +name = "time" +version = "0.3.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" +dependencies = [ + "deranged", "itoa", "js-sys", - "libc", - "num_threads", + "num-conv", + "powerfmt", "serde", "time-core", "time-macros", @@ -5439,16 +6303,17 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" [[package]] name = "time-macros" -version = "0.2.9" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b" +checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" dependencies = [ + "num-conv", "time-core", ] @@ -5486,25 +6351,11 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" -[[package]] -name = "tls-listener" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81294c017957a1a69794f506723519255879e15a870507faf45dfed288b763dd" -dependencies = [ - "futures-util", - "hyper", - "pin-project-lite", - "thiserror", - "tokio", - "tokio-rustls", -] - [[package]] name = "tokio" -version = "1.34.0" +version = "1.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9" +checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" dependencies = [ "backtrace", "bytes", @@ -5518,6 +6369,22 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "tokio-epoll-uring" +version = "0.1.0" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6" +dependencies = [ + "futures", + "nix 0.26.4", + "once_cell", + "scopeguard", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "uring-common", +] + [[package]] name = "tokio-io-timeout" version = "1.2.0" @@ -5536,23 +6403,13 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", -] - -[[package]] -name = "tokio-native-tls" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" -dependencies = [ - "native-tls", - "tokio", + "syn 2.0.52", ] [[package]] name = "tokio-postgres" version = "0.7.7" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "async-trait", "byteorder", @@ -5574,16 +6431,17 @@ dependencies = [ [[package]] name = "tokio-postgres-rustls" -version = "0.10.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd5831152cb0d3f79ef5523b357319ba154795d64c7078b2daa95a803b54057f" +checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677" dependencies = [ "futures", - "ring 0.16.20", - "rustls", + "ring 0.17.6", + "rustls 0.22.4", "tokio", "tokio-postgres", - "tokio-rustls", + "tokio-rustls 0.25.0", + "x509-certificate", ] [[package]] @@ -5592,7 +6450,18 @@ version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5" dependencies = [ - "rustls", + "rustls 0.21.11", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f" +dependencies = [ + "rustls 0.22.4", + "rustls-pki-types", "tokio", ] @@ -5645,7 +6514,7 @@ dependencies = [ "futures-io", "futures-sink", "futures-util", - "hashbrown 0.14.0", + "hashbrown 0.14.5", "pin-project-lite", "tokio", "tracing", @@ -5660,14 +6529,26 @@ dependencies = [ "serde", "serde_spanned", "toml_datetime", - "toml_edit", + "toml_edit 0.19.10", +] + +[[package]] +name = "toml" +version = "0.8.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f49eb2ab21d2f26bd6db7bf383edc527a7ebaee412d17af4d40fdccd442f335" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit 0.22.14", ] [[package]] name = "toml_datetime" -version = "0.6.2" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a76a9312f5ba4c2dec6b9161fdf25d87ad8a09256ccea5a556fef03c706a10f" +checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf" dependencies = [ "serde", ] @@ -5682,7 +6563,20 @@ dependencies = [ "serde", "serde_spanned", "toml_datetime", - "winnow", + "winnow 0.4.6", +] + +[[package]] +name = "toml_edit" +version = "0.22.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38" +dependencies = [ + "indexmap 2.0.1", + "serde", + "serde_spanned", + "toml_datetime", + "winnow 0.6.13", ] [[package]] @@ -5698,18 +6592,18 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "h2", - "http", - "http-body", - "hyper", + "h2 0.3.26", + "http 0.2.9", + "http-body 0.4.5", + "hyper 0.14.26", "hyper-timeout", "percent-encoding", "pin-project", "prost", - "rustls-native-certs", - "rustls-pemfile", + "rustls-native-certs 0.6.2", + "rustls-pemfile 1.0.2", "tokio", - "tokio-rustls", + "tokio-rustls 0.24.0", "tokio-stream", "tower", "tower-layer", @@ -5762,17 +6656,6 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" -[[package]] -name = "trace" -version = "0.1.0" -dependencies = [ - "anyhow", - "clap", - "pageserver_api", - "utils", - "workspace_hack", -] - [[package]] name = "tracing" version = "0.1.37" @@ -5805,7 +6688,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -5841,12 +6724,14 @@ dependencies = [ [[package]] name = "tracing-opentelemetry" -version = "0.20.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc09e402904a5261e42cf27aea09ccb7d5318c6717a9eec3d8e2e65c56b18f19" +checksum = "75327c6b667828ddc28f5e3f169036cb793c3f588d83bf0f262a7f062ffed3c8" dependencies = [ "once_cell", "opentelemetry", + "opentelemetry_sdk", + "smallvec", "tracing", "tracing-core", "tracing-log", @@ -5887,23 +6772,22 @@ dependencies = [ name = "tracing-utils" version = "0.1.0" dependencies = [ - "hyper", + "hyper 0.14.26", "opentelemetry", "opentelemetry-otlp", "opentelemetry-semantic-conventions", - "reqwest", + "reqwest 0.12.4", "tokio", "tracing", "tracing-opentelemetry", "tracing-subscriber", - "workspace_hack", ] [[package]] name = "try-lock" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] name = "tungstenite" @@ -5914,7 +6798,7 @@ dependencies = [ "byteorder", "bytes", "data-encoding", - "http", + "http 0.2.9", "httparse", "log", "rand 0.8.5", @@ -5934,21 +6818,22 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "typed-json" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6024a8d0025400b3f6b189366e9aa92012cf9c4fe1cd2620848dd61425c49eed" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "typenum" version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" -[[package]] -name = "tz-rs" -version = "0.6.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33851b15c848fad2cf4b105c6bb66eb9512b6f6c44a4b13f57c53c73c707e2b4" -dependencies = [ - "const_fn", -] - [[package]] name = "uname" version = "0.1.1" @@ -5958,15 +6843,6 @@ dependencies = [ "libc", ] -[[package]] -name = "unicase" -version = "2.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" -dependencies = [ - "version_check", -] - [[package]] name = "unicode-bidi" version = "0.3.13" @@ -6021,12 +6897,23 @@ dependencies = [ "base64 0.21.1", "log", "once_cell", - "rustls", + "rustls 0.21.11", "rustls-webpki 0.100.2", "url", "webpki-roots 0.23.1", ] +[[package]] +name = "uring-common" +version = "0.1.0" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6" +dependencies = [ + "bytes", + "io-uring", + "libc", + "linux-raw-sys 0.6.4", +] + [[package]] name = "url" version = "2.3.1" @@ -6063,7 +6950,7 @@ version = "0.1.0" dependencies = [ "anyhow", "arc-swap", - "async-trait", + "async-compression", "bincode", "byteorder", "bytes", @@ -6074,10 +6961,10 @@ dependencies = [ "criterion", "fail", "futures", - "heapless", "hex", "hex-literal", - "hyper", + "humantime", + "hyper 0.14.26", "jsonwebtoken", "metrics", "nix 0.27.1", @@ -6100,13 +6987,15 @@ dependencies = [ "thiserror", "tokio", "tokio-stream", + "tokio-tar", "tokio-util", + "toml_edit 0.19.10", "tracing", "tracing-error", "tracing-subscriber", "url", "uuid", - "workspace_hack", + "walkdir", ] [[package]] @@ -6185,7 +7074,6 @@ dependencies = [ "postgres_ffi", "regex", "utils", - "workspace_hack", ] [[package]] @@ -6206,7 +7094,6 @@ dependencies = [ "bindgen", "postgres_ffi", "utils", - "workspace_hack", ] [[package]] @@ -6232,10 +7119,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] -name = "wasm-bindgen" -version = "0.2.86" +name = "wasite" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73" +checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" + +[[package]] +name = "wasm-bindgen" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -6243,24 +7136,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.36" +version = "0.4.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e" +checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0" dependencies = [ "cfg-if", "js-sys", @@ -6270,9 +7163,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -6280,22 +7173,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" [[package]] name = "wasm-streams" @@ -6310,6 +7203,19 @@ dependencies = [ "web-sys", ] +[[package]] +name = "wasm-streams" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "wasm-timer" version = "0.2.5" @@ -6327,9 +7233,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.63" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2" +checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" dependencies = [ "js-sys", "wasm-bindgen", @@ -6350,6 +7256,15 @@ version = "0.25.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc" +[[package]] +name = "webpki-roots" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3de34ae270483955a94f4b21bdaaeb83d508bb84a01435f393818edb0012009" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "which" version = "4.4.0" @@ -6361,6 +7276,17 @@ dependencies = [ "once_cell", ] +[[package]] +name = "whoami" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9" +dependencies = [ + "redox_syscall 0.4.1", + "wasite", + "web-sys", +] + [[package]] name = "winapi" version = "0.3.9" @@ -6401,6 +7327,25 @@ dependencies = [ "windows-targets 0.48.0", ] +[[package]] +name = "windows" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" +dependencies = [ + "windows-core", + "windows-targets 0.52.4", +] + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.4", +] + [[package]] name = "windows-sys" version = "0.42.0" @@ -6416,15 +7361,6 @@ dependencies = [ "windows_x86_64_msvc 0.42.2", ] -[[package]] -name = "windows-sys" -version = "0.45.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" -dependencies = [ - "windows-targets 0.42.2", -] - [[package]] name = "windows-sys" version = "0.48.0" @@ -6435,18 +7371,12 @@ dependencies = [ ] [[package]] -name = "windows-targets" -version = "0.42.2" +name = "windows-sys" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", + "windows-targets 0.52.4", ] [[package]] @@ -6464,6 +7394,21 @@ dependencies = [ "windows_x86_64_msvc 0.48.0", ] +[[package]] +name = "windows-targets" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +dependencies = [ + "windows_aarch64_gnullvm 0.52.4", + "windows_aarch64_msvc 0.52.4", + "windows_i686_gnu 0.52.4", + "windows_i686_msvc 0.52.4", + "windows_x86_64_gnu 0.52.4", + "windows_x86_64_gnullvm 0.52.4", + "windows_x86_64_msvc 0.52.4", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" @@ -6476,6 +7421,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" + [[package]] name = "windows_aarch64_msvc" version = "0.42.2" @@ -6488,6 +7439,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" + [[package]] name = "windows_i686_gnu" version = "0.42.2" @@ -6500,6 +7457,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +[[package]] +name = "windows_i686_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" + [[package]] name = "windows_i686_msvc" version = "0.42.2" @@ -6512,6 +7475,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +[[package]] +name = "windows_i686_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" + [[package]] name = "windows_x86_64_gnu" version = "0.42.2" @@ -6524,6 +7493,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" + [[package]] name = "windows_x86_64_gnullvm" version = "0.42.2" @@ -6536,6 +7511,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" + [[package]] name = "windows_x86_64_msvc" version = "0.42.2" @@ -6548,6 +7529,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" + [[package]] name = "winnow" version = "0.4.6" @@ -6557,6 +7544,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "winnow" +version = "0.6.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1" +dependencies = [ + "memchr", +] + [[package]] name = "winreg" version = "0.50.0" @@ -6567,42 +7563,58 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "winreg" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + [[package]] name = "workspace_hack" version = "0.1.0" dependencies = [ + "ahash", "anyhow", "aws-config", "aws-runtime", "aws-sigv4", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-runtime-api", "aws-smithy-types", "axum", "base64 0.21.1", "base64ct", "bytes", + "camino", "cc", "chrono", "clap", "clap_builder", "crossbeam-utils", + "crypto-bigint 0.5.5", + "der 0.7.8", + "deranged", + "digest", "either", "fail", "futures-channel", - "futures-core", "futures-executor", "futures-io", - "futures-sink", "futures-util", + "generic-array", "getrandom 0.2.11", - "hashbrown 0.14.0", + "hashbrown 0.14.5", "hex", "hmac", - "hyper", + "hyper 0.14.26", "indexmap 1.9.3", - "itertools", + "itertools 0.10.5", + "itertools 0.12.1", + "lazy_static", "libc", "log", "memchr", @@ -6612,40 +7624,64 @@ dependencies = [ "num-traits", "once_cell", "parquet", + "proc-macro2", "prost", + "quote", "rand 0.8.5", "regex", "regex-automata 0.4.3", "regex-syntax 0.8.2", - "reqwest", - "ring 0.16.20", - "rustls", + "reqwest 0.11.19", + "reqwest 0.12.4", + "rustls 0.21.11", "scopeguard", "serde", "serde_json", + "sha2", + "signature 2.2.0", "smallvec", + "spki 0.7.3", "subtle", "syn 1.0.109", - "syn 2.0.32", + "syn 2.0.52", + "sync_wrapper", + "tikv-jemalloc-sys", "time", "time-macros", "tokio", - "tokio-rustls", + "tokio-rustls 0.24.0", "tokio-util", - "toml_datetime", - "toml_edit", "tonic", "tower", "tracing", "tracing-core", - "tungstenite", "url", "uuid", + "zeroize", "zstd", "zstd-safe", "zstd-sys", ] +[[package]] +name = "x509-certificate" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66534846dec7a11d7c50a74b7cdb208b9a581cad890b7866430d438455847c85" +dependencies = [ + "bcder", + "bytes", + "chrono", + "der 0.7.8", + "hex", + "pem", + "ring 0.17.6", + "signature 2.2.0", + "spki 0.7.3", + "thiserror", + "zeroize", +] + [[package]] name = "x509-parser" version = "0.15.0" @@ -6704,14 +7740,29 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] name = "zeroize" -version = "1.6.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9" +checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" +dependencies = [ + "serde", + "zeroize_derive", +] + +[[package]] +name = "zeroize_derive" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.52", +] [[package]] name = "zstd" diff --git a/Cargo.toml b/Cargo.toml index eefd1cb114..5045ee0d4d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,21 +3,24 @@ resolver = "2" members = [ "compute_tools", "control_plane", - "control_plane/attachment_service", + "control_plane/storcon_cli", "pageserver", + "pageserver/compaction", "pageserver/ctl", "pageserver/client", "pageserver/pagebench", "proxy", "safekeeper", "storage_broker", - "s3_scrubber", + "storage_controller", + "storage_controller/client", + "storage_scrubber", "workspace_hack", - "trace", "libs/compute_api", "libs/pageserver_api", "libs/postgres_ffi", "libs/safekeeper_api", + "libs/desim", "libs/utils", "libs/consumption_metrics", "libs/postgres_backend", @@ -38,25 +41,31 @@ license = "Apache-2.0" ## All dependency versions, used in the project [workspace.dependencies] +ahash = "0.8" anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } -azure_core = "0.18" -azure_identity = "0.18" -azure_storage = "0.18" -azure_storage_blobs = "0.18" +atomic-take = "1.1.0" +azure_core = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] } +azure_identity = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] } +azure_storage = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] } +azure_storage_blobs = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] } flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" -aws-config = { version = "1.0", default-features = false, features=["rustls"] } -aws-sdk-s3 = "1.0" -aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] } -aws-smithy-types = "1.0" -aws-credential-types = "1.0" +aws-config = { version = "1.3", default-features = false, features=["rustls"] } +aws-sdk-s3 = "1.26" +aws-sdk-iam = "1.15.0" +aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] } +aws-smithy-types = "1.1.9" +aws-credential-types = "1.2.0" +aws-sigv4 = { version = "1.2.1", features = ["sign-http"] } +aws-types = "1.2.0" axum = { version = "0.6.20", features = ["ws"] } base64 = "0.13.0" bincode = "1.3" -bindgen = "0.65" +bindgen = "0.70" +bit_field = "0.10.2" bstr = "1.0" byteorder = "1.4" bytes = "1.0" @@ -64,73 +73,81 @@ camino = "1.1.6" cfg-if = "1.0.0" chrono = { version = "0.4", default-features = false, features = ["clock"] } clap = { version = "4.0", features = ["derive"] } -close_fds = "0.3.2" comfy-table = "6.1" const_format = "0.2" crc32c = "0.6" +crossbeam-deque = "0.8.5" crossbeam-utils = "0.8.5" dashmap = { version = "5.5.0", features = ["raw-api"] } either = "1.8" enum-map = "2.4.2" enumset = "1.0.12" fail = "0.5.0" -fs2 = "0.4.3" +fallible-iterator = "0.2" +framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" } futures = "0.3" futures-core = "0.3" futures-util = "0.3" git-version = "0.3" -hashbrown = "0.13" -hashlink = "0.8.1" +hashbrown = "0.14" +hashlink = "0.9.1" hdrhistogram = "7.5.2" hex = "0.4" hex-literal = "0.4" hmac = "0.12.1" hostname = "0.3.1" +http = {version = "1.1.0", features = ["std"]} http-types = { version = "2", default-features = false } humantime = "2.1" humantime-serde = "1.1.1" hyper = "0.14" -hyper-tungstenite = "0.11" +tokio-tungstenite = "0.20.0" +indexmap = "2" +indoc = "2" inotify = "0.10.2" ipnet = "2.9.0" itertools = "0.10" jsonwebtoken = "9" +lasso = "0.7" libc = "0.2" md5 = "0.7.0" +measured = { version = "0.0.22", features=["lasso"] } +measured-process = { version = "0.0.22" } memoffset = "0.8" -native-tls = "0.2" -nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] } +nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] } notify = "6.0.0" num_cpus = "1.15" num-traits = "0.2.15" once_cell = "1.13" opentelemetry = "0.20.0" -opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-semantic-conventions = "0.12.0" parking_lot = "0.12" -parquet = { version = "49.0.0", default-features = false, features = ["zstd"] } -parquet_derive = "49.0.0" +parquet = { version = "51.0.0", default-features = false, features = ["zstd"] } +parquet_derive = "51.0.0" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pin-project-lite = "0.2" -prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency +procfs = "0.16" +prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency prost = "0.11" rand = "0.8" -redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] } +redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" -reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] } -reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] } -reqwest-middleware = "0.2.0" -reqwest-retry = "0.2.2" +reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } +reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] } +reqwest-middleware = "0.3.0" +reqwest-retry = "0.5" routerify = "3" rpds = "0.13" rustc-hash = "1.1.0" -rustls = "0.21" -rustls-pemfile = "1" +rustls = "0.22" +rustls-pemfile = "2" rustls-split = "0.3" scopeguard = "1.1" sysinfo = "0.29.2" sd-notify = "0.4.1" -sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } +send-future = "0.1.0" +sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_path_to_error = "0.1" @@ -143,32 +160,41 @@ smol_str = { version = "0.2.0", features = ["serde"] } socket2 = "0.5" strum = "0.24" strum_macros = "0.24" -svg_fmt = "0.4.1" +"subtle" = "2.5.0" +svg_fmt = "0.4.3" sync_wrapper = "0.1.2" tar = "0.4" task-local-extensions = "0.1.4" -test-context = "0.1" +test-context = "0.3" thiserror = "1.0" -tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] } +tikv-jemallocator = "0.5" +tikv-jemalloc-ctl = "0.5" tokio = { version = "1.17", features = ["macros"] } +tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" } tokio-io-timeout = "1.2.0" -tokio-postgres-rustls = "0.10.0" -tokio-rustls = "0.24" +tokio-postgres-rustls = "0.11.0" +tokio-rustls = "0.25" tokio-stream = "0.1" tokio-tar = "0.3" tokio-util = { version = "0.7.10", features = ["io", "rt"] } toml = "0.7" toml_edit = "0.19" tonic = {version = "0.9", features = ["tls", "tls-roots"]} +tower-service = "0.3.2" tracing = "0.1" tracing-error = "0.2.0" -tracing-opentelemetry = "0.20.0" -tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } +tracing-opentelemetry = "0.21.0" +tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } +try-lock = "0.2.5" +twox-hash = { version = "1.6.3", default-features = false } +typed-json = "0.1" url = "2.2" +urlencoding = "2.1" uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] } walkdir = "2.3.2" -webpki-roots = "0.25" +rustls-native-certs = "0.7" x509-parser = "0.15" +whoami = "1.5.1" ## TODO replace this with tracing env_logger = "0.10" @@ -176,27 +202,26 @@ log = "0.4" ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } -postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } -## Other git libraries -heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending - ## Local libraries compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } metrics = { version = "0.1", path = "./libs/metrics/" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } pageserver_client = { path = "./pageserver/client" } +pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" } postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } pq_proto = { version = "0.1", path = "./libs/pq_proto/" } remote_storage = { version = "0.1", path = "./libs/remote_storage/" } safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" } +desim = { version = "0.1", path = "./libs/desim" } storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy. +storage_controller_client = { path = "./storage_controller/client" } tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" } tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" } utils = { version = "0.1", path = "./libs/utils/" } @@ -208,20 +233,19 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" } ## Build dependencies criterion = "0.5.1" -rcgen = "0.11" +rcgen = "0.12" rstest = "0.18" camino-tempfile = "1.0.2" tonic-build = "0.9" [patch.crates-io] -# This is only needed for proxy's tests. -# TODO: we should probably fork `tokio-postgres-rustls` instead. +# Needed to get `tokio-postgres-rustls` to depend on our fork. tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } # bug fixes for UUID -parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" } -parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" } +parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" } +parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" } ################# Binary contents sections diff --git a/Dockerfile b/Dockerfile index 5d5fde4f14..1efedfa9bc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,7 @@ COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh -ENV BUILD_TYPE release +ENV BUILD_TYPE=release RUN set -e \ && mold -run make -j $(nproc) -s neon-pg-ext \ && rm -rf pg_install/build \ @@ -29,34 +29,25 @@ WORKDIR /home/nonroot ARG GIT_VERSION=local ARG BUILD_TAG -# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. -# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. -# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build -ARG RUSTC_WRAPPER=cachepot -ENV AWS_REGION=eu-central-1 -ENV CACHEPOT_S3_KEY_PREFIX=cachepot -ARG CACHEPOT_BUCKET=neon-github-dev -#ARG AWS_ACCESS_KEY_ID -#ARG AWS_SECRET_ACCESS_KEY - COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server +COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib COPY --chown=nonroot . . -# Show build caching stats to check if it was used in the end. -# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. +ARG ADDITIONAL_RUSTFLAGS RUN set -e \ - && mold -run cargo build \ + && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \ --bin pg_sni_router \ --bin pageserver \ --bin pagectl \ --bin safekeeper \ --bin storage_broker \ + --bin storage_controller \ --bin proxy \ --bin neon_local \ - --locked --release \ - && cachepot -s + --bin storage_scrubber \ + --locked --release # Build final image # @@ -68,8 +59,6 @@ RUN set -e \ && apt install -y \ libreadline-dev \ libseccomp-dev \ - libicu67 \ - openssl \ ca-certificates \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ && useradd -d /data neon \ @@ -80,8 +69,10 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ @@ -90,15 +81,25 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. # Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values. -RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \ - && /usr/local/bin/pageserver -D /data/.neon/ --init \ - -c "id=1234" \ - -c "broker_endpoint='http://storage_broker:50051'" \ - -c "pg_distrib_dir='/usr/local/'" \ - -c "listen_pg_addr='0.0.0.0:6400'" \ - -c "listen_http_addr='0.0.0.0:9898'" +RUN mkdir -p /data/.neon/ && \ + echo "id=1234" > "/data/.neon/identity.toml" && \ + echo "broker_endpoint='http://storage_broker:50051'\n" \ + "pg_distrib_dir='/usr/local/'\n" \ + "listen_pg_addr='0.0.0.0:6400'\n" \ + "listen_http_addr='0.0.0.0:9898'\n" \ + "availability_zone='local'\n" \ + > /data/.neon/pageserver.toml && \ + chown -R neon:neon /data/.neon + +# When running a binary that links with libpq, default to using our most recent postgres version. Binaries +# that want a particular postgres version will select it explicitly: this is just a default. +ENV LD_LIBRARY_PATH=/usr/local/v16/lib + VOLUME ["/data"] USER neon EXPOSE 6400 EXPOSE 9898 + +CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"] + diff --git a/Dockerfile.buildtools b/Dockerfile.build-tools similarity index 56% rename from Dockerfile.buildtools rename to Dockerfile.build-tools index 213aed1679..a9cbed85fb 100644 --- a/Dockerfile.buildtools +++ b/Dockerfile.build-tools @@ -1,5 +1,13 @@ FROM debian:bullseye-slim +# Use ARG as a build-time environment variable here to allow. +# It's not supposed to be set outside. +# Alternatively it can be obtained using the following command +# ``` +# . /etc/os-release && echo "${VERSION_CODENAME}" +# ``` +ARG DEBIAN_VERSION_CODENAME=bullseye + # Add nonroot user RUN useradd -ms /bin/bash nonroot -b /home SHELL ["/bin/bash", "-c"] @@ -26,7 +34,6 @@ RUN set -e \ liblzma-dev \ libncurses5-dev \ libncursesw5-dev \ - libpq-dev \ libreadline-dev \ libseccomp-dev \ libsqlite3-dev \ @@ -51,29 +58,40 @@ RUN set -e \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* # protobuf-compiler (protoc) -ENV PROTOC_VERSION 25.1 +ENV PROTOC_VERSION=25.1 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \ && unzip -q protoc.zip -d protoc \ && mv protoc/bin/protoc /usr/local/bin/protoc \ && mv protoc/include/google /usr/local/include/google \ && rm -rf protoc.zip protoc +# s5cmd +ENV S5CMD_VERSION=2.2.2 +RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \ + && chmod +x s5cmd \ + && mv s5cmd /usr/local/bin/s5cmd + # LLVM -ENV LLVM_VERSION=17 +ENV LLVM_VERSION=18 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ - && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ + && echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ && apt update \ && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \ && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* -# PostgreSQL 14 -RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \ - && echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \ +# Install docker +RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \ && apt update \ - && apt install -y postgresql-client-14 \ + && apt install -y docker-ce docker-ce-cli \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +# Configure sudo & docker +RUN usermod -aG sudo nonroot && \ + echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \ + usermod -aG docker nonroot + # AWS CLI RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \ && unzip -q awscliv2.zip \ @@ -81,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws && rm awscliv2.zip # Mold: A Modern Linker -ENV MOLD_VERSION v2.4.0 +ENV MOLD_VERSION=v2.33.0 RUN set -e \ && git clone https://github.com/rui314/mold.git \ && mkdir mold/build \ @@ -106,12 +124,51 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS && make install \ && rm -rf ../lcov.tar.gz +# Compile and install the static OpenSSL library +ENV OPENSSL_VERSION=1.1.1w +ENV OPENSSL_PREFIX=/usr/local/openssl +RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \ + echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \ + cd /tmp && \ + tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \ + rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \ + cd /tmp/openssl-${OPENSSL_VERSION} && \ + ./config --prefix=${OPENSSL_PREFIX} -static --static no-shared -fPIC && \ + make -j "$(nproc)" && \ + make install && \ + cd /tmp && \ + rm -rf /tmp/openssl-${OPENSSL_VERSION} + +# Use the same version of libicu as the compute nodes so that +# clusters created using inidb on pageserver can be used by computes. +# +# TODO: at this time, Dockerfile.compute-node uses the debian bullseye libicu +# package, which is 67.1. We're duplicating that knowledge here, and also, technically, +# Debian has a few patches on top of 67.1 that we're not adding here. +ENV ICU_VERSION=67.1 +ENV ICU_PREFIX=/usr/local/icu + +# Download and build static ICU +RUN wget -O /tmp/libicu-${ICU_VERSION}.tgz https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION//./-}/icu4c-${ICU_VERSION//./_}-src.tgz && \ + echo "94a80cd6f251a53bd2a997f6f1b5ac6653fe791dfab66e1eb0227740fb86d5dc /tmp/libicu-${ICU_VERSION}.tgz" | sha256sum --check && \ + mkdir /tmp/icu && \ + pushd /tmp/icu && \ + tar -xzf /tmp/libicu-${ICU_VERSION}.tgz && \ + pushd icu/source && \ + ./configure --prefix=${ICU_PREFIX} --enable-static --enable-shared=no CXXFLAGS="-fPIC" CFLAGS="-fPIC" && \ + make -j "$(nproc)" && \ + make install && \ + popd && \ + rm -rf icu && \ + rm -f /tmp/libicu-${ICU_VERSION}.tgz && \ + popd + # Switch to nonroot user USER nonroot:nonroot WORKDIR /home/nonroot # Python -ENV PYTHON_VERSION=3.9.2 \ +ENV PYTHON_VERSION=3.9.19 \ PYENV_ROOT=/home/nonroot/.pyenv \ PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH RUN set -e \ @@ -135,9 +192,14 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.75.0 +ENV RUSTC_VERSION=1.81.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" +ARG RUSTFILT_VERSION=0.2.1 +ARG CARGO_HAKARI_VERSION=0.9.30 +ARG CARGO_DENY_VERSION=0.16.1 +ARG CARGO_HACK_VERSION=0.6.31 +ARG CARGO_NEXTEST_VERSION=0.9.72 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ chmod +x rustup-init && \ ./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \ @@ -146,15 +208,13 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux . "$HOME/.cargo/env" && \ cargo --version && rustup --version && \ rustup component add llvm-tools-preview rustfmt clippy && \ - cargo install --git https://github.com/paritytech/cachepot && \ - cargo install rustfilt && \ - cargo install cargo-hakari && \ - cargo install cargo-deny && \ - cargo install cargo-hack && \ - cargo install cargo-nextest && \ + cargo install rustfilt --version ${RUSTFILT_VERSION} && \ + cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} && \ + cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \ + cargo install cargo-hack --version ${CARGO_HACK_VERSION} && \ + cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} && \ rm -rf /home/nonroot/.cargo/registry && \ rm -rf /home/nonroot/.cargo/git -ENV RUSTC_WRAPPER=cachepot # Show versions RUN whoami \ @@ -164,3 +224,6 @@ RUN whoami \ && rustup --version --verbose \ && rustc --version --verbose \ && clang --version + +# Set following flag to check in Makefile if its running in Docker +RUN touch /home/nonroot/.docker_build diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index a5c1f3157d..b6c89cd71f 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -52,7 +52,7 @@ RUN cd postgres && \ # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser. # In vanilla postgres this function is limited to Postgres role superuser. # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases. - # We could add the additional grant statements to the postgres repository but it would be hard to maintain, + # We could add the additional grant statements to the postgres repository but it would be hard to maintain, # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork, # so we do it here. old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \ @@ -63,14 +63,14 @@ RUN cd postgres && \ echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \ fi; \ done; \ - # the second loop is for pg_stat_statement extension versions >= 1.7, + # the second loop is for pg_stat_statement extension versions >= 1.7, # where pg_stat_statement_reset() got 3 additional arguments for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \ filename=$(basename "$file"); \ if ! echo "$old_list" | grep -q -F "$filename"; then \ echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \ fi; \ - done + done ######################################################################################### # @@ -89,16 +89,16 @@ RUN apt update && \ # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2 RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \ - mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ + mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \ DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \ make clean && cp -R /sfcgal/* / -ENV PATH "/usr/local/pgsql/bin:$PATH" +ENV PATH="/usr/local/pgsql/bin:$PATH" RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \ - mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \ + mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ ./autogen.sh && \ ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \ @@ -124,7 +124,7 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postg RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \ echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \ - mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \ + mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ cmake -DCMAKE_BUILD_TYPE=Release .. && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -144,30 +144,23 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti FROM build-deps AS plv8-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ARG PG_VERSION RUN apt update && \ apt install -y ninja-build python3-dev libncurses5 binutils clang -RUN case "${PG_VERSION}" in \ - "v14" | "v15") \ - export PLV8_VERSION=3.1.5 \ - export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \ - ;; \ - "v16") \ - export PLV8_VERSION=3.1.8 \ - export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \ - ;; \ - *) \ - echo "Export the valid PG_VERSION variable" && exit 1 \ - ;; \ - esac && \ - wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \ - echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \ - mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \ +RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \ + echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \ + mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \ + # generate and copy upgrade scripts + mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \ + cp upgrade/* /usr/local/pgsql/share/extension/ && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ rm -rf /plv8-* && \ find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \ + # don't break computes with installed old version of plv8 + cd /usr/local/pgsql/lib/ && \ + ln -s plv8-3.1.10.so plv8-3.1.5.so && \ + ln -s plv8-3.1.10.so plv8-3.1.8.so && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control @@ -201,7 +194,7 @@ RUN case "$(uname -m)" in \ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \ - mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \ + mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ cmake .. -DCMAKE_BUILD_TYPE=Release && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -211,7 +204,7 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \ - mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \ + mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -229,7 +222,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \ echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \ - mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ + mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ # unit extension's "create extension" script relies on absolute install path to fill some reference tables. @@ -248,11 +241,17 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz - FROM build-deps AS vector-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \ - echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \ - mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ +COPY patches/pgvector.patch /pgvector.patch + +# By default, pgvector Makefile uses `-march=native`. We don't want that, +# because we build the images on different machines than where we run them. +# Pass OPTFLAGS="" to remove it. +RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \ + echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \ + mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ + patch -p1 < /pgvector.patch && \ + make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control ######################################################################################### @@ -267,7 +266,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021 RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \ echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \ - mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \ + mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control @@ -282,7 +281,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \ echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \ - mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \ + mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control @@ -298,7 +297,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \ - mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \ + mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control @@ -312,9 +311,12 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz FROM build-deps AS rum-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY patches/rum.patch /rum.patch + RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \ - mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \ + mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \ + patch -p1 < /rum.patch && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control @@ -330,7 +332,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \ - mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \ + mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control @@ -346,7 +348,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \ - mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \ + mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control @@ -362,7 +364,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \ - mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \ + mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control @@ -378,7 +380,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \ - mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \ + mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control @@ -394,7 +396,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \ echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \ - mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ + mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control @@ -409,7 +411,7 @@ FROM build-deps AS timescaledb-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION -ENV PATH "/usr/local/pgsql/bin:$PATH" +ENV PATH="/usr/local/pgsql/bin:$PATH" RUN case "${PG_VERSION}" in \ "v14" | "v15") \ @@ -425,7 +427,7 @@ RUN case "${PG_VERSION}" in \ apt-get install -y cmake && \ wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \ echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \ - mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \ + mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \ ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \ cd build && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -442,7 +444,7 @@ FROM build-deps AS pg-hint-plan-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION -ENV PATH "/usr/local/pgsql/bin:$PATH" +ENV PATH="/usr/local/pgsql/bin:$PATH" RUN case "${PG_VERSION}" in \ "v14") \ @@ -463,36 +465,11 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \ echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \ - mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xvzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \ + mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control -######################################################################################### -# -# Layer "kq-imcx-pg-build" -# compile kq_imcx extension -# -######################################################################################### -FROM build-deps AS kq-imcx-pg-build -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ - -ENV PATH "/usr/local/pgsql/bin/:$PATH" -RUN apt-get update && \ - apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \ - wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \ - echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \ - mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \ - find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ - mkdir build && cd build && \ - cmake -DCMAKE_BUILD_TYPE=Release .. && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \ - find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\ - mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \ - sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \ - comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T - ######################################################################################### # @@ -503,10 +480,10 @@ RUN apt-get update && \ FROM build-deps AS pg-cron-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \ - mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \ + mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control @@ -527,13 +504,12 @@ RUN apt-get update && \ libboost-regex1.74-dev \ libboost-serialization1.74-dev \ libboost-system1.74-dev \ - libeigen3-dev \ - libfreetype6-dev + libeigen3-dev -ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \ - mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \ + mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \ cmake \ -D RDK_BUILD_CAIRO_SUPPORT=OFF \ -D RDK_BUILD_INCHI_SUPPORT=ON \ @@ -553,6 +529,8 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar. -D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \ -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \ -D RDK_INSTALL_INTREE=OFF \ + -D RDK_INSTALL_COMIC_FONTS=OFF \ + -D RDK_BUILD_FREETYPE_SUPPORT=OFF \ -D CMAKE_BUILD_TYPE=Release \ . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -568,10 +546,10 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar. FROM build-deps AS pg-uuidv7-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \ - mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ + mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control @@ -585,10 +563,10 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz FROM build-deps AS pg-roaringbitmap-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ - mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ + mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control @@ -602,10 +580,10 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4 FROM build-deps AS pg-semver-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \ - mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \ + mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control @@ -620,7 +598,7 @@ FROM build-deps AS pg-embedding-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in \ "v14" | "v15") \ export PG_EMBEDDING_VERSION=0.3.5 \ @@ -631,7 +609,7 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \ echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \ - mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \ + mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install @@ -644,10 +622,10 @@ RUN case "${PG_VERSION}" in \ FROM build-deps AS pg-anon-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" -RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \ - echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \ - mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \ +ENV PATH="/usr/local/pgsql/bin/:$PATH" +RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ + echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ + mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \ @@ -679,7 +657,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux chmod +x rustup-init && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ rm rustup-init && \ - cargo install --locked --version 0.10.2 cargo-pgrx && \ + cargo install --locked --version 0.11.3 cargo-pgrx && \ /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' USER root @@ -694,10 +672,15 @@ USER root FROM rust-extensions-build AS pg-jsonschema-pg-build ARG PG_VERSION -RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \ - echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \ - mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ +RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \ + echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \ + mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ + # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8 + # `unsafe-postgres` feature allows to build pgx extensions + # against postgres forks that decided to change their ABI name (like us). + # With that we can build extensions without forking them and using stock + # pgx. As this feature is new few manual version bumps were required. + sed -i 's/pgrx = "0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control @@ -711,10 +694,10 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar. FROM rust-extensions-build AS pg-graphql-pg-build ARG PG_VERSION -RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \ - echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \ - mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ +RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \ + echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \ + mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ + sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ # it's needed to enable extension because it uses untrusted C language sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \ @@ -733,7 +716,10 @@ ARG PG_VERSION # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023 RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \ echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \ - mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ + mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ + # TODO update pgrx version in the pg_tiktoken repo and remove this line + sed -i 's/pgrx = { version = "=0.10.2",/pgrx = { version = "0.11.3",/g' Cargo.toml && \ + sed -i 's/pgrx-tests = "=0.10.2"/pgrx-tests = "0.11.3"/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control @@ -747,14 +733,10 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6 FROM rust-extensions-build AS pg-pgx-ulid-build ARG PG_VERSION -RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \ - echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \ - mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ - echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \ - wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \ - patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \ - echo "********************************************************************************************************" && \ - sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ +RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ + echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \ + mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ + sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control @@ -768,13 +750,47 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz - FROM build-deps AS wal2json-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ - mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ + mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install +######################################################################################### +# +# Layer "pg_ivm" +# compile pg_ivm extension +# +######################################################################################### +FROM build-deps AS pg-ivm-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PATH="/usr/local/pgsql/bin/:$PATH" +RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ + echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ + mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control + +######################################################################################### +# +# Layer "pg_partman" +# compile pg_partman extension +# +######################################################################################### +FROM build-deps AS pg-partman-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PATH="/usr/local/pgsql/bin/:$PATH" +RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ + echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ + mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control + ######################################################################################### # # Layer "neon-pg-ext-build" @@ -806,7 +822,6 @@ COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ @@ -815,6 +830,9 @@ COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql +COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ @@ -825,6 +843,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon_utils \ -s install && \ + make -j $(getconf _NPROCESSORS_ONLN) \ + PG_CONFIG=/usr/local/pgsql/bin/pg_config \ + -C pgxn/neon_test_utils \ + -s install && \ make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon_rmgr \ @@ -856,7 +878,17 @@ ENV BUILD_TAG=$BUILD_TAG USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . -RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto +RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto + +######################################################################################### +# +# Final compute-tools image +# +######################################################################################### + +FROM debian:bullseye-slim AS compute-tools-image + +COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl ######################################################################################### # @@ -877,6 +909,70 @@ RUN rm -r /usr/local/pgsql/include # if they were to be used by other libraries. RUN rm /usr/local/pgsql/lib/lib*.a + +######################################################################################### +# +# Layer neon-pg-ext-test +# +######################################################################################### + +FROM neon-pg-ext-build AS neon-pg-ext-test +ARG PG_VERSION +RUN mkdir /ext-src + +#COPY --from=postgis-build /postgis.tar.gz /ext-src/ +#COPY --from=postgis-build /sfcgal/* /usr +COPY --from=plv8-build /plv8.tar.gz /ext-src/ +COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/ +COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/ +COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/ +COPY --from=vector-pg-build /pgvector.patch /ext-src/ +COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src +#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src +#COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src +#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src +COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src +COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src +COPY --from=rum-pg-build /rum.tar.gz /ext-src +COPY patches/rum.patch /ext-src +#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src +COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src +COPY --from=prefix-pg-build /prefix.tar.gz /ext-src +COPY --from=hll-pg-build /hll.tar.gz /ext-src +COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src +#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src +COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src +COPY patches/pg_hint_plan.patch /ext-src +COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src +COPY patches/pg_cron.patch /ext-src +#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src +#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src +COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src +COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src +COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src +#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src +#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src +COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src +COPY patches/pg_anon.patch /ext-src +COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src +COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src +RUN cd /ext-src/ && for f in *.tar.gz; \ + do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \ + rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \ + || exit 1; rm -f $f; done +RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch +RUN cd /ext-src/rum-src && patch -p1 <../rum.patch +# cmake is required for the h3 test +RUN apt-get update && apt-get install -y cmake +RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch +COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh +RUN patch -p1 target/CACHEDIR.TAG ### PostgreSQL parts # Some rules are duplicated for Postgres v14 and 15. We may want to refactor # to avoid the duplication in the future, but it's tolerable for now. # $(POSTGRES_INSTALL_DIR)/build/%/config.status: + + mkdir -p $(POSTGRES_INSTALL_DIR) + test -e $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG + +@echo "Configuring Postgres $* build" @test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \ echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \ echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \ exit 1; } mkdir -p $(POSTGRES_INSTALL_DIR)/build/$* - (cd $(POSTGRES_INSTALL_DIR)/build/$* && \ - env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \ + + VERSION=$*; \ + EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \ + (cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \ + env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \ CFLAGS='$(PG_CFLAGS)' \ - $(PG_CONFIGURE_OPTS) \ - --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log) + $(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \ + --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log) # nicer alias to run 'configure' # Note: I've been unable to use templates for this part of our configuration. @@ -117,6 +148,8 @@ postgres-%: postgres-configure-% \ $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install +@echo "Compiling amcheck $*" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install + +@echo "Compiling test_decoding $*" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/test_decoding install .PHONY: postgres-clean-% postgres-clean-%: @@ -157,8 +190,8 @@ neon-pg-ext-%: postgres-% -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install -.PHONY: neon-pg-ext-clean-% -neon-pg-ext-clean-%: +.PHONY: neon-pg-clean-ext-% +neon-pg-clean-ext-%: $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ -C $(POSTGRES_INSTALL_DIR)/build/neon-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean @@ -174,10 +207,10 @@ neon-pg-ext-clean-%: # Build walproposer as a static library. walproposer source code is located # in the pgxn/neon directory. -# +# # We also need to include libpgport.a and libpgcommon.a, because walproposer # uses some functions from those libraries. -# +# # Some object files are removed from libpgport.a and libpgcommon.a because # they depend on openssl and other libraries that are not included in our # Rust build. @@ -214,11 +247,11 @@ neon-pg-ext: \ neon-pg-ext-v15 \ neon-pg-ext-v16 -.PHONY: neon-pg-ext-clean -neon-pg-ext-clean: \ - neon-pg-ext-clean-v14 \ - neon-pg-ext-clean-v15 \ - neon-pg-ext-clean-v16 +.PHONY: neon-pg-clean-ext +neon-pg-clean-ext: \ + neon-pg-clean-ext-v14 \ + neon-pg-clean-ext-v15 \ + neon-pg-clean-ext-v16 # shorthand to build all Postgres versions .PHONY: postgres @@ -247,7 +280,7 @@ postgres-check: \ # This doesn't remove the effects of 'configure'. .PHONY: clean -clean: postgres-clean neon-pg-ext-clean +clean: postgres-clean neon-pg-clean-ext $(CARGO_CMD_PREFIX) cargo clean # This removes everything diff --git a/NOTICE b/NOTICE index c13dc2f0b3..52fc751c41 100644 --- a/NOTICE +++ b/NOTICE @@ -1,5 +1,5 @@ Neon -Copyright 2022 Neon Inc. +Copyright 2022 - 2024 Neon Inc. The PostgreSQL submodules in vendor/ are licensed under the PostgreSQL license. See vendor/postgres-vX/COPYRIGHT for details. diff --git a/README.md b/README.md index 98af1edee6..b54956f773 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,13 @@ -[![Neon](https://user-images.githubusercontent.com/13738772/236813940-dcfdcb5b-69d3-449b-a686-013febe834d4.png)](https://neon.tech) +[![Neon](https://github.com/neondatabase/neon/assets/11527560/f15a17f0-836e-40c5-b35d-030606a6b660)](https://neon.tech) + + # Neon Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes. ## Quick start -Try the [Neon Free Tier](https://neon.tech/docs/introduction/technical-preview-free-tier/) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions. +Try the [Neon Free Tier](https://neon.tech/github) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions. Alternatively, compile and run the project [locally](#running-local-installation). @@ -14,8 +16,8 @@ Alternatively, compile and run the project [locally](#running-local-installation A Neon installation consists of compute nodes and the Neon storage engine. Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine. The Neon storage engine consists of two major components: -- Pageserver. Scalable storage backend for the compute nodes. -- Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage. +- Pageserver: Scalable storage backend for the compute nodes. +- Safekeepers: The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage. See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information. @@ -62,6 +64,12 @@ brew install protobuf openssl flex bison icu4c pkg-config echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc ``` +If you get errors about missing `m4` you may have to install it manually: +``` +brew install m4 +brew link --force m4 +``` + 2. [Install Rust](https://www.rust-lang.org/tools/install) ``` # recommended approach from https://www.rust-lang.org/tools/install @@ -81,9 +89,9 @@ The project uses [rust toolchain file](./rust-toolchain.toml) to define the vers This file is automatically picked up by [`rustup`](https://rust-lang.github.io/rustup/overrides.html#the-toolchain-file) that installs (if absent) and uses the toolchain version pinned in the file. -rustup users who want to build with another toolchain can use [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory. +rustup users who want to build with another toolchain can use the [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory. -non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify their toolchain matches the version in the file. +non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify that their toolchain matches the version in the file. Newer rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates. #### Building on Linux @@ -124,7 +132,7 @@ make -j`sysctl -n hw.logicalcpu` -s To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively. To run the integration tests or Python scripts (not required to use the code), install -Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory. +Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory. #### Running neon database @@ -166,7 +174,7 @@ Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55432/postgres' 2. Now, it is possible to connect to postgres and run some queries: ```text -> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres +> psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres postgres=# CREATE TABLE t(key int primary key, value text); CREATE TABLE postgres=# insert into t values(1,1); @@ -205,7 +213,7 @@ Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres' # this new postgres instance will have all the data from 'main' postgres, # but all modifications would not affect data in original postgres -> psql -p55434 -h 127.0.0.1 -U cloud_admin postgres +> psql -p 55434 -h 127.0.0.1 -U cloud_admin postgres postgres=# select * from t; key | value -----+------- @@ -216,7 +224,7 @@ postgres=# insert into t values(2,2); INSERT 0 1 # check that the new change doesn't affect the 'main' postgres -> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres +> psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres postgres=# select * from t; key | value -----+------- @@ -224,14 +232,28 @@ postgres=# select * from t; (1 row) ``` -4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances +4. If you want to run tests afterwards (see below), you must stop all the running pageserver, safekeeper, and postgres instances you have just started. You can terminate them all with one command: ```sh > cargo neon stop ``` +More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md). + +#### Handling build failures + +If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again. + ## Running tests +### Rust unit tests + +We are using [`cargo-nextest`](https://nexte.st/) to run the tests in Github Workflows. +Some crates do not support running plain `cargo test` anymore, prefer `cargo nextest run` instead. +You can install `cargo-nextest` with `cargo install cargo-nextest`. + +### Integration tests + Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes). ```sh @@ -243,12 +265,28 @@ CARGO_BUILD_FLAGS="--features=testing" make ``` By default, this runs both debug and release modes, and all supported postgres versions. When -testing locally, it is convenient to run just run one set of permutations, like this: +testing locally, it is convenient to run just one set of permutations, like this: ```sh -DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest +DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest ``` +## Flamegraphs + +You may find yourself in need of flamegraphs for software in this repository. +You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or the original [`flamegraph.pl`](https://github.com/brendangregg/FlameGraph). Your choice! + +>[!IMPORTANT] +> If you're using `lld` or `mold`, you need the `--no-rosegment` linker argument. +> It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository. +> See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764). + +## Cleanup + +For cleaning up the source tree from build artifacts, run `make clean` in the source directory. + +For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directory will remove your database, with all data in it. You have been warned! + ## Documentation [docs](/docs) Contains a top-level overview of all available markdown documentation. diff --git a/clippy.toml b/clippy.toml index d788afc84d..4c0c04f9a1 100644 --- a/clippy.toml +++ b/clippy.toml @@ -2,4 +2,13 @@ disallowed-methods = [ "tokio::task::block_in_place", # Allow this for now, to deny it later once we stop using Handle::block_on completely # "tokio::runtime::Handle::block_on", + # use tokio_epoll_uring_ext instead + "tokio_epoll_uring::thread_local_system", +] + +disallowed-macros = [ + # use std::pin::pin + "futures::pin_mut", + # cannot disallow this, because clippy finds used from tokio macros + #"tokio::pin", ] diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 759a117ee9..8af0ed43ce 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -4,6 +4,11 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +default = [] +# Enables test specific features. +testing = [] + [dependencies] anyhow.workspace = true async-compression.workspace = true @@ -27,10 +32,12 @@ reqwest = { workspace = true, features = ["json"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true tokio-util.workspace = true +tokio-stream.workspace = true tracing.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true +thiserror.workspace = true url.workspace = true compute_api.workspace = true @@ -42,3 +49,4 @@ vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" } zstd = "0.13" bytes = "1.0" rust-ini = "0.20.0" +rlimit = "0.10.1" diff --git a/compute_tools/README.md b/compute_tools/README.md index 22a7de7cb7..8d84031efc 100644 --- a/compute_tools/README.md +++ b/compute_tools/README.md @@ -32,6 +32,29 @@ compute_ctl -D /var/db/postgres/compute \ -b /usr/local/bin/postgres ``` +## State Diagram + +Computes can be in various states. Below is a diagram that details how a +compute moves between states. + +```mermaid +%% https://mermaid.js.org/syntax/stateDiagram.html +stateDiagram-v2 + [*] --> Empty : Compute spawned + Empty --> ConfigurationPending : Waiting for compute spec + ConfigurationPending --> Configuration : Received compute spec + Configuration --> Failed : Failed to configure the compute + Configuration --> Running : Compute has been configured + Empty --> Init : Compute spec is immediately available + Empty --> TerminationPending : Requested termination + Init --> Failed : Failed to start Postgres + Init --> Running : Started Postgres + Running --> TerminationPending : Requested termination + TerminationPending --> Terminated : Terminated compute + Failed --> [*] : Compute exited + Terminated --> [*] : Compute exited +``` + ## Tests Cargo formatter: diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index a7e10d0aee..9499a7186e 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -6,7 +6,7 @@ //! - Every start is a fresh start, so the data directory is removed and //! initialized again on each run. //! - If remote_extension_config is provided, it will be used to fetch extensions list -//! and download `shared_preload_libraries` from the remote storage. +//! and download `shared_preload_libraries` from the remote storage. //! - Next it will put configuration files into the `PGDATA` directory. //! - Sync safekeepers and get commit LSN. //! - Get `basebackup` from pageserver using the returned on the previous step LSN. @@ -33,7 +33,6 @@ //! -b /usr/local/bin/postgres \ //! -r http://pg-ext-s3-gateway \ //! ``` -//! use std::collections::HashMap; use std::fs::File; use std::path::Path; @@ -45,15 +44,18 @@ use std::{thread, time::Duration}; use anyhow::{Context, Result}; use chrono::Utc; use clap::Arg; -use nix::sys::signal::{kill, Signal}; +use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static; use signal_hook::consts::{SIGQUIT, SIGTERM}; use signal_hook::{consts::SIGINT, iterator::Signals}; -use tracing::{error, info}; +use tracing::{error, info, warn}; use url::Url; use compute_api::responses::ComputeStatus; +use compute_api::spec::ComputeSpec; -use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID}; +use compute_tools::compute::{ + forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID, +}; use compute_tools::configurator::launch_configurator; use compute_tools::extension_server::get_pg_version; use compute_tools::http::api::launch_http_server; @@ -61,12 +63,45 @@ use compute_tools::logger::*; use compute_tools::monitor::launch_monitor; use compute_tools::params::*; use compute_tools::spec::*; +use compute_tools::swap::resize_swap; +use rlimit::{setrlimit, Resource}; // this is an arbitrary build tag. Fine as a default / for testing purposes // in-case of not-set environment var const BUILD_TAG_DEFAULT: &str = "latest"; fn main() -> Result<()> { + let (build_tag, clap_args) = init()?; + + // enable core dumping for all child processes + setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?; + + let (pg_handle, start_pg_result) = { + // Enter startup tracing context + let _startup_context_guard = startup_context_from_env(); + + let cli_args = process_cli(&clap_args)?; + + let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?; + + let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?; + + start_postgres(&clap_args, wait_spec_result)? + + // Startup is finished, exit the startup tracing span + }; + + // PostgreSQL is now running, if startup was successful. Wait until it exits. + let wait_pg_result = wait_postgres(pg_handle)?; + + let delay_exit = cleanup_after_postgres_exit(start_pg_result)?; + + maybe_delay_exit(delay_exit); + + deinit_and_exit(wait_pg_result); +} + +fn init() -> Result<(String, clap::ArgMatches)> { init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; @@ -81,9 +116,15 @@ fn main() -> Result<()> { .to_string(); info!("build_tag: {build_tag}"); - let matches = cli().get_matches(); - let pgbin_default = String::from("postgres"); - let pgbin = matches.get_one::("pgbin").unwrap_or(&pgbin_default); + Ok((build_tag, cli().get_matches())) +} + +fn process_cli(matches: &clap::ArgMatches) -> Result { + let pgbin_default = "postgres"; + let pgbin = matches + .get_one::("pgbin") + .map(|s| s.as_str()) + .unwrap_or(pgbin_default); let ext_remote_storage = matches .get_one::("remote-ext-config") @@ -109,7 +150,32 @@ fn main() -> Result<()> { .expect("Postgres connection string is required"); let spec_json = matches.get_one::("spec"); let spec_path = matches.get_one::("spec-path"); + let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind"); + Ok(ProcessCliResult { + connstr, + pgdata, + pgbin, + ext_remote_storage, + http_port, + spec_json, + spec_path, + resize_swap_on_bind, + }) +} + +struct ProcessCliResult<'clap> { + connstr: &'clap str, + pgdata: &'clap str, + pgbin: &'clap str, + ext_remote_storage: Option<&'clap str>, + http_port: u16, + spec_json: Option<&'clap String>, + spec_path: Option<&'clap String>, + resize_swap_on_bind: bool, +} + +fn startup_context_from_env() -> Option { // Extract OpenTelemetry context for the startup actions from the // TRACEPARENT and TRACESTATE env variables, and attach it to the current // tracing context. @@ -146,7 +212,7 @@ fn main() -> Result<()> { if let Ok(val) = std::env::var("TRACESTATE") { startup_tracing_carrier.insert("tracestate".to_string(), val); } - let startup_context_guard = if !startup_tracing_carrier.is_empty() { + if !startup_tracing_carrier.is_empty() { use opentelemetry::propagation::TextMapPropagator; use opentelemetry::sdk::propagation::TraceContextPropagator; let guard = TraceContextPropagator::new() @@ -156,8 +222,17 @@ fn main() -> Result<()> { Some(guard) } else { None - }; + } +} +fn try_spec_from_cli( + matches: &clap::ArgMatches, + ProcessCliResult { + spec_json, + spec_path, + .. + }: &ProcessCliResult, +) -> Result { let compute_id = matches.get_one::("compute-id"); let control_plane_uri = matches.get_one::("control-plane-uri"); @@ -198,6 +273,34 @@ fn main() -> Result<()> { } }; + Ok(CliSpecParams { + spec, + live_config_allowed, + }) +} + +struct CliSpecParams { + /// If a spec was provided via CLI or file, the [`ComputeSpec`] + spec: Option, + live_config_allowed: bool, +} + +fn wait_spec( + build_tag: String, + ProcessCliResult { + connstr, + pgdata, + pgbin, + ext_remote_storage, + resize_swap_on_bind, + http_port, + .. + }: ProcessCliResult, + CliSpecParams { + spec, + live_config_allowed, + }: CliSpecParams, +) -> Result { let mut new_state = ComputeState::new(); let spec_set; @@ -225,19 +328,17 @@ fn main() -> Result<()> { // If this is a pooled VM, prewarm before starting HTTP server and becoming // available for binding. Prewarming helps Postgres start quicker later, - // because QEMU will already have it's memory allocated from the host, and + // because QEMU will already have its memory allocated from the host, and // the necessary binaries will already be cached. if !spec_set { compute.prewarm_postgres()?; } - // Launch http service first, so we were able to serve control-plane - // requests, while configuration is still in progress. + // Launch http service first, so that we can serve control-plane requests + // while configuration is still in progress. let _http_handle = launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread"); - let extension_server_port: u16 = http_port; - if !spec_set { // No spec provided, hang waiting for it. info!("no compute spec provided, waiting"); @@ -252,21 +353,47 @@ fn main() -> Result<()> { break; } } + + // Record for how long we slept waiting for the spec. + let now = Utc::now(); + state.metrics.wait_for_spec_ms = now + .signed_duration_since(state.start_time) + .to_std() + .unwrap() + .as_millis() as u64; + + // Reset start time, so that the total startup time that is calculated later will + // not include the time that we waited for the spec. + state.start_time = now; } + launch_lsn_lease_bg_task_for_static(&compute); + + Ok(WaitSpecResult { + compute, + http_port, + resize_swap_on_bind, + }) +} + +struct WaitSpecResult { + compute: Arc, + // passed through from ProcessCliResult + http_port: u16, + resize_swap_on_bind: bool, +} + +fn start_postgres( + // need to allow unused because `matches` is only used if target_os = "linux" + #[allow(unused_variables)] matches: &clap::ArgMatches, + WaitSpecResult { + compute, + http_port, + resize_swap_on_bind, + }: WaitSpecResult, +) -> Result<(Option, StartPostgresResult)> { // We got all we need, update the state. let mut state = compute.state.lock().unwrap(); - - // Record for how long we slept waiting for the spec. - state.metrics.wait_for_spec_ms = Utc::now() - .signed_duration_since(state.start_time) - .to_std() - .unwrap() - .as_millis() as u64; - // Reset start time to the actual start of the configuration, so that - // total startup time was properly measured at the end. - state.start_time = Utc::now(); - state.status = ComputeStatus::Init; compute.state_changed.notify_all(); @@ -274,33 +401,72 @@ fn main() -> Result<()> { "running compute with features: {:?}", state.pspec.as_ref().unwrap().spec.features ); + // before we release the mutex, fetch the swap size (if any) for later. + let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes; drop(state); // Launch remaining service threads let _monitor_handle = launch_monitor(&compute); let _configurator_handle = launch_configurator(&compute); - // Start Postgres + let mut prestartup_failed = false; let mut delay_exit = false; - let mut exit_code = None; - let pg = match compute.start_compute(extension_server_port) { - Ok(pg) => Some(pg), - Err(err) => { - error!("could not start the compute node: {:#}", err); - let mut state = compute.state.lock().unwrap(); - state.error = Some(format!("{:?}", err)); - state.status = ComputeStatus::Failed; - // Notify others that Postgres failed to start. In case of configuring the - // empty compute, it's likely that API handler is still waiting for compute - // state change. With this we will notify it that compute is in Failed state, - // so control plane will know about it earlier and record proper error instead - // of timeout. - compute.state_changed.notify_all(); - drop(state); // unlock - delay_exit = true; - None + + // Resize swap to the desired size if the compute spec says so + if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) { + // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion + // *before* starting postgres. + // + // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this + // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets + // OOM-killed during startup because swap wasn't available yet. + match resize_swap(size_bytes) { + Ok(()) => { + let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display. + info!(%size_bytes, %size_gib, "resized swap"); + } + Err(err) => { + let err = err.context("failed to resize swap"); + error!("{err:#}"); + + // Mark compute startup as failed; don't try to start postgres, and report this + // error to the control plane when it next asks. + prestartup_failed = true; + let mut state = compute.state.lock().unwrap(); + state.error = Some(format!("{err:?}")); + state.status = ComputeStatus::Failed; + compute.state_changed.notify_all(); + delay_exit = true; + } } - }; + } + + let extension_server_port: u16 = http_port; + + // Start Postgres + let mut pg = None; + if !prestartup_failed { + pg = match compute.start_compute(extension_server_port) { + Ok(pg) => Some(pg), + Err(err) => { + error!("could not start the compute node: {:#}", err); + let mut state = compute.state.lock().unwrap(); + state.error = Some(format!("{:?}", err)); + state.status = ComputeStatus::Failed; + // Notify others that Postgres failed to start. In case of configuring the + // empty compute, it's likely that API handler is still waiting for compute + // state change. With this we will notify it that compute is in Failed state, + // so control plane will know about it earlier and record proper error instead + // of timeout. + compute.state_changed.notify_all(); + drop(state); // unlock + delay_exit = true; + None + } + }; + } else { + warn!("skipping postgres startup because pre-startup step failed"); + } // Start the vm-monitor if directed to. The vm-monitor only runs on linux // because it requires cgroups. @@ -333,7 +499,7 @@ fn main() -> Result<()> { // This token is used internally by the monitor to clean up all threads let token = CancellationToken::new(); - let vm_monitor = &rt.as_ref().map(|rt| { + let vm_monitor = rt.as_ref().map(|rt| { rt.spawn(vm_monitor::start( Box::leak(Box::new(vm_monitor::Args { cgroup: cgroup.cloned(), @@ -346,12 +512,41 @@ fn main() -> Result<()> { } } + Ok(( + pg, + StartPostgresResult { + delay_exit, + compute, + #[cfg(target_os = "linux")] + rt, + #[cfg(target_os = "linux")] + token, + #[cfg(target_os = "linux")] + vm_monitor, + }, + )) +} + +type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>); + +struct StartPostgresResult { + delay_exit: bool, + // passed through from WaitSpecResult + compute: Arc, + + #[cfg(target_os = "linux")] + rt: Option, + #[cfg(target_os = "linux")] + token: tokio_util::sync::CancellationToken, + #[cfg(target_os = "linux")] + vm_monitor: Option>>, +} + +fn wait_postgres(pg: Option) -> Result { // Wait for the child Postgres process forever. In this state Ctrl+C will // propagate to Postgres and it will be shut down as well. + let mut exit_code = None; if let Some((mut pg, logs_handle)) = pg { - // Startup is finished, exit the startup tracing span - drop(startup_context_guard); - let ecode = pg .wait() .expect("failed to start waiting on Postgres process"); @@ -366,6 +561,25 @@ fn main() -> Result<()> { exit_code = ecode.code() } + Ok(WaitPostgresResult { exit_code }) +} + +struct WaitPostgresResult { + exit_code: Option, +} + +fn cleanup_after_postgres_exit( + StartPostgresResult { + mut delay_exit, + compute, + #[cfg(target_os = "linux")] + vm_monitor, + #[cfg(target_os = "linux")] + token, + #[cfg(target_os = "linux")] + rt, + }: StartPostgresResult, +) -> Result { // Terminate the vm_monitor so it releases the file watcher on // /sys/fs/cgroup/neon-postgres. // Note: the vm-monitor only runs on linux because it requires cgroups. @@ -394,17 +608,32 @@ fn main() -> Result<()> { info!("synced safekeepers at lsn {lsn}"); } + let mut state = compute.state.lock().unwrap(); + if state.status == ComputeStatus::TerminationPending { + state.status = ComputeStatus::Terminated; + compute.state_changed.notify_all(); + // we were asked to terminate gracefully, don't exit to avoid restart + delay_exit = true + } + drop(state); + if let Err(err) = compute.check_for_core_dumps() { error!("error while checking for core dumps: {err:?}"); } + Ok(delay_exit) +} + +fn maybe_delay_exit(delay_exit: bool) { // If launch failed, keep serving HTTP requests for a while, so the cloud // control plane can get the actual error. if delay_exit { info!("giving control plane 30s to collect the error before shutdown"); thread::sleep(Duration::from_secs(30)); } +} +fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! { // Shutdown trace pipeline gracefully, so that it has a chance to send any // pending traces before we exit. Shutting down OTEL tracing provider may // hang for quite some time, see, for example: @@ -512,10 +741,15 @@ fn cli() -> clap::Command { Arg::new("filecache-connstr") .long("filecache-connstr") .default_value( - "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable", + "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor", ) .value_name("FILECACHE_CONNSTR"), ) + .arg( + Arg::new("resize-swap-on-bind") + .long("resize-swap-on-bind") + .action(clap::ArgAction::SetTrue), + ) } /// When compute_ctl is killed, send also termination signal to sync-safekeepers @@ -523,16 +757,7 @@ fn cli() -> clap::Command { /// wait for termination which would be easy then. fn handle_exit_signal(sig: i32) { info!("received {sig} termination signal"); - let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst); - if ss_pid != 0 { - let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32); - kill(ss_pid, Signal::SIGTERM).ok(); - } - let pg_pid = PG_PID.load(Ordering::SeqCst); - if pg_pid != 0 { - let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32); - kill(pg_pid, Signal::SIGTERM).ok(); - } + forward_termination_signal(); exit(1); } diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs new file mode 100644 index 0000000000..4fefa831e0 --- /dev/null +++ b/compute_tools/src/catalog.rs @@ -0,0 +1,116 @@ +use compute_api::{ + responses::CatalogObjects, + spec::{Database, Role}, +}; +use futures::Stream; +use postgres::{Client, NoTls}; +use std::{path::Path, process::Stdio, result::Result, sync::Arc}; +use tokio::{ + io::{AsyncBufReadExt, BufReader}, + process::Command, + task, +}; +use tokio_stream::{self as stream, StreamExt}; +use tokio_util::codec::{BytesCodec, FramedRead}; +use tracing::warn; + +use crate::{ + compute::ComputeNode, + pg_helpers::{get_existing_dbs, get_existing_roles}, +}; + +pub async fn get_dbs_and_roles(compute: &Arc) -> anyhow::Result { + let connstr = compute.connstr.clone(); + task::spawn_blocking(move || { + let mut client = Client::connect(connstr.as_str(), NoTls)?; + let roles: Vec; + { + let mut xact = client.transaction()?; + roles = get_existing_roles(&mut xact)?; + } + let databases: Vec = get_existing_dbs(&mut client)?.values().cloned().collect(); + + Ok(CatalogObjects { roles, databases }) + }) + .await? +} + +#[derive(Debug, thiserror::Error)] +pub enum SchemaDumpError { + #[error("Database does not exist.")] + DatabaseDoesNotExist, + #[error("Failed to execute pg_dump.")] + IO(#[from] std::io::Error), +} + +// It uses the pg_dump utility to dump the schema of the specified database. +// The output is streamed back to the caller and supposed to be streamed via HTTP. +// +// Before return the result with the output, it checks that pg_dump produced any output. +// If not, it tries to parse the stderr output to determine if the database does not exist +// and special error is returned. +// +// To make sure that the process is killed when the caller drops the stream, we use tokio kill_on_drop feature. +pub async fn get_database_schema( + compute: &Arc, + dbname: &str, +) -> Result>, SchemaDumpError> { + let pgbin = &compute.pgbin; + let basepath = Path::new(pgbin).parent().unwrap(); + let pgdump = basepath.join("pg_dump"); + let mut connstr = compute.connstr.clone(); + connstr.set_path(dbname); + let mut cmd = Command::new(pgdump) + .arg("--schema-only") + .arg(connstr.as_str()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .kill_on_drop(true) + .spawn()?; + + let stdout = cmd.stdout.take().ok_or_else(|| { + std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.") + })?; + + let stderr = cmd.stderr.take().ok_or_else(|| { + std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.") + })?; + + let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new()); + let stderr_reader = BufReader::new(stderr); + + let first_chunk = match stdout_reader.next().await { + Some(Ok(bytes)) if !bytes.is_empty() => bytes, + Some(Err(e)) => { + return Err(SchemaDumpError::IO(e)); + } + _ => { + let mut lines = stderr_reader.lines(); + if let Some(line) = lines.next_line().await? { + if line.contains(&format!("FATAL: database \"{}\" does not exist", dbname)) { + return Err(SchemaDumpError::DatabaseDoesNotExist); + } + warn!("pg_dump stderr: {}", line) + } + tokio::spawn(async move { + while let Ok(Some(line)) = lines.next_line().await { + warn!("pg_dump stderr: {}", line) + } + }); + + return Err(SchemaDumpError::IO(std::io::Error::new( + std::io::ErrorKind::Other, + "failed to start pg_dump", + ))); + } + }; + let initial_stream = stream::once(Ok(first_chunk.freeze())); + // Consume stderr and log warnings + tokio::spawn(async move { + let mut lines = stderr_reader.lines(); + while let Ok(Some(line)) = lines.next_line().await { + warn!("pg_dump stderr: {}", line) + } + }); + Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze())))) +} diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 07e0abe6ff..5bd6897fe3 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use std::env; use std::fs; use std::io::BufRead; -use std::os::unix::fs::PermissionsExt; +use std::os::unix::fs::{symlink, PermissionsExt}; use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; @@ -17,9 +17,9 @@ use chrono::{DateTime, Utc}; use futures::future::join_all; use futures::stream::FuturesUnordered; use futures::StreamExt; +use nix::unistd::Pid; +use postgres::error::SqlState; use postgres::{Client, NoTls}; -use tokio; -use tokio_postgres; use tracing::{debug, error, info, instrument, warn}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; @@ -28,6 +28,8 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus}; use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec}; use utils::measured_stream::MeasuredReader; +use nix::sys::signal::{kill, Signal}; + use remote_storage::{DownloadError, RemotePath}; use crate::checker::create_availability_check_data; @@ -54,6 +56,7 @@ pub struct ComputeNode { /// - we push new spec and it does reconfiguration /// - but then something happens and compute pod / VM is destroyed, /// so k8s controller starts it again with the **old** spec + /// /// and the same for empty computes: /// - we started compute without any spec /// - we push spec and it does configuration @@ -207,6 +210,7 @@ fn maybe_cgexec(cmd: &str) -> Command { /// Create special neon_superuser role, that's a slightly nerfed version of a real superuser /// that we give to customers +#[instrument(skip_all)] fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> { let roles = spec .cluster @@ -319,11 +323,12 @@ impl ComputeNode { // Get basebackup from the libpq connection to pageserver using `connstr` and // unarchive it to `pgdata` directory overriding all its previous content. #[instrument(skip_all, fields(%lsn))] - fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { + fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { let spec = compute_state.pspec.as_ref().expect("spec must be set"); let start_time = Instant::now(); - let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?; + let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap(); + let mut config = postgres::Config::from_str(shard0_connstr)?; // Use the storage auth token from the config file, if given. // Note: this overrides any password set in the connection string. @@ -390,6 +395,42 @@ impl ComputeNode { Ok(()) } + // Gets the basebackup in a retry loop + #[instrument(skip_all, fields(%lsn))] + pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { + let mut retry_period_ms = 500.0; + let mut attempts = 0; + const DEFAULT_ATTEMPTS: u16 = 10; + #[cfg(feature = "testing")] + let max_attempts = if let Ok(v) = env::var("NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES") { + u16::from_str(&v).unwrap() + } else { + DEFAULT_ATTEMPTS + }; + #[cfg(not(feature = "testing"))] + let max_attempts = DEFAULT_ATTEMPTS; + loop { + let result = self.try_get_basebackup(compute_state, lsn); + match result { + Ok(_) => { + return result; + } + Err(ref e) if attempts < max_attempts => { + warn!( + "Failed to get basebackup: {} (attempt {}/{})", + e, attempts, max_attempts + ); + std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64)); + retry_period_ms *= 1.5; + } + Err(_) => { + return result; + } + } + attempts += 1; + } + } + pub async fn check_safekeepers_synced_async( &self, compute_state: &ComputeState, @@ -605,6 +646,48 @@ impl ComputeNode { // Update pg_hba.conf received with basebackup. update_pg_hba(pgdata_path)?; + // Place pg_dynshmem under /dev/shm. This allows us to use + // 'dynamic_shared_memory_type = mmap' so that the files are placed in + // /dev/shm, similar to how 'dynamic_shared_memory_type = posix' works. + // + // Why on earth don't we just stick to the 'posix' default, you might + // ask. It turns out that making large allocations with 'posix' doesn't + // work very well with autoscaling. The behavior we want is that: + // + // 1. You can make large DSM allocations, larger than the current RAM + // size of the VM, without errors + // + // 2. If the allocated memory is really used, the VM is scaled up + // automatically to accommodate that + // + // We try to make that possible by having swap in the VM. But with the + // default 'posix' DSM implementation, we fail step 1, even when there's + // plenty of swap available. PostgreSQL uses posix_fallocate() to create + // the shmem segment, which is really just a file in /dev/shm in Linux, + // but posix_fallocate() on tmpfs returns ENOMEM if the size is larger + // than available RAM. + // + // Using 'dynamic_shared_memory_type = mmap' works around that, because + // the Postgres 'mmap' DSM implementation doesn't use + // posix_fallocate(). Instead, it uses repeated calls to write(2) to + // fill the file with zeros. It's weird that that differs between + // 'posix' and 'mmap', but we take advantage of it. When the file is + // filled slowly with write(2), the kernel allows it to grow larger, as + // long as there's swap available. + // + // In short, using 'dynamic_shared_memory_type = mmap' allows us one DSM + // segment to be larger than currently available RAM. But because we + // don't want to store it on a real file, which the kernel would try to + // flush to disk, so symlink pg_dynshm to /dev/shm. + // + // We don't set 'dynamic_shared_memory_type = mmap' here, we let the + // control plane control that option. If 'mmap' is not used, this + // symlink doesn't affect anything. + // + // See https://github.com/neondatabase/autoscaling/issues/800 + std::fs::remove_dir(pgdata_path.join("pg_dynshmem"))?; + symlink("/dev/shm/", pgdata_path.join("pg_dynshmem"))?; + match spec.mode { ComputeMode::Primary => {} ComputeMode::Replica | ComputeMode::Static(..) => { @@ -649,8 +732,12 @@ impl ComputeNode { // Stop it when it's ready info!("waiting for postgres"); wait_for_postgres(&mut pg, Path::new(pgdata))?; - pg.kill()?; - info!("sent kill signal"); + // SIGQUIT orders postgres to exit immediately. We don't want to SIGKILL + // it to avoid orphaned processes prowling around while datadir is + // wiped. + let pm_pid = Pid::from_raw(pg.id() as i32); + kill(pm_pid, Signal::SIGQUIT)?; + info!("sent SIGQUIT signal"); pg.wait()?; info!("done prewarming"); @@ -691,6 +778,26 @@ impl ComputeNode { Ok((pg, logs_handle)) } + /// Do post configuration of the already started Postgres. This function spawns a background thread to + /// configure the database after applying the compute spec. Currently, it upgrades the neon extension + /// version. In the future, it may upgrade all 3rd-party extensions. + #[instrument(skip_all)] + pub fn post_apply_config(&self) -> Result<()> { + let connstr = self.connstr.clone(); + thread::spawn(move || { + let func = || { + let mut client = Client::connect(connstr.as_str(), NoTls)?; + handle_neon_extension_upgrade(&mut client) + .context("handle_neon_extension_upgrade")?; + Ok::<_, anyhow::Error>(()) + }; + if let Err(err) = func() { + error!("error while post_apply_config: {err:#}"); + } + }); + Ok(()) + } + /// Do initial configuration of the already started Postgres. #[instrument(skip_all)] pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { @@ -700,62 +807,92 @@ impl ComputeNode { // In this case we need to connect with old `zenith_admin` name // and create new user. We cannot simply rename connected user, // but we can create a new one and grant it all privileges. - let connstr = self.connstr.clone(); + let mut connstr = self.connstr.clone(); + connstr + .query_pairs_mut() + .append_pair("application_name", "apply_config"); + let mut client = match Client::connect(connstr.as_str(), NoTls) { - Err(e) => { - info!( - "cannot connect to postgres: {}, retrying with `zenith_admin` username", - e - ); - let mut zenith_admin_connstr = connstr.clone(); + Err(e) => match e.code() { + Some(&SqlState::INVALID_PASSWORD) + | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => { + // connect with zenith_admin if cloud_admin could not authenticate + info!( + "cannot connect to postgres: {}, retrying with `zenith_admin` username", + e + ); + let mut zenith_admin_connstr = connstr.clone(); - zenith_admin_connstr - .set_username("zenith_admin") - .map_err(|_| anyhow::anyhow!("invalid connstr"))?; + zenith_admin_connstr + .set_username("zenith_admin") + .map_err(|_| anyhow::anyhow!("invalid connstr"))?; - let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?; - // Disable forwarding so that users don't get a cloud_admin role - client.simple_query("SET neon.forward_ddl = false")?; - client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; - client.simple_query("GRANT zenith_admin TO cloud_admin")?; - drop(client); + let mut client = + Client::connect(zenith_admin_connstr.as_str(), NoTls) + .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; + // Disable forwarding so that users don't get a cloud_admin role - // reconnect with connstring with expected name - Client::connect(connstr.as_str(), NoTls)? - } + let mut func = || { + client.simple_query("SET neon.forward_ddl = false")?; + client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; + client.simple_query("GRANT zenith_admin TO cloud_admin")?; + Ok::<_, anyhow::Error>(()) + }; + func().context("apply_config setup cloud_admin")?; + + drop(client); + + // reconnect with connstring with expected name + Client::connect(connstr.as_str(), NoTls)? + } + _ => return Err(e.into()), + }, Ok(client) => client, }; // Disable DDL forwarding because control plane already knows about these roles/databases. - client.simple_query("SET neon.forward_ddl = false")?; + client + .simple_query("SET neon.forward_ddl = false") + .context("apply_config SET neon.forward_ddl = false")?; // Proceed with post-startup configuration. Note, that order of operations is important. let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec; - create_neon_superuser(spec, &mut client)?; - cleanup_instance(&mut client)?; - handle_roles(spec, &mut client)?; - handle_databases(spec, &mut client)?; - handle_role_deletions(spec, connstr.as_str(), &mut client)?; - handle_grants(spec, &mut client, connstr.as_str())?; - handle_extensions(spec, &mut client)?; - handle_extension_neon(&mut client)?; - create_availability_check_data(&mut client)?; + create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?; + cleanup_instance(&mut client).context("apply_config cleanup_instance")?; + handle_roles(spec, &mut client).context("apply_config handle_roles")?; + handle_databases(spec, &mut client).context("apply_config handle_databases")?; + handle_role_deletions(spec, connstr.as_str(), &mut client) + .context("apply_config handle_role_deletions")?; + handle_grants( + spec, + &mut client, + connstr.as_str(), + self.has_feature(ComputeFeature::AnonExtension), + ) + .context("apply_config handle_grants")?; + handle_extensions(spec, &mut client).context("apply_config handle_extensions")?; + handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?; + create_availability_check_data(&mut client) + .context("apply_config create_availability_check_data")?; // 'Close' connection drop(client); - if self.has_feature(ComputeFeature::Migrations) { - thread::spawn(move || { - let mut client = Client::connect(connstr.as_str(), NoTls)?; - handle_migrations(&mut client) - }); - } + // Run migrations separately to not hold up cold starts + thread::spawn(move || { + let mut connstr = connstr.clone(); + connstr + .query_pairs_mut() + .append_pair("application_name", "migrations"); + + let mut client = Client::connect(connstr.as_str(), NoTls)?; + handle_migrations(&mut client).context("apply_config handle_migrations") + }); Ok(()) } - // We could've wrapped this around `pg_ctl reload`, but right now we don't use - // `pg_ctl` for start / stop, so this just seems much easier to do as we already - // have opened connection to Postgres and superuser access. + // Wrapped this around `pg_ctl reload`, but right now we don't use + // `pg_ctl` for start / stop. #[instrument(skip_all)] fn pg_reload_conf(&self) -> Result<()> { let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl"); @@ -798,33 +935,39 @@ impl ComputeNode { // temporarily reset max_cluster_size in config // to avoid the possibility of hitting the limit, while we are reconfiguring: // creating new extensions, roles, etc... - config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?; - self.pg_reload_conf()?; + config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { + self.pg_reload_conf()?; - let mut client = Client::connect(self.connstr.as_str(), NoTls)?; + let mut client = Client::connect(self.connstr.as_str(), NoTls)?; - // Proceed with post-startup configuration. Note, that order of operations is important. - // Disable DDL forwarding because control plane already knows about these roles/databases. - if spec.mode == ComputeMode::Primary { - client.simple_query("SET neon.forward_ddl = false")?; - cleanup_instance(&mut client)?; - handle_roles(&spec, &mut client)?; - handle_databases(&spec, &mut client)?; - handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?; - handle_grants(&spec, &mut client, self.connstr.as_str())?; - handle_extensions(&spec, &mut client)?; - handle_extension_neon(&mut client)?; - // We can skip handle_migrations here because a new migration can only appear - // if we have a new version of the compute_ctl binary, which can only happen - // if compute got restarted, in which case we'll end up inside of apply_config - // instead of reconfigure. - } + // Proceed with post-startup configuration. Note, that order of operations is important. + // Disable DDL forwarding because control plane already knows about these roles/databases. + if spec.mode == ComputeMode::Primary { + client.simple_query("SET neon.forward_ddl = false")?; + cleanup_instance(&mut client)?; + handle_roles(&spec, &mut client)?; + handle_databases(&spec, &mut client)?; + handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?; + handle_grants( + &spec, + &mut client, + self.connstr.as_str(), + self.has_feature(ComputeFeature::AnonExtension), + )?; + handle_extensions(&spec, &mut client)?; + handle_extension_neon(&mut client)?; + // We can skip handle_migrations here because a new migration can only appear + // if we have a new version of the compute_ctl binary, which can only happen + // if compute got restarted, in which case we'll end up inside of apply_config + // instead of reconfigure. + } - // 'Close' connection - drop(client); + // 'Close' connection + drop(client); + + Ok(()) + })?; - // reset max_cluster_size in config back to original value and reload config - config::compute_ctl_temp_override_remove(pgdata_path)?; self.pg_reload_conf()?; let unknown_op = "unknown".to_string(); @@ -909,18 +1052,26 @@ impl ComputeNode { let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?; let config_time = Utc::now(); - if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates { - let pgdata_path = Path::new(&self.pgdata); - // temporarily reset max_cluster_size in config - // to avoid the possibility of hitting the limit, while we are applying config: - // creating new extensions, roles, etc... - config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?; - self.pg_reload_conf()?; + if pspec.spec.mode == ComputeMode::Primary { + if !pspec.spec.skip_pg_catalog_updates { + let pgdata_path = Path::new(&self.pgdata); + // temporarily reset max_cluster_size in config + // to avoid the possibility of hitting the limit, while we are applying config: + // creating new extensions, roles, etc... + config::with_compute_ctl_tmp_override( + pgdata_path, + "neon.max_cluster_size=-1", + || { + self.pg_reload_conf()?; - self.apply_config(&compute_state)?; + self.apply_config(&compute_state)?; - config::compute_ctl_temp_override_remove(pgdata_path)?; - self.pg_reload_conf()?; + Ok(()) + }, + )?; + self.pg_reload_conf()?; + } + self.post_apply_config()?; } let startup_end_time = Utc::now(); @@ -974,7 +1125,7 @@ impl ComputeNode { // EKS worker nodes have following core dump settings: // /proc/sys/kernel/core_pattern -> core // /proc/sys/kernel/core_uses_pid -> 1 - // ulimint -c -> unlimited + // ulimit -c -> unlimited // which results in core dumps being written to postgres data directory as core.. // // Use that as a default location and pattern, except macos where core dumps are written @@ -1145,10 +1296,12 @@ LIMIT 100", .await .map_err(DownloadError::Other); - self.ext_download_progress - .write() - .expect("bad lock") - .insert(ext_archive_name.to_string(), (download_start, true)); + if download_size.is_ok() { + self.ext_download_progress + .write() + .expect("bad lock") + .insert(ext_archive_name.to_string(), (download_start, true)); + } download_size } @@ -1241,3 +1394,19 @@ LIMIT 100", Ok(remote_ext_metrics) } } + +pub fn forward_termination_signal() { + let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst); + if ss_pid != 0 { + let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32); + kill(ss_pid, Signal::SIGTERM).ok(); + } + let pg_pid = PG_PID.load(Ordering::SeqCst); + if pg_pid != 0 { + let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32); + // Use 'fast' shutdown (SIGINT) because it also creates a shutdown checkpoint, which is important for + // ROs to get a list of running xacts faster instead of going through the CLOG. + // See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals. + kill(pg_pid, Signal::SIGINT).ok(); + } +} diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index a7ef8cea92..479100eb89 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -6,8 +6,8 @@ use std::path::Path; use anyhow::Result; use crate::pg_helpers::escape_conf_value; -use crate::pg_helpers::PgOptionsSerialize; -use compute_api::spec::{ComputeMode, ComputeSpec}; +use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize}; +use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption}; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. @@ -17,6 +17,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result { .write(true) .create(true) .append(false) + .truncate(false) .open(path)?; let buf = io::BufReader::new(&file); let mut count: usize = 0; @@ -51,6 +52,9 @@ pub fn write_postgres_conf( if let Some(s) = &spec.pageserver_connstring { writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?; } + if let Some(stripe_size) = spec.shard_stripe_size { + writeln!(file, "neon.stripe_size={stripe_size}")?; + } if !spec.safekeeper_connstrings.is_empty() { writeln!( file, @@ -82,6 +86,27 @@ pub fn write_postgres_conf( } } + if cfg!(target_os = "linux") { + // Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is + // disabled), then the control plane has enabled swap and we should set + // dynamic_shared_memory_type = 'mmap'. + // + // This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047. + let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory") + // ignore any errors - they may be expected to occur under certain situations (e.g. when + // not running in Linux). + .unwrap_or_else(|_| String::new()); + if overcommit_memory_contents.trim() == "2" { + let opt = GenericOption { + name: "dynamic_shared_memory_type".to_owned(), + value: Some("mmap".to_owned()), + vartype: "enum".to_owned(), + }; + + write!(file, "{}", opt.to_pg_setting())?; + } + } + // If there are any extra options in the 'settings' field, append those if spec.cluster.settings.is_some() { writeln!(file, "# Managed by compute_ctl: begin")?; @@ -100,18 +125,17 @@ pub fn write_postgres_conf( Ok(()) } -/// create file compute_ctl_temp_override.conf in pgdata_dir -/// add provided options to this file -pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> { +pub fn with_compute_ctl_tmp_override(pgdata_path: &Path, options: &str, exec: F) -> Result<()> +where + F: FnOnce() -> Result<()>, +{ let path = pgdata_path.join("compute_ctl_temp_override.conf"); let mut file = File::create(path)?; write!(file, "{}", options)?; - Ok(()) -} -/// remove file compute_ctl_temp_override.conf in pgdata_dir -pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> { - let path = pgdata_path.join("compute_ctl_temp_override.conf"); - std::fs::remove_file(path)?; - Ok(()) + let res = exec(); + + file.set_len(0)?; + + res } diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 2cec12119f..ef1db73982 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -71,7 +71,7 @@ More specifically, here is an example ext_index.json } } */ -use anyhow::{self, Result}; +use anyhow::Result; use anyhow::{bail, Context}; use bytes::Bytes; use compute_api::spec::RemoteExtSpec; diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index fa2c4cff28..43d29402bc 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -5,18 +5,21 @@ use std::net::SocketAddr; use std::sync::Arc; use std::thread; +use crate::catalog::SchemaDumpError; +use crate::catalog::{get_database_schema, get_dbs_and_roles}; +use crate::compute::forward_termination_signal; use crate::compute::{ComputeNode, ComputeState, ParsedSpec}; use compute_api::requests::ConfigurationRequest; use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError}; use anyhow::Result; +use hyper::header::CONTENT_TYPE; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; -use num_cpus; -use serde_json; use tokio::task; -use tracing::{error, info, warn}; +use tracing::{debug, error, info, warn}; use tracing_utils::http::OtelName; +use utils::http::request::must_get_query_param; fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse { ComputeStatusResponse { @@ -45,7 +48,7 @@ async fn routes(req: Request, compute: &Arc) -> Response { - info!("serving /status GET request"); + debug!("serving /status GET request"); let state = compute.state.lock().unwrap(); let status_response = status_response_from_state(&state); Response::new(Body::from(serde_json::to_string(&status_response).unwrap())) @@ -123,6 +126,45 @@ async fn routes(req: Request, compute: &Arc) -> Response { + info!("serving /terminate POST request"); + match handle_terminate_request(compute).await { + Ok(()) => Response::new(Body::empty()), + Err((msg, code)) => { + error!("error handling /terminate request: {msg}"); + render_json_error(&msg, code) + } + } + } + + (&Method::GET, "/dbs_and_roles") => { + info!("serving /dbs_and_roles GET request",); + match get_dbs_and_roles(compute).await { + Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())), + Err(_) => { + render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR) + } + } + } + + (&Method::GET, "/database_schema") => { + let database = match must_get_query_param(&req, "database") { + Err(e) => return e.into_response(), + Ok(database) => database, + }; + info!("serving /database_schema GET request with database: {database}",); + match get_database_schema(compute, &database).await { + Ok(res) => render_plain(Body::wrap_stream(res)), + Err(SchemaDumpError::DatabaseDoesNotExist) => { + render_json_error("database does not exist", StatusCode::NOT_FOUND) + } + Err(e) => { + error!("can't get schema dump: {}", e); + render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR) + } + } + } + // download extension files from remote extension storage on demand (&Method::POST, route) if route.starts_with("/extension_server/") => { info!("serving {:?} POST request", route); @@ -293,10 +335,68 @@ fn render_json_error(e: &str, status: StatusCode) -> Response { }; Response::builder() .status(status) + .header(CONTENT_TYPE, "application/json") .body(Body::from(serde_json::to_string(&error).unwrap())) .unwrap() } +fn render_json(body: Body) -> Response { + Response::builder() + .header(CONTENT_TYPE, "application/json") + .body(body) + .unwrap() +} + +fn render_plain(body: Body) -> Response { + Response::builder() + .header(CONTENT_TYPE, "text/plain") + .body(body) + .unwrap() +} + +async fn handle_terminate_request(compute: &Arc) -> Result<(), (String, StatusCode)> { + { + let mut state = compute.state.lock().unwrap(); + if state.status == ComputeStatus::Terminated { + return Ok(()); + } + if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running { + let msg = format!( + "invalid compute status for termination request: {:?}", + state.status.clone() + ); + return Err((msg, StatusCode::PRECONDITION_FAILED)); + } + state.status = ComputeStatus::TerminationPending; + compute.state_changed.notify_all(); + drop(state); + } + forward_termination_signal(); + info!("sent signal and notified waiters"); + + // Spawn a blocking thread to wait for compute to become Terminated. + // This is needed to do not block the main pool of workers and + // be able to serve other requests while some particular request + // is waiting for compute to finish configuration. + let c = compute.clone(); + task::spawn_blocking(move || { + let mut state = c.state.lock().unwrap(); + while state.status != ComputeStatus::Terminated { + state = c.state_changed.wait(state).unwrap(); + info!( + "waiting for compute to become Terminated, current status: {:?}", + state.status + ); + } + + Ok(()) + }) + .await + .unwrap()?; + info!("terminated Postgres"); + Ok(()) +} + // Main Hyper HTTP server function that runs it and blocks waiting on it forever. #[tokio::main] async fn serve(port: u16, state: Arc) { diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index cedc6ece8f..b0ddaeae2b 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -68,6 +68,51 @@ paths: schema: $ref: "#/components/schemas/Info" + /dbs_and_roles: + get: + tags: + - Info + summary: Get databases and roles in the catalog. + description: "" + operationId: getDbsAndRoles + responses: + 200: + description: Compute schema objects + content: + application/json: + schema: + $ref: "#/components/schemas/DbsAndRoles" + + /database_schema: + get: + tags: + - Info + summary: Get schema dump + parameters: + - name: database + in: query + description: Database name to dump. + required: true + schema: + type: string + example: "postgres" + description: Get schema dump in SQL format. + operationId: getDatabaseSchema + responses: + 200: + description: Schema dump + content: + text/plain: + schema: + type: string + description: Schema dump in SQL format. + 404: + description: Non existing database. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + /check_writability: post: tags: @@ -168,6 +213,29 @@ paths: schema: $ref: "#/components/schemas/GenericError" + /terminate: + post: + tags: + - Terminate + summary: Terminate Postgres and wait for it to exit + description: "" + operationId: terminate + responses: + 200: + description: Result + 412: + description: "wrong state" + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + 500: + description: "Unexpected error" + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + components: securitySchemes: JWT: @@ -206,6 +274,73 @@ components: num_cpus: type: integer + DbsAndRoles: + type: object + description: Databases and Roles + required: + - roles + - databases + properties: + roles: + type: array + items: + $ref: "#/components/schemas/Role" + databases: + type: array + items: + $ref: "#/components/schemas/Database" + + Database: + type: object + description: Database + required: + - name + - owner + - restrict_conn + - invalid + properties: + name: + type: string + owner: + type: string + options: + type: array + items: + $ref: "#/components/schemas/GenericOption" + restrict_conn: + type: boolean + invalid: + type: boolean + + Role: + type: object + description: Role + required: + - name + properties: + name: + type: string + encrypted_password: + type: string + options: + type: array + items: + $ref: "#/components/schemas/GenericOption" + + GenericOption: + type: object + description: Schema Generic option + required: + - name + - vartype + properties: + name: + type: string + value: + type: string + vartype: + type: string + ComputeState: type: object required: diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 4e01ffd954..c402d63305 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -8,10 +8,14 @@ pub mod configurator; pub mod http; #[macro_use] pub mod logger; +pub mod catalog; pub mod compute; pub mod extension_server; +pub mod lsn_lease; +mod migration; pub mod monitor; pub mod params; pub mod pg_helpers; pub mod spec; +pub mod swap; pub mod sync_sk; diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs new file mode 100644 index 0000000000..7e5917c55f --- /dev/null +++ b/compute_tools/src/lsn_lease.rs @@ -0,0 +1,186 @@ +use anyhow::bail; +use anyhow::Result; +use postgres::{NoTls, SimpleQueryMessage}; +use std::time::SystemTime; +use std::{str::FromStr, sync::Arc, thread, time::Duration}; +use utils::id::TenantId; +use utils::id::TimelineId; + +use compute_api::spec::ComputeMode; +use tracing::{info, warn}; +use utils::{ + lsn::Lsn, + shard::{ShardCount, ShardNumber, TenantShardId}, +}; + +use crate::compute::ComputeNode; + +/// Spawns a background thread to periodically renew LSN leases for static compute. +/// Do nothing if the compute is not in static mode. +pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc) { + let (tenant_id, timeline_id, lsn) = { + let state = compute.state.lock().unwrap(); + let spec = state.pspec.as_ref().expect("Spec must be set"); + match spec.spec.mode { + ComputeMode::Static(lsn) => (spec.tenant_id, spec.timeline_id, lsn), + _ => return, + } + }; + let compute = compute.clone(); + + let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn); + thread::spawn(move || { + let _entered = span.entered(); + if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) { + // TODO: might need stronger error feedback than logging an warning. + warn!("Exited with error: {e}"); + } + }); +} + +/// Renews lsn lease periodically so static compute are not affected by GC. +fn lsn_lease_bg_task( + compute: Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + lsn: Lsn, +) -> Result<()> { + loop { + let valid_until = acquire_lsn_lease_with_retry(&compute, tenant_id, timeline_id, lsn)?; + let valid_duration = valid_until + .duration_since(SystemTime::now()) + .unwrap_or(Duration::ZERO); + + // Sleep for 60 seconds less than the valid duration but no more than half of the valid duration. + let sleep_duration = valid_duration + .saturating_sub(Duration::from_secs(60)) + .max(valid_duration / 2); + + info!( + "Succeeded, sleeping for {} seconds", + sleep_duration.as_secs() + ); + thread::sleep(sleep_duration); + } +} + +/// Acquires lsn lease in a retry loop. Returns the expiration time if a lease is granted. +/// Returns an error if a lease is explicitly not granted. Otherwise, we keep sending requests. +fn acquire_lsn_lease_with_retry( + compute: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + lsn: Lsn, +) -> Result { + let mut attempts = 0usize; + let mut retry_period_ms: f64 = 500.0; + const MAX_RETRY_PERIOD_MS: f64 = 60.0 * 1000.0; + + loop { + // Note: List of pageservers is dynamic, need to re-read configs before each attempt. + let configs = { + let state = compute.state.lock().unwrap(); + + let spec = state.pspec.as_ref().expect("spec must be set"); + + let conn_strings = spec.pageserver_connstr.split(','); + + conn_strings + .map(|connstr| { + let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr"); + if let Some(storage_auth_token) = &spec.storage_auth_token { + info!("Got storage auth token from spec file"); + config.password(storage_auth_token.clone()); + } else { + info!("Storage auth token not set"); + } + config + }) + .collect::>() + }; + + let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs); + match result { + Ok(Some(res)) => { + return Ok(res); + } + Ok(None) => { + bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff"); + } + Err(e) => { + warn!("Failed to acquire lsn lease: {e} (attempt {attempts}"); + + thread::sleep(Duration::from_millis(retry_period_ms as u64)); + retry_period_ms *= 1.5; + retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS); + } + } + attempts += 1; + } +} + +/// Tries to acquire an LSN lease through PS page_service API. +fn try_acquire_lsn_lease( + tenant_id: TenantId, + timeline_id: TimelineId, + lsn: Lsn, + configs: &[postgres::Config], +) -> Result> { + fn get_valid_until( + config: &postgres::Config, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, + ) -> Result> { + let mut client = config.connect(NoTls)?; + let cmd = format!("lease lsn {} {} {} ", tenant_shard_id, timeline_id, lsn); + let res = client.simple_query(&cmd)?; + let msg = match res.first() { + Some(msg) => msg, + None => bail!("empty response"), + }; + let row = match msg { + SimpleQueryMessage::Row(row) => row, + _ => bail!("error parsing lsn lease response"), + }; + + // Note: this will be None if a lease is explicitly not granted. + let valid_until_str = row.get("valid_until"); + + let valid_until = valid_until_str.map(|s| { + SystemTime::UNIX_EPOCH + .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64)) + .expect("Time larger than max SystemTime could handle") + }); + Ok(valid_until) + } + + let shard_count = configs.len(); + + let valid_until = if shard_count > 1 { + configs + .iter() + .enumerate() + .map(|(shard_number, config)| { + let tenant_shard_id = TenantShardId { + tenant_id, + shard_count: ShardCount::new(shard_count as u8), + shard_number: ShardNumber(shard_number as u8), + }; + get_valid_until(config, tenant_shard_id, timeline_id, lsn) + }) + .collect::>>>()? + .into_iter() + .min() + .unwrap() + } else { + get_valid_until( + &configs[0], + TenantShardId::unsharded(tenant_id), + timeline_id, + lsn, + )? + }; + + Ok(valid_until) +} diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs new file mode 100644 index 0000000000..22ab145eda --- /dev/null +++ b/compute_tools/src/migration.rs @@ -0,0 +1,105 @@ +use anyhow::{Context, Result}; +use postgres::Client; +use tracing::info; + +pub(crate) struct MigrationRunner<'m> { + client: &'m mut Client, + migrations: &'m [&'m str], +} + +impl<'m> MigrationRunner<'m> { + pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self { + // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64 + assert!(migrations.len() + 1 < i64::MAX as usize); + + Self { client, migrations } + } + + fn get_migration_id(&mut self) -> Result { + let query = "SELECT id FROM neon_migration.migration_id"; + let row = self + .client + .query_one(query, &[]) + .context("run_migrations get migration_id")?; + + Ok(row.get::<&str, i64>("id")) + } + + fn update_migration_id(&mut self, migration_id: i64) -> Result<()> { + let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id); + + self.client + .simple_query(&setval) + .context("run_migrations update id")?; + + Ok(()) + } + + fn prepare_migrations(&mut self) -> Result<()> { + let query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; + self.client.simple_query(query)?; + + let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; + self.client.simple_query(query)?; + + let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; + self.client.simple_query(query)?; + + let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; + self.client.simple_query(query)?; + + let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; + self.client.simple_query(query)?; + + Ok(()) + } + + pub fn run_migrations(mut self) -> Result<()> { + self.prepare_migrations()?; + + let mut current_migration = self.get_migration_id()? as usize; + while current_migration < self.migrations.len() { + macro_rules! migration_id { + ($cm:expr) => { + ($cm + 1) as i64 + }; + } + + let migration = self.migrations[current_migration]; + + if migration.starts_with("-- SKIP") { + info!("Skipping migration id={}", migration_id!(current_migration)); + } else { + info!( + "Running migration id={}:\n{}\n", + migration_id!(current_migration), + migration + ); + + self.client + .simple_query("BEGIN") + .context("begin migration")?; + + self.client.simple_query(migration).with_context(|| { + format!( + "run_migrations migration id={}", + migration_id!(current_migration) + ) + })?; + + // Migration IDs start at 1 + self.update_migration_id(migration_id!(current_migration))?; + + self.client + .simple_query("COMMIT") + .context("commit migration")?; + + info!("Finished migration id={}", migration_id!(current_migration)); + } + + current_migration += 1; + } + + Ok(()) + } +} diff --git a/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql b/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql new file mode 100644 index 0000000000..73b36a37f6 --- /dev/null +++ b/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql @@ -0,0 +1 @@ +ALTER ROLE neon_superuser BYPASSRLS; diff --git a/compute_tools/src/migrations/0002-alter_roles.sql b/compute_tools/src/migrations/0002-alter_roles.sql new file mode 100644 index 0000000000..6cb49f873f --- /dev/null +++ b/compute_tools/src/migrations/0002-alter_roles.sql @@ -0,0 +1,18 @@ +DO $$ +DECLARE + role_name text; +BEGIN + FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member') + LOOP + RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name); + EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT'; + END LOOP; + + FOR role_name IN SELECT rolname FROM pg_roles + WHERE + NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_') + LOOP + RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name); + EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS'; + END LOOP; +END $$; diff --git a/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql new file mode 100644 index 0000000000..37f0ce211f --- /dev/null +++ b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql @@ -0,0 +1,6 @@ +DO $$ +BEGIN + IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN + EXECUTE 'GRANT pg_create_subscription TO neon_superuser'; + END IF; +END $$; diff --git a/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql new file mode 100644 index 0000000000..11afd3b635 --- /dev/null +++ b/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql @@ -0,0 +1 @@ +GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION; diff --git a/compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql b/compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql new file mode 100644 index 0000000000..8abe052494 --- /dev/null +++ b/compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql @@ -0,0 +1,4 @@ +-- SKIP: Deemed insufficient for allowing relations created by extensions to be +-- interacted with by neon_superuser without permission issues. + +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser; diff --git a/compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql b/compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql new file mode 100644 index 0000000000..5bcb026e0c --- /dev/null +++ b/compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql @@ -0,0 +1,4 @@ +-- SKIP: Deemed insufficient for allowing relations created by extensions to be +-- interacted with by neon_superuser without permission issues. + +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser; diff --git a/compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql new file mode 100644 index 0000000000..ce7c96753e --- /dev/null +++ b/compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql @@ -0,0 +1,3 @@ +-- SKIP: Moved inline to the handle_grants() functions. + +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION; diff --git a/compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql new file mode 100644 index 0000000000..72baf920cd --- /dev/null +++ b/compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql @@ -0,0 +1,3 @@ +-- SKIP: Moved inline to the handle_grants() functions. + +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION; diff --git a/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql new file mode 100644 index 0000000000..47129d65b8 --- /dev/null +++ b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql @@ -0,0 +1,13 @@ +-- SKIP: The original goal of this migration was to prevent creating +-- subscriptions, but this migration was insufficient. + +DO $$ +DECLARE + role_name TEXT; +BEGIN + FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE + LOOP + RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name); + EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION'; + END LOOP; +END $$; diff --git a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql new file mode 100644 index 0000000000..28750e00dd --- /dev/null +++ b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql @@ -0,0 +1,7 @@ +DO $$ +BEGIN + IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN + EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser'; + EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser'; + END IF; +END $$; diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index f09bd02664..d7127aac32 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -17,7 +17,11 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500); // should be handled gracefully. fn watch_compute_activity(compute: &ComputeNode) { // Suppose that `connstr` doesn't change - let connstr = compute.connstr.as_str(); + let mut connstr = compute.connstr.clone(); + connstr + .query_pairs_mut() + .append_pair("application_name", "compute_activity_monitor"); + let connstr = connstr.as_str(); // During startup and configuration we connect to every Postgres database, // but we don't want to count this as some user activity. So wait until @@ -138,6 +142,34 @@ fn watch_compute_activity(compute: &ComputeNode) { } } // + // Don't suspend compute if there is an active logical replication subscription + // + // `where pid is not null` – to filter out read only computes and subscription on branches + // + let logical_subscriptions_query = + "select count(*) from pg_stat_subscription where pid is not null;"; + match cli.query_one(logical_subscriptions_query, &[]) { + Ok(row) => match row.try_get::<&str, i64>("count") { + Ok(num_subscribers) => { + if num_subscribers > 0 { + compute.update_last_active(Some(Utc::now())); + continue; + } + } + Err(e) => { + warn!("failed to parse `pg_stat_subscription` count: {:?}", e); + continue; + } + }, + Err(e) => { + warn!( + "failed to get list of active logical replication subscriptions: {:?}", + e + ); + continue; + } + } + // // Do not suspend compute if autovacuum is running // let autovacuum_count_query = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'"; diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index ce704385c6..b2dc265864 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -22,9 +22,10 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds -/// Escape a string for including it in a SQL literal. Wrapping the result -/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use -/// SQL string literal, e.g. `'db'''` or `E'db\\'`. +/// Escape a string for including it in a SQL literal. +/// +/// Wrapping the result with `E'{}'` or `'{}'` is not required, +/// as it returns a ready-to-use SQL string literal, e.g. `'db'''` or `E'db\\'`. /// See /// for the original implementation. pub fn escape_literal(s: &str) -> String { @@ -44,7 +45,7 @@ pub fn escape_conf_value(s: &str) -> String { format!("'{}'", res) } -trait GenericOptionExt { +pub trait GenericOptionExt { fn to_pg_option(&self) -> String; fn to_pg_setting(&self) -> String; } @@ -264,9 +265,10 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { // case we miss some events for some reason. Not strictly necessary, but // better safe than sorry. let (tx, rx) = std::sync::mpsc::channel(); - let (mut watcher, rx): (Box, _) = match notify::recommended_watcher(move |res| { + let watcher_res = notify::recommended_watcher(move |res| { let _ = tx.send(res); - }) { + }); + let (mut watcher, rx): (Box, _) = match watcher_res { Ok(watcher) => (Box::new(watcher), rx), Err(e) => { match e.kind { @@ -488,7 +490,7 @@ pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> /// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions: /// - next line starts with timestamp /// - EOF -/// - no new lines were written for the last second +/// - no new lines were written for the last 100 milliseconds async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> { let mut lines = tokio::io::BufReader::new(stderr).lines(); let timeout_duration = Duration::from_millis(100); diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index e87dc0b732..6a87263821 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -2,7 +2,7 @@ use std::fs::File; use std::path::Path; use std::str::FromStr; -use anyhow::{anyhow, bail, Result}; +use anyhow::{anyhow, bail, Context, Result}; use postgres::config::Config; use postgres::{Client, NoTls}; use reqwest::StatusCode; @@ -10,6 +10,7 @@ use tracing::{error, info, info_span, instrument, span_enabled, warn, Level}; use crate::config; use crate::logger::inlinify; +use crate::migration::MigrationRunner; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; @@ -490,7 +491,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { "rename_db" => { let new_name = op.new_name.as_ref().unwrap(); - if existing_dbs.get(&op.name).is_some() { + if existing_dbs.contains_key(&op.name) { let query: String = format!( "ALTER DATABASE {} RENAME TO {}", op.name.pg_quote(), @@ -581,7 +582,12 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants /// to allow users creating trusted extensions and re-creating `public` schema, for example. #[instrument(skip_all)] -pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> Result<()> { +pub fn handle_grants( + spec: &ComputeSpec, + client: &mut Client, + connstr: &str, + enable_anon_extension: bool, +) -> Result<()> { info!("modifying database permissions"); let existing_dbs = get_existing_dbs(client)?; @@ -650,6 +656,9 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> // remove this code if possible. The worst thing that could happen is that // user won't be able to use public schema in NEW databases created in the // very OLD project. + // + // Also, alter default permissions so that relations created by extensions can be + // used by neon_superuser without permission issues. let grant_query = "DO $$\n\ BEGIN\n\ IF EXISTS(\n\ @@ -668,6 +677,15 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> GRANT CREATE ON SCHEMA public TO web_access;\n\ END IF;\n\ END IF;\n\ + IF EXISTS(\n\ + SELECT nspname\n\ + FROM pg_catalog.pg_namespace\n\ + WHERE nspname = 'public'\n\ + )\n\ + THEN\n\ + ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\ + ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\ + END IF;\n\ END\n\ $$;" .to_string(); @@ -678,6 +696,12 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> inlinify(&grant_query) ); db_client.simple_query(&grant_query)?; + + // it is important to run this after all grants + if enable_anon_extension { + handle_extension_anon(spec, &db.owner, &mut db_client, false) + .context("handle_grants handle_extension_anon")?; + } } Ok(()) @@ -722,7 +746,22 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> { // - extension was just installed // - extension was already installed and is up to date let query = "ALTER EXTENSION neon UPDATE"; - info!("update neon extension schema with query: {}", query); + info!("update neon extension version with query: {}", query); + if let Err(e) = client.simple_query(query) { + error!( + "failed to upgrade neon extension during `handle_extension_neon`: {}", + e + ); + } + + Ok(()) +} + +#[instrument(skip_all)] +pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { + info!("handle neon extension upgrade"); + let query = "ALTER EXTENSION neon UPDATE"; + info!("update neon extension version with query: {}", query); client.simple_query(query)?; Ok(()) @@ -736,70 +775,146 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> { // !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN! // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + // Add new migrations in numerical order. let migrations = [ - "ALTER ROLE neon_superuser BYPASSRLS", - r#" -DO $$ -DECLARE - role_name text; -BEGIN - FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member') - LOOP - RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name); - EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT'; - END LOOP; - - FOR role_name IN SELECT rolname FROM pg_roles - WHERE - NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_') - LOOP - RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name); - EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS'; - END LOOP; -END $$; -"#, + include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"), + include_str!("./migrations/0002-alter_roles.sql"), + include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"), + include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"), + include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"), + include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"), + include_str!( + "./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql" + ), + include_str!( + "./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql" + ), + include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"), + include_str!( + "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql" + ), ]; - let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; - client.simple_query(query)?; + MigrationRunner::new(client, &migrations).run_migrations()?; + + Ok(()) +} + +/// Connect to the database as superuser and pre-create anon extension +/// if it is present in shared_preload_libraries +#[instrument(skip_all)] +pub fn handle_extension_anon( + spec: &ComputeSpec, + db_owner: &str, + db_client: &mut Client, + grants_only: bool, +) -> Result<()> { + info!("handle extension anon"); + + if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { + if libs.contains("anon") { + if !grants_only { + // check if extension is already initialized using anon.is_initialized() + let query = "SELECT anon.is_initialized()"; + match db_client.query(query, &[]) { + Ok(rows) => { + if !rows.is_empty() { + let is_initialized: bool = rows[0].get(0); + if is_initialized { + info!("anon extension is already initialized"); + return Ok(()); + } + } + } + Err(e) => { + warn!( + "anon extension is_installed check failed with expected error: {}", + e + ); + } + }; + + // Create anon extension if this compute needs it + // Users cannot create it themselves, because superuser is required. + let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE"; + info!("creating anon extension with query: {}", query); + match db_client.query(query, &[]) { + Ok(_) => {} + Err(e) => { + error!("anon extension creation failed with error: {}", e); + return Ok(()); + } + } + + // check that extension is installed + query = "SELECT extname FROM pg_extension WHERE extname = 'anon'"; + let rows = db_client.query(query, &[])?; + if rows.is_empty() { + error!("anon extension is not installed"); + return Ok(()); + } + + // Initialize anon extension + // This also requires superuser privileges, so users cannot do it themselves. + query = "SELECT anon.init()"; + match db_client.query(query, &[]) { + Ok(_) => {} + Err(e) => { + error!("anon.init() failed with error: {}", e); + return Ok(()); + } + } + } + + // check that extension is installed, if not bail early + let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'"; + match db_client.query(query, &[]) { + Ok(rows) => { + if rows.is_empty() { + error!("anon extension is not installed"); + return Ok(()); + } + } + Err(e) => { + error!("anon extension check failed with error: {}", e); + return Ok(()); + } + }; + + let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner); + info!("granting anon extension permissions with query: {}", query); + db_client.simple_query(&query)?; + + // Grant permissions to db_owner to use anon extension functions + let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner); + info!("granting anon extension permissions with query: {}", query); + db_client.simple_query(&query)?; + + // This is needed, because some functions are defined as SECURITY DEFINER. + // In Postgres SECURITY DEFINER functions are executed with the privileges + // of the owner. + // In anon extension this it is needed to access some GUCs, which are only accessible to + // superuser. But we've patched postgres to allow db_owner to access them as well. + // So we need to change owner of these functions to db_owner. + let query = format!(" + SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {};' + from pg_proc p + join pg_namespace nsp ON p.pronamespace = nsp.oid + where nsp.nspname = 'anon';", db_owner); + + info!("change anon extension functions owner to db owner"); + db_client.simple_query(&query)?; + + // affects views as well + let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner); + info!("granting anon extension permissions with query: {}", query); + db_client.simple_query(&query)?; + + let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner); + info!("granting anon extension permissions with query: {}", query); + db_client.simple_query(&query)?; + } + } - query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; - client.simple_query(query)?; - - query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; - client.simple_query(query)?; - - query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; - client.simple_query(query)?; - - query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; - client.simple_query(query)?; - - query = "SELECT id FROM neon_migration.migration_id"; - let row = client.query_one(query, &[])?; - let mut current_migration: usize = row.get::<&str, i64>("id") as usize; - let starting_migration_id = current_migration; - - query = "BEGIN"; - client.simple_query(query)?; - - while current_migration < migrations.len() { - info!("Running migration:\n{}\n", migrations[current_migration]); - client.simple_query(migrations[current_migration])?; - current_migration += 1; - } - let setval = format!( - "UPDATE neon_migration.migration_id SET id={}", - migrations.len() - ); - client.simple_query(&setval)?; - - query = "COMMIT"; - client.simple_query(query)?; - - info!( - "Ran {} migrations", - (migrations.len() - starting_migration_id) - ); Ok(()) } diff --git a/compute_tools/src/swap.rs b/compute_tools/src/swap.rs new file mode 100644 index 0000000000..024c5b338e --- /dev/null +++ b/compute_tools/src/swap.rs @@ -0,0 +1,45 @@ +use std::path::Path; + +use anyhow::{anyhow, Context}; +use tracing::warn; + +pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap"; + +pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> { + // run `/neonvm/bin/resize-swap --once {size_bytes}` + // + // Passing '--once' causes resize-swap to delete itself after successful completion, which + // means that if compute_ctl restarts later, we won't end up calling 'swapoff' while + // postgres is running. + // + // NOTE: resize-swap is not very clever. If present, --once MUST be the first arg. + let child_result = std::process::Command::new("/usr/bin/sudo") + .arg(RESIZE_SWAP_BIN) + .arg("--once") + .arg(size_bytes.to_string()) + .spawn(); + + child_result + .context("spawn() failed") + .and_then(|mut child| child.wait().context("wait() failed")) + .and_then(|status| match status.success() { + true => Ok(()), + false => { + // The command failed. Maybe it was because the resize-swap file doesn't exist? + // The --once flag causes it to delete itself on success so we don't disable swap + // while postgres is running; maybe this is fine. + match Path::new(RESIZE_SWAP_BIN).try_exists() { + Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")), + // The path doesn't exist; we're actually ok + Ok(false) => { + warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running"); + Ok(()) + }, + } + } + }) + // wrap any prior error with the overall context that we couldn't run the command + .with_context(|| { + format!("could not run `/usr/bin/sudo {RESIZE_SWAP_BIN} --once {size_bytes}`") + }) +} diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 75e5dcb7f8..6fca59b368 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -6,16 +6,17 @@ license.workspace = true [dependencies] anyhow.workspace = true -async-trait.workspace = true camino.workspace = true clap.workspace = true comfy-table.workspace = true futures.workspace = true git-version.workspace = true +humantime.workspace = true nix.workspace = true once_cell.workspace = true postgres.workspace = true hex.workspace = true +humantime-serde.workspace = true hyper.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["blocking", "json"] } @@ -26,6 +27,7 @@ serde_with.workspace = true tar.workspace = true thiserror.workspace = true toml.workspace = true +toml_edit.workspace = true tokio.workspace = true tokio-postgres.workspace = true tokio-util.workspace = true @@ -37,6 +39,7 @@ safekeeper_api.workspace = true postgres_connection.workspace = true storage_broker.workspace = true utils.workspace = true +whoami.workspace = true compute_api.workspace = true workspace_hack.workspace = true diff --git a/control_plane/README.md b/control_plane/README.md new file mode 100644 index 0000000000..827aba5c1f --- /dev/null +++ b/control_plane/README.md @@ -0,0 +1,26 @@ +# Control Plane and Neon Local + +This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command. + +## Example: Start with Postgres 16 + +To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands. + +```shell +cargo neon init --pg-version 16 +cargo neon start +cargo neon tenant create --set-default --pg-version 16 +cargo neon endpoint create main --pg-version 16 +cargo neon endpoint start main +``` + +## Example: Create Test User and Database + +By default, `cargo neon` starts an endpoint with `cloud_admin` and `postgres` database. If you want to have a role and a database similar to what we have on the cloud service, you can do it with the following commands when starting an endpoint. + +```shell +cargo neon endpoint create main --pg-version 16 --update-catalog true +cargo neon endpoint start main --create-test-user true +``` + +The first command creates `neon_superuser` and necessary roles. The second command creates `test` user and `neondb` database. You will see a connection string that connects you to the test user after running the second command. diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml deleted file mode 100644 index 743dd806c4..0000000000 --- a/control_plane/attachment_service/Cargo.toml +++ /dev/null @@ -1,33 +0,0 @@ -[package] -name = "attachment_service" -version = "0.1.0" -edition.workspace = true -license.workspace = true - -[dependencies] -anyhow.workspace = true -camino.workspace = true -clap.workspace = true -futures.workspace = true -git-version.workspace = true -hyper.workspace = true -pageserver_api.workspace = true -pageserver_client.workspace = true -postgres_connection.workspace = true -scopeguard.workspace = true -serde.workspace = true -serde_json.workspace = true -thiserror.workspace = true -tokio.workspace = true -tokio-util.workspace = true -tracing.workspace = true - -# TODO: remove this after DB persistence is added, it is only used for -# a parsing function when loading pageservers from neon_local LocalEnv -postgres_backend.workspace = true - -utils = { path = "../../libs/utils/" } -metrics = { path = "../../libs/metrics/" } -control_plane = { path = ".." } -workspace_hack = { version = "0.1", path = "../../workspace_hack" } - diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs deleted file mode 100644 index 02617cd065..0000000000 --- a/control_plane/attachment_service/src/compute_hook.rs +++ /dev/null @@ -1,116 +0,0 @@ -use std::collections::HashMap; - -use control_plane::endpoint::ComputeControlPlane; -use control_plane::local_env::LocalEnv; -use pageserver_api::shard::{ShardCount, ShardIndex, TenantShardId}; -use postgres_connection::parse_host_port; -use utils::id::{NodeId, TenantId}; - -pub(super) struct ComputeHookTenant { - shards: Vec<(ShardIndex, NodeId)>, -} - -impl ComputeHookTenant { - pub(super) async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> anyhow::Result<()> { - // Find the highest shard count and drop any shards that aren't - // for that shard count. - let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max(); - let Some(shard_count) = shard_count else { - // No shards, nothing to do. - tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards"); - return Ok(()); - }; - - self.shards.retain(|(k, _v)| k.shard_count == shard_count); - self.shards - .sort_by_key(|(shard, _node_id)| shard.shard_number); - - if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) { - // We have pageservers for all the shards: proceed to reconfigure compute - let env = match LocalEnv::load_config() { - Ok(e) => e, - Err(e) => { - tracing::warn!( - "Couldn't load neon_local config, skipping compute update ({e})" - ); - return Ok(()); - } - }; - let cplane = ComputeControlPlane::load(env.clone()) - .expect("Error loading compute control plane"); - - let compute_pageservers = self - .shards - .iter() - .map(|(_shard, node_id)| { - let ps_conf = env - .get_pageserver_conf(*node_id) - .expect("Unknown pageserver"); - let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr) - .expect("Unable to parse listen_pg_addr"); - (pg_host, pg_port.unwrap_or(5432)) - }) - .collect::>(); - - for (endpoint_name, endpoint) in &cplane.endpoints { - if endpoint.tenant_id == tenant_id && endpoint.status() == "running" { - tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,); - endpoint.reconfigure(compute_pageservers.clone()).await?; - } - } - } else { - tracing::info!( - "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})", - self.shards.len(), - shard_count.0 - ); - } - - Ok(()) - } -} - -/// The compute hook is a destination for notifications about changes to tenant:pageserver -/// mapping. It aggregates updates for the shards in a tenant, and when appropriate reconfigures -/// the compute connection string. -pub(super) struct ComputeHook { - state: tokio::sync::Mutex>, -} - -impl ComputeHook { - pub(super) fn new() -> Self { - Self { - state: Default::default(), - } - } - - pub(super) async fn notify( - &self, - tenant_shard_id: TenantShardId, - node_id: NodeId, - ) -> anyhow::Result<()> { - tracing::info!("ComputeHook::notify: {}->{}", tenant_shard_id, node_id); - let mut locked = self.state.lock().await; - let entry = locked - .entry(tenant_shard_id.tenant_id) - .or_insert_with(|| ComputeHookTenant { shards: Vec::new() }); - - let shard_index = ShardIndex { - shard_count: tenant_shard_id.shard_count, - shard_number: tenant_shard_id.shard_number, - }; - - let mut set = false; - for (existing_shard, existing_node) in &mut entry.shards { - if *existing_shard == shard_index { - *existing_node = node_id; - set = true; - } - } - if !set { - entry.shards.push((shard_index, node_id)); - } - - entry.maybe_reconfigure(tenant_shard_id.tenant_id).await - } -} diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs deleted file mode 100644 index 30f6dd66ee..0000000000 --- a/control_plane/attachment_service/src/http.rs +++ /dev/null @@ -1,218 +0,0 @@ -use crate::reconciler::ReconcileError; -use crate::service::Service; -use hyper::{Body, Request, Response}; -use hyper::{StatusCode, Uri}; -use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest}; -use pageserver_api::shard::TenantShardId; -use std::sync::Arc; -use utils::auth::SwappableJwtAuth; -use utils::http::endpoint::{auth_middleware, request_span}; -use utils::http::request::parse_request_param; -use utils::id::TenantId; - -use utils::{ - http::{ - endpoint::{self}, - error::ApiError, - json::{json_request, json_response}, - RequestExt, RouterBuilder, - }, - id::NodeId, -}; - -use pageserver_api::control_api::{ReAttachRequest, ValidateRequest}; - -use control_plane::attachment_service::{ - AttachHookRequest, InspectRequest, NodeConfigureRequest, NodeRegisterRequest, - TenantShardMigrateRequest, -}; - -/// State available to HTTP request handlers -#[derive(Clone)] -pub struct HttpState { - service: Arc, - auth: Option>, - allowlist_routes: Vec, -} - -impl HttpState { - pub fn new(service: Arc, auth: Option>) -> Self { - let allowlist_routes = ["/status"] - .iter() - .map(|v| v.parse().unwrap()) - .collect::>(); - Self { - service, - auth, - allowlist_routes, - } - } -} - -#[inline(always)] -fn get_state(request: &Request) -> &HttpState { - request - .data::>() - .expect("unknown state type") - .as_ref() -} - -/// Pageserver calls into this on startup, to learn which tenants it should attach -async fn handle_re_attach(mut req: Request) -> Result, ApiError> { - let reattach_req = json_request::(&mut req).await?; - let state = get_state(&req); - json_response( - StatusCode::OK, - state - .service - .re_attach(reattach_req) - .await - .map_err(ApiError::InternalServerError)?, - ) -} - -/// Pageserver calls into this before doing deletions, to confirm that it still -/// holds the latest generation for the tenants with deletions enqueued -async fn handle_validate(mut req: Request) -> Result, ApiError> { - let validate_req = json_request::(&mut req).await?; - let state = get_state(&req); - json_response(StatusCode::OK, state.service.validate(validate_req)) -} - -/// Call into this before attaching a tenant to a pageserver, to acquire a generation number -/// (in the real control plane this is unnecessary, because the same program is managing -/// generation numbers and doing attachments). -async fn handle_attach_hook(mut req: Request) -> Result, ApiError> { - let attach_req = json_request::(&mut req).await?; - let state = get_state(&req); - - json_response( - StatusCode::OK, - state - .service - .attach_hook(attach_req) - .await - .map_err(ApiError::InternalServerError)?, - ) -} - -async fn handle_inspect(mut req: Request) -> Result, ApiError> { - let inspect_req = json_request::(&mut req).await?; - - let state = get_state(&req); - - json_response(StatusCode::OK, state.service.inspect(inspect_req)) -} - -async fn handle_tenant_create(mut req: Request) -> Result, ApiError> { - let create_req = json_request::(&mut req).await?; - let state = get_state(&req); - json_response( - StatusCode::OK, - state.service.tenant_create(create_req).await?, - ) -} - -async fn handle_tenant_timeline_create(mut req: Request) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; - let create_req = json_request::(&mut req).await?; - - let state = get_state(&req); - json_response( - StatusCode::OK, - state - .service - .tenant_timeline_create(tenant_id, create_req) - .await?, - ) -} - -async fn handle_tenant_locate(req: Request) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; - let state = get_state(&req); - - json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?) -} - -async fn handle_node_register(mut req: Request) -> Result, ApiError> { - let register_req = json_request::(&mut req).await?; - let state = get_state(&req); - state.service.node_register(register_req).await?; - json_response(StatusCode::OK, ()) -} - -async fn handle_node_configure(mut req: Request) -> Result, ApiError> { - let node_id: NodeId = parse_request_param(&req, "node_id")?; - let config_req = json_request::(&mut req).await?; - if node_id != config_req.node_id { - return Err(ApiError::BadRequest(anyhow::anyhow!( - "Path and body node_id differ" - ))); - } - let state = get_state(&req); - - json_response(StatusCode::OK, state.service.node_configure(config_req)?) -} - -async fn handle_tenant_shard_migrate(mut req: Request) -> Result, ApiError> { - let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; - let migrate_req = json_request::(&mut req).await?; - let state = get_state(&req); - json_response( - StatusCode::OK, - state - .service - .tenant_shard_migrate(tenant_shard_id, migrate_req) - .await?, - ) -} - -/// Status endpoint is just used for checking that our HTTP listener is up -async fn handle_status(_req: Request) -> Result, ApiError> { - json_response(StatusCode::OK, ()) -} - -impl From for ApiError { - fn from(value: ReconcileError) -> Self { - ApiError::Conflict(format!("Reconciliation error: {}", value)) - } -} - -pub fn make_router( - service: Arc, - auth: Option>, -) -> RouterBuilder { - let mut router = endpoint::make_router(); - if auth.is_some() { - router = router.middleware(auth_middleware(|request| { - let state = get_state(request); - if state.allowlist_routes.contains(request.uri()) { - None - } else { - state.auth.as_deref() - } - })) - } - - router - .data(Arc::new(HttpState::new(service, auth))) - .get("/status", |r| request_span(r, handle_status)) - .post("/re-attach", |r| request_span(r, handle_re_attach)) - .post("/validate", |r| request_span(r, handle_validate)) - .post("/attach-hook", |r| request_span(r, handle_attach_hook)) - .post("/inspect", |r| request_span(r, handle_inspect)) - .post("/node", |r| request_span(r, handle_node_register)) - .put("/node/:node_id/config", |r| { - request_span(r, handle_node_configure) - }) - .post("/tenant", |r| request_span(r, handle_tenant_create)) - .post("/tenant/:tenant_id/timeline", |r| { - request_span(r, handle_tenant_timeline_create) - }) - .get("/tenant/:tenant_id/locate", |r| { - request_span(r, handle_tenant_locate) - }) - .put("/tenant/:tenant_shard_id/migrate", |r| { - request_span(r, handle_tenant_shard_migrate) - }) -} diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs deleted file mode 100644 index 38e51b9a9e..0000000000 --- a/control_plane/attachment_service/src/main.rs +++ /dev/null @@ -1,100 +0,0 @@ -/// The attachment service mimics the aspects of the control plane API -/// that are required for a pageserver to operate. -/// -/// This enables running & testing pageservers without a full-blown -/// deployment of the Neon cloud platform. -/// -use anyhow::anyhow; -use attachment_service::http::make_router; -use attachment_service::persistence::Persistence; -use attachment_service::service::{Config, Service}; -use camino::Utf8PathBuf; -use clap::Parser; -use metrics::launch_timestamp::LaunchTimestamp; -use std::sync::Arc; -use utils::auth::{JwtAuth, SwappableJwtAuth}; -use utils::logging::{self, LogFormat}; -use utils::signals::{ShutdownSignals, Signal}; - -use utils::{project_build_tag, project_git_version, tcp_listener}; - -project_git_version!(GIT_VERSION); -project_build_tag!(BUILD_TAG); - -#[derive(Parser)] -#[command(author, version, about, long_about = None)] -#[command(arg_required_else_help(true))] -struct Cli { - /// Host and port to listen on, like `127.0.0.1:1234` - #[arg(short, long)] - listen: std::net::SocketAddr, - - /// Path to public key for JWT authentication of clients - #[arg(long)] - public_key: Option, - - /// Token for authenticating this service with the pageservers it controls - #[arg(short, long)] - jwt_token: Option, - - /// Path to the .json file to store state (will be created if it doesn't exist) - #[arg(short, long)] - path: Utf8PathBuf, -} - -#[tokio::main] -async fn main() -> anyhow::Result<()> { - let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate())); - - logging::init( - LogFormat::Plain, - logging::TracingErrorLayerEnablement::Disabled, - logging::Output::Stdout, - )?; - - let args = Cli::parse(); - tracing::info!( - "version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}", - GIT_VERSION, - launch_ts.to_string(), - BUILD_TAG, - args.path, - args.listen - ); - - let config = Config { - jwt_token: args.jwt_token, - }; - - let persistence = Arc::new(Persistence::spawn(&args.path).await); - - let service = Service::spawn(config, persistence).await?; - - let http_listener = tcp_listener::bind(args.listen)?; - - let auth = if let Some(public_key_path) = &args.public_key { - let jwt_auth = JwtAuth::from_key_path(public_key_path)?; - Some(Arc::new(SwappableJwtAuth::new(jwt_auth))) - } else { - None - }; - let router = make_router(service, auth) - .build() - .map_err(|err| anyhow!(err))?; - let service = utils::http::RouterService::new(router).unwrap(); - let server = hyper::Server::from_tcp(http_listener)?.serve(service); - - tracing::info!("Serving on {0}", args.listen); - - tokio::task::spawn(server); - - ShutdownSignals::handle(|signal| match signal { - Signal::Interrupt | Signal::Terminate | Signal::Quit => { - tracing::info!("Got {}. Terminating", signal.name()); - // We're just a test helper: no graceful shutdown. - std::process::exit(0); - } - })?; - - Ok(()) -} diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs deleted file mode 100644 index efd3f8f49b..0000000000 --- a/control_plane/attachment_service/src/node.rs +++ /dev/null @@ -1,37 +0,0 @@ -use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy}; -use utils::id::NodeId; - -#[derive(Clone)] -pub(crate) struct Node { - pub(crate) id: NodeId, - - pub(crate) availability: NodeAvailability, - pub(crate) scheduling: NodeSchedulingPolicy, - - pub(crate) listen_http_addr: String, - pub(crate) listen_http_port: u16, - - pub(crate) listen_pg_addr: String, - pub(crate) listen_pg_port: u16, -} - -impl Node { - pub(crate) fn base_url(&self) -> String { - format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) - } - - /// Is this node elegible to have work scheduled onto it? - pub(crate) fn may_schedule(&self) -> bool { - match self.availability { - NodeAvailability::Active => {} - NodeAvailability::Offline => return false, - } - - match self.scheduling { - NodeSchedulingPolicy::Active => true, - NodeSchedulingPolicy::Draining => false, - NodeSchedulingPolicy::Filling => true, - NodeSchedulingPolicy::Pause => false, - } - } -} diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs deleted file mode 100644 index e944a2e9ed..0000000000 --- a/control_plane/attachment_service/src/persistence.rs +++ /dev/null @@ -1,311 +0,0 @@ -use std::{collections::HashMap, str::FromStr}; - -use camino::{Utf8Path, Utf8PathBuf}; -use control_plane::{ - attachment_service::{NodeAvailability, NodeSchedulingPolicy}, - local_env::LocalEnv, -}; -use pageserver_api::{ - models::TenantConfig, - shard::{ShardCount, ShardNumber, TenantShardId}, -}; -use postgres_connection::parse_host_port; -use serde::{Deserialize, Serialize}; -use tracing::info; -use utils::{ - generation::Generation, - id::{NodeId, TenantId}, -}; - -use crate::{node::Node, PlacementPolicy}; - -/// Placeholder for storage. This will be replaced with a database client. -pub struct Persistence { - inner: std::sync::Mutex, -} - -struct Inner { - state: PersistentState, - write_queue_tx: tokio::sync::mpsc::UnboundedSender, -} - -#[derive(Serialize, Deserialize)] -struct PersistentState { - tenants: HashMap, -} - -struct PendingWrite { - bytes: Vec, - done_tx: tokio::sync::oneshot::Sender<()>, -} - -impl PersistentState { - async fn load(path: &Utf8Path) -> anyhow::Result { - let bytes = tokio::fs::read(path).await?; - let mut decoded = serde_json::from_slice::(&bytes)?; - - for (tenant_id, tenant) in &mut decoded.tenants { - // Backward compat: an old attachments.json from before PR #6251, replace - // empty strings with proper defaults. - if tenant.tenant_id.is_empty() { - tenant.tenant_id = format!("{}", tenant_id); - tenant.config = serde_json::to_string(&TenantConfig::default())?; - tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())?; - } - } - - Ok(decoded) - } - - async fn load_or_new(path: &Utf8Path) -> Self { - match Self::load(path).await { - Ok(s) => { - tracing::info!("Loaded state file at {}", path); - s - } - Err(e) - if e.downcast_ref::() - .map(|e| e.kind() == std::io::ErrorKind::NotFound) - .unwrap_or(false) => - { - tracing::info!("Will create state file at {}", path); - Self { - tenants: HashMap::new(), - } - } - Err(e) => { - panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path) - } - } - } -} - -impl Persistence { - pub async fn spawn(path: &Utf8Path) -> Self { - let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); - let state = PersistentState::load_or_new(path).await; - tokio::spawn(Self::writer_task(rx, path.to_owned())); - Self { - inner: std::sync::Mutex::new(Inner { - state, - write_queue_tx: tx, - }), - } - } - - async fn writer_task( - mut rx: tokio::sync::mpsc::UnboundedReceiver, - path: Utf8PathBuf, - ) { - scopeguard::defer! { - info!("persistence writer task exiting"); - }; - loop { - match rx.recv().await { - Some(write) => { - tokio::task::spawn_blocking({ - let path = path.clone(); - move || { - let tmp_path = - utils::crashsafe::path_with_suffix_extension(&path, "___new"); - utils::crashsafe::overwrite(&path, &tmp_path, &write.bytes) - } - }) - .await - .expect("spawn_blocking") - .expect("write file"); - let _ = write.done_tx.send(()); // receiver may lose interest any time - } - None => { - return; - } - } - } - } - - /// Perform a modification on our [`PersistentState`]. - /// Return a future that completes once our modification has been persisted. - /// The output of the future is the return value of the `txn`` closure. - async fn mutating_transaction(&self, txn: F) -> R - where - F: FnOnce(&mut PersistentState) -> R, - { - let (ret, done_rx) = { - let mut inner = self.inner.lock().unwrap(); - let ret = txn(&mut inner.state); - let (done_tx, done_rx) = tokio::sync::oneshot::channel(); - let write = PendingWrite { - bytes: serde_json::to_vec(&inner.state).expect("Serialization error"), - done_tx, - }; - inner - .write_queue_tx - .send(write) - .expect("writer task always outlives self"); - (ret, done_rx) - }; - // the write task can go away once we start .await'ing - let _: () = done_rx.await.expect("writer task dead, check logs"); - ret - } - - /// When registering a node, persist it so that on next start we will be able to - /// iterate over known nodes to synchronize their tenant shard states with our observed state. - pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> { - // TODO: node persitence will come with database backend - Ok(()) - } - - /// At startup, we populate the service's list of nodes, and use this list to call into - /// each node to do an initial reconciliation of the state of the world with our in-memory - /// observed state. - pub(crate) async fn list_nodes(&self) -> anyhow::Result> { - let env = LocalEnv::load_config()?; - // TODO: node persitence will come with database backend - - // XXX hack: enable test_backward_compatibility to work by populating our list of - // nodes from LocalEnv when it is not present in persistent storage. Otherwise at - // first startup in the compat test, we may have shards but no nodes. - let mut result = Vec::new(); - tracing::info!( - "Loaded {} pageserver nodes from LocalEnv", - env.pageservers.len() - ); - for ps_conf in env.pageservers { - let (pg_host, pg_port) = - parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr"); - let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr) - .expect("Unable to parse listen_http_addr"); - result.push(Node { - id: ps_conf.id, - listen_pg_addr: pg_host.to_string(), - listen_pg_port: pg_port.unwrap_or(5432), - listen_http_addr: http_host.to_string(), - listen_http_port: http_port.unwrap_or(80), - availability: NodeAvailability::Active, - scheduling: NodeSchedulingPolicy::Active, - }); - } - - Ok(result) - } - - /// At startup, we populate our map of tenant shards from persistent storage. - pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result> { - let inner = self.inner.lock().unwrap(); - Ok(inner.state.tenants.values().cloned().collect()) - } - - /// Tenants must be persisted before we schedule them for the first time. This enables us - /// to correctly retain generation monotonicity, and the externally provided placement policy & config. - pub(crate) async fn insert_tenant_shards( - &self, - shards: Vec, - ) -> anyhow::Result<()> { - self.mutating_transaction(|locked| { - for shard in shards { - let tenant_shard_id = TenantShardId { - tenant_id: TenantId::from_str(shard.tenant_id.as_str())?, - shard_number: ShardNumber(shard.shard_number as u8), - shard_count: ShardCount(shard.shard_count as u8), - }; - - locked.tenants.insert(tenant_shard_id, shard); - } - Ok(()) - }) - .await - } - - /// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically - /// advancing generation number. We also store the NodeId for which the generation was issued, so that in - /// [`Self::re_attach`] we can do a bulk UPDATE on the generations for that node. - pub(crate) async fn increment_generation( - &self, - tenant_shard_id: TenantShardId, - node_id: NodeId, - ) -> anyhow::Result { - self.mutating_transaction(|locked| { - let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else { - anyhow::bail!("Tried to increment generation of unknown shard"); - }; - - shard.generation += 1; - shard.generation_pageserver = Some(node_id); - - let gen = Generation::new(shard.generation); - Ok(gen) - }) - .await - } - - pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { - self.mutating_transaction(|locked| { - let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else { - anyhow::bail!("Tried to increment generation of unknown shard"); - }; - shard.generation_pageserver = None; - shard.placement_policy = serde_json::to_string(&PlacementPolicy::Detached).unwrap(); - Ok(()) - }) - .await - } - - pub(crate) async fn re_attach( - &self, - node_id: NodeId, - ) -> anyhow::Result> { - self.mutating_transaction(|locked| { - let mut result = HashMap::new(); - for (tenant_shard_id, shard) in locked.tenants.iter_mut() { - if shard.generation_pageserver == Some(node_id) { - shard.generation += 1; - result.insert(*tenant_shard_id, Generation::new(shard.generation)); - } - } - Ok(result) - }) - .await - } - - // TODO: when we start shard splitting, we must durably mark the tenant so that - // on restart, we know that we must go through recovery (list shards that exist - // and pick up where we left off and/or revert to parent shards). - #[allow(dead_code)] - pub(crate) async fn begin_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> { - todo!(); - } - - // TODO: when we finish shard splitting, we must atomically clean up the old shards - // and insert the new shards, and clear the splitting marker. - #[allow(dead_code)] - pub(crate) async fn complete_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> { - todo!(); - } -} - -/// Parts of [`crate::tenant_state::TenantState`] that are stored durably -#[derive(Serialize, Deserialize, Clone)] -pub(crate) struct TenantShardPersistence { - #[serde(default)] - pub(crate) tenant_id: String, - #[serde(default)] - pub(crate) shard_number: i32, - #[serde(default)] - pub(crate) shard_count: i32, - #[serde(default)] - pub(crate) shard_stripe_size: i32, - - // Currently attached pageserver - #[serde(rename = "pageserver")] - pub(crate) generation_pageserver: Option, - - // Latest generation number: next time we attach, increment this - // and use the incremented number when attaching - pub(crate) generation: u32, - - #[serde(default)] - pub(crate) placement_policy: String, - #[serde(default)] - pub(crate) config: String, -} diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs deleted file mode 100644 index d7f4c0406a..0000000000 --- a/control_plane/attachment_service/src/reconciler.rs +++ /dev/null @@ -1,495 +0,0 @@ -use crate::persistence::Persistence; -use crate::service; -use control_plane::attachment_service::NodeAvailability; -use pageserver_api::models::{ - LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, -}; -use pageserver_api::shard::{ShardIdentity, TenantShardId}; -use pageserver_client::mgmt_api; -use std::collections::HashMap; -use std::sync::Arc; -use std::time::Duration; -use tokio_util::sync::CancellationToken; -use utils::generation::Generation; -use utils::id::{NodeId, TimelineId}; -use utils::lsn::Lsn; - -use crate::compute_hook::ComputeHook; -use crate::node::Node; -use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation}; - -/// Object with the lifetime of the background reconcile task that is created -/// for tenants which have a difference between their intent and observed states. -pub(super) struct Reconciler { - /// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot - /// of a tenant's state from when we spawned a reconcile task. - pub(super) tenant_shard_id: TenantShardId, - pub(crate) shard: ShardIdentity, - pub(crate) generation: Generation, - pub(crate) intent: IntentState, - pub(crate) config: TenantConfig, - pub(crate) observed: ObservedState, - - pub(crate) service_config: service::Config, - - /// A snapshot of the pageservers as they were when we were asked - /// to reconcile. - pub(crate) pageservers: Arc>, - - /// A hook to notify the running postgres instances when we change the location - /// of a tenant - pub(crate) compute_hook: Arc, - - /// A means to abort background reconciliation: it is essential to - /// call this when something changes in the original TenantState that - /// will make this reconciliation impossible or unnecessary, for - /// example when a pageserver node goes offline, or the PlacementPolicy for - /// the tenant is changed. - pub(crate) cancel: CancellationToken, - - /// Access to persistent storage for updating generation numbers - pub(crate) persistence: Arc, -} - -#[derive(thiserror::Error, Debug)] -pub enum ReconcileError { - #[error(transparent)] - Other(#[from] anyhow::Error), -} - -impl Reconciler { - async fn location_config( - &mut self, - node_id: NodeId, - config: LocationConfig, - flush_ms: Option, - ) -> anyhow::Result<()> { - let node = self - .pageservers - .get(&node_id) - .expect("Pageserver may not be removed while referenced"); - - self.observed - .locations - .insert(node.id, ObservedStateLocation { conf: None }); - - tracing::info!("location_config({}) calling: {:?}", node_id, config); - let client = - mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref()); - client - .location_config(self.tenant_shard_id, config.clone(), flush_ms) - .await?; - tracing::info!("location_config({}) complete: {:?}", node_id, config); - - self.observed - .locations - .insert(node.id, ObservedStateLocation { conf: Some(config) }); - - Ok(()) - } - - async fn maybe_live_migrate(&mut self) -> Result<(), ReconcileError> { - let destination = if let Some(node_id) = self.intent.attached { - match self.observed.locations.get(&node_id) { - Some(conf) => { - // We will do a live migration only if the intended destination is not - // currently in an attached state. - match &conf.conf { - Some(conf) if conf.mode == LocationConfigMode::Secondary => { - // Fall through to do a live migration - node_id - } - None | Some(_) => { - // Attached or uncertain: don't do a live migration, proceed - // with a general-case reconciliation - tracing::info!("maybe_live_migrate: destination is None or attached"); - return Ok(()); - } - } - } - None => { - // Our destination is not attached: maybe live migrate if some other - // node is currently attached. Fall through. - node_id - } - } - } else { - // No intent to be attached - tracing::info!("maybe_live_migrate: no attached intent"); - return Ok(()); - }; - - let mut origin = None; - for (node_id, state) in &self.observed.locations { - if let Some(observed_conf) = &state.conf { - if observed_conf.mode == LocationConfigMode::AttachedSingle { - let node = self - .pageservers - .get(node_id) - .expect("Nodes may not be removed while referenced"); - // We will only attempt live migration if the origin is not offline: this - // avoids trying to do it while reconciling after responding to an HA failover. - if !matches!(node.availability, NodeAvailability::Offline) { - origin = Some(*node_id); - break; - } - } - } - } - - let Some(origin) = origin else { - tracing::info!("maybe_live_migrate: no origin found"); - return Ok(()); - }; - - // We have an origin and a destination: proceed to do the live migration - tracing::info!("Live migrating {}->{}", origin, destination); - self.live_migrate(origin, destination).await?; - - Ok(()) - } - - async fn get_lsns( - &self, - tenant_shard_id: TenantShardId, - node_id: &NodeId, - ) -> anyhow::Result> { - let node = self - .pageservers - .get(node_id) - .expect("Pageserver may not be removed while referenced"); - - let client = - mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref()); - - let timelines = client.timeline_list(&tenant_shard_id).await?; - Ok(timelines - .into_iter() - .map(|t| (t.timeline_id, t.last_record_lsn)) - .collect()) - } - - async fn secondary_download(&self, tenant_shard_id: TenantShardId, node_id: &NodeId) { - let node = self - .pageservers - .get(node_id) - .expect("Pageserver may not be removed while referenced"); - - let client = - mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref()); - - match client.tenant_secondary_download(tenant_shard_id).await { - Ok(()) => {} - Err(_) => { - tracing::info!(" (skipping, destination wasn't in secondary mode)") - } - } - } - - async fn await_lsn( - &self, - tenant_shard_id: TenantShardId, - pageserver_id: &NodeId, - baseline: HashMap, - ) -> anyhow::Result<()> { - loop { - let latest = match self.get_lsns(tenant_shard_id, pageserver_id).await { - Ok(l) => l, - Err(e) => { - println!( - "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})", - pageserver_id - ); - std::thread::sleep(Duration::from_millis(500)); - continue; - } - }; - - let mut any_behind: bool = false; - for (timeline_id, baseline_lsn) in &baseline { - match latest.get(timeline_id) { - Some(latest_lsn) => { - println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}"); - if latest_lsn < baseline_lsn { - any_behind = true; - } - } - None => { - // Expected timeline isn't yet visible on migration destination. - // (IRL we would have to account for timeline deletion, but this - // is just test helper) - any_behind = true; - } - } - } - - if !any_behind { - println!("✅ LSN caught up. Proceeding..."); - break; - } else { - std::thread::sleep(Duration::from_millis(500)); - } - } - - Ok(()) - } - - pub async fn live_migrate( - &mut self, - origin_ps_id: NodeId, - dest_ps_id: NodeId, - ) -> anyhow::Result<()> { - // `maybe_live_migrate` is responsibble for sanity of inputs - assert!(origin_ps_id != dest_ps_id); - - fn build_location_config( - shard: &ShardIdentity, - config: &TenantConfig, - mode: LocationConfigMode, - generation: Option, - secondary_conf: Option, - ) -> LocationConfig { - LocationConfig { - mode, - generation: generation.map(|g| g.into().unwrap()), - secondary_conf, - tenant_conf: config.clone(), - shard_number: shard.number.0, - shard_count: shard.count.0, - shard_stripe_size: shard.stripe_size.0, - } - } - - tracing::info!( - "🔁 Switching origin pageserver {} to stale mode", - origin_ps_id - ); - - // FIXME: it is incorrect to use self.generation here, we should use the generation - // from the ObservedState of the origin pageserver (it might be older than self.generation) - let stale_conf = build_location_config( - &self.shard, - &self.config, - LocationConfigMode::AttachedStale, - Some(self.generation), - None, - ); - self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10))) - .await?; - - let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?); - - // If we are migrating to a destination that has a secondary location, warm it up first - if let Some(destination_conf) = self.observed.locations.get(&dest_ps_id) { - if let Some(destination_conf) = &destination_conf.conf { - if destination_conf.mode == LocationConfigMode::Secondary { - tracing::info!( - "🔁 Downloading latest layers to destination pageserver {}", - dest_ps_id, - ); - self.secondary_download(self.tenant_shard_id, &dest_ps_id) - .await; - } - } - } - - // Increment generation before attaching to new pageserver - self.generation = self - .persistence - .increment_generation(self.tenant_shard_id, dest_ps_id) - .await?; - - let dest_conf = build_location_config( - &self.shard, - &self.config, - LocationConfigMode::AttachedMulti, - Some(self.generation), - None, - ); - - tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id); - self.location_config(dest_ps_id, dest_conf, None).await?; - - if let Some(baseline) = baseline_lsns { - tracing::info!("🕑 Waiting for LSN to catch up..."); - self.await_lsn(self.tenant_shard_id, &dest_ps_id, baseline) - .await?; - } - - tracing::info!("🔁 Notifying compute to use pageserver {}", dest_ps_id); - self.compute_hook - .notify(self.tenant_shard_id, dest_ps_id) - .await?; - - // Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Single, then - // this location will be deleted in the general case reconciliation that runs after this. - let origin_secondary_conf = build_location_config( - &self.shard, - &self.config, - LocationConfigMode::Secondary, - None, - Some(LocationConfigSecondary { warm: true }), - ); - self.location_config(origin_ps_id, origin_secondary_conf.clone(), None) - .await?; - // TODO: we should also be setting the ObservedState on earlier API calls, in case we fail - // partway through. In fact, all location conf API calls should be in a wrapper that sets - // the observed state to None, then runs, then sets it to what we wrote. - self.observed.locations.insert( - origin_ps_id, - ObservedStateLocation { - conf: Some(origin_secondary_conf), - }, - ); - - println!( - "🔁 Switching to AttachedSingle mode on pageserver {}", - dest_ps_id - ); - let dest_final_conf = build_location_config( - &self.shard, - &self.config, - LocationConfigMode::AttachedSingle, - Some(self.generation), - None, - ); - self.location_config(dest_ps_id, dest_final_conf.clone(), None) - .await?; - self.observed.locations.insert( - dest_ps_id, - ObservedStateLocation { - conf: Some(dest_final_conf), - }, - ); - - println!("✅ Migration complete"); - - Ok(()) - } - - /// Reconciling a tenant makes API calls to pageservers until the observed state - /// matches the intended state. - /// - /// First we apply special case handling (e.g. for live migrations), and then a - /// general case reconciliation where we walk through the intent by pageserver - /// and call out to the pageserver to apply the desired state. - pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> { - // TODO: if any of self.observed is None, call to remote pageservers - // to learn correct state. - - // Special case: live migration - self.maybe_live_migrate().await?; - - // If the attached pageserver is not attached, do so now. - if let Some(node_id) = self.intent.attached { - let mut wanted_conf = - attached_location_conf(self.generation, &self.shard, &self.config); - match self.observed.locations.get(&node_id) { - Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { - // Nothing to do - tracing::info!("Observed configuration already correct.") - } - _ => { - // In all cases other than a matching observed configuration, we will - // reconcile this location. This includes locations with different configurations, as well - // as locations with unknown (None) observed state. - self.generation = self - .persistence - .increment_generation(self.tenant_shard_id, node_id) - .await?; - wanted_conf.generation = self.generation.into(); - tracing::info!("Observed configuration requires update."); - self.location_config(node_id, wanted_conf, None).await?; - if let Err(e) = self - .compute_hook - .notify(self.tenant_shard_id, node_id) - .await - { - tracing::warn!( - "Failed to notify compute of newly attached pageserver {node_id}: {e}" - ); - } - } - } - } - - // Configure secondary locations: if these were previously attached this - // implicitly downgrades them from attached to secondary. - let mut changes = Vec::new(); - for node_id in &self.intent.secondary { - let wanted_conf = secondary_location_conf(&self.shard, &self.config); - match self.observed.locations.get(node_id) { - Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { - // Nothing to do - tracing::info!(%node_id, "Observed configuration already correct.") - } - _ => { - // In all cases other than a matching observed configuration, we will - // reconcile this location. - tracing::info!(%node_id, "Observed configuration requires update."); - changes.push((*node_id, wanted_conf)) - } - } - } - - // Detach any extraneous pageservers that are no longer referenced - // by our intent. - let all_pageservers = self.intent.all_pageservers(); - for node_id in self.observed.locations.keys() { - if all_pageservers.contains(node_id) { - // We are only detaching pageservers that aren't used at all. - continue; - } - - changes.push(( - *node_id, - LocationConfig { - mode: LocationConfigMode::Detached, - generation: None, - secondary_conf: None, - shard_number: self.shard.number.0, - shard_count: self.shard.count.0, - shard_stripe_size: self.shard.stripe_size.0, - tenant_conf: self.config.clone(), - }, - )); - } - - for (node_id, conf) in changes { - self.location_config(node_id, conf, None).await?; - } - - Ok(()) - } -} - -pub(crate) fn attached_location_conf( - generation: Generation, - shard: &ShardIdentity, - config: &TenantConfig, -) -> LocationConfig { - LocationConfig { - mode: LocationConfigMode::AttachedSingle, - generation: generation.into(), - secondary_conf: None, - shard_number: shard.number.0, - shard_count: shard.count.0, - shard_stripe_size: shard.stripe_size.0, - tenant_conf: config.clone(), - } -} - -pub(crate) fn secondary_location_conf( - shard: &ShardIdentity, - config: &TenantConfig, -) -> LocationConfig { - LocationConfig { - mode: LocationConfigMode::Secondary, - generation: None, - secondary_conf: Some(LocationConfigSecondary { warm: true }), - shard_number: shard.number.0, - shard_count: shard.count.0, - shard_stripe_size: shard.stripe_size.0, - tenant_conf: config.clone(), - } -} diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs deleted file mode 100644 index 1966a7ea2a..0000000000 --- a/control_plane/attachment_service/src/scheduler.rs +++ /dev/null @@ -1,89 +0,0 @@ -use pageserver_api::shard::TenantShardId; -use std::collections::{BTreeMap, HashMap}; -use utils::{http::error::ApiError, id::NodeId}; - -use crate::{node::Node, tenant_state::TenantState}; - -/// Scenarios in which we cannot find a suitable location for a tenant shard -#[derive(thiserror::Error, Debug)] -pub enum ScheduleError { - #[error("No pageservers found")] - NoPageservers, - #[error("No pageserver found matching constraint")] - ImpossibleConstraint, -} - -impl From for ApiError { - fn from(value: ScheduleError) -> Self { - ApiError::Conflict(format!("Scheduling error: {}", value)) - } -} - -pub(crate) struct Scheduler { - tenant_counts: HashMap, -} - -impl Scheduler { - pub(crate) fn new( - tenants: &BTreeMap, - nodes: &HashMap, - ) -> Self { - let mut tenant_counts = HashMap::new(); - for node_id in nodes.keys() { - tenant_counts.insert(*node_id, 0); - } - - for tenant in tenants.values() { - if let Some(ps) = tenant.intent.attached { - let entry = tenant_counts.entry(ps).or_insert(0); - *entry += 1; - } - } - - for (node_id, node) in nodes { - if !node.may_schedule() { - tenant_counts.remove(node_id); - } - } - - Self { tenant_counts } - } - - pub(crate) fn schedule_shard( - &mut self, - hard_exclude: &[NodeId], - ) -> Result { - if self.tenant_counts.is_empty() { - return Err(ScheduleError::NoPageservers); - } - - let mut tenant_counts: Vec<(NodeId, usize)> = self - .tenant_counts - .iter() - .filter_map(|(k, v)| { - if hard_exclude.contains(k) { - None - } else { - Some((*k, *v)) - } - }) - .collect(); - - // Sort by tenant count. Nodes with the same tenant count are sorted by ID. - tenant_counts.sort_by_key(|i| (i.1, i.0)); - - if tenant_counts.is_empty() { - // After applying constraints, no pageservers were left - return Err(ScheduleError::ImpossibleConstraint); - } - - for (node_id, count) in &tenant_counts { - tracing::info!("tenant_counts[{node_id}]={count}"); - } - - let node_id = tenant_counts.first().unwrap().0; - tracing::info!("scheduler selected node {node_id}"); - *self.tenant_counts.get_mut(&node_id).unwrap() += 1; - Ok(node_id) - } -} diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs deleted file mode 100644 index c9ed07ae5f..0000000000 --- a/control_plane/attachment_service/src/service.rs +++ /dev/null @@ -1,1149 +0,0 @@ -use std::{ - collections::{BTreeMap, HashMap}, - str::FromStr, - sync::Arc, - time::{Duration, Instant}, -}; - -use control_plane::attachment_service::{ - AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, NodeAvailability, - NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, TenantCreateResponse, - TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard, - TenantShardMigrateRequest, TenantShardMigrateResponse, -}; -use hyper::StatusCode; -use pageserver_api::{ - control_api::{ - ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, - ValidateResponse, ValidateResponseTenant, - }, - models, - models::{ - LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest, - TimelineCreateRequest, TimelineInfo, - }, - shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId}, -}; -use pageserver_client::mgmt_api; -use utils::{ - generation::Generation, - http::error::ApiError, - id::{NodeId, TenantId}, - seqwait::SeqWait, -}; - -use crate::{ - compute_hook::ComputeHook, - node::Node, - persistence::{Persistence, TenantShardPersistence}, - scheduler::Scheduler, - tenant_state::{ - IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError, - ReconcilerWaiter, TenantState, - }, - PlacementPolicy, Sequence, -}; - -const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); - -// Top level state available to all HTTP handlers -struct ServiceState { - tenants: BTreeMap, - - nodes: Arc>, - - compute_hook: Arc, - - result_tx: tokio::sync::mpsc::UnboundedSender, -} - -impl ServiceState { - fn new( - result_tx: tokio::sync::mpsc::UnboundedSender, - nodes: HashMap, - tenants: BTreeMap, - ) -> Self { - Self { - tenants, - nodes: Arc::new(nodes), - compute_hook: Arc::new(ComputeHook::new()), - result_tx, - } - } -} - -#[derive(Clone)] -pub struct Config { - // All pageservers managed by one instance of this service must have - // the same public key. - pub jwt_token: Option, -} - -pub struct Service { - inner: Arc>, - config: Config, - persistence: Arc, -} - -impl From for ApiError { - fn from(value: ReconcileWaitError) -> Self { - match value { - ReconcileWaitError::Shutdown => ApiError::ShuttingDown, - e @ ReconcileWaitError::Timeout(_) => ApiError::Timeout(format!("{e}").into()), - e @ ReconcileWaitError::Failed(..) => ApiError::InternalServerError(anyhow::anyhow!(e)), - } - } -} - -impl Service { - pub async fn spawn(config: Config, persistence: Arc) -> anyhow::Result> { - let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel(); - - tracing::info!("Loading nodes from database..."); - let mut nodes = persistence.list_nodes().await?; - tracing::info!("Loaded {} nodes from database.", nodes.len()); - - tracing::info!("Loading shards from database..."); - let tenant_shard_persistence = persistence.list_tenant_shards().await?; - tracing::info!( - "Loaded {} shards from database.", - tenant_shard_persistence.len() - ); - - let mut tenants = BTreeMap::new(); - - for tsp in tenant_shard_persistence { - let tenant_shard_id = TenantShardId { - tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?, - shard_number: ShardNumber(tsp.shard_number as u8), - shard_count: ShardCount(tsp.shard_count as u8), - }; - let shard_identity = if tsp.shard_count == 0 { - ShardIdentity::unsharded() - } else { - ShardIdentity::new( - ShardNumber(tsp.shard_number as u8), - ShardCount(tsp.shard_count as u8), - ShardStripeSize(tsp.shard_stripe_size as u32), - )? - }; - let new_tenant = TenantState { - tenant_shard_id, - shard: shard_identity, - sequence: Sequence::initial(), - // Note that we load generation, but don't care about generation_pageserver. We will either end up finding - // our existing attached location and it will match generation_pageserver, or we will attach somewhere new - // and update generation_pageserver in the process. - generation: Generation::new(tsp.generation), - policy: serde_json::from_str(&tsp.placement_policy).unwrap(), - intent: IntentState::new(), - observed: ObservedState::new(), - config: serde_json::from_str(&tsp.config).unwrap(), - reconciler: None, - waiter: Arc::new(SeqWait::new(Sequence::initial())), - error_waiter: Arc::new(SeqWait::new(Sequence::initial())), - last_error: Arc::default(), - }; - - tenants.insert(tenant_shard_id, new_tenant); - } - - // For all tenant shards, a vector of observed states on nodes (where None means - // indeterminate, same as in [`ObservedStateLocation`]) - let mut observed = HashMap::new(); - - // TODO: issue these requests concurrently - for node in &mut nodes { - let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref()); - - tracing::info!("Scanning shards on node {}...", node.id); - match client.list_location_config().await { - Err(e) => { - tracing::warn!("Could not contact pageserver {} ({e})", node.id); - // TODO: be more tolerant, apply a generous 5-10 second timeout - // TODO: setting a node to Offline is a dramatic thing to do, and can - // prevent neon_local from starting up (it starts this service before - // any pageservers are running). It may make sense to give nodes - // a Pending state to accomodate this situation, and allow (but deprioritize) - // scheduling on Pending nodes. - //node.availability = NodeAvailability::Offline; - } - Ok(listing) => { - tracing::info!( - "Received {} shard statuses from pageserver {}, setting it to Active", - listing.tenant_shards.len(), - node.id - ); - node.availability = NodeAvailability::Active; - - for (tenant_shard_id, conf_opt) in listing.tenant_shards { - observed.insert(tenant_shard_id, (node.id, conf_opt)); - } - } - } - } - - let mut cleanup = Vec::new(); - - // Populate intent and observed states for all tenants, based on reported state on pageservers - for (tenant_shard_id, (node_id, observed_loc)) in observed { - let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else { - cleanup.push((tenant_shard_id, node_id)); - continue; - }; - - tenant_state - .observed - .locations - .insert(node_id, ObservedStateLocation { conf: observed_loc }); - } - - // State of nodes is now frozen, transform to a HashMap. - let mut nodes: HashMap = nodes.into_iter().map(|n| (n.id, n)).collect(); - - // Populate each tenant's intent state - let mut scheduler = Scheduler::new(&tenants, &nodes); - for (tenant_shard_id, tenant_state) in tenants.iter_mut() { - tenant_state.intent_from_observed(); - if let Err(e) = tenant_state.schedule(&mut scheduler) { - // Non-fatal error: we are unable to properly schedule the tenant, perhaps because - // not enough pageservers are available. The tenant may well still be available - // to clients. - tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}"); - } - } - - // Clean up any tenants that were found on pageservers but are not known to us. - for (tenant_shard_id, node_id) in cleanup { - // A node reported a tenant_shard_id which is unknown to us: detach it. - let node = nodes - .get_mut(&node_id) - .expect("Always exists: only known nodes are scanned"); - - let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref()); - match client - .location_config( - tenant_shard_id, - LocationConfig { - mode: LocationConfigMode::Detached, - generation: None, - secondary_conf: None, - shard_number: tenant_shard_id.shard_number.0, - shard_count: tenant_shard_id.shard_count.0, - shard_stripe_size: 0, - tenant_conf: models::TenantConfig::default(), - }, - None, - ) - .await - { - Ok(()) => { - tracing::info!( - "Detached unknown shard {tenant_shard_id} on pageserver {node_id}" - ); - } - Err(e) => { - // Non-fatal error: leaving a tenant shard behind that we are not managing shouldn't - // break anything. - tracing::error!( - "Failed to detach unknkown shard {tenant_shard_id} on pageserver {node_id}: {e}" - ); - } - } - } - - let shard_count = tenants.len(); - let this = Arc::new(Self { - inner: Arc::new(std::sync::RwLock::new(ServiceState::new( - result_tx, nodes, tenants, - ))), - config, - persistence, - }); - - let result_task_this = this.clone(); - tokio::task::spawn(async move { - while let Some(result) = result_rx.recv().await { - tracing::info!( - "Reconcile result for sequence {}, ok={}", - result.sequence, - result.result.is_ok() - ); - let mut locked = result_task_this.inner.write().unwrap(); - let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else { - // A reconciliation result might race with removing a tenant: drop results for - // tenants that aren't in our map. - continue; - }; - - // Usually generation should only be updated via this path, so the max() isn't - // needed, but it is used to handle out-of-band updates via. e.g. test hook. - tenant.generation = std::cmp::max(tenant.generation, result.generation); - - match result.result { - Ok(()) => { - for (node_id, loc) in &result.observed.locations { - if let Some(conf) = &loc.conf { - tracing::info!( - "Updating observed location {}: {:?}", - node_id, - conf - ); - } else { - tracing::info!("Setting observed location {} to None", node_id,) - } - } - tenant.observed = result.observed; - tenant.waiter.advance(result.sequence); - } - Err(e) => { - tracing::warn!( - "Reconcile error on tenant {}: {}", - tenant.tenant_shard_id, - e - ); - - // Ordering: populate last_error before advancing error_seq, - // so that waiters will see the correct error after waiting. - *(tenant.last_error.lock().unwrap()) = format!("{e}"); - tenant.error_waiter.advance(result.sequence); - - for (node_id, o) in result.observed.locations { - tenant.observed.locations.insert(node_id, o); - } - } - } - } - }); - - // Finally, now that the service is up and running, launch reconcile operations for any tenants - // which require it: under normal circumstances this should only include tenants that were in some - // transient state before we restarted. - let reconcile_tasks = this.reconcile_all(); - tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)"); - - Ok(this) - } - - pub(crate) async fn attach_hook( - &self, - attach_req: AttachHookRequest, - ) -> anyhow::Result { - // This is a test hook. To enable using it on tenants that were created directly with - // the pageserver API (not via this service), we will auto-create any missing tenant - // shards with default state. - let insert = { - let locked = self.inner.write().unwrap(); - !locked.tenants.contains_key(&attach_req.tenant_shard_id) - }; - - if insert { - let tsp = TenantShardPersistence { - tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(), - shard_number: attach_req.tenant_shard_id.shard_number.0 as i32, - shard_count: attach_req.tenant_shard_id.shard_count.0 as i32, - shard_stripe_size: 0, - generation: 0, - generation_pageserver: None, - placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(), - config: serde_json::to_string(&TenantConfig::default()).unwrap(), - }; - - self.persistence.insert_tenant_shards(vec![tsp]).await?; - - let mut locked = self.inner.write().unwrap(); - locked.tenants.insert( - attach_req.tenant_shard_id, - TenantState::new( - attach_req.tenant_shard_id, - ShardIdentity::unsharded(), - PlacementPolicy::Single, - ), - ); - } - - let new_generation = if let Some(req_node_id) = attach_req.node_id { - Some( - self.persistence - .increment_generation(attach_req.tenant_shard_id, req_node_id) - .await?, - ) - } else { - self.persistence.detach(attach_req.tenant_shard_id).await?; - None - }; - - let mut locked = self.inner.write().unwrap(); - let tenant_state = locked - .tenants - .get_mut(&attach_req.tenant_shard_id) - .expect("Checked for existence above"); - - if let Some(new_generation) = new_generation { - tenant_state.generation = new_generation; - } else { - // This is a detach notification. We must update placement policy to avoid re-attaching - // during background scheduling/reconciliation, or during attachment service restart. - assert!(attach_req.node_id.is_none()); - tenant_state.policy = PlacementPolicy::Detached; - } - - if let Some(attaching_pageserver) = attach_req.node_id.as_ref() { - tracing::info!( - tenant_id = %attach_req.tenant_shard_id, - ps_id = %attaching_pageserver, - generation = ?tenant_state.generation, - "issuing", - ); - } else if let Some(ps_id) = tenant_state.intent.attached { - tracing::info!( - tenant_id = %attach_req.tenant_shard_id, - %ps_id, - generation = ?tenant_state.generation, - "dropping", - ); - } else { - tracing::info!( - tenant_id = %attach_req.tenant_shard_id, - "no-op: tenant already has no pageserver"); - } - tenant_state.intent.attached = attach_req.node_id; - - tracing::info!( - "attach_hook: tenant {} set generation {:?}, pageserver {}", - attach_req.tenant_shard_id, - tenant_state.generation, - // TODO: this is an odd number of 0xf's - attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)) - ); - - Ok(AttachHookResponse { - gen: attach_req - .node_id - .map(|_| tenant_state.generation.into().unwrap()), - }) - } - - pub(crate) fn inspect(&self, inspect_req: InspectRequest) -> InspectResponse { - let locked = self.inner.read().unwrap(); - - let tenant_state = locked.tenants.get(&inspect_req.tenant_shard_id); - - InspectResponse { - attachment: tenant_state.and_then(|s| { - s.intent - .attached - .map(|ps| (s.generation.into().unwrap(), ps)) - }), - } - } - - pub(crate) async fn re_attach( - &self, - reattach_req: ReAttachRequest, - ) -> anyhow::Result { - // Ordering: we must persist generation number updates before making them visible in the in-memory state - let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?; - - // Apply the updated generation to our in-memory state - let mut locked = self.inner.write().unwrap(); - - let mut response = ReAttachResponse { - tenants: Vec::new(), - }; - - for (tenant_shard_id, new_gen) in incremented_generations { - response.tenants.push(ReAttachResponseTenant { - id: tenant_shard_id, - gen: new_gen.into().unwrap(), - }); - - // Apply the new generation number to our in-memory state - let shard_state = locked.tenants.get_mut(&tenant_shard_id); - let Some(shard_state) = shard_state else { - // Not fatal. This edge case requires a re-attach to happen - // between inserting a new tenant shard in to the database, and updating our in-memory - // state to know about the shard, _and_ that the state inserted to the database referenced - // a pageserver. Should never happen, but handle it rather than panicking, since it should - // be harmless. - tracing::error!( - "Shard {} is in database for node {} but not in-memory state", - tenant_shard_id, - reattach_req.node_id - ); - continue; - }; - - shard_state.generation = std::cmp::max(shard_state.generation, new_gen); - - // TODO: cancel/restart any running reconciliation for this tenant, it might be trying - // to call location_conf API with an old generation. Wait for cancellation to complete - // before responding to this request. Requires well implemented CancellationToken logic - // all the way to where we call location_conf. Even then, there can still be a location_conf - // request in flight over the network: TODO handle that by making location_conf API refuse - // to go backward in generations. - } - Ok(response) - } - - pub(crate) fn validate(&self, validate_req: ValidateRequest) -> ValidateResponse { - let locked = self.inner.read().unwrap(); - - let mut response = ValidateResponse { - tenants: Vec::new(), - }; - - for req_tenant in validate_req.tenants { - if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) { - let valid = tenant_state.generation == Generation::new(req_tenant.gen); - tracing::info!( - "handle_validate: {}(gen {}): valid={valid} (latest {:?})", - req_tenant.id, - req_tenant.gen, - tenant_state.generation - ); - response.tenants.push(ValidateResponseTenant { - id: req_tenant.id, - valid, - }); - } - } - response - } - - pub(crate) async fn tenant_create( - &self, - create_req: TenantCreateRequest, - ) -> Result { - // Shard count 0 is valid: it means create a single shard (ShardCount(0) means "unsharded") - let literal_shard_count = if create_req.shard_parameters.is_unsharded() { - 1 - } else { - create_req.shard_parameters.count.0 - }; - - // This service expects to handle sharding itself: it is an error to try and directly create - // a particular shard here. - let tenant_id = if create_req.new_tenant_id.shard_count > ShardCount(1) { - return Err(ApiError::BadRequest(anyhow::anyhow!( - "Attempted to create a specific shard, this API is for creating the whole tenant" - ))); - } else { - create_req.new_tenant_id.tenant_id - }; - - tracing::info!( - "Creating tenant {}, shard_count={:?}", - create_req.new_tenant_id, - create_req.shard_parameters.count, - ); - - let create_ids = (0..literal_shard_count) - .map(|i| TenantShardId { - tenant_id, - shard_number: ShardNumber(i), - shard_count: create_req.shard_parameters.count, - }) - .collect::>(); - - // TODO: enable specifying this. Using Single as a default helps legacy tests to work (they - // have no expectation of HA). - let placement_policy: PlacementPolicy = PlacementPolicy::Single; - - // Ordering: we persist tenant shards before creating them on the pageserver. This enables a caller - // to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart - // during the creation, rather than risking leaving orphan objects in S3. - let persist_tenant_shards = create_ids - .iter() - .map(|tenant_shard_id| TenantShardPersistence { - tenant_id: tenant_shard_id.tenant_id.to_string(), - shard_number: tenant_shard_id.shard_number.0 as i32, - shard_count: tenant_shard_id.shard_count.0 as i32, - shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32, - generation: 0, - generation_pageserver: None, - placement_policy: serde_json::to_string(&placement_policy).unwrap(), - config: serde_json::to_string(&create_req.config).unwrap(), - }) - .collect(); - self.persistence - .insert_tenant_shards(persist_tenant_shards) - .await - .map_err(|e| { - // TODO: distinguish primary key constraint (idempotent, OK), from other errors - ApiError::InternalServerError(anyhow::anyhow!(e)) - })?; - - let (waiters, response_shards) = { - let mut locked = self.inner.write().unwrap(); - - let mut response_shards = Vec::new(); - - let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes); - - for tenant_shard_id in create_ids { - tracing::info!("Creating shard {tenant_shard_id}..."); - - use std::collections::btree_map::Entry; - match locked.tenants.entry(tenant_shard_id) { - Entry::Occupied(mut entry) => { - tracing::info!( - "Tenant shard {tenant_shard_id} already exists while creating" - ); - - // TODO: schedule() should take an anti-affinity expression that pushes - // attached and secondary locations (independently) away frorm those - // pageservers also holding a shard for this tenant. - - entry.get_mut().schedule(&mut scheduler).map_err(|e| { - ApiError::Conflict(format!( - "Failed to schedule shard {tenant_shard_id}: {e}" - )) - })?; - - response_shards.push(TenantCreateResponseShard { - node_id: entry - .get() - .intent - .attached - .expect("We just set pageserver if it was None"), - generation: entry.get().generation.into().unwrap(), - }); - - continue; - } - Entry::Vacant(entry) => { - let mut state = TenantState::new( - tenant_shard_id, - ShardIdentity::from_params( - tenant_shard_id.shard_number, - &create_req.shard_parameters, - ), - placement_policy.clone(), - ); - - if let Some(create_gen) = create_req.generation { - state.generation = Generation::new(create_gen); - } - state.config = create_req.config.clone(); - - state.schedule(&mut scheduler).map_err(|e| { - ApiError::Conflict(format!( - "Failed to schedule shard {tenant_shard_id}: {e}" - )) - })?; - - response_shards.push(TenantCreateResponseShard { - node_id: state - .intent - .attached - .expect("We just set pageserver if it was None"), - generation: state.generation.into().unwrap(), - }); - entry.insert(state) - } - }; - } - - // Take a snapshot of pageservers - let pageservers = locked.nodes.clone(); - - let result_tx = locked.result_tx.clone(); - let compute_hook = locked.compute_hook.clone(); - - let waiters = locked - .tenants - .range_mut(TenantShardId::tenant_range(tenant_id)) - .filter_map(|(_shard_id, shard)| { - shard.maybe_reconcile( - result_tx.clone(), - &pageservers, - &compute_hook, - &self.config, - &self.persistence, - ) - }) - .collect::>(); - (waiters, response_shards) - }; - - let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap(); - for waiter in waiters { - let timeout = deadline.duration_since(Instant::now()); - waiter.wait_timeout(timeout).await?; - } - Ok(TenantCreateResponse { - shards: response_shards, - }) - } - - pub(crate) async fn tenant_timeline_create( - &self, - tenant_id: TenantId, - mut create_req: TimelineCreateRequest, - ) -> Result { - let mut timeline_info = None; - - let ensure_waiters = { - let locked = self.inner.write().unwrap(); - tracing::info!( - "Creating timeline {}/{}, have {} pageservers", - tenant_id, - create_req.new_timeline_id, - locked.nodes.len() - ); - - self.ensure_attached(locked, tenant_id) - .map_err(ApiError::InternalServerError)? - }; - - let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap(); - for waiter in ensure_waiters { - let timeout = deadline.duration_since(Instant::now()); - waiter.wait_timeout(timeout).await?; - } - - let targets = { - let locked = self.inner.read().unwrap(); - let mut targets = Vec::new(); - - for (tenant_shard_id, shard) in - locked.tenants.range(TenantShardId::tenant_range(tenant_id)) - { - let node_id = shard.intent.attached.ok_or_else(|| { - ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled")) - })?; - let node = locked - .nodes - .get(&node_id) - .expect("Pageservers may not be deleted while referenced"); - - targets.push((*tenant_shard_id, node.clone())); - } - targets - }; - - if targets.is_empty() { - return Err(ApiError::NotFound( - anyhow::anyhow!("Tenant not found").into(), - )); - } - - for (tenant_shard_id, node) in targets { - // TODO: issue shard timeline creates in parallel, once the 0th is done. - - let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); - - tracing::info!( - "Creating timeline on shard {}/{}, attached to node {}", - tenant_shard_id, - create_req.new_timeline_id, - node.id - ); - - let shard_timeline_info = client - .timeline_create(tenant_shard_id, &create_req) - .await - .map_err(|e| match e { - mgmt_api::Error::ApiError(status, msg) - if status == StatusCode::INTERNAL_SERVER_ERROR - || status == StatusCode::NOT_ACCEPTABLE => - { - // TODO: handle more error codes, e.g. 503 should be passed through. Make a general wrapper - // for pass-through API calls. - ApiError::InternalServerError(anyhow::anyhow!(msg)) - } - _ => ApiError::Conflict(format!("Failed to create timeline: {e}")), - })?; - - if timeline_info.is_none() { - // If the caller specified an ancestor but no ancestor LSN, we are responsible for - // propagating the LSN chosen by the first shard to the other shards: it is important - // that all shards end up with the same ancestor_start_lsn. - if create_req.ancestor_timeline_id.is_some() - && create_req.ancestor_start_lsn.is_none() - { - create_req.ancestor_start_lsn = shard_timeline_info.ancestor_lsn; - } - - // We will return the TimelineInfo from the first shard - timeline_info = Some(shard_timeline_info); - } - } - Ok(timeline_info.expect("targets cannot be empty")) - } - - pub(crate) fn tenant_locate( - &self, - tenant_id: TenantId, - ) -> Result { - let locked = self.inner.read().unwrap(); - tracing::info!("Locating shards for tenant {tenant_id}"); - - // Take a snapshot of pageservers - let pageservers = locked.nodes.clone(); - - let mut result = Vec::new(); - let mut shard_params: Option = None; - - for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) - { - let node_id = shard - .intent - .attached - .ok_or(ApiError::BadRequest(anyhow::anyhow!( - "Cannot locate a tenant that is not attached" - )))?; - - let node = pageservers - .get(&node_id) - .expect("Pageservers may not be deleted while referenced"); - - result.push(TenantLocateResponseShard { - shard_id: *tenant_shard_id, - node_id, - listen_http_addr: node.listen_http_addr.clone(), - listen_http_port: node.listen_http_port, - listen_pg_addr: node.listen_pg_addr.clone(), - listen_pg_port: node.listen_pg_port, - }); - - match &shard_params { - None => { - shard_params = Some(ShardParameters { - stripe_size: shard.shard.stripe_size, - count: shard.shard.count, - }); - } - Some(params) => { - if params.stripe_size != shard.shard.stripe_size { - // This should never happen. We enforce at runtime because it's simpler than - // adding an extra per-tenant data structure to store the things that should be the same - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "Inconsistent shard stripe size parameters!" - ))); - } - } - } - } - - if result.is_empty() { - return Err(ApiError::NotFound( - anyhow::anyhow!("No shards for this tenant ID found").into(), - )); - } - let shard_params = shard_params.expect("result is non-empty, therefore this is set"); - tracing::info!( - "Located tenant {} with params {:?} on shards {}", - tenant_id, - shard_params, - result - .iter() - .map(|s| format!("{:?}", s)) - .collect::>() - .join(",") - ); - - Ok(TenantLocateResponse { - shards: result, - shard_params, - }) - } - - pub(crate) async fn tenant_shard_migrate( - &self, - tenant_shard_id: TenantShardId, - migrate_req: TenantShardMigrateRequest, - ) -> Result { - let waiter = { - let mut locked = self.inner.write().unwrap(); - - let result_tx = locked.result_tx.clone(); - let pageservers = locked.nodes.clone(); - let compute_hook = locked.compute_hook.clone(); - - let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else { - return Err(ApiError::NotFound( - anyhow::anyhow!("Tenant shard not found").into(), - )); - }; - - if shard.intent.attached == Some(migrate_req.node_id) { - // No-op case: we will still proceed to wait for reconciliation in case it is - // incomplete from an earlier update to the intent. - tracing::info!("Migrating: intent is unchanged {:?}", shard.intent); - } else { - let old_attached = shard.intent.attached; - - match shard.policy { - PlacementPolicy::Single => { - shard.intent.secondary.clear(); - } - PlacementPolicy::Double(_n) => { - // If our new attached node was a secondary, it no longer should be. - shard.intent.secondary.retain(|s| s != &migrate_req.node_id); - - // If we were already attached to something, demote that to a secondary - if let Some(old_attached) = old_attached { - shard.intent.secondary.push(old_attached); - } - } - PlacementPolicy::Detached => { - return Err(ApiError::BadRequest(anyhow::anyhow!( - "Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first" - ))) - } - } - shard.intent.attached = Some(migrate_req.node_id); - - tracing::info!("Migrating: new intent {:?}", shard.intent); - shard.sequence = shard.sequence.next(); - } - - shard.maybe_reconcile( - result_tx, - &pageservers, - &compute_hook, - &self.config, - &self.persistence, - ) - }; - - if let Some(waiter) = waiter { - waiter.wait_timeout(RECONCILE_TIMEOUT).await?; - } else { - tracing::warn!("Migration is a no-op"); - } - - Ok(TenantShardMigrateResponse {}) - } - - pub(crate) async fn node_register( - &self, - register_req: NodeRegisterRequest, - ) -> Result<(), ApiError> { - // Pre-check for an already-existing node - { - let locked = self.inner.read().unwrap(); - if let Some(node) = locked.nodes.get(®ister_req.node_id) { - // Note that we do not do a total equality of the struct, because we don't require - // the availability/scheduling states to agree for a POST to be idempotent. - if node.listen_http_addr == register_req.listen_http_addr - && node.listen_http_port == register_req.listen_http_port - && node.listen_pg_addr == register_req.listen_pg_addr - && node.listen_pg_port == register_req.listen_pg_port - { - tracing::info!( - "Node {} re-registered with matching address", - register_req.node_id - ); - return Ok(()); - } else { - // TODO: decide if we want to allow modifying node addresses without removing and re-adding - // the node. Safest/simplest thing is to refuse it, and usually we deploy with - // a fixed address through the lifetime of a node. - tracing::warn!( - "Node {} tried to register with different address", - register_req.node_id - ); - return Err(ApiError::Conflict( - "Node is already registered with different address".to_string(), - )); - } - } - } - - // Ordering: we must persist the new node _before_ adding it to in-memory state. - // This ensures that before we use it for anything or expose it via any external - // API, it is guaranteed to be available after a restart. - let new_node = Node { - id: register_req.node_id, - listen_http_addr: register_req.listen_http_addr, - listen_http_port: register_req.listen_http_port, - listen_pg_addr: register_req.listen_pg_addr, - listen_pg_port: register_req.listen_pg_port, - scheduling: NodeSchedulingPolicy::Filling, - // TODO: we shouldn't really call this Active until we've heartbeated it. - availability: NodeAvailability::Active, - }; - // TODO: idempotency if the node already exists in the database - self.persistence - .insert_node(&new_node) - .await - .map_err(ApiError::InternalServerError)?; - - let mut locked = self.inner.write().unwrap(); - let mut new_nodes = (*locked.nodes).clone(); - - new_nodes.insert(register_req.node_id, new_node); - - locked.nodes = Arc::new(new_nodes); - - tracing::info!( - "Registered pageserver {}, now have {} pageservers", - register_req.node_id, - locked.nodes.len() - ); - Ok(()) - } - - pub(crate) fn node_configure(&self, config_req: NodeConfigureRequest) -> Result<(), ApiError> { - let mut locked = self.inner.write().unwrap(); - let result_tx = locked.result_tx.clone(); - let compute_hook = locked.compute_hook.clone(); - - let mut new_nodes = (*locked.nodes).clone(); - - let Some(node) = new_nodes.get_mut(&config_req.node_id) else { - return Err(ApiError::NotFound( - anyhow::anyhow!("Node not registered").into(), - )); - }; - - let mut offline_transition = false; - let mut active_transition = false; - - if let Some(availability) = &config_req.availability { - match (availability, &node.availability) { - (NodeAvailability::Offline, NodeAvailability::Active) => { - tracing::info!("Node {} transition to offline", config_req.node_id); - offline_transition = true; - } - (NodeAvailability::Active, NodeAvailability::Offline) => { - tracing::info!("Node {} transition to active", config_req.node_id); - active_transition = true; - } - _ => { - tracing::info!("Node {} no change during config", config_req.node_id); - // No change - } - }; - node.availability = *availability; - } - - if let Some(scheduling) = config_req.scheduling { - node.scheduling = scheduling; - - // TODO: once we have a background scheduling ticker for fill/drain, kick it - // to wake up and start working. - } - - let new_nodes = Arc::new(new_nodes); - - let mut scheduler = Scheduler::new(&locked.tenants, &new_nodes); - if offline_transition { - for (tenant_shard_id, tenant_state) in &mut locked.tenants { - if let Some(observed_loc) = - tenant_state.observed.locations.get_mut(&config_req.node_id) - { - // When a node goes offline, we set its observed configuration to None, indicating unknown: we will - // not assume our knowledge of the node's configuration is accurate until it comes back online - observed_loc.conf = None; - } - - if tenant_state.intent.notify_offline(config_req.node_id) { - tenant_state.sequence = tenant_state.sequence.next(); - match tenant_state.schedule(&mut scheduler) { - Err(e) => { - // It is possible that some tenants will become unschedulable when too many pageservers - // go offline: in this case there isn't much we can do other than make the issue observable. - // TODO: give TenantState a scheduling error attribute to be queried later. - tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id); - } - Ok(()) => { - tenant_state.maybe_reconcile( - result_tx.clone(), - &new_nodes, - &compute_hook, - &self.config, - &self.persistence, - ); - } - } - } - } - } - - if active_transition { - // When a node comes back online, we must reconcile any tenant that has a None observed - // location on the node. - for tenant_state in locked.tenants.values_mut() { - if let Some(observed_loc) = - tenant_state.observed.locations.get_mut(&config_req.node_id) - { - if observed_loc.conf.is_none() { - tenant_state.maybe_reconcile( - result_tx.clone(), - &new_nodes, - &compute_hook, - &self.config, - &self.persistence, - ); - } - } - } - - // TODO: in the background, we should balance work back onto this pageserver - } - - locked.nodes = new_nodes; - - Ok(()) - } - - /// Helper for methods that will try and call pageserver APIs for - /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant - /// is attached somewhere. - fn ensure_attached( - &self, - mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>, - tenant_id: TenantId, - ) -> Result, anyhow::Error> { - let mut waiters = Vec::new(); - let result_tx = locked.result_tx.clone(); - let compute_hook = locked.compute_hook.clone(); - let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes); - let pageservers = locked.nodes.clone(); - - for (_tenant_shard_id, shard) in locked - .tenants - .range_mut(TenantShardId::tenant_range(tenant_id)) - { - shard.schedule(&mut scheduler)?; - - if let Some(waiter) = shard.maybe_reconcile( - result_tx.clone(), - &pageservers, - &compute_hook, - &self.config, - &self.persistence, - ) { - waiters.push(waiter); - } - } - Ok(waiters) - } - - /// Check all tenants for pending reconciliation work, and reconcile those in need - /// - /// Returns how many reconciliation tasks were started - fn reconcile_all(&self) -> usize { - let mut locked = self.inner.write().unwrap(); - let result_tx = locked.result_tx.clone(); - let compute_hook = locked.compute_hook.clone(); - let pageservers = locked.nodes.clone(); - locked - .tenants - .iter_mut() - .filter_map(|(_tenant_shard_id, shard)| { - shard.maybe_reconcile( - result_tx.clone(), - &pageservers, - &compute_hook, - &self.config, - &self.persistence, - ) - }) - .count() - } -} diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs deleted file mode 100644 index 5290197d84..0000000000 --- a/control_plane/attachment_service/src/tenant_state.rs +++ /dev/null @@ -1,467 +0,0 @@ -use std::{collections::HashMap, sync::Arc, time::Duration}; - -use control_plane::attachment_service::NodeAvailability; -use pageserver_api::{ - models::{LocationConfig, LocationConfigMode, TenantConfig}, - shard::{ShardIdentity, TenantShardId}, -}; -use tokio::task::JoinHandle; -use tokio_util::sync::CancellationToken; -use utils::{ - generation::Generation, - id::NodeId, - seqwait::{SeqWait, SeqWaitError}, -}; - -use crate::{ - compute_hook::ComputeHook, - node::Node, - persistence::Persistence, - reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler}, - scheduler::{ScheduleError, Scheduler}, - service, PlacementPolicy, Sequence, -}; - -pub(crate) struct TenantState { - pub(crate) tenant_shard_id: TenantShardId, - - pub(crate) shard: ShardIdentity, - - // Runtime only: sequence used to coordinate when updating this object while - // with background reconcilers may be running. A reconciler runs to a particular - // sequence. - pub(crate) sequence: Sequence, - - // Latest generation number: next time we attach, increment this - // and use the incremented number when attaching - pub(crate) generation: Generation, - - // High level description of how the tenant should be set up. Provided - // externally. - pub(crate) policy: PlacementPolicy, - - // Low level description of exactly which pageservers should fulfil - // which role. Generated by `Self::schedule`. - pub(crate) intent: IntentState, - - // Low level description of how the tenant is configured on pageservers: - // if this does not match `Self::intent` then the tenant needs reconciliation - // with `Self::reconcile`. - pub(crate) observed: ObservedState, - - // Tenant configuration, passed through opaquely to the pageserver. Identical - // for all shards in a tenant. - pub(crate) config: TenantConfig, - - /// If a reconcile task is currently in flight, it may be joined here (it is - /// only safe to join if either the result has been received or the reconciler's - /// cancellation token has been fired) - pub(crate) reconciler: Option, - - /// Optionally wait for reconciliation to complete up to a particular - /// sequence number. - pub(crate) waiter: std::sync::Arc>, - - /// Indicates sequence number for which we have encountered an error reconciling. If - /// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred, - /// and callers should stop waiting for `waiter` and propagate the error. - pub(crate) error_waiter: std::sync::Arc>, - - /// The most recent error from a reconcile on this tenant - /// TODO: generalize to an array of recent events - /// TOOD: use a ArcSwap instead of mutex for faster reads? - pub(crate) last_error: std::sync::Arc>, -} - -#[derive(Default, Clone, Debug)] -pub(crate) struct IntentState { - pub(crate) attached: Option, - pub(crate) secondary: Vec, -} - -#[derive(Default, Clone)] -pub(crate) struct ObservedState { - pub(crate) locations: HashMap, -} - -/// Our latest knowledge of how this tenant is configured in the outside world. -/// -/// Meaning: -/// * No instance of this type exists for a node: we are certain that we have nothing configured on that -/// node for this shard. -/// * Instance exists with conf==None: we *might* have some state on that node, but we don't know -/// what it is (e.g. we failed partway through configuring it) -/// * Instance exists with conf==Some: this tells us what we last successfully configured on this node, -/// and that configuration will still be present unless something external interfered. -#[derive(Clone)] -pub(crate) struct ObservedStateLocation { - /// If None, it means we do not know the status of this shard's location on this node, but - /// we know that we might have some state on this node. - pub(crate) conf: Option, -} -pub(crate) struct ReconcilerWaiter { - // For observability purposes, remember the ID of the shard we're - // waiting for. - pub(crate) tenant_shard_id: TenantShardId, - - seq_wait: std::sync::Arc>, - error_seq_wait: std::sync::Arc>, - error: std::sync::Arc>, - seq: Sequence, -} - -#[derive(thiserror::Error, Debug)] -pub enum ReconcileWaitError { - #[error("Timeout waiting for shard {0}")] - Timeout(TenantShardId), - #[error("shutting down")] - Shutdown, - #[error("Reconcile error on shard {0}: {1}")] - Failed(TenantShardId, String), -} - -impl ReconcilerWaiter { - pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> { - tokio::select! { - result = self.seq_wait.wait_for_timeout(self.seq, timeout)=> { - result.map_err(|e| match e { - SeqWaitError::Timeout => ReconcileWaitError::Timeout(self.tenant_shard_id), - SeqWaitError::Shutdown => ReconcileWaitError::Shutdown - })?; - }, - result = self.error_seq_wait.wait_for(self.seq) => { - result.map_err(|e| match e { - SeqWaitError::Shutdown => ReconcileWaitError::Shutdown, - SeqWaitError::Timeout => unreachable!() - })?; - - return Err(ReconcileWaitError::Failed(self.tenant_shard_id, self.error.lock().unwrap().clone())) - } - } - - Ok(()) - } -} - -/// Having spawned a reconciler task, the tenant shard's state will carry enough -/// information to optionally cancel & await it later. -pub(crate) struct ReconcilerHandle { - sequence: Sequence, - handle: JoinHandle<()>, - cancel: CancellationToken, -} - -/// When a reconcile task completes, it sends this result object -/// to be applied to the primary TenantState. -pub(crate) struct ReconcileResult { - pub(crate) sequence: Sequence, - /// On errors, `observed` should be treated as an incompleted description - /// of state (i.e. any nodes present in the result should override nodes - /// present in the parent tenant state, but any unmentioned nodes should - /// not be removed from parent tenant state) - pub(crate) result: Result<(), ReconcileError>, - - pub(crate) tenant_shard_id: TenantShardId, - pub(crate) generation: Generation, - pub(crate) observed: ObservedState, -} - -impl IntentState { - pub(crate) fn new() -> Self { - Self { - attached: None, - secondary: vec![], - } - } - pub(crate) fn all_pageservers(&self) -> Vec { - let mut result = Vec::new(); - if let Some(p) = self.attached { - result.push(p) - } - - result.extend(self.secondary.iter().copied()); - - result - } - - /// When a node goes offline, we update intents to avoid using it - /// as their attached pageserver. - /// - /// Returns true if a change was made - pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool { - if self.attached == Some(node_id) { - self.attached = None; - self.secondary.push(node_id); - true - } else { - false - } - } -} - -impl ObservedState { - pub(crate) fn new() -> Self { - Self { - locations: HashMap::new(), - } - } -} - -impl TenantState { - pub(crate) fn new( - tenant_shard_id: TenantShardId, - shard: ShardIdentity, - policy: PlacementPolicy, - ) -> Self { - Self { - tenant_shard_id, - policy, - intent: IntentState::default(), - generation: Generation::new(0), - shard, - observed: ObservedState::default(), - config: TenantConfig::default(), - reconciler: None, - sequence: Sequence(1), - waiter: Arc::new(SeqWait::new(Sequence(0))), - error_waiter: Arc::new(SeqWait::new(Sequence(0))), - last_error: Arc::default(), - } - } - - /// For use on startup when learning state from pageservers: generate my [`IntentState`] from my - /// [`ObservedState`], even if it violates my [`PlacementPolicy`]. Call [`Self::schedule`] next, - /// to get an intent state that complies with placement policy. The overall goal is to do scheduling - /// in a way that makes use of any configured locations that already exist in the outside world. - pub(crate) fn intent_from_observed(&mut self) { - // Choose an attached location by filtering observed locations, and then sorting to get the highest - // generation - let mut attached_locs = self - .observed - .locations - .iter() - .filter_map(|(node_id, l)| { - if let Some(conf) = &l.conf { - if conf.mode == LocationConfigMode::AttachedMulti - || conf.mode == LocationConfigMode::AttachedSingle - || conf.mode == LocationConfigMode::AttachedStale - { - Some((node_id, conf.generation)) - } else { - None - } - } else { - None - } - }) - .collect::>(); - - attached_locs.sort_by_key(|i| i.1); - if let Some((node_id, _gen)) = attached_locs.into_iter().last() { - self.intent.attached = Some(*node_id); - } - - // All remaining observed locations generate secondary intents. This includes None - // observations, as these may well have some local content on disk that is usable (this - // is an edge case that might occur if we restarted during a migration or other change) - self.observed.locations.keys().for_each(|node_id| { - if Some(*node_id) != self.intent.attached { - self.intent.secondary.push(*node_id); - } - }); - } - - pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> { - // TODO: before scheduling new nodes, check if any existing content in - // self.intent refers to pageservers that are offline, and pick other - // pageservers if so. - - // Build the set of pageservers already in use by this tenant, to avoid scheduling - // more work on the same pageservers we're already using. - let mut used_pageservers = self.intent.all_pageservers(); - let mut modified = false; - - use PlacementPolicy::*; - match self.policy { - Single => { - // Should have exactly one attached, and zero secondaries - if self.intent.attached.is_none() { - let node_id = scheduler.schedule_shard(&used_pageservers)?; - self.intent.attached = Some(node_id); - used_pageservers.push(node_id); - modified = true; - } - if !self.intent.secondary.is_empty() { - self.intent.secondary.clear(); - modified = true; - } - } - Double(secondary_count) => { - // Should have exactly one attached, and N secondaries - if self.intent.attached.is_none() { - let node_id = scheduler.schedule_shard(&used_pageservers)?; - self.intent.attached = Some(node_id); - used_pageservers.push(node_id); - modified = true; - } - - while self.intent.secondary.len() < secondary_count { - let node_id = scheduler.schedule_shard(&used_pageservers)?; - self.intent.secondary.push(node_id); - used_pageservers.push(node_id); - modified = true; - } - } - Detached => { - // Should have no attached or secondary pageservers - if self.intent.attached.is_some() { - self.intent.attached = None; - modified = true; - } - - if !self.intent.secondary.is_empty() { - self.intent.secondary.clear(); - modified = true; - } - } - } - - if modified { - self.sequence.0 += 1; - } - - Ok(()) - } - - fn dirty(&self) -> bool { - if let Some(node_id) = self.intent.attached { - let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config); - match self.observed.locations.get(&node_id) { - Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} - Some(_) | None => { - return true; - } - } - } - - for node_id in &self.intent.secondary { - let wanted_conf = secondary_location_conf(&self.shard, &self.config); - match self.observed.locations.get(node_id) { - Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} - Some(_) | None => { - return true; - } - } - } - - false - } - - pub(crate) fn maybe_reconcile( - &mut self, - result_tx: tokio::sync::mpsc::UnboundedSender, - pageservers: &Arc>, - compute_hook: &Arc, - service_config: &service::Config, - persistence: &Arc, - ) -> Option { - // If there are any ambiguous observed states, and the nodes they refer to are available, - // we should reconcile to clean them up. - let mut dirty_observed = false; - for (node_id, observed_loc) in &self.observed.locations { - let node = pageservers - .get(node_id) - .expect("Nodes may not be removed while referenced"); - if observed_loc.conf.is_none() - && !matches!(node.availability, NodeAvailability::Offline) - { - dirty_observed = true; - break; - } - } - - if !self.dirty() && !dirty_observed { - tracing::info!("Not dirty, no reconciliation needed."); - return None; - } - - // Reconcile already in flight for the current sequence? - if let Some(handle) = &self.reconciler { - if handle.sequence == self.sequence { - return Some(ReconcilerWaiter { - tenant_shard_id: self.tenant_shard_id, - seq_wait: self.waiter.clone(), - error_seq_wait: self.error_waiter.clone(), - error: self.last_error.clone(), - seq: self.sequence, - }); - } - } - - // Reconcile in flight for a stale sequence? Our sequence's task will wait for it before - // doing our sequence's work. - let old_handle = self.reconciler.take(); - - let cancel = CancellationToken::new(); - let mut reconciler = Reconciler { - tenant_shard_id: self.tenant_shard_id, - shard: self.shard, - generation: self.generation, - intent: self.intent.clone(), - config: self.config.clone(), - observed: self.observed.clone(), - pageservers: pageservers.clone(), - compute_hook: compute_hook.clone(), - service_config: service_config.clone(), - cancel: cancel.clone(), - persistence: persistence.clone(), - }; - - let reconcile_seq = self.sequence; - - tracing::info!("Spawning Reconciler for sequence {}", self.sequence); - let join_handle = tokio::task::spawn(async move { - // Wait for any previous reconcile task to complete before we start - if let Some(old_handle) = old_handle { - old_handle.cancel.cancel(); - if let Err(e) = old_handle.handle.await { - // We can't do much with this other than log it: the task is done, so - // we may proceed with our work. - tracing::error!("Unexpected join error waiting for reconcile task: {e}"); - } - } - - // Early check for cancellation before doing any work - // TODO: wrap all remote API operations in cancellation check - // as well. - if reconciler.cancel.is_cancelled() { - return; - } - - let result = reconciler.reconcile().await; - result_tx - .send(ReconcileResult { - sequence: reconcile_seq, - result, - tenant_shard_id: reconciler.tenant_shard_id, - generation: reconciler.generation, - observed: reconciler.observed, - }) - .ok(); - }); - - self.reconciler = Some(ReconcilerHandle { - sequence: self.sequence, - handle: join_handle, - cancel, - }); - - Some(ReconcilerWaiter { - tenant_shard_id: self.tenant_shard_id, - seq_wait: self.waiter.clone(), - error_seq_wait: self.error_waiter.clone(), - error: self.last_error.clone(), - seq: self.sequence, - }) - } -} diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs deleted file mode 100644 index 2d43c46270..0000000000 --- a/control_plane/src/attachment_service.rs +++ /dev/null @@ -1,422 +0,0 @@ -use crate::{background_process, local_env::LocalEnv}; -use camino::Utf8PathBuf; -use hyper::Method; -use pageserver_api::{ - models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo}, - shard::TenantShardId, -}; -use pageserver_client::mgmt_api::ResponseErrorMessageExt; -use postgres_backend::AuthType; -use postgres_connection::parse_host_port; -use serde::{de::DeserializeOwned, Deserialize, Serialize}; -use std::{path::PathBuf, str::FromStr}; -use tracing::instrument; -use utils::{ - auth::{Claims, Scope}, - id::{NodeId, TenantId}, -}; - -pub struct AttachmentService { - env: LocalEnv, - listen: String, - path: PathBuf, - jwt_token: Option, - public_key_path: Option, - client: reqwest::Client, -} - -const COMMAND: &str = "attachment_service"; - -#[derive(Serialize, Deserialize)] -pub struct AttachHookRequest { - pub tenant_shard_id: TenantShardId, - pub node_id: Option, -} - -#[derive(Serialize, Deserialize)] -pub struct AttachHookResponse { - pub gen: Option, -} - -#[derive(Serialize, Deserialize)] -pub struct InspectRequest { - pub tenant_shard_id: TenantShardId, -} - -#[derive(Serialize, Deserialize)] -pub struct InspectResponse { - pub attachment: Option<(u32, NodeId)>, -} - -#[derive(Serialize, Deserialize)] -pub struct TenantCreateResponseShard { - pub node_id: NodeId, - pub generation: u32, -} - -#[derive(Serialize, Deserialize)] -pub struct TenantCreateResponse { - pub shards: Vec, -} - -#[derive(Serialize, Deserialize)] -pub struct NodeRegisterRequest { - pub node_id: NodeId, - - pub listen_pg_addr: String, - pub listen_pg_port: u16, - - pub listen_http_addr: String, - pub listen_http_port: u16, -} - -#[derive(Serialize, Deserialize)] -pub struct NodeConfigureRequest { - pub node_id: NodeId, - - pub availability: Option, - pub scheduling: Option, -} - -#[derive(Serialize, Deserialize, Debug)] -pub struct TenantLocateResponseShard { - pub shard_id: TenantShardId, - pub node_id: NodeId, - - pub listen_pg_addr: String, - pub listen_pg_port: u16, - - pub listen_http_addr: String, - pub listen_http_port: u16, -} - -#[derive(Serialize, Deserialize)] -pub struct TenantLocateResponse { - pub shards: Vec, - pub shard_params: ShardParameters, -} - -/// Explicitly migrating a particular shard is a low level operation -/// TODO: higher level "Reschedule tenant" operation where the request -/// specifies some constraints, e.g. asking it to get off particular node(s) -#[derive(Serialize, Deserialize, Debug)] -pub struct TenantShardMigrateRequest { - pub tenant_shard_id: TenantShardId, - pub node_id: NodeId, -} - -#[derive(Serialize, Deserialize, Clone, Copy)] -pub enum NodeAvailability { - // Normal, happy state - Active, - // Offline: Tenants shouldn't try to attach here, but they may assume that their - // secondary locations on this node still exist. Newly added nodes are in this - // state until we successfully contact them. - Offline, -} - -impl FromStr for NodeAvailability { - type Err = anyhow::Error; - - fn from_str(s: &str) -> Result { - match s { - "active" => Ok(Self::Active), - "offline" => Ok(Self::Offline), - _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")), - } - } -} - -/// FIXME: this is a duplicate of the type in the attachment_service crate, because the -/// type needs to be defined with diesel traits in there. -#[derive(Serialize, Deserialize, Clone, Copy)] -pub enum NodeSchedulingPolicy { - Active, - Filling, - Pause, - Draining, -} - -impl FromStr for NodeSchedulingPolicy { - type Err = anyhow::Error; - - fn from_str(s: &str) -> Result { - match s { - "active" => Ok(Self::Active), - "filling" => Ok(Self::Filling), - "pause" => Ok(Self::Pause), - "draining" => Ok(Self::Draining), - _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")), - } - } -} - -impl From for String { - fn from(value: NodeSchedulingPolicy) -> String { - use NodeSchedulingPolicy::*; - match value { - Active => "active", - Filling => "filling", - Pause => "pause", - Draining => "draining", - } - .to_string() - } -} - -#[derive(Serialize, Deserialize, Debug)] -pub struct TenantShardMigrateResponse {} - -impl AttachmentService { - pub fn from_env(env: &LocalEnv) -> Self { - let path = env.base_data_dir.join("attachments.json"); - - // Makes no sense to construct this if pageservers aren't going to use it: assume - // pageservers have control plane API set - let listen_url = env.control_plane_api.clone().unwrap(); - - let listen = format!( - "{}:{}", - listen_url.host_str().unwrap(), - listen_url.port().unwrap() - ); - - // Assume all pageservers have symmetric auth configuration: this service - // expects to use one JWT token to talk to all of them. - let ps_conf = env - .pageservers - .first() - .expect("Config is validated to contain at least one pageserver"); - let (jwt_token, public_key_path) = match ps_conf.http_auth_type { - AuthType::Trust => (None, None), - AuthType::NeonJWT => { - let jwt_token = env - .generate_auth_token(&Claims::new(None, Scope::PageServerApi)) - .unwrap(); - - // If pageserver auth is enabled, this implicitly enables auth for this service, - // using the same credentials. - let public_key_path = - camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem")) - .unwrap(); - (Some(jwt_token), Some(public_key_path)) - } - }; - - Self { - env: env.clone(), - path, - listen, - jwt_token, - public_key_path, - client: reqwest::ClientBuilder::new() - .build() - .expect("Failed to construct http client"), - } - } - - fn pid_file(&self) -> Utf8PathBuf { - Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("attachment_service.pid")) - .expect("non-Unicode path") - } - - pub async fn start(&self) -> anyhow::Result<()> { - let path_str = self.path.to_string_lossy(); - - let mut args = vec!["-l", &self.listen, "-p", &path_str] - .into_iter() - .map(|s| s.to_string()) - .collect::>(); - if let Some(jwt_token) = &self.jwt_token { - args.push(format!("--jwt-token={jwt_token}")); - } - - if let Some(public_key_path) = &self.public_key_path { - args.push(format!("--public-key={public_key_path}")); - } - - let result = background_process::start_process( - COMMAND, - &self.env.base_data_dir, - &self.env.attachment_service_bin(), - args, - [( - "NEON_REPO_DIR".to_string(), - self.env.base_data_dir.to_string_lossy().to_string(), - )], - background_process::InitialPidFile::Create(self.pid_file()), - || async { - match self.status().await { - Ok(_) => Ok(true), - Err(_) => Ok(false), - } - }, - ) - .await; - - // TODO: shouldn't we bail if we fail to spawn the process? - for ps_conf in &self.env.pageservers { - let (pg_host, pg_port) = - parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr"); - let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr) - .expect("Unable to parse listen_http_addr"); - self.node_register(NodeRegisterRequest { - node_id: ps_conf.id, - listen_pg_addr: pg_host.to_string(), - listen_pg_port: pg_port.unwrap_or(5432), - listen_http_addr: http_host.to_string(), - listen_http_port: http_port.unwrap_or(80), - }) - .await?; - } - - result - } - - pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { - background_process::stop_process(immediate, COMMAND, &self.pid_file()) - } - /// Simple HTTP request wrapper for calling into attachment service - async fn dispatch( - &self, - method: hyper::Method, - path: String, - body: Option, - ) -> anyhow::Result - where - RQ: Serialize + Sized, - RS: DeserializeOwned + Sized, - { - let url = self - .env - .control_plane_api - .clone() - .unwrap() - .join(&path) - .unwrap(); - - let mut builder = self.client.request(method, url); - if let Some(body) = body { - builder = builder.json(&body) - } - if let Some(jwt_token) = &self.jwt_token { - builder = builder.header( - reqwest::header::AUTHORIZATION, - format!("Bearer {jwt_token}"), - ); - } - - let response = builder.send().await?; - let response = response.error_from_body().await?; - - Ok(response - .json() - .await - .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?) - } - - /// Call into the attach_hook API, for use before handing out attachments to pageservers - #[instrument(skip(self))] - pub async fn attach_hook( - &self, - tenant_shard_id: TenantShardId, - pageserver_id: NodeId, - ) -> anyhow::Result> { - let request = AttachHookRequest { - tenant_shard_id, - node_id: Some(pageserver_id), - }; - - let response = self - .dispatch::<_, AttachHookResponse>( - Method::POST, - "attach-hook".to_string(), - Some(request), - ) - .await?; - - Ok(response.gen) - } - - #[instrument(skip(self))] - pub async fn inspect( - &self, - tenant_shard_id: TenantShardId, - ) -> anyhow::Result> { - let request = InspectRequest { tenant_shard_id }; - - let response = self - .dispatch::<_, InspectResponse>(Method::POST, "inspect".to_string(), Some(request)) - .await?; - - Ok(response.attachment) - } - - #[instrument(skip(self))] - pub async fn tenant_create( - &self, - req: TenantCreateRequest, - ) -> anyhow::Result { - self.dispatch(Method::POST, "tenant".to_string(), Some(req)) - .await - } - - #[instrument(skip(self))] - pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result { - self.dispatch::<(), _>(Method::GET, format!("tenant/{tenant_id}/locate"), None) - .await - } - - #[instrument(skip(self))] - pub async fn tenant_migrate( - &self, - tenant_shard_id: TenantShardId, - node_id: NodeId, - ) -> anyhow::Result { - self.dispatch( - Method::PUT, - format!("tenant/{tenant_shard_id}/migrate"), - Some(TenantShardMigrateRequest { - tenant_shard_id, - node_id, - }), - ) - .await - } - - #[instrument(skip_all, fields(node_id=%req.node_id))] - pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> { - self.dispatch::<_, ()>(Method::POST, "node".to_string(), Some(req)) - .await - } - - #[instrument(skip_all, fields(node_id=%req.node_id))] - pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> { - self.dispatch::<_, ()>( - Method::PUT, - format!("node/{}/config", req.node_id), - Some(req), - ) - .await - } - - #[instrument(skip(self))] - pub async fn status(&self) -> anyhow::Result<()> { - self.dispatch::<(), ()>(Method::GET, "status".to_string(), None) - .await - } - - #[instrument(skip_all, fields(%tenant_id, timeline_id=%req.new_timeline_id))] - pub async fn tenant_timeline_create( - &self, - tenant_id: TenantId, - req: TimelineCreateRequest, - ) -> anyhow::Result { - self.dispatch( - Method::POST, - format!("tenant/{tenant_id}/timeline"), - Some(req), - ) - .await - } -} diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 3ffb8734d0..619c5bce3e 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -36,11 +36,11 @@ use utils::pid_file::{self, PidFileRead}; // it's waiting. If the process hasn't started/stopped after 5 seconds, // it prints a notice that it's taking long, but keeps waiting. // -const RETRY_UNTIL_SECS: u64 = 10; -const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS; -const RETRY_INTERVAL_MILLIS: u64 = 100; -const DOT_EVERY_RETRIES: u64 = 10; -const NOTICE_AFTER_RETRIES: u64 = 50; +const STOP_RETRY_TIMEOUT: Duration = Duration::from_secs(10); +const STOP_RETRIES: u128 = STOP_RETRY_TIMEOUT.as_millis() / RETRY_INTERVAL.as_millis(); +const RETRY_INTERVAL: Duration = Duration::from_millis(100); +const DOT_EVERY_RETRIES: u128 = 10; +const NOTICE_AFTER_RETRIES: u128 = 50; /// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates /// it itself. @@ -52,6 +52,7 @@ pub enum InitialPidFile { } /// Start a background child process using the parameters given. +#[allow(clippy::too_many_arguments)] pub async fn start_process( process_name: &str, datadir: &Path, @@ -59,6 +60,7 @@ pub async fn start_process( args: AI, envs: EI, initial_pid_file: InitialPidFile, + retry_timeout: &Duration, process_status_check: F, ) -> anyhow::Result<()> where @@ -69,10 +71,13 @@ where // Not generic AsRef, otherwise empty `envs` prevents type inference EI: IntoIterator, { + let retries: u128 = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis(); + if !datadir.metadata().context("stat datadir")?.is_dir() { + anyhow::bail!("`datadir` must be a directory when calling this function: {datadir:?}"); + } let log_path = datadir.join(format!("{process_name}.log")); let process_log_file = fs::OpenOptions::new() .create(true) - .write(true) .append(true) .open(&log_path) .with_context(|| { @@ -86,8 +91,17 @@ where let background_command = command .stdout(process_log_file) .stderr(same_file_for_stderr) - .args(args); - let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command)); + .args(args) + // spawn all child processes in their datadir, useful for all kinds of things, + // not least cleaning up child processes e.g. after an unclean exit from the test suite: + // ``` + // lsof -d cwd -a +D Users/cs/src/neon/test_output + // ``` + .current_dir(datadir); + + let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars( + fill_rust_env_vars(background_command), + )); filled_cmd.envs(envs); let pid_file_to_check = match &initial_pid_file { @@ -119,7 +133,7 @@ where .unwrap(); }); - for retries in 0..RETRIES { + for retries in 0..retries { match process_started(pid, pid_file_to_check, &process_status_check).await { Ok(true) => { println!("\n{process_name} started and passed status check, pid: {pid}"); @@ -137,7 +151,7 @@ where print!("."); io::stdout().flush().unwrap(); } - thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS)); + thread::sleep(RETRY_INTERVAL); } Err(e) => { println!("error starting process {process_name:?}: {e:#}"); @@ -146,9 +160,10 @@ where } } println!(); - anyhow::bail!( - "{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds" - ); + anyhow::bail!(format!( + "{} did not start+pass status checks within {:?} seconds", + process_name, retry_timeout + )); } /// Stops the process, using the pid file given. Returns Ok also if the process is already not running. @@ -204,7 +219,7 @@ pub fn stop_process( } pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> { - for retries in 0..RETRIES { + for retries in 0..STOP_RETRIES { match process_has_stopped(pid) { Ok(true) => { println!("\n{process_name} stopped"); @@ -220,7 +235,7 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> { print!("."); io::stdout().flush().unwrap(); } - thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS)); + thread::sleep(RETRY_INTERVAL); } Err(e) => { println!("{process_name} with pid {pid} failed to stop: {e:#}"); @@ -229,7 +244,10 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> { } } println!(); - anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds"); + anyhow::bail!(format!( + "{} with pid {} did not stop in {:?} seconds", + process_name, pid, STOP_RETRY_TIMEOUT + )); } fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command { @@ -256,7 +274,9 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command { for env_key in [ "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", - "AWS_SESSION_TOKEN", + "AWS_PROFILE", + // HOME is needed in combination with `AWS_PROFILE` to pick up the SSO sessions. + "HOME", "AZURE_STORAGE_ACCOUNT", "AZURE_STORAGE_ACCESS_KEY", ] { @@ -267,6 +287,15 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command { cmd } +fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command { + for (var, val) in std::env::vars() { + if var.starts_with("NEON_") { + cmd = cmd.env(var, val); + } + } + cmd +} + /// Add a `pre_exec` to the cmd that, inbetween fork() and exec(), /// 1. Claims a pidfile with a fcntl lock on it and /// 2. Sets up the pidfile's file descriptor so that it (and the lock) @@ -293,7 +322,7 @@ where // is in state 'taken' but the thread that would unlock it is // not there. // 2. A rust object that represented some external resource in the - // parent now got implicitly copied by the the fork, even though + // parent now got implicitly copied by the fork, even though // the object's type is not `Copy`. The parent program may use // non-copyability as way to enforce unique ownership of an // external resource in the typesystem. The fork breaks that @@ -350,7 +379,7 @@ where } } -fn process_has_stopped(pid: Pid) -> anyhow::Result { +pub(crate) fn process_has_stopped(pid: Pid) -> anyhow::Result { match kill(pid, None) { // Process exists, keep waiting Ok(_) => Ok(false), diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 279c47398f..1d66532d49 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -8,22 +8,26 @@ use anyhow::{anyhow, bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum}; use compute_api::spec::ComputeMode; -use control_plane::attachment_service::{ - AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, -}; use control_plane::endpoint::ComputeControlPlane; -use control_plane::local_env::{InitForceMode, LocalEnv}; -use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR}; -use control_plane::safekeeper::SafekeeperNode; -use control_plane::{broker, local_env}; -use pageserver_api::models::{ - ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo, +use control_plane::local_env::{ + InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf, + SafekeeperConf, }; -use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; -use pageserver_api::{ +use control_plane::pageserver::PageServerNode; +use control_plane::safekeeper::SafekeeperNode; +use control_plane::storage_controller::{ + NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController, +}; +use control_plane::{broker, local_env}; +use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, }; +use pageserver_api::controller_api::{ + NodeAvailabilityWrapper, PlacementPolicy, TenantCreateRequest, +}; +use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo}; +use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; use postgres_backend::AuthType; use postgres_connection::parse_host_port; use safekeeper_api::{ @@ -34,6 +38,7 @@ use std::collections::{BTreeSet, HashMap}; use std::path::PathBuf; use std::process::exit; use std::str::FromStr; +use std::time::Duration; use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR; use url::Host; use utils::{ @@ -49,47 +54,9 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); -const DEFAULT_PG_VERSION: &str = "15"; +const DEFAULT_PG_VERSION: &str = "16"; -const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/"; - -fn default_conf(num_pageservers: u16) -> String { - let mut template = format!( - r#" -# Default built-in configuration, defined in main.rs -control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}' - -[broker] -listen_addr = '{DEFAULT_BROKER_ADDR}' - -[[safekeepers]] -id = {DEFAULT_SAFEKEEPER_ID} -pg_port = {DEFAULT_SAFEKEEPER_PG_PORT} -http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT} - -"#, - ); - - for i in 0..num_pageservers { - let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64); - let pg_port = DEFAULT_PAGESERVER_PG_PORT + i; - let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i; - - template += &format!( - r#" -[[pageservers]] -id = {pageserver_id} -listen_pg_addr = '127.0.0.1:{pg_port}' -listen_http_addr = '127.0.0.1:{http_port}' -pg_auth_type = '{trust_auth}' -http_auth_type = '{trust_auth}' -"#, - trust_auth = AuthType::Trust, - ) - } - - template -} +const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/"; /// /// Timelines tree element used as a value in the HashMap. @@ -123,7 +90,8 @@ fn main() -> Result<()> { handle_init(sub_args).map(Some) } else { // all other commands need an existing config - let mut env = LocalEnv::load_config().context("Error loading config")?; + let mut env = + LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?; let original_env = env.clone(); let rt = tokio::runtime::Builder::new_current_thread() @@ -134,10 +102,10 @@ fn main() -> Result<()> { let subcommand_result = match sub_name { "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)), "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)), - "start" => rt.block_on(handle_start_all(sub_args, &env)), - "stop" => handle_stop_all(sub_args, &env), + "start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))), + "stop" => rt.block_on(handle_stop_all(sub_args, &env)), "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)), - "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)), + "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)), "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)), "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)), "mappings" => handle_mappings(sub_args, &mut env), @@ -153,7 +121,7 @@ fn main() -> Result<()> { }; match subcommand_result { - Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?, + Ok(Some(updated_env)) => updated_env.persist_config()?, Ok(None) => (), Err(e) => { eprintln!("command failed: {e:?}"); @@ -342,48 +310,66 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result anyhow::Result { - let num_pageservers = init_match - .get_one::("num-pageservers") - .expect("num-pageservers arg has a default"); - // Create config file - let toml_file: String = if let Some(config_path) = init_match.get_one::("config") { + let num_pageservers = init_match.get_one::("num-pageservers"); + + let force = init_match.get_one("force").expect("we set a default value"); + + // Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`. + let init_conf: NeonLocalInitConf = if let Some(config_path) = + init_match.get_one::("config") + { + // User (likely the Python test suite) provided a description of the environment. + if num_pageservers.is_some() { + bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead"); + } // load and parse the file - std::fs::read_to_string(config_path).with_context(|| { + let contents = std::fs::read_to_string(config_path).with_context(|| { format!( "Could not read configuration file '{}'", config_path.display() ) - })? + })?; + toml_edit::de::from_str(&contents)? } else { - // Built-in default config - default_conf(*num_pageservers) + // User (likely interactive) did not provide a description of the environment, give them the default + NeonLocalInitConf { + control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())), + broker: NeonBroker { + listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(), + }, + safekeepers: vec![SafekeeperConf { + id: DEFAULT_SAFEKEEPER_ID, + pg_port: DEFAULT_SAFEKEEPER_PG_PORT, + http_port: DEFAULT_SAFEKEEPER_HTTP_PORT, + ..Default::default() + }], + pageservers: (0..num_pageservers.copied().unwrap_or(1)) + .map(|i| { + let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64); + let pg_port = DEFAULT_PAGESERVER_PG_PORT + i; + let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i; + NeonLocalInitPageserverConf { + id: pageserver_id, + listen_pg_addr: format!("127.0.0.1:{pg_port}"), + listen_http_addr: format!("127.0.0.1:{http_port}"), + pg_auth_type: AuthType::Trust, + http_auth_type: AuthType::Trust, + other: Default::default(), + } + }) + .collect(), + pg_distrib_dir: None, + neon_distrib_dir: None, + default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)), + storage_controller: None, + control_plane_compute_hook_api: None, + } }; - let pg_version = init_match - .get_one::("pg-version") - .copied() - .context("Failed to parse postgres version from the argument string")?; - - let mut env = - LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; - let force = init_match.get_one("force").expect("we set a default value"); - env.init(pg_version, force) - .context("Failed to initialize neon repository")?; - - // Create remote storage location for default LocalFs remote storage - std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?; - - // Initialize pageserver, create initial tenant and timeline. - for ps_conf in &env.pageservers { - PageServerNode::from_env(&env, ps_conf) - .initialize(&pageserver_config_overrides(init_match)) - .unwrap_or_else(|e| { - eprintln!("pageserver init failed: {e:?}"); - exit(1); - }); - } - - Ok(env) + LocalEnv::init(init_conf, force) + .context("materialize initial neon_local environment on disk")?; + Ok(LocalEnv::load_config(&local_env::base_path()) + .expect("freshly written config should be loadable")) } /// The default pageserver is the one where CLI tenant/timeline operations are sent by default. @@ -398,15 +384,6 @@ fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode { PageServerNode::from_env(env, ps_conf) } -fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { - init_match - .get_many::("pageserver-config-override") - .into_iter() - .flatten() - .map(String::as_str) - .collect() -} - async fn handle_tenant( tenant_match: &ArgMatches, env: &mut local_env::LocalEnv, @@ -418,6 +395,54 @@ async fn handle_tenant( println!("{} {:?}", t.id, t.state); } } + Some(("import", import_match)) => { + let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate); + + let storage_controller = StorageController::from_env(env); + let create_response = storage_controller.tenant_import(tenant_id).await?; + + let shard_zero = create_response + .shards + .first() + .expect("Import response omitted shards"); + + let attached_pageserver_id = shard_zero.node_id; + let pageserver = + PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?); + + println!( + "Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}" + ); + + let timelines = pageserver + .http_client + .list_timelines(shard_zero.shard_id) + .await?; + + // Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names + let main_timeline = timelines + .iter() + .find(|t| t.ancestor_timeline_id.is_none()) + .expect("No timelines found") + .timeline_id; + + let mut branch_i = 0; + for timeline in timelines.iter() { + let branch_name = if timeline.timeline_id == main_timeline { + "main".to_string() + } else { + branch_i += 1; + format!("branch_{branch_i}") + }; + + println!( + "Importing timeline {tenant_id}/{} as branch {branch_name}", + timeline.timeline_id + ); + + env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?; + } + } Some(("create", create_match)) => { let tenant_conf: HashMap<_, _> = create_match .get_many::("config") @@ -434,27 +459,33 @@ async fn handle_tenant( let shard_stripe_size: Option = create_match.get_one::("shard-stripe-size").cloned(); + let placement_policy = match create_match.get_one::("placement-policy") { + Some(s) if !s.is_empty() => serde_json::from_str::(s)?, + _ => PlacementPolicy::Attached(0), + }; + let tenant_conf = PageServerNode::parse_config(tenant_conf)?; // If tenant ID was not specified, generate one let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate); - // We must register the tenant with the attachment service, so + // We must register the tenant with the storage controller, so // that when the pageserver restarts, it will be re-attached. - let attachment_service = AttachmentService::from_env(env); - attachment_service + let storage_controller = StorageController::from_env(env); + storage_controller .tenant_create(TenantCreateRequest { // Note that ::unsharded here isn't actually because the tenant is unsharded, its because the - // attachment service expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest - // type is used both in attachment service (for creating tenants) and in pageserver (for creating shards) + // storage controller expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest + // type is used both in storage controller (for creating tenants) and in pageserver (for creating shards) new_tenant_id: TenantShardId::unsharded(tenant_id), generation: None, shard_parameters: ShardParameters { - count: ShardCount(shard_count), + count: ShardCount::new(shard_count), stripe_size: shard_stripe_size .map(ShardStripeSize) .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE), }, + placement_policy: Some(placement_policy), config: tenant_conf, }) .await?; @@ -469,9 +500,9 @@ async fn handle_tenant( .context("Failed to parse postgres version from the argument string")?; // FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have - // different shards picking different start lsns. Maybe we have to teach attachment service + // different shards picking different start lsns. Maybe we have to teach storage controller // to let shard 0 branch first and then propagate the chosen LSN to other shards. - attachment_service + storage_controller .tenant_timeline_create( tenant_id, TimelineCreateRequest { @@ -516,65 +547,7 @@ async fn handle_tenant( .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?; println!("tenant {tenant_id} successfully configured on the pageserver"); } - Some(("migrate", matches)) => { - let tenant_shard_id = get_tenant_shard_id(matches, env)?; - let new_pageserver = get_pageserver(env, matches)?; - let new_pageserver_id = new_pageserver.conf.id; - let attachment_service = AttachmentService::from_env(env); - attachment_service - .tenant_migrate(tenant_shard_id, new_pageserver_id) - .await?; - - println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id); - } - Some(("status", matches)) => { - let tenant_id = get_tenant_id(matches, env)?; - - let mut shard_table = comfy_table::Table::new(); - shard_table.set_header(["Shard", "Pageserver", "Physical Size"]); - - let mut tenant_synthetic_size = None; - - let attachment_service = AttachmentService::from_env(env); - for shard in attachment_service.tenant_locate(tenant_id).await?.shards { - let pageserver = - PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?); - - let size = pageserver - .http_client - .tenant_details(shard.shard_id) - .await? - .tenant_info - .current_physical_size - .unwrap(); - - shard_table.add_row([ - format!("{}", shard.shard_id.shard_slug()), - format!("{}", shard.node_id.0), - format!("{} MiB", size / (1024 * 1024)), - ]); - - if shard.shard_id.is_zero() { - tenant_synthetic_size = - Some(pageserver.tenant_synthetic_size(shard.shard_id).await?); - } - } - - let Some(synthetic_size) = tenant_synthetic_size else { - bail!("Shard 0 not found") - }; - - let mut tenant_table = comfy_table::Table::new(); - tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]); - tenant_table.add_row([ - "Synthetic size".to_string(), - format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)), - ]); - - println!("{tenant_table}"); - println!("{shard_table}"); - } Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), None => bail!("no tenant subcommand provided"), } @@ -586,7 +559,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local match timeline_match.subcommand() { Some(("list", list_match)) => { - // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service + // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller // where shard 0 is attached, and query there. let tenant_shard_id = get_tenant_shard_id(list_match, env)?; let timelines = pageserver.timeline_list(&tenant_shard_id).await?; @@ -606,7 +579,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local let new_timeline_id_opt = parse_timeline_id(create_match)?; let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate()); - let attachment_service = AttachmentService::from_env(env); + let storage_controller = StorageController::from_env(env); let create_req = TimelineCreateRequest { new_timeline_id, ancestor_timeline_id: None, @@ -614,7 +587,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local ancestor_start_lsn: None, pg_version: Some(pg_version), }; - let timeline_info = attachment_service + let timeline_info = storage_controller .tenant_timeline_create(tenant_id, create_req) .await?; @@ -629,9 +602,9 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local Some(("import", import_match)) => { let tenant_id = get_tenant_id(import_match, env)?; let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided"); - let name = import_match - .get_one::("node-name") - .ok_or_else(|| anyhow!("No node name provided"))?; + let branch_name = import_match + .get_one::("branch-name") + .ok_or_else(|| anyhow!("No branch name provided"))?; // Parse base inputs let base_tarfile = import_match @@ -658,23 +631,11 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local .copied() .context("Failed to parse postgres version from the argument string")?; - let mut cplane = ComputeControlPlane::load(env.clone())?; println!("Importing timeline into pageserver ..."); pageserver .timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version) .await?; - env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; - - println!("Creating endpoint for imported timeline ..."); - cplane.new_endpoint( - name, - tenant_id, - timeline_id, - None, - None, - pg_version, - ComputeMode::Primary, - )?; + env.register_branch_mapping(branch_name.to_string(), tenant_id, timeline_id)?; println!("Done"); } Some(("branch", branch_match)) => { @@ -698,7 +659,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local .transpose() .context("Failed to parse ancestor start Lsn from the request")?; let new_timeline_id = TimelineId::generate(); - let attachment_service = AttachmentService::from_env(env); + let storage_controller = StorageController::from_env(env); let create_req = TimelineCreateRequest { new_timeline_id, ancestor_timeline_id: Some(ancestor_timeline_id), @@ -706,7 +667,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local ancestor_start_lsn: start_lsn, pg_version: None, }; - let timeline_info = attachment_service + let timeline_info = storage_controller .tenant_timeline_create(tenant_id, create_req) .await?; @@ -735,7 +696,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re match sub_name { "list" => { - // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service + // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller // where shard 0 is attached, and query there. let tenant_shard_id = get_tenant_shard_id(sub_args, env)?; let timeline_infos = get_timeline_infos(env, &tenant_shard_id) @@ -795,7 +756,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re &endpoint.timeline_id.to_string(), branch_name, lsn_str.as_str(), - endpoint.status(), + &format!("{}", endpoint.status()), ]); } @@ -811,6 +772,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .get_one::("endpoint_id") .map(String::to_string) .unwrap_or_else(|| format!("ep-{branch_name}")); + let update_catalog = sub_args + .get_one::("update-catalog") + .cloned() + .unwrap_or_default(); let lsn = sub_args .get_one::("lsn") @@ -833,6 +798,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .copied() .unwrap_or(false); + let allow_multiple = sub_args.get_flag("allow-multiple"); + let mode = match (lsn, hot_standby) { (Some(lsn), false) => ComputeMode::Static(lsn), (None, true) => ComputeMode::Replica, @@ -850,7 +817,9 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re _ => {} } - cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?; + if !allow_multiple { + cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?; + } cplane.new_endpoint( &endpoint_id, @@ -860,6 +829,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re http_port, pg_version, mode, + !update_catalog, )?; } "start" => { @@ -878,31 +848,33 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re let remote_ext_config = sub_args.get_one::("remote-ext-config"); - // If --safekeepers argument is given, use only the listed safekeeper nodes. - let safekeepers = - if let Some(safekeepers_str) = sub_args.get_one::("safekeepers") { - let mut safekeepers: Vec = Vec::new(); - for sk_id in safekeepers_str.split(',').map(str::trim) { - let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| { - anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list") - })?); - safekeepers.push(sk_id); - } - safekeepers - } else { - env.safekeepers.iter().map(|sk| sk.id).collect() - }; + let allow_multiple = sub_args.get_flag("allow-multiple"); + + // If --safekeepers argument is given, use only the listed + // safekeeper nodes; otherwise all from the env. + let safekeepers = if let Some(safekeepers) = parse_safekeepers(sub_args)? { + safekeepers + } else { + env.safekeepers.iter().map(|sk| sk.id).collect() + }; let endpoint = cplane .endpoints .get(endpoint_id.as_str()) .ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?; - cplane.check_conflicting_endpoints( - endpoint.mode, - endpoint.tenant_id, - endpoint.timeline_id, - )?; + let create_test_user = sub_args + .get_one::("create-test-user") + .cloned() + .unwrap_or_default(); + + if !allow_multiple { + cplane.check_conflicting_endpoints( + endpoint.mode, + endpoint.tenant_id, + endpoint.timeline_id, + )?; + } let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id { let conf = env.get_pageserver_conf(pageserver_id).unwrap(); @@ -910,21 +882,21 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re ( vec![(parsed.0, parsed.1.unwrap_or(5432))], // If caller is telling us what pageserver to use, this is not a tenant which is - // full managed by attachment service, therefore not sharded. + // full managed by storage controller, therefore not sharded. ShardParameters::DEFAULT_STRIPE_SIZE, ) } else { // Look up the currently attached location of the tenant, and its striping metadata, // to pass these on to postgres. - let attachment_service = AttachmentService::from_env(env); - let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?; + let storage_controller = StorageController::from_env(env); + let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?; let pageservers = locate_result .shards .into_iter() .map(|shard| { ( Host::parse(&shard.listen_pg_addr) - .expect("Attachment service reported bad hostname"), + .expect("Storage controller reported bad hostname"), shard.listen_pg_port, ) }) @@ -952,6 +924,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re pageservers, remote_ext_config, stripe_size.0 as usize, + create_test_user, ) .await?; } @@ -972,8 +945,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re pageserver.pg_connection_config.port(), )] } else { - let attachment_service = AttachmentService::from_env(env); - attachment_service + let storage_controller = StorageController::from_env(env); + storage_controller .tenant_locate(endpoint.tenant_id) .await? .shards @@ -981,25 +954,29 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .map(|shard| { ( Host::parse(&shard.listen_pg_addr) - .expect("Attachment service reported malformed host"), + .expect("Storage controller reported malformed host"), shard.listen_pg_port, ) }) .collect::>() }; - endpoint.reconfigure(pageservers).await?; + // If --safekeepers argument is given, use only the listed + // safekeeper nodes; otherwise all from the env. + let safekeepers = parse_safekeepers(sub_args)?; + endpoint.reconfigure(pageservers, None, safekeepers).await?; } "stop" => { let endpoint_id = sub_args .get_one::("endpoint_id") .ok_or_else(|| anyhow!("No endpoint ID was provided to stop"))?; let destroy = sub_args.get_flag("destroy"); + let mode = sub_args.get_one::("mode").expect("has a default"); let endpoint = cplane .endpoints .get(endpoint_id.as_str()) .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; - endpoint.stop(destroy)?; + endpoint.stop(mode, destroy)?; } _ => bail!("Unexpected endpoint subcommand '{sub_name}'"), @@ -1008,6 +985,23 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re Ok(()) } +/// Parse --safekeepers as list of safekeeper ids. +fn parse_safekeepers(sub_args: &ArgMatches) -> Result>> { + if let Some(safekeepers_str) = sub_args.get_one::("safekeepers") { + let mut safekeepers: Vec = Vec::new(); + for sk_id in safekeepers_str.split(',').map(str::trim) { + let sk_id = NodeId( + u64::from_str(sk_id) + .map_err(|_| anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list"))?, + ); + safekeepers.push(sk_id); + } + Ok(Some(safekeepers)) + } else { + Ok(None) + } +} + fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { let (sub_name, sub_args) = match sub_match.subcommand() { Some(ep_subcommand_data) => ep_subcommand_data, @@ -1053,11 +1047,48 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result &Duration { + let humantime_duration = args + .get_one::("start-timeout") + .expect("invalid value for start-timeout"); + humantime_duration.as_ref() +} + +fn storage_controller_start_args(args: &ArgMatches) -> NeonStorageControllerStartArgs { + let maybe_instance_id = args.get_one::("instance-id"); + + let base_port = args.get_one::("base-port"); + + if maybe_instance_id.is_some() && base_port.is_none() { + panic!("storage-controller start specificied instance-id but did not provide base-port"); + } + + let start_timeout = args + .get_one::("start-timeout") + .expect("invalid value for start-timeout"); + + NeonStorageControllerStartArgs { + instance_id: maybe_instance_id.copied().unwrap_or(1), + base_port: base_port.copied(), + start_timeout: *start_timeout, + } +} + +fn storage_controller_stop_args(args: &ArgMatches) -> NeonStorageControllerStopArgs { + let maybe_instance_id = args.get_one::("instance-id"); + let immediate = args.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); + + NeonStorageControllerStopArgs { + instance_id: maybe_instance_id.copied().unwrap_or(1), + immediate, + } +} + async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { match sub_match.subcommand() { Some(("start", subcommand_args)) => { if let Err(e) = get_pageserver(env, subcommand_args)? - .start(&pageserver_config_overrides(subcommand_args)) + .start(get_start_timeout(subcommand_args)) .await { eprintln!("pageserver start failed: {e}"); @@ -1085,47 +1116,12 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> exit(1); } - if let Err(e) = pageserver - .start(&pageserver_config_overrides(subcommand_args)) - .await - { + if let Err(e) = pageserver.start(get_start_timeout(sub_match)).await { eprintln!("pageserver start failed: {e}"); exit(1); } } - Some(("migrate", subcommand_args)) => { - let pageserver = get_pageserver(env, subcommand_args)?; - //TODO what shutdown strategy should we use here? - if let Err(e) = pageserver.stop(false) { - eprintln!("pageserver stop failed: {}", e); - exit(1); - } - - if let Err(e) = pageserver - .start(&pageserver_config_overrides(subcommand_args)) - .await - { - eprintln!("pageserver start failed: {e}"); - exit(1); - } - } - - Some(("set-state", subcommand_args)) => { - let pageserver = get_pageserver(env, subcommand_args)?; - let scheduling = subcommand_args.get_one("scheduling"); - let availability = subcommand_args.get_one("availability"); - - let attachment_service = AttachmentService::from_env(env); - attachment_service - .node_configure(NodeConfigureRequest { - node_id: pageserver.conf.id, - scheduling: scheduling.cloned(), - availability: availability.cloned(), - }) - .await?; - } - Some(("status", subcommand_args)) => { match get_pageserver(env, subcommand_args)?.check_status().await { Ok(_) => println!("Page server is up and running"), @@ -1142,32 +1138,27 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Ok(()) } -async fn handle_attachment_service( +async fn handle_storage_controller( sub_match: &ArgMatches, env: &local_env::LocalEnv, ) -> Result<()> { - let svc = AttachmentService::from_env(env); + let svc = StorageController::from_env(env); match sub_match.subcommand() { - Some(("start", _start_match)) => { - if let Err(e) = svc.start().await { + Some(("start", start_match)) => { + if let Err(e) = svc.start(storage_controller_start_args(start_match)).await { eprintln!("start failed: {e}"); exit(1); } } Some(("stop", stop_match)) => { - let immediate = stop_match - .get_one::("stop-mode") - .map(|s| s.as_str()) - == Some("immediate"); - - if let Err(e) = svc.stop(immediate) { + if let Err(e) = svc.stop(storage_controller_stop_args(stop_match)).await { eprintln!("stop failed: {}", e); exit(1); } } - Some((sub_name, _)) => bail!("Unexpected attachment_service subcommand '{}'", sub_name), - None => bail!("no attachment_service subcommand provided"), + Some((sub_name, _)) => bail!("Unexpected storage_controller subcommand '{}'", sub_name), + None => bail!("no storage_controller subcommand provided"), } Ok(()) } @@ -1208,7 +1199,10 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> "start" => { let extra_opts = safekeeper_extra_opts(sub_args); - if let Err(e) = safekeeper.start(extra_opts).await { + if let Err(e) = safekeeper + .start(extra_opts, get_start_timeout(sub_args)) + .await + { eprintln!("safekeeper start failed: {}", e); exit(1); } @@ -1234,7 +1228,10 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> } let extra_opts = safekeeper_extra_opts(sub_args); - if let Err(e) = safekeeper.start(extra_opts).await { + if let Err(e) = safekeeper + .start(extra_opts, get_start_timeout(sub_args)) + .await + { eprintln!("safekeeper start failed: {}", e); exit(1); } @@ -1247,59 +1244,125 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Ok(()) } -async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> { +async fn handle_start_all( + env: &local_env::LocalEnv, + retry_timeout: &Duration, +) -> anyhow::Result<()> { // Endpoints are not started automatically - broker::start_broker_process(env).await?; + broker::start_broker_process(env, retry_timeout).await?; - // Only start the attachment service if the pageserver is configured to need it + // Only start the storage controller if the pageserver is configured to need it if env.control_plane_api.is_some() { - let attachment_service = AttachmentService::from_env(env); - if let Err(e) = attachment_service.start().await { - eprintln!("attachment_service start failed: {:#}", e); - try_stop_all(env, true); + let storage_controller = StorageController::from_env(env); + if let Err(e) = storage_controller + .start(NeonStorageControllerStartArgs::with_default_instance_id( + (*retry_timeout).into(), + )) + .await + { + eprintln!("storage_controller start failed: {:#}", e); + try_stop_all(env, true).await; exit(1); } } for ps_conf in &env.pageservers { let pageserver = PageServerNode::from_env(env, ps_conf); - if let Err(e) = pageserver - .start(&pageserver_config_overrides(sub_match)) - .await - { + if let Err(e) = pageserver.start(retry_timeout).await { eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e); - try_stop_all(env, true); + try_stop_all(env, true).await; exit(1); } } for node in env.safekeepers.iter() { let safekeeper = SafekeeperNode::from_env(env, node); - if let Err(e) = safekeeper.start(vec![]).await { + if let Err(e) = safekeeper.start(vec![], retry_timeout).await { eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e); - try_stop_all(env, false); + try_stop_all(env, false).await; exit(1); } } + + neon_start_status_check(env, retry_timeout).await?; + Ok(()) } -fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +async fn neon_start_status_check( + env: &local_env::LocalEnv, + retry_timeout: &Duration, +) -> anyhow::Result<()> { + const RETRY_INTERVAL: Duration = Duration::from_millis(100); + const NOTICE_AFTER_RETRIES: Duration = Duration::from_secs(5); + + if env.control_plane_api.is_none() { + return Ok(()); + } + + let storcon = StorageController::from_env(env); + + let retries = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis(); + let notice_after_retries = retry_timeout.as_millis() / NOTICE_AFTER_RETRIES.as_millis(); + + println!("\nRunning neon status check"); + + for retry in 0..retries { + if retry == notice_after_retries { + println!("\nNeon status check has not passed yet, continuing to wait") + } + + let mut passed = true; + let mut nodes = storcon.node_list().await?; + let mut pageservers = env.pageservers.clone(); + + if nodes.len() != pageservers.len() { + continue; + } + + nodes.sort_by_key(|ps| ps.id); + pageservers.sort_by_key(|ps| ps.id); + + for (idx, pageserver) in pageservers.iter().enumerate() { + let node = &nodes[idx]; + if node.id != pageserver.id { + passed = false; + break; + } + + if !matches!(node.availability, NodeAvailabilityWrapper::Active) { + passed = false; + break; + } + } + + if passed { + println!("\nNeon started and passed status check"); + return Ok(()); + } + + tokio::time::sleep(RETRY_INTERVAL).await; + } + + anyhow::bail!("\nNeon passed status check") +} + +async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let immediate = sub_match.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); - try_stop_all(env, immediate); + try_stop_all(env, immediate).await; Ok(()) } -fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { +async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { // Stop all endpoints match ComputeControlPlane::load(env.clone()) { Ok(cplane) => { for (_k, node) in cplane.endpoints { - if let Err(e) = node.stop(false) { + if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) { eprintln!("postgres stop failed: {e:#}"); } } @@ -1327,15 +1390,35 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { eprintln!("neon broker stop failed: {e:#}"); } - if env.control_plane_api.is_some() { - let attachment_service = AttachmentService::from_env(env); - if let Err(e) = attachment_service.stop(immediate) { - eprintln!("attachment service stop failed: {e:#}"); + // Stop all storage controller instances. In the most common case there's only one, + // but iterate though the base data directory in order to discover the instances. + let storcon_instances = env + .storage_controller_instances() + .await + .expect("Must inspect data dir"); + for (instance_id, _instance_dir_path) in storcon_instances { + let storage_controller = StorageController::from_env(env); + let stop_args = NeonStorageControllerStopArgs { + instance_id, + immediate, + }; + + if let Err(e) = storage_controller.stop(stop_args).await { + eprintln!("Storage controller instance {instance_id} stop failed: {e:#}"); } } } fn cli() -> Command { + let timeout_arg = Arg::new("start-timeout") + .long("start-timeout") + .short('t') + .global(true) + .help("timeout until we fail the command, e.g. 30s") + .value_parser(value_parser!(humantime::Duration)) + .default_value("10s") + .required(false); + let branch_name_arg = Arg::new("branch-name") .long("branch-name") .help("Name of the branch to be created or used as an alias for other services") @@ -1408,13 +1491,6 @@ fn cli() -> Command { .required(false) .value_name("stop-mode"); - let pageserver_config_args = Arg::new("pageserver-config-override") - .long("pageserver-config-override") - .num_args(1) - .action(ArgAction::Append) - .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more") - .required(false); - let remote_ext_config_args = Arg::new("remote-ext-config") .long("remote-ext-config") .num_args(1) @@ -1448,9 +1524,37 @@ fn cli() -> Command { let num_pageservers_arg = Arg::new("num-pageservers") .value_parser(value_parser!(u16)) .long("num-pageservers") - .help("How many pageservers to create (default 1)") - .required(false) - .default_value("1"); + .help("How many pageservers to create (default 1)"); + + let update_catalog = Arg::new("update-catalog") + .value_parser(value_parser!(bool)) + .long("update-catalog") + .help("If set, will set up the catalog for neon_superuser") + .required(false); + + let create_test_user = Arg::new("create-test-user") + .value_parser(value_parser!(bool)) + .long("create-test-user") + .help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`") + .required(false); + + let allow_multiple = Arg::new("allow-multiple") + .help("Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests.") + .long("allow-multiple") + .action(ArgAction::SetTrue) + .required(false); + + let instance_id = Arg::new("instance-id") + .long("instance-id") + .help("Identifier used to distinguish storage controller instances (default 1)") + .value_parser(value_parser!(u8)) + .required(false); + + let base_port = Arg::new("base-port") + .long("base-port") + .help("Base port for the storage controller instance idenfified by instance-id (defaults to pagserver cplane api)") + .value_parser(value_parser!(u16)) + .required(false); Command::new("Neon CLI") .arg_required_else_help(true) @@ -1458,14 +1562,13 @@ fn cli() -> Command { .subcommand( Command::new("init") .about("Initialize a new Neon repository, preparing configs for services to start with") - .arg(pageserver_config_args.clone()) .arg(num_pageservers_arg.clone()) .arg( Arg::new("config") .long("config") .required(false) .value_parser(value_parser!(PathBuf)) - .value_name("config"), + .value_name("config") ) .arg(pg_version_arg.clone()) .arg(force_arg) @@ -1473,6 +1576,7 @@ fn cli() -> Command { .subcommand( Command::new("timeline") .about("Manage timelines") + .arg_required_else_help(true) .subcommand(Command::new("list") .about("List all timelines, available to this pageserver") .arg(tenant_id_arg.clone())) @@ -1495,8 +1599,7 @@ fn cli() -> Command { .about("Import timeline from basebackup directory") .arg(tenant_id_arg.clone()) .arg(timeline_id_arg.clone()) - .arg(Arg::new("node-name").long("node-name") - .help("Name to assign to the imported timeline")) + .arg(branch_name_arg.clone()) .arg(Arg::new("base-tarfile") .long("base-tarfile") .value_parser(value_parser!(PathBuf)) @@ -1527,19 +1630,15 @@ fn cli() -> Command { .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified")) .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)")) .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages")) + .arg(Arg::new("placement-policy").value_parser(value_parser!(String)).long("placement-policy").action(ArgAction::Set).help("Placement policy shards in this tenant")) ) .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true)) .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified")) .subcommand(Command::new("config") .arg(tenant_id_arg.clone()) .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))) - .subcommand(Command::new("migrate") - .about("Migrate a tenant from one pageserver to another") - .arg(tenant_id_arg.clone()) - .arg(pageserver_id_arg.clone())) - .subcommand(Command::new("status") - .about("Human readable summary of the tenant's shards and attachment locations") - .arg(tenant_id_arg.clone())) + .subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true)) + .about("Import a tenant that is present in remote storage, and create branches for its timelines")) ) .subcommand( Command::new("pageserver") @@ -1549,7 +1648,7 @@ fn cli() -> Command { .subcommand(Command::new("status")) .subcommand(Command::new("start") .about("Start local pageserver") - .arg(pageserver_config_args.clone()) + .arg(timeout_arg.clone()) ) .subcommand(Command::new("stop") .about("Stop local pageserver") @@ -1557,22 +1656,20 @@ fn cli() -> Command { ) .subcommand(Command::new("restart") .about("Restart local pageserver") - .arg(pageserver_config_args.clone()) - ) - .subcommand(Command::new("set-state") - .arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active")) - .arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active")) - .about("Set scheduling or availability state of pageserver node") - .arg(pageserver_config_args.clone()) + .arg(timeout_arg.clone()) ) ) .subcommand( - Command::new("attachment_service") + Command::new("storage_controller") .arg_required_else_help(true) - .about("Manage attachment_service") - .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone())) - .subcommand(Command::new("stop").about("Stop local pageserver") - .arg(stop_mode_arg.clone())) + .about("Manage storage_controller") + .subcommand(Command::new("start").about("Start storage controller") + .arg(timeout_arg.clone()) + .arg(instance_id.clone()) + .arg(base_port)) + .subcommand(Command::new("stop").about("Stop storage controller") + .arg(stop_mode_arg.clone()) + .arg(instance_id)) ) .subcommand( Command::new("safekeeper") @@ -1582,6 +1679,7 @@ fn cli() -> Command { .about("Start local safekeeper") .arg(safekeeper_id_arg.clone()) .arg(safekeeper_extra_opt_arg.clone()) + .arg(timeout_arg.clone()) ) .subcommand(Command::new("stop") .about("Stop local safekeeper") @@ -1593,6 +1691,7 @@ fn cli() -> Command { .arg(safekeeper_id_arg) .arg(stop_mode_arg.clone()) .arg(safekeeper_extra_opt_arg) + .arg(timeout_arg.clone()) ) ) .subcommand( @@ -1616,17 +1715,23 @@ fn cli() -> Command { .required(false)) .arg(pg_version_arg.clone()) .arg(hot_standby_arg.clone()) + .arg(update_catalog) + .arg(allow_multiple.clone()) ) .subcommand(Command::new("start") .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.") .arg(endpoint_id_arg.clone()) .arg(endpoint_pageserver_id_arg.clone()) - .arg(safekeepers_arg) + .arg(safekeepers_arg.clone()) .arg(remote_ext_config_args) + .arg(create_test_user) + .arg(allow_multiple.clone()) + .arg(timeout_arg.clone()) ) .subcommand(Command::new("reconfigure") .about("Reconfigure the endpoint") .arg(endpoint_pageserver_id_arg) + .arg(safekeepers_arg) .arg(endpoint_id_arg.clone()) .arg(tenant_id_arg.clone()) ) @@ -1639,7 +1744,16 @@ fn cli() -> Command { .long("destroy") .action(ArgAction::SetTrue) .required(false) - ) + ) + .arg( + Arg::new("mode") + .help("Postgres shutdown mode, passed to \"pg_ctl -m \"") + .long("mode") + .action(ArgAction::Set) + .required(false) + .value_parser(["smart", "fast", "immediate"]) + .default_value("fast") + ) ) ) @@ -1665,7 +1779,7 @@ fn cli() -> Command { .subcommand( Command::new("start") .about("Start page server and safekeepers") - .arg(pageserver_config_args) + .arg(timeout_arg.clone()) ) .subcommand( Command::new("stop") diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs index f40705863b..c8ac5d8981 100644 --- a/control_plane/src/broker.rs +++ b/control_plane/src/broker.rs @@ -1,17 +1,22 @@ //! Code to manage the storage broker //! -//! In the local test environment, the data for each safekeeper is stored in +//! In the local test environment, the storage broker stores its data directly in //! //! ```text -//! .neon/safekeepers/ +//! .neon //! ``` +use std::time::Duration; + use anyhow::Context; use camino::Utf8PathBuf; use crate::{background_process, local_env}; -pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { +pub async fn start_broker_process( + env: &local_env::LocalEnv, + retry_timeout: &Duration, +) -> anyhow::Result<()> { let broker = &env.broker; let listen_addr = &broker.listen_addr; @@ -27,6 +32,7 @@ pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<( args, [], background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)), + retry_timeout, || async { let url = broker.client_url(); let status_url = url.join("status").with_context(|| { diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index d3b0366d31..9f879c4b08 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -12,7 +12,7 @@ //! //! The endpoint is managed by the `compute_ctl` binary. When an endpoint is //! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads -//! the basebackup from the pageserver to initialize the the data directory, and +//! the basebackup from the pageserver to initialize the data directory, and //! finally launches the PostgreSQL process. It watches the PostgreSQL process //! until it exits. //! @@ -41,20 +41,25 @@ use std::net::SocketAddr; use std::net::TcpStream; use std::path::PathBuf; use std::process::Command; +use std::str::FromStr; use std::sync::Arc; use std::time::Duration; use anyhow::{anyhow, bail, Context, Result}; +use compute_api::spec::Database; +use compute_api::spec::PgIdent; use compute_api::spec::RemoteExtSpec; +use compute_api::spec::Role; use nix::sys::signal::kill; use nix::sys::signal::Signal; +use pageserver_api::shard::ShardStripeSize; use serde::{Deserialize, Serialize}; use url::Host; use utils::id::{NodeId, TenantId, TimelineId}; -use crate::attachment_service::AttachmentService; use crate::local_env::LocalEnv; use crate::postgresql_conf::PostgresConf; +use crate::storage_controller::StorageController; use compute_api::responses::{ComputeState, ComputeStatus}; use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec}; @@ -122,6 +127,7 @@ impl ComputeControlPlane { http_port: Option, pg_version: u32, mode: ComputeMode, + skip_pg_catalog_updates: bool, ) -> Result> { let pg_port = pg_port.unwrap_or_else(|| self.get_port()); let http_port = http_port.unwrap_or_else(|| self.get_port() + 1); @@ -140,7 +146,7 @@ impl ComputeControlPlane { // before and after start are the same. So, skip catalog updates, // with this we basically test a case of waking up an idle compute, where // we also skip catalog updates in the cloud. - skip_pg_catalog_updates: true, + skip_pg_catalog_updates, features: vec![], }); @@ -155,7 +161,7 @@ impl ComputeControlPlane { http_port, pg_port, pg_version, - skip_pg_catalog_updates: true, + skip_pg_catalog_updates, features: vec![], })?, )?; @@ -184,7 +190,7 @@ impl ComputeControlPlane { v.tenant_id == tenant_id && v.timeline_id == timeline_id && v.mode == mode - && v.status() != "stopped" + && v.status() != EndpointStatus::Stopped }); if let Some((key, _)) = duplicates.next() { @@ -223,6 +229,26 @@ pub struct Endpoint { features: Vec, } +#[derive(PartialEq, Eq)] +pub enum EndpointStatus { + Running, + Stopped, + Crashed, + RunningNoPidfile, +} + +impl std::fmt::Display for EndpointStatus { + fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result { + let s = match self { + Self::Running => "running", + Self::Stopped => "stopped", + Self::Crashed => "crashed", + Self::RunningNoPidfile => "running, no pidfile", + }; + write!(writer, "{}", s) + } +} + impl Endpoint { fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result { if !entry.file_type()?.is_dir() { @@ -380,16 +406,16 @@ impl Endpoint { self.endpoint_path().join("pgdata") } - pub fn status(&self) -> &str { + pub fn status(&self) -> EndpointStatus { let timeout = Duration::from_millis(300); let has_pidfile = self.pgdata().join("postmaster.pid").exists(); let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok(); match (has_pidfile, can_connect) { - (true, true) => "running", - (false, false) => "stopped", - (true, false) => "crashed", - (false, true) => "running, no pidfile", + (true, true) => EndpointStatus::Running, + (false, false) => EndpointStatus::Stopped, + (true, false) => EndpointStatus::Crashed, + (false, true) => EndpointStatus::RunningNoPidfile, } } @@ -438,7 +464,7 @@ impl Endpoint { } fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> { - // TODO use background_process::stop_process instead + // TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482 let pidfile_path = self.endpoint_path().join("compute_ctl.pid"); let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?; let pid = nix::unistd::Pid::from_raw(pid as i32); @@ -473,6 +499,23 @@ impl Endpoint { .join(",") } + /// Map safekeepers ids to the actual connection strings. + fn build_safekeepers_connstrs(&self, sk_ids: Vec) -> Result> { + let mut safekeeper_connstrings = Vec::new(); + if self.mode == ComputeMode::Primary { + for sk_id in sk_ids { + let sk = self + .env + .safekeepers + .iter() + .find(|node| node.id == sk_id) + .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?; + safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port())); + } + } + Ok(safekeeper_connstrings) + } + pub async fn start( &self, auth_token: &Option, @@ -480,8 +523,9 @@ impl Endpoint { pageservers: Vec<(Host, u16)>, remote_ext_config: Option<&String>, shard_stripe_size: usize, + create_test_user: bool, ) -> Result<()> { - if self.status() == "running" { + if self.status() == EndpointStatus::Running { anyhow::bail!("The endpoint is already running"); } @@ -496,18 +540,7 @@ impl Endpoint { let pageserver_connstring = Self::build_pageserver_connstr(&pageservers); assert!(!pageserver_connstring.is_empty()); - let mut safekeeper_connstrings = Vec::new(); - if self.mode == ComputeMode::Primary { - for sk_id in safekeepers { - let sk = self - .env - .safekeepers - .iter() - .find(|node| node.id == sk_id) - .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?; - safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port())); - } - } + let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?; // check for file remote_extensions_spec.json // if it is present, read it and pass to compute_ctl @@ -527,12 +560,31 @@ impl Endpoint { format_version: 1.0, operation_uuid: None, features: self.features.clone(), + swap_size_bytes: None, cluster: Cluster { cluster_id: None, // project ID: not used name: None, // project name: not used state: None, - roles: vec![], - databases: vec![], + roles: if create_test_user { + vec![Role { + name: PgIdent::from_str("test").unwrap(), + encrypted_password: None, + options: None, + }] + } else { + Vec::new() + }, + databases: if create_test_user { + vec![Database { + name: PgIdent::from_str("neondb").unwrap(), + owner: PgIdent::from_str("test").unwrap(), + options: None, + restrict_conn: false, + invalid: false, + }] + } else { + Vec::new() + }, settings: None, postgresql_conf: Some(postgresql_conf), }, @@ -557,11 +609,16 @@ impl Endpoint { .open(self.endpoint_path().join("compute.log"))?; // Launch compute_ctl - println!("Starting postgres node at '{}'", self.connstr()); + let conn_str = self.connstr("cloud_admin", "postgres"); + println!("Starting postgres node at '{}'", conn_str); + if create_test_user { + let conn_str = self.connstr("test", "neondb"); + println!("Also at '{}'", conn_str); + } let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl")); cmd.args(["--http-port", &self.http_address.port().to_string()]) .args(["--pgdata", self.pgdata().to_str().unwrap()]) - .args(["--connstr", &self.connstr()]) + .args(["--connstr", &conn_str]) .args([ "--spec-path", self.endpoint_path().join("spec.json").to_str().unwrap(), @@ -583,9 +640,21 @@ impl Endpoint { } let child = cmd.spawn()?; + // set up a scopeguard to kill & wait for the child in case we panic or bail below + let child = scopeguard::guard(child, |mut child| { + println!("SIGKILL & wait the started process"); + (|| { + // TODO: use another signal that can be caught by the child so it can clean up any children it spawned + child.kill().context("SIGKILL child")?; + child.wait().context("wait() for child process")?; + anyhow::Ok(()) + })() + .with_context(|| format!("scopeguard kill&wait child {child:?}")) + .unwrap(); + }); // Write down the pid so we can wait for it when we want to stop - // TODO use background_process::start_process instead + // TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482 let pid = child.id(); let pidfile_path = self.endpoint_path().join("compute_ctl.pid"); std::fs::write(pidfile_path, pid.to_string())?; @@ -593,7 +662,7 @@ impl Endpoint { // Wait for it to start let mut attempt = 0; const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100); - const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s + const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min loop { attempt += 1; match self.get_status().await { @@ -620,7 +689,9 @@ impl Endpoint { } ComputeStatus::Empty | ComputeStatus::ConfigurationPending - | ComputeStatus::Configuration => { + | ComputeStatus::Configuration + | ComputeStatus::TerminationPending + | ComputeStatus::Terminated => { bail!("unexpected compute status: {:?}", state.status) } } @@ -634,6 +705,9 @@ impl Endpoint { std::thread::sleep(ATTEMPT_INTERVAL); } + // disarm the scopeguard, let the child outlive this function (and neon_local invoction) + drop(scopeguard::ScopeGuard::into_inner(child)); + Ok(()) } @@ -668,7 +742,12 @@ impl Endpoint { } } - pub async fn reconfigure(&self, mut pageservers: Vec<(Host, u16)>) -> Result<()> { + pub async fn reconfigure( + &self, + mut pageservers: Vec<(Host, u16)>, + stripe_size: Option, + safekeepers: Option>, + ) -> Result<()> { let mut spec: ComputeSpec = { let spec_path = self.endpoint_path().join("spec.json"); let file = std::fs::File::open(spec_path)?; @@ -678,17 +757,17 @@ impl Endpoint { let postgresql_conf = self.read_postgresql_conf()?; spec.cluster.postgresql_conf = Some(postgresql_conf); - // If we weren't given explicit pageservers, query the attachment service + // If we weren't given explicit pageservers, query the storage controller if pageservers.is_empty() { - let attachment_service = AttachmentService::from_env(&self.env); - let locate_result = attachment_service.tenant_locate(self.tenant_id).await?; + let storage_controller = StorageController::from_env(&self.env); + let locate_result = storage_controller.tenant_locate(self.tenant_id).await?; pageservers = locate_result .shards .into_iter() .map(|shard| { ( Host::parse(&shard.listen_pg_addr) - .expect("Attachment service reported bad hostname"), + .expect("Storage controller reported bad hostname"), shard.listen_pg_port, ) }) @@ -698,8 +777,20 @@ impl Endpoint { let pageserver_connstr = Self::build_pageserver_connstr(&pageservers); assert!(!pageserver_connstr.is_empty()); spec.pageserver_connstring = Some(pageserver_connstr); + if stripe_size.is_some() { + spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize); + } - let client = reqwest::Client::new(); + // If safekeepers are not specified, don't change them. + if let Some(safekeepers) = safekeepers { + let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?; + spec.safekeeper_connstrings = safekeeper_connstrings; + } + + let client = reqwest::Client::builder() + .timeout(Duration::from_secs(30)) + .build() + .unwrap(); let response = client .post(format!( "http://{}:{}/configure", @@ -726,32 +817,19 @@ impl Endpoint { } } - pub fn stop(&self, destroy: bool) -> Result<()> { - // If we are going to destroy data directory, - // use immediate shutdown mode, otherwise, - // shutdown gracefully to leave the data directory sane. - // - // Postgres is always started from scratch, so stop - // without destroy only used for testing and debugging. - // - self.pg_ctl( - if destroy { - &["-m", "immediate", "stop"] - } else { - &["stop"] - }, - &None, - )?; + pub fn stop(&self, mode: &str, destroy: bool) -> Result<()> { + self.pg_ctl(&["-m", mode, "stop"], &None)?; // Also wait for the compute_ctl process to die. It might have some // cleanup work to do after postgres stops, like syncing safekeepers, // etc. // - // If destroying, send it SIGTERM before waiting. Sometimes we do *not* - // want this cleanup: tests intentionally do stop when majority of - // safekeepers is down, so sync-safekeepers would hang otherwise. This - // could be a separate flag though. - self.wait_for_compute_ctl_to_exit(destroy)?; + // If destroying or stop mode is immediate, send it SIGTERM before + // waiting. Sometimes we do *not* want this cleanup: tests intentionally + // do stop when majority of safekeepers is down, so sync-safekeepers + // would hang otherwise. This could be a separate flag though. + let send_sigterm = destroy || mode == "immediate"; + self.wait_for_compute_ctl_to_exit(send_sigterm)?; if destroy { println!( "Destroying postgres data directory '{}'", @@ -762,13 +840,13 @@ impl Endpoint { Ok(()) } - pub fn connstr(&self) -> String { + pub fn connstr(&self, user: &str, db_name: &str) -> String { format!( "postgresql://{}@{}:{}/{}", - "cloud_admin", + user, self.pg_address.ip(), self.pg_address.port(), - "postgres" + db_name ) } } diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index bb79d36bfc..2af272f388 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -6,7 +6,6 @@ //! local installations. #![deny(clippy::undocumented_unsafe_blocks)] -pub mod attachment_service; mod background_process; pub mod broker; pub mod endpoint; @@ -14,3 +13,4 @@ pub mod local_env; pub mod pageserver; pub mod postgresql_conf; pub mod safekeeper; +pub mod storage_controller; diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 4460fdd3a6..5dbc3bcbbc 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -3,7 +3,7 @@ //! Now it also provides init method which acts like a stub for proper installation //! script which will use local paths. -use anyhow::{bail, ensure, Context}; +use anyhow::{bail, Context}; use clap::ValueEnum; use postgres_backend::AuthType; @@ -17,14 +17,17 @@ use std::net::Ipv4Addr; use std::net::SocketAddr; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; +use std::time::Duration; use utils::{ auth::{encode_from_key_file, Claims}, id::{NodeId, TenantId, TenantTimelineId, TimelineId}, }; +use crate::pageserver::PageServerNode; +use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR; use crate::safekeeper::SafekeeperNode; -pub const DEFAULT_PG_VERSION: u32 = 15; +pub const DEFAULT_PG_VERSION: u32 = 16; // // This data structures represents neon_local CLI config @@ -33,58 +36,107 @@ pub const DEFAULT_PG_VERSION: u32 = 15; // to 'neon_local init --config=' option. See control_plane/simple.conf for // an example. // -#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] +#[derive(PartialEq, Eq, Clone, Debug)] pub struct LocalEnv { // Base directory for all the nodes (the pageserver, safekeepers and // compute endpoints). // // This is not stored in the config file. Rather, this is the path where the - // config file itself is. It is read from the NEON_REPO_DIR env variable or - // '.neon' if not given. - #[serde(skip)] + // config file itself is. It is read from the NEON_REPO_DIR env variable which + // must be an absolute path. If the env var is not set, $PWD/.neon is used. pub base_data_dir: PathBuf, // Path to postgres distribution. It's expected that "bin", "include", // "lib", "share" from postgres distribution are there. If at some point // in time we will be able to run against vanilla postgres we may split that // to four separate paths and match OS-specific installation layout. - #[serde(default)] pub pg_distrib_dir: PathBuf, // Path to pageserver binary. - #[serde(default)] pub neon_distrib_dir: PathBuf, // Default tenant ID to use with the 'neon_local' command line utility, when // --tenant_id is not explicitly specified. - #[serde(default)] pub default_tenant_id: Option, // used to issue tokens during e.g pg start - #[serde(default)] pub private_key_path: PathBuf, pub broker: NeonBroker, + // Configuration for the storage controller (1 per neon_local environment) + pub storage_controller: NeonStorageControllerConf, + /// This Vec must always contain at least one pageserver + /// Populdated by [`Self::load_config`] from the individual `pageserver.toml`s. + /// NB: not used anymore except for informing users that they need to change their `.neon/config`. pub pageservers: Vec, - #[serde(default)] pub safekeepers: Vec, - // Control plane location: if None, we will not run attachment_service. If set, this will + // Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will // be propagated into each pageserver's configuration. - #[serde(default)] pub control_plane_api: Option, + // Control plane upcall API for storage controller. If set, this will be propagated into the + // storage controller's configuration. + pub control_plane_compute_hook_api: Option, + /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. - #[serde(default)] // A `HashMap>` would be more appropriate here, // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". + pub branch_name_mappings: HashMap>, +} + +/// On-disk state stored in `.neon/config`. +#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)] +#[serde(default, deny_unknown_fields)] +pub struct OnDiskConfig { + pub pg_distrib_dir: PathBuf, + pub neon_distrib_dir: PathBuf, + pub default_tenant_id: Option, + pub private_key_path: PathBuf, + pub broker: NeonBroker, + pub storage_controller: NeonStorageControllerConf, + #[serde( + skip_serializing, + deserialize_with = "fail_if_pageservers_field_specified" + )] + pub pageservers: Vec, + pub safekeepers: Vec, + pub control_plane_api: Option, + pub control_plane_compute_hook_api: Option, branch_name_mappings: HashMap>, } +fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + Err(serde::de::Error::custom( + "The 'pageservers' field is no longer used; pageserver.toml is now authoritative; \ + Please remove the `pageservers` from your .neon/config.", + )) +} + +/// The description of the neon_local env to be initialized by `neon_local init --config`. +#[derive(Clone, Debug, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct NeonLocalInitConf { + // TODO: do we need this? Seems unused + pub pg_distrib_dir: Option, + // TODO: do we need this? Seems unused + pub neon_distrib_dir: Option, + pub default_tenant_id: TenantId, + pub broker: NeonBroker, + pub storage_controller: Option, + pub pageservers: Vec, + pub safekeepers: Vec, + pub control_plane_api: Option>, + pub control_plane_compute_hook_api: Option>, +} + /// Broker config for cluster internal communication. #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] @@ -93,6 +145,55 @@ pub struct NeonBroker { pub listen_addr: SocketAddr, } +/// Broker config for cluster internal communication. +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] +#[serde(default)] +pub struct NeonStorageControllerConf { + /// Heartbeat timeout before marking a node offline + #[serde(with = "humantime_serde")] + pub max_offline: Duration, + + #[serde(with = "humantime_serde")] + pub max_warming_up: Duration, + + pub start_as_candidate: bool, + + /// Database url used when running multiple storage controller instances + pub database_url: Option, + + /// Threshold for auto-splitting a tenant into shards + pub split_threshold: Option, + + pub max_secondary_lag_bytes: Option, + + #[serde(with = "humantime_serde")] + pub heartbeat_interval: Duration, +} + +impl NeonStorageControllerConf { + // Use a shorter pageserver unavailability interval than the default to speed up tests. + const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10); + + const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30); + + // Very tight heartbeat interval to speed up tests + const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(100); +} + +impl Default for NeonStorageControllerConf { + fn default() -> Self { + Self { + max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL, + max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL, + start_as_candidate: false, + database_url: None, + split_threshold: None, + max_secondary_lag_bytes: None, + heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL, + } + } +} + // Dummy Default impl to satisfy Deserialize derive. impl Default for NeonBroker { fn default() -> Self { @@ -108,17 +209,16 @@ impl NeonBroker { } } +// neon_local needs to know this subset of pageserver configuration. +// For legacy reasons, this information is duplicated from `pageserver.toml` into `.neon/config`. +// It can get stale if `pageserver.toml` is changed. +// TODO(christian): don't store this at all in `.neon/config`, always load it from `pageserver.toml` #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] -#[serde(default)] +#[serde(default, deny_unknown_fields)] pub struct PageServerConf { - // node id pub id: NodeId, - - // Pageserver connection settings pub listen_pg_addr: String, pub listen_http_addr: String, - - // auth type used for the PG and HTTP ports pub pg_auth_type: AuthType, pub http_auth_type: AuthType, } @@ -135,6 +235,40 @@ impl Default for PageServerConf { } } +/// The toml that can be passed to `neon_local init --config`. +/// This is a subset of the `pageserver.toml` configuration. +// TODO(christian): use pageserver_api::config::ConfigToml (PR #7656) +#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] +pub struct NeonLocalInitPageserverConf { + pub id: NodeId, + pub listen_pg_addr: String, + pub listen_http_addr: String, + pub pg_auth_type: AuthType, + pub http_auth_type: AuthType, + #[serde(flatten)] + pub other: HashMap, +} + +impl From<&NeonLocalInitPageserverConf> for PageServerConf { + fn from(conf: &NeonLocalInitPageserverConf) -> Self { + let NeonLocalInitPageserverConf { + id, + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + other: _, + } = conf; + Self { + id: *id, + listen_pg_addr: listen_pg_addr.clone(), + listen_http_addr: listen_http_addr.clone(), + pg_auth_type: *pg_auth_type, + http_auth_type: *http_auth_type, + } + } +} + #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct SafekeeperConf { @@ -146,6 +280,7 @@ pub struct SafekeeperConf { pub remote_storage: Option, pub backup_threads: Option, pub auth_enabled: bool, + pub listen_addr: Option, } impl Default for SafekeeperConf { @@ -159,6 +294,7 @@ impl Default for SafekeeperConf { remote_storage: None, backup_threads: None, auth_enabled: false, + listen_addr: None, } } } @@ -211,19 +347,28 @@ impl LocalEnv { } } - pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result { - Ok(self.pg_distrib_dir(pg_version)?.join("bin")) + pub fn pg_dir(&self, pg_version: u32, dir_name: &str) -> anyhow::Result { + Ok(self.pg_distrib_dir(pg_version)?.join(dir_name)) } + + pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result { + self.pg_dir(pg_version, "bin") + } + pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result { - Ok(self.pg_distrib_dir(pg_version)?.join("lib")) + self.pg_dir(pg_version, "lib") } pub fn pageserver_bin(&self) -> PathBuf { self.neon_distrib_dir.join("pageserver") } - pub fn attachment_service_bin(&self) -> PathBuf { - self.neon_distrib_dir.join("attachment_service") + pub fn storage_controller_bin(&self) -> PathBuf { + // Irrespective of configuration, storage controller binary is always + // run from the same location as neon_local. This means that for compatibility + // tests that run old pageserver/safekeeper, they still run latest storage controller. + let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned(); + neon_local_bin_dir.join("storage_controller") } pub fn safekeeper_bin(&self) -> PathBuf { @@ -261,6 +406,36 @@ impl LocalEnv { } } + /// Inspect the base data directory and extract the instance id and instance directory path + /// for all storage controller instances + pub async fn storage_controller_instances(&self) -> std::io::Result> { + let mut instances = Vec::default(); + + let dir = std::fs::read_dir(self.base_data_dir.clone())?; + for dentry in dir { + let dentry = dentry?; + let is_dir = dentry.metadata()?.is_dir(); + let filename = dentry.file_name().into_string().unwrap(); + let parsed_instance_id = match filename.strip_prefix("storage_controller_") { + Some(suffix) => suffix.parse::().ok(), + None => None, + }; + + let is_instance_dir = is_dir && parsed_instance_id.is_some(); + + if !is_instance_dir { + continue; + } + + instances.push(( + parsed_instance_id.expect("Checked previously"), + dentry.path(), + )); + } + + Ok(instances) + } + pub fn register_branch_mapping( &mut self, branch_name: String, @@ -312,44 +487,8 @@ impl LocalEnv { .collect() } - /// Create a LocalEnv from a config file. - /// - /// Unlike 'load_config', this function fills in any defaults that are missing - /// from the config file. - pub fn parse_config(toml: &str) -> anyhow::Result { - let mut env: LocalEnv = toml::from_str(toml)?; - - // Find postgres binaries. - // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install". - // Note that later in the code we assume, that distrib dirs follow the same pattern - // for all postgres versions. - if env.pg_distrib_dir == Path::new("") { - if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { - env.pg_distrib_dir = postgres_bin.into(); - } else { - let cwd = env::current_dir()?; - env.pg_distrib_dir = cwd.join("pg_install") - } - } - - // Find neon binaries. - if env.neon_distrib_dir == Path::new("") { - env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned(); - } - - if env.pageservers.is_empty() { - anyhow::bail!("Configuration must contain at least one pageserver"); - } - - env.base_data_dir = base_path(); - - Ok(env) - } - - /// Locate and load config - pub fn load_config() -> anyhow::Result { - let repopath = base_path(); - + /// Construct `Self` from on-disk state. + pub fn load_config(repopath: &Path) -> anyhow::Result { if !repopath.exists() { bail!( "Neon config is not found in {}. You need to run 'neon_local init' first", @@ -360,38 +499,140 @@ impl LocalEnv { // TODO: check that it looks like a neon repository // load and parse file - let config = fs::read_to_string(repopath.join("config"))?; - let mut env: LocalEnv = toml::from_str(config.as_str())?; + let config_file_contents = fs::read_to_string(repopath.join("config"))?; + let on_disk_config: OnDiskConfig = toml::from_str(config_file_contents.as_str())?; + let mut env = { + let OnDiskConfig { + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id, + private_key_path, + broker, + storage_controller, + pageservers, + safekeepers, + control_plane_api, + control_plane_compute_hook_api, + branch_name_mappings, + } = on_disk_config; + LocalEnv { + base_data_dir: repopath.to_owned(), + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id, + private_key_path, + broker, + storage_controller, + pageservers, + safekeepers, + control_plane_api, + control_plane_compute_hook_api, + branch_name_mappings, + } + }; - env.base_data_dir = repopath; + // The source of truth for pageserver configuration is the pageserver.toml. + assert!( + env.pageservers.is_empty(), + "we ensure this during deserialization" + ); + env.pageservers = { + let iter = std::fs::read_dir(repopath).context("open dir")?; + let mut pageservers = Vec::new(); + for res in iter { + let dentry = res?; + const PREFIX: &str = "pageserver_"; + let dentry_name = dentry + .file_name() + .into_string() + .ok() + .with_context(|| format!("non-utf8 dentry: {:?}", dentry.path())) + .unwrap(); + if !dentry_name.starts_with(PREFIX) { + continue; + } + if !dentry.file_type().context("determine file type")?.is_dir() { + anyhow::bail!("expected a directory, got {:?}", dentry.path()); + } + let id = dentry_name[PREFIX.len()..] + .parse::() + .with_context(|| format!("parse id from {:?}", dentry.path()))?; + // TODO(christian): use pageserver_api::config::ConfigToml (PR #7656) + #[derive(serde::Serialize, serde::Deserialize)] + // (allow unknown fields, unlike PageServerConf) + struct PageserverConfigTomlSubset { + listen_pg_addr: String, + listen_http_addr: String, + pg_auth_type: AuthType, + http_auth_type: AuthType, + } + let config_toml_path = dentry.path().join("pageserver.toml"); + let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str( + &std::fs::read_to_string(&config_toml_path) + .with_context(|| format!("read {:?}", config_toml_path))?, + ) + .context("parse pageserver.toml")?; + let identity_toml_path = dentry.path().join("identity.toml"); + #[derive(serde::Serialize, serde::Deserialize)] + struct IdentityTomlSubset { + id: NodeId, + } + let identity_toml: IdentityTomlSubset = toml_edit::de::from_str( + &std::fs::read_to_string(&identity_toml_path) + .with_context(|| format!("read {:?}", identity_toml_path))?, + ) + .context("parse identity.toml")?; + let PageserverConfigTomlSubset { + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + } = config_toml; + let IdentityTomlSubset { + id: identity_toml_id, + } = identity_toml; + let conf = PageServerConf { + id: { + anyhow::ensure!( + identity_toml_id == id, + "id mismatch: identity.toml:id={identity_toml_id} pageserver_(.*) id={id}", + ); + id + }, + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + }; + pageservers.push(conf); + } + pageservers + }; Ok(env) } - pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> { - // Currently, the user first passes a config file with 'neon_local init --config=' - // We read that in, in `create_config`, and fill any missing defaults. Then it's saved - // to .neon/config. TODO: We lose any formatting and comments along the way, which is - // a bit sad. - let mut conf_content = r#"# This file describes a local deployment of the page server -# and safekeeeper node. It is read by the 'neon_local' command-line -# utility. -"# - .to_string(); - - // Convert the LocalEnv to a toml file. - // - // This could be as simple as this: - // - // conf_content += &toml::to_string_pretty(env)?; - // - // But it results in a "values must be emitted before tables". I'm not sure - // why, AFAICS the table, i.e. 'safekeepers: Vec' is last. - // Maybe rust reorders the fields to squeeze avoid padding or something? - // In any case, converting to toml::Value first, and serializing that, works. - // See https://github.com/alexcrichton/toml-rs/issues/142 - conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?; + pub fn persist_config(&self) -> anyhow::Result<()> { + Self::persist_config_impl( + &self.base_data_dir, + &OnDiskConfig { + pg_distrib_dir: self.pg_distrib_dir.clone(), + neon_distrib_dir: self.neon_distrib_dir.clone(), + default_tenant_id: self.default_tenant_id, + private_key_path: self.private_key_path.clone(), + broker: self.broker.clone(), + storage_controller: self.storage_controller.clone(), + pageservers: vec![], // it's skip_serializing anyway + safekeepers: self.safekeepers.clone(), + control_plane_api: self.control_plane_api.clone(), + control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(), + branch_name_mappings: self.branch_name_mappings.clone(), + }, + ) + } + pub fn persist_config_impl(base_path: &Path, config: &OnDiskConfig) -> anyhow::Result<()> { + let conf_content = &toml::to_string_pretty(config)?; let target_config_path = base_path.join("config"); fs::write(&target_config_path, conf_content).with_context(|| { format!( @@ -403,27 +644,26 @@ impl LocalEnv { // this function is used only for testing purposes in CLI e g generate tokens during init pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result { - let private_key_path = if self.private_key_path.is_absolute() { - self.private_key_path.to_path_buf() - } else { - self.base_data_dir.join(&self.private_key_path) - }; - + let private_key_path = self.get_private_key_path(); let key_data = fs::read(private_key_path)?; encode_from_key_file(claims, &key_data) } - // - // Initialize a new Neon repository - // - pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> { - // check if config already exists - let base_path = &self.base_data_dir; - ensure!( - base_path != Path::new(""), - "repository base path is missing" - ); + pub fn get_private_key_path(&self) -> PathBuf { + if self.private_key_path.is_absolute() { + self.private_key_path.to_path_buf() + } else { + self.base_data_dir.join(&self.private_key_path) + } + } + /// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`]. + pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> { + let base_path = base_path(); + assert_ne!(base_path, Path::new("")); + let base_path = &base_path; + + // create base_path dir if base_path.exists() { match force { InitForceMode::MustNotExist => { @@ -455,74 +695,115 @@ impl LocalEnv { } } } - - if !self.pg_bin_dir(pg_version)?.join("postgres").exists() { - bail!( - "Can't find postgres binary at {}", - self.pg_bin_dir(pg_version)?.display() - ); - } - for binary in ["pageserver", "safekeeper"] { - if !self.neon_distrib_dir.join(binary).exists() { - bail!( - "Can't find binary '{binary}' in neon distrib dir '{}'", - self.neon_distrib_dir.display() - ); - } - } - if !base_path.exists() { fs::create_dir(base_path)?; } + let NeonLocalInitConf { + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id, + broker, + storage_controller, + pageservers, + safekeepers, + control_plane_api, + control_plane_compute_hook_api, + } = conf; + + // Find postgres binaries. + // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install". + // Note that later in the code we assume, that distrib dirs follow the same pattern + // for all postgres versions. + let pg_distrib_dir = pg_distrib_dir.unwrap_or_else(|| { + if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { + postgres_bin.into() + } else { + let cwd = env::current_dir().unwrap(); + cwd.join("pg_install") + } + }); + + // Find neon binaries. + let neon_distrib_dir = neon_distrib_dir + .unwrap_or_else(|| env::current_exe().unwrap().parent().unwrap().to_owned()); + // Generate keypair for JWT. // // The keypair is only needed if authentication is enabled in any of the // components. For convenience, we generate the keypair even if authentication // is not enabled, so that you can easily enable it after the initialization - // step. However, if the key generation fails, we treat it as non-fatal if - // authentication was not enabled. - if self.private_key_path == PathBuf::new() { - match generate_auth_keys( - base_path.join("auth_private_key.pem").as_path(), - base_path.join("auth_public_key.pem").as_path(), - ) { - Ok(()) => { - self.private_key_path = PathBuf::from("auth_private_key.pem"); - } - Err(e) => { - if !self.auth_keys_needed() { - eprintln!("Could not generate keypair for JWT authentication: {e}"); - eprintln!("Continuing anyway because authentication was not enabled"); - self.private_key_path = PathBuf::from("auth_private_key.pem"); - } else { - return Err(e); - } - } - } + // step. + generate_auth_keys( + base_path.join("auth_private_key.pem").as_path(), + base_path.join("auth_public_key.pem").as_path(), + ) + .context("generate auth keys")?; + let private_key_path = PathBuf::from("auth_private_key.pem"); + + // create the runtime type because the remaining initialization code below needs + // a LocalEnv instance op operation + // TODO: refactor to avoid this, LocalEnv should only be constructed from on-disk state + let env = LocalEnv { + base_data_dir: base_path.clone(), + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id: Some(default_tenant_id), + private_key_path, + broker, + storage_controller: storage_controller.unwrap_or_default(), + pageservers: pageservers.iter().map(Into::into).collect(), + safekeepers, + control_plane_api: control_plane_api.unwrap_or_default(), + control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(), + branch_name_mappings: Default::default(), + }; + + // create endpoints dir + fs::create_dir_all(env.endpoints_path())?; + + // create safekeeper dirs + for safekeeper in &env.safekeepers { + fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?; } - fs::create_dir_all(self.endpoints_path())?; - - for safekeeper in &self.safekeepers { - fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?; + // initialize pageserver state + for (i, ps) in pageservers.into_iter().enumerate() { + let runtime_ps = &env.pageservers[i]; + assert_eq!(&PageServerConf::from(&ps), runtime_ps); + fs::create_dir(env.pageserver_data_dir(ps.id))?; + PageServerNode::from_env(&env, runtime_ps) + .initialize(ps) + .context("pageserver init failed")?; } - self.persist_config(base_path) - } + // setup remote remote location for default LocalFs remote storage + std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?; - fn auth_keys_needed(&self) -> bool { - self.pageservers.iter().any(|ps| { - ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT - }) || self.safekeepers.iter().any(|sk| sk.auth_enabled) + env.persist_config() } } -fn base_path() -> PathBuf { - match std::env::var_os("NEON_REPO_DIR") { - Some(val) => PathBuf::from(val), - None => PathBuf::from(".neon"), - } +pub fn base_path() -> PathBuf { + let path = match std::env::var_os("NEON_REPO_DIR") { + Some(val) => { + let path = PathBuf::from(val); + if !path.is_absolute() { + // repeat the env var in the error because our default is always absolute + panic!("NEON_REPO_DIR must be an absolute path, got {path:?}"); + } + path + } + None => { + let pwd = std::env::current_dir() + // technically this can fail but it's quite unlikeley + .expect("determine current directory"); + let pwd_abs = pwd.canonicalize().expect("canonicalize current directory"); + pwd_abs.join(".neon") + } + }; + assert!(path.is_absolute()); + path } /// Generate a public/private key pair for JWT authentication @@ -561,31 +842,3 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow } Ok(()) } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn simple_conf_parsing() { - let simple_conf_toml = include_str!("../simple.conf"); - let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml); - assert!( - simple_conf_parse_result.is_ok(), - "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}" - ); - - let string_to_replace = "listen_addr = '127.0.0.1:50051'"; - let spoiled_url_str = "listen_addr = '!@$XOXO%^&'"; - let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str); - assert!( - spoiled_url_toml.contains(spoiled_url_str), - "Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}" - ); - let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml); - assert!( - spoiled_url_parse_result.is_err(), - "expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}" - ); - } -} diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index b6e6b5fdca..e879674d66 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -1,36 +1,37 @@ //! Code to manage pageservers //! -//! In the local test environment, the pageserver stores its data directly in +//! In the local test environment, the data for each pageserver is stored in //! -//! .neon/ +//! ```text +//! .neon/pageserver_ +//! ``` //! -use std::borrow::Cow; use std::collections::HashMap; use std::io; use std::io::Write; use std::num::NonZeroU64; use std::path::PathBuf; -use std::process::Command; +use std::str::FromStr; use std::time::Duration; use anyhow::{bail, Context}; use camino::Utf8PathBuf; -use futures::SinkExt; use pageserver_api::models::{ - self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo, + self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo, }; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use postgres_backend::AuthType; use postgres_connection::{parse_host_port, PgConnectionConfig}; use utils::auth::{Claims, Scope}; +use utils::id::NodeId; use utils::{ id::{TenantId, TimelineId}, lsn::Lsn, }; -use crate::local_env::PageServerConf; +use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf}; use crate::{background_process, local_env::LocalEnv}; /// Directory within .neon which will be used by default for LocalFs remote storage. @@ -74,34 +75,27 @@ impl PageServerNode { } } - /// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration. - /// - /// These all end up on the command line of the `pageserver` binary. - fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec { - let id = format!("id={}", self.conf.id); + fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document { + toml_edit::Document::from_str(&format!("id={node_id}")).unwrap() + } + + fn pageserver_init_make_toml( + &self, + conf: NeonLocalInitPageserverConf, + ) -> anyhow::Result { + assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully"); + + // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656) + // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let pg_distrib_dir_param = format!( "pg_distrib_dir='{}'", self.env.pg_distrib_dir_raw().display() ); - let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type); - let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr); - - let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type); - let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr); - let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url()); - let mut overrides = vec![ - id, - pg_distrib_dir_param, - http_auth_type_param, - pg_auth_type_param, - listen_http_addr_param, - listen_pg_addr_param, - broker_endpoint_param, - ]; + let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param]; if let Some(control_plane_api) = &self.env.control_plane_api { overrides.push(format!( @@ -109,43 +103,54 @@ impl PageServerNode { control_plane_api.as_str() )); - // Attachment service uses the same auth as pageserver: if JWT is enabled + // Storage controller uses the same auth as pageserver: if JWT is enabled // for us, we will also need it to talk to them. - if matches!(self.conf.http_auth_type, AuthType::NeonJWT) { + if matches!(conf.http_auth_type, AuthType::NeonJWT) { let jwt_token = self .env - .generate_auth_token(&Claims::new(None, Scope::PageServerApi)) + .generate_auth_token(&Claims::new(None, Scope::GenerationsApi)) .unwrap(); overrides.push(format!("control_plane_api_token='{}'", jwt_token)); } } - if !cli_overrides - .iter() - .any(|c| c.starts_with("remote_storage")) - { + if !conf.other.contains_key("remote_storage") { overrides.push(format!( "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}" )); } - if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust - { + if conf.http_auth_type != AuthType::Trust || conf.pg_auth_type != AuthType::Trust { // Keys are generated in the toplevel repo dir, pageservers' workdirs // are one level below that, so refer to keys with ../ overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned()); } // Apply the user-provided overrides - overrides.extend(cli_overrides.iter().map(|&c| c.to_owned())); + overrides.push({ + let mut doc = + toml_edit::ser::to_document(&conf).expect("we deserialized this from toml earlier"); + // `id` is written out to `identity.toml` instead of `pageserver.toml` + doc.remove("id").expect("it's part of the struct"); + doc.to_string() + }); - overrides + // Turn `overrides` into a toml document. + // TODO: above code is legacy code, it should be refactored to use toml_edit directly. + let mut config_toml = toml_edit::Document::new(); + for fragment_str in overrides { + let fragment = toml_edit::Document::from_str(&fragment_str) + .expect("all fragments in `overrides` are valid toml documents, this function controls that"); + for (key, item) in fragment.iter() { + config_toml.insert(key, item.clone()); + } + } + Ok(config_toml) } /// Initializes a pageserver node by creating its config with the overrides provided. - pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> { - // First, run `pageserver --init` and wait for it to write a config into FS and exit. - self.pageserver_init(config_overrides) + pub fn initialize(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> { + self.pageserver_init(conf) .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id)) } @@ -161,11 +166,11 @@ impl PageServerNode { .expect("non-Unicode path") } - pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> { - self.start_node(config_overrides, false).await + pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> { + self.start_node(retry_timeout).await } - fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> { + fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> { let datadir = self.repo_path(); let node_id = self.conf.id; println!( @@ -176,45 +181,90 @@ impl PageServerNode { ); io::stdout().flush()?; - if !datadir.exists() { - std::fs::create_dir(&datadir)?; - } + // If the config file we got as a CLI argument includes the `availability_zone` + // config, then use that to populate the `metadata.json` file for the pageserver. + // In production the deployment orchestrator does this for us. + let az_id = conf + .other + .get("availability_zone") + .map(|toml| { + let az_str = toml.to_string(); + // Trim the (") chars from the toml representation + if az_str.starts_with('"') && az_str.ends_with('"') { + az_str[1..az_str.len() - 1].to_string() + } else { + az_str + } + }) + .unwrap_or("local".to_string()); - let datadir_path_str = datadir.to_str().with_context(|| { - format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}") - })?; - let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str); - args.push(Cow::Borrowed("--init")); + let config = self + .pageserver_init_make_toml(conf) + .context("make pageserver toml")?; + let config_file_path = datadir.join("pageserver.toml"); + let mut config_file = std::fs::OpenOptions::new() + .create_new(true) + .write(true) + .open(&config_file_path) + .with_context(|| format!("open pageserver toml for write: {config_file_path:?}"))?; + config_file + .write_all(config.to_string().as_bytes()) + .context("write pageserver toml")?; + drop(config_file); - let init_output = Command::new(self.env.pageserver_bin()) - .args(args.iter().map(Cow::as_ref)) - .envs(self.pageserver_env_variables()?) - .output() - .with_context(|| format!("Failed to run pageserver init for node {node_id}"))?; + let identity_file_path = datadir.join("identity.toml"); + let mut identity_file = std::fs::OpenOptions::new() + .create_new(true) + .write(true) + .open(identity_file_path) + .with_context(|| format!("open identity toml for write: {config_file_path:?}"))?; + let identity_toml = self.pageserver_make_identity_toml(node_id); + identity_file + .write_all(identity_toml.to_string().as_bytes()) + .context("write identity toml")?; + drop(identity_toml); - anyhow::ensure!( - init_output.status.success(), - "Pageserver init for node {} did not finish successfully, stdout: {}, stderr: {}", - node_id, - String::from_utf8_lossy(&init_output.stdout), - String::from_utf8_lossy(&init_output.stderr), - ); + // TODO: invoke a TBD config-check command to validate that pageserver will start with the written config + + // Write metadata file, used by pageserver on startup to register itself with + // the storage controller + let metadata_path = datadir.join("metadata.json"); + + let (_http_host, http_port) = + parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr"); + let http_port = http_port.unwrap_or(9898); + + // Intentionally hand-craft JSON: this acts as an implicit format compat test + // in case the pageserver-side structure is edited, and reflects the real life + // situation: the metadata is written by some other script. + std::fs::write( + metadata_path, + serde_json::to_vec(&pageserver_api::config::NodeMetadata { + postgres_host: "localhost".to_string(), + postgres_port: self.pg_connection_config.port(), + http_host: "localhost".to_string(), + http_port, + other: HashMap::from([( + "availability_zone_id".to_string(), + serde_json::json!(az_id), + )]), + }) + .unwrap(), + ) + .expect("Failed to write metadata file"); Ok(()) } - async fn start_node( - &self, - config_overrides: &[&str], - update_config: bool, - ) -> anyhow::Result<()> { + async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> { // TODO: using a thread here because start_process() is not async but we need to call check_status() let datadir = self.repo_path(); print!( - "Starting pageserver node {} at '{}' in {:?}", + "Starting pageserver node {} at '{}' in {:?}, retrying for {:?}", self.conf.id, self.pg_connection_config.raw_address(), - datadir + datadir, + retry_timeout ); io::stdout().flush().context("flush stdout")?; @@ -224,17 +274,15 @@ impl PageServerNode { self.conf.id, datadir, ) })?; - let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str); - if update_config { - args.push(Cow::Borrowed("--update-config")); - } + let args = vec!["-D", datadir_path_str]; background_process::start_process( "pageserver", &datadir, &self.env.pageserver_bin(), - args.iter().map(Cow::as_ref), + args, self.pageserver_env_variables()?, background_process::InitialPidFile::Expect(self.pid_file()), + retry_timeout, || async { let res = tokio::time::timeout(Duration::from_secs(1), self.http_client.status()).await; @@ -246,23 +294,9 @@ impl PageServerNode { } }, ) - .await - } + .await?; - fn pageserver_basic_args<'a>( - &self, - config_overrides: &'a [&'a str], - datadir_path_str: &'a str, - ) -> Vec> { - let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)]; - - let overrides = self.neon_local_overrides(config_overrides); - for config_override in overrides { - args.push(Cow::Borrowed("-c")); - args.push(Cow::Owned(config_override)); - } - - args + Ok(()) } fn pageserver_env_variables(&self) -> anyhow::Result> { @@ -331,6 +365,11 @@ impl PageServerNode { .remove("compaction_threshold") .map(|x| x.parse::()) .transpose()?, + compaction_algorithm: settings + .remove("compaction_algorithm") + .map(serde_json::from_str) + .transpose() + .context("Failed to parse 'compaction_algorithm' json")?, gc_horizon: settings .remove("gc_horizon") .map(|x| x.parse::()) @@ -340,6 +379,10 @@ impl PageServerNode { .remove("image_creation_threshold") .map(|x| x.parse::()) .transpose()?, + image_layer_creation_check_threshold: settings + .remove("image_layer_creation_check_threshold") + .map(|x| x.parse::()) + .transpose()?, pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), walreceiver_connect_timeout: settings .remove("walreceiver_connect_timeout") @@ -352,11 +395,6 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, - trace_read_requests: settings - .remove("trace_read_requests") - .map(|x| x.parse::()) - .transpose() - .context("Failed to parse 'trace_read_requests' as bool")?, eviction_policy: settings .remove("eviction_policy") .map(serde_json::from_str) @@ -370,12 +408,26 @@ impl PageServerNode { evictions_low_residence_duration_metric_threshold: settings .remove("evictions_low_residence_duration_metric_threshold") .map(|x| x.to_string()), - gc_feedback: settings - .remove("gc_feedback") + heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), + lazy_slru_download: settings + .remove("lazy_slru_download") .map(|x| x.parse::()) .transpose() - .context("Failed to parse 'gc_feedback' as bool")?, - heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), + .context("Failed to parse 'lazy_slru_download' as bool")?, + timeline_get_throttle: settings + .remove("timeline_get_throttle") + .map(serde_json::from_str) + .transpose() + .context("parse `timeline_get_throttle` from json")?, + switch_aux_file_policy: settings + .remove("switch_aux_file_policy") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'switch_aux_file_policy'")?, + lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()), + lsn_lease_length_for_ts: settings + .remove("lsn_lease_length_for_ts") + .map(|x| x.to_string()), }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") @@ -384,26 +436,6 @@ impl PageServerNode { } } - pub async fn tenant_create( - &self, - new_tenant_id: TenantId, - generation: Option, - settings: HashMap<&str, &str>, - ) -> anyhow::Result { - let config = Self::parse_config(settings.clone())?; - - let request = models::TenantCreateRequest { - new_tenant_id: TenantShardId::unsharded(new_tenant_id), - generation, - config, - shard_parameters: ShardParameters::default(), - }; - if !settings.is_empty() { - bail!("Unrecognized tenant settings: {settings:?}") - } - Ok(self.http_client.tenant_create(&request).await?) - } - pub async fn tenant_config( &self, tenant_id: TenantId, @@ -429,6 +461,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_threshold' as an integer")?, + compaction_algorithm: settings + .remove("compactin_algorithm") + .map(serde_json::from_str) + .transpose() + .context("Failed to parse 'compaction_algorithm' json")?, gc_horizon: settings .remove("gc_horizon") .map(|x| x.parse::()) @@ -440,6 +477,12 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'image_creation_threshold' as non zero integer")?, + image_layer_creation_check_threshold: settings + .remove("image_layer_creation_check_threshold") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'image_creation_check_threshold' as integer")?, + pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), walreceiver_connect_timeout: settings .remove("walreceiver_connect_timeout") @@ -452,11 +495,6 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, - trace_read_requests: settings - .remove("trace_read_requests") - .map(|x| x.parse::()) - .transpose() - .context("Failed to parse 'trace_read_requests' as bool")?, eviction_policy: settings .remove("eviction_policy") .map(serde_json::from_str) @@ -470,12 +508,26 @@ impl PageServerNode { evictions_low_residence_duration_metric_threshold: settings .remove("evictions_low_residence_duration_metric_threshold") .map(|x| x.to_string()), - gc_feedback: settings - .remove("gc_feedback") + heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), + lazy_slru_download: settings + .remove("lazy_slru_download") .map(|x| x.parse::()) .transpose() - .context("Failed to parse 'gc_feedback' as bool")?, - heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), + .context("Failed to parse 'lazy_slru_download' as bool")?, + timeline_get_throttle: settings + .remove("timeline_get_throttle") + .map(serde_json::from_str) + .transpose() + .context("parse `timeline_get_throttle` from json")?, + switch_aux_file_policy: settings + .remove("switch_aux_file_policy") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'switch_aux_file_policy'")?, + lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()), + lsn_lease_length_for_ts: settings + .remove("lsn_lease_length_for_ts") + .map(|x| x.to_string()), } }; @@ -495,10 +547,11 @@ impl PageServerNode { tenant_shard_id: TenantShardId, config: LocationConfig, flush_ms: Option, + lazy: bool, ) -> anyhow::Result<()> { Ok(self .http_client - .location_config(tenant_shard_id, config, flush_ms) + .location_config(tenant_shard_id, config, flush_ms, lazy) .await?) } @@ -509,13 +562,6 @@ impl PageServerNode { Ok(self.http_client.list_timelines(*tenant_shard_id).await?) } - pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> { - Ok(self - .http_client - .tenant_secondary_download(*tenant_id) - .await?) - } - pub async fn timeline_create( &self, tenant_shard_id: TenantShardId, @@ -555,60 +601,39 @@ impl PageServerNode { pg_wal: Option<(Lsn, PathBuf)>, pg_version: u32, ) -> anyhow::Result<()> { - let (client, conn) = self.page_server_psql_client().await?; - // The connection object performs the actual communication with the database, - // so spawn it off to run on its own. - tokio::spawn(async move { - if let Err(e) = conn.await { - eprintln!("connection error: {}", e); - } - }); - tokio::pin!(client); - // Init base reader let (start_lsn, base_tarfile_path) = base; let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?; - let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile); + let base_tarfile = + mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(base_tarfile)); // Init wal reader if necessary let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal { let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?; - let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile); + let wal_reader = + mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(wal_tarfile)); (end_lsn, Some(wal_reader)) } else { (start_lsn, None) }; - let copy_in = |reader, cmd| { - let client = &client; - async move { - let writer = client.copy_in(&cmd).await?; - let writer = std::pin::pin!(writer); - let mut writer = writer.sink_map_err(|e| { - std::io::Error::new(std::io::ErrorKind::Other, format!("{e}")) - }); - let mut reader = std::pin::pin!(reader); - writer.send_all(&mut reader).await?; - writer.into_inner().finish().await?; - anyhow::Ok(()) - } - }; - // Import base - copy_in( - base_tarfile, - format!( - "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}" - ), - ) - .await?; - // Import wal if necessary - if let Some(wal_reader) = wal_reader { - copy_in( - wal_reader, - format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"), + self.http_client + .import_basebackup( + tenant_id, + timeline_id, + start_lsn, + end_lsn, + pg_version, + base_tarfile, ) .await?; + + // Import wal if necessary + if let Some(wal_reader) = wal_reader { + self.http_client + .import_wal(tenant_id, timeline_id, start_lsn, end_lsn, wal_reader) + .await?; } Ok(()) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 6ac71dfe51..573f1688d5 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -5,8 +5,10 @@ //! ```text //! .neon/safekeepers/ //! ``` +use std::future::Future; use std::io::Write; use std::path::PathBuf; +use std::time::Duration; use std::{io, result}; use anyhow::Context; @@ -14,6 +16,7 @@ use camino::Utf8PathBuf; use postgres_connection::PgConnectionConfig; use reqwest::{IntoUrl, Method}; use thiserror::Error; +use utils::auth::{Claims, Scope}; use utils::{http::error::HttpErrorBody, id::NodeId}; use crate::{ @@ -32,12 +35,10 @@ pub enum SafekeeperHttpError { type Result = result::Result; -#[async_trait::async_trait] -pub trait ResponseErrorMessageExt: Sized { - async fn error_from_body(self) -> Result; +pub(crate) trait ResponseErrorMessageExt: Sized { + fn error_from_body(self) -> impl Future> + Send; } -#[async_trait::async_trait] impl ResponseErrorMessageExt for reqwest::Response { async fn error_from_body(self) -> Result { let status = self.status(); @@ -70,24 +71,31 @@ pub struct SafekeeperNode { pub pg_connection_config: PgConnectionConfig, pub env: LocalEnv, pub http_client: reqwest::Client, + pub listen_addr: String, pub http_base_url: String, } impl SafekeeperNode { pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode { + let listen_addr = if let Some(ref listen_addr) = conf.listen_addr { + listen_addr.clone() + } else { + "127.0.0.1".to_string() + }; SafekeeperNode { id: conf.id, conf: conf.clone(), - pg_connection_config: Self::safekeeper_connection_config(conf.pg_port), + pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port), env: env.clone(), http_client: reqwest::Client::new(), - http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port), + http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port), + listen_addr, } } /// Construct libpq connection string for connecting to this safekeeper. - fn safekeeper_connection_config(port: u16) -> PgConnectionConfig { - PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port) + fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig { + PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port) } pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf { @@ -103,16 +111,21 @@ impl SafekeeperNode { .expect("non-Unicode path") } - pub async fn start(&self, extra_opts: Vec) -> anyhow::Result<()> { + pub async fn start( + &self, + extra_opts: Vec, + retry_timeout: &Duration, + ) -> anyhow::Result<()> { print!( - "Starting safekeeper at '{}' in '{}'", + "Starting safekeeper at '{}' in '{}', retrying for {:?}", self.pg_connection_config.raw_address(), - self.datadir_path().display() + self.datadir_path().display(), + retry_timeout, ); io::stdout().flush().unwrap(); - let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port); - let listen_http = format!("127.0.0.1:{}", self.conf.http_port); + let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port); + let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port); let id = self.id; let datadir = self.datadir_path(); @@ -139,7 +152,7 @@ impl SafekeeperNode { availability_zone, ]; if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port { - let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port); + let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port); args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]); } if !self.conf.sync { @@ -190,8 +203,9 @@ impl SafekeeperNode { &datadir, &self.env.safekeeper_bin(), &args, - [], + self.safekeeper_env_variables()?, background_process::InitialPidFile::Expect(self.pid_file()), + retry_timeout, || async { match self.check_status().await { Ok(()) => Ok(true), @@ -203,6 +217,18 @@ impl SafekeeperNode { .await } + fn safekeeper_env_variables(&self) -> anyhow::Result> { + // Generate a token to connect from safekeeper to peers + if self.conf.auth_enabled { + let token = self + .env + .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?; + Ok(vec![("SAFEKEEPER_AUTH_TOKEN".to_owned(), token)]) + } else { + Ok(Vec::new()) + } + } + /// /// Stop the server. /// diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs new file mode 100644 index 0000000000..c715d6b789 --- /dev/null +++ b/control_plane/src/storage_controller.rs @@ -0,0 +1,823 @@ +use crate::{ + background_process, + local_env::{LocalEnv, NeonStorageControllerConf}, +}; +use camino::{Utf8Path, Utf8PathBuf}; +use hyper::Uri; +use nix::unistd::Pid; +use pageserver_api::{ + controller_api::{ + NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, + TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest, + TenantShardMigrateResponse, + }, + models::{ + TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, + }, + shard::{ShardStripeSize, TenantShardId}, +}; +use pageserver_client::mgmt_api::ResponseErrorMessageExt; +use postgres_backend::AuthType; +use reqwest::Method; +use serde::{de::DeserializeOwned, Deserialize, Serialize}; +use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock}; +use tokio::process::Command; +use tracing::instrument; +use url::Url; +use utils::{ + auth::{encode_from_key_file, Claims, Scope}, + id::{NodeId, TenantId}, +}; + +pub struct StorageController { + env: LocalEnv, + private_key: Option>, + public_key: Option, + client: reqwest::Client, + config: NeonStorageControllerConf, + + // The listen addresses is learned when starting the storage controller, + // hence the use of OnceLock to init it at the right time. + listen: OnceLock, +} + +const COMMAND: &str = "storage_controller"; + +const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16; + +const DB_NAME: &str = "storage_controller"; + +pub struct NeonStorageControllerStartArgs { + pub instance_id: u8, + pub base_port: Option, + pub start_timeout: humantime::Duration, +} + +impl NeonStorageControllerStartArgs { + pub fn with_default_instance_id(start_timeout: humantime::Duration) -> Self { + Self { + instance_id: 1, + base_port: None, + start_timeout, + } + } +} + +pub struct NeonStorageControllerStopArgs { + pub instance_id: u8, + pub immediate: bool, +} + +impl NeonStorageControllerStopArgs { + pub fn with_default_instance_id(immediate: bool) -> Self { + Self { + instance_id: 1, + immediate, + } + } +} + +#[derive(Serialize, Deserialize)] +pub struct AttachHookRequest { + pub tenant_shard_id: TenantShardId, + pub node_id: Option, + pub generation_override: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct AttachHookResponse { + pub gen: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct InspectRequest { + pub tenant_shard_id: TenantShardId, +} + +#[derive(Serialize, Deserialize)] +pub struct InspectResponse { + pub attachment: Option<(u32, NodeId)>, +} + +impl StorageController { + pub fn from_env(env: &LocalEnv) -> Self { + // Assume all pageservers have symmetric auth configuration: this service + // expects to use one JWT token to talk to all of them. + let ps_conf = env + .pageservers + .first() + .expect("Config is validated to contain at least one pageserver"); + let (private_key, public_key) = match ps_conf.http_auth_type { + AuthType::Trust => (None, None), + AuthType::NeonJWT => { + let private_key_path = env.get_private_key_path(); + let private_key = fs::read(private_key_path).expect("failed to read private key"); + + // If pageserver auth is enabled, this implicitly enables auth for this service, + // using the same credentials. + let public_key_path = + camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem")) + .unwrap(); + + // This service takes keys as a string rather than as a path to a file/dir: read the key into memory. + let public_key = if std::fs::metadata(&public_key_path) + .expect("Can't stat public key") + .is_dir() + { + // Our config may specify a directory: this is for the pageserver's ability to handle multiple + // keys. We only use one key at a time, so, arbitrarily load the first one in the directory. + let mut dir = + std::fs::read_dir(&public_key_path).expect("Can't readdir public key path"); + let dent = dir + .next() + .expect("Empty key dir") + .expect("Error reading key dir"); + + std::fs::read_to_string(dent.path()).expect("Can't read public key") + } else { + std::fs::read_to_string(&public_key_path).expect("Can't read public key") + }; + (Some(private_key), Some(public_key)) + } + }; + + Self { + env: env.clone(), + private_key, + public_key, + client: reqwest::ClientBuilder::new() + .build() + .expect("Failed to construct http client"), + config: env.storage_controller.clone(), + listen: OnceLock::default(), + } + } + + fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf { + self.env + .base_data_dir + .join(format!("storage_controller_{}", instance_id)) + } + + fn pid_file(&self, instance_id: u8) -> Utf8PathBuf { + Utf8PathBuf::from_path_buf( + self.storage_controller_instance_dir(instance_id) + .join("storage_controller.pid"), + ) + .expect("non-Unicode path") + } + + /// PIDFile for the postgres instance used to store storage controller state + fn postgres_pid_file(&self) -> Utf8PathBuf { + Utf8PathBuf::from_path_buf( + self.env + .base_data_dir + .join("storage_controller_postgres.pid"), + ) + .expect("non-Unicode path") + } + + /// Find the directory containing postgres subdirectories, such `bin` and `lib` + /// + /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back + /// to other versions if that one isn't found. Some automated tests create circumstances + /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`. + async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result { + let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14]; + + for v in prefer_versions { + let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap(); + if tokio::fs::try_exists(&path).await? { + return Ok(path); + } + } + + // Fall through + anyhow::bail!( + "Postgres directory '{}' not found in {}", + dir_name, + self.env.pg_distrib_dir.display(), + ); + } + + pub async fn get_pg_bin_dir(&self) -> anyhow::Result { + self.get_pg_dir("bin").await + } + + pub async fn get_pg_lib_dir(&self) -> anyhow::Result { + self.get_pg_dir("lib").await + } + + /// Readiness check for our postgres process + async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result { + let bin_path = pg_bin_dir.join("pg_isready"); + let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)]; + let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?; + + Ok(exitcode.success()) + } + + /// Create our database if it doesn't exist + /// + /// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement + /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers + /// who just want to run `cargo neon_local` without knowing about diesel. + /// + /// Returns the database url + pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result { + let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port); + + let pg_bin_dir = self.get_pg_bin_dir().await?; + let createdb_path = pg_bin_dir.join("createdb"); + let output = Command::new(&createdb_path) + .args([ + "-h", + "localhost", + "-p", + &format!("{}", postgres_port), + DB_NAME, + ]) + .output() + .await + .expect("Failed to spawn createdb"); + + if !output.status.success() { + let stderr = String::from_utf8(output.stderr).expect("Non-UTF8 output from createdb"); + if stderr.contains("already exists") { + tracing::info!("Database {DB_NAME} already exists"); + } else { + anyhow::bail!("createdb failed with status {}: {stderr}", output.status); + } + } + + Ok(database_url) + } + + pub async fn connect_to_database( + &self, + postgres_port: u16, + ) -> anyhow::Result<( + tokio_postgres::Client, + tokio_postgres::Connection, + )> { + tokio_postgres::Config::new() + .host("localhost") + .port(postgres_port) + // The user is the ambient operating system user name. + // That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400 + // + // Until we get there, use the ambient operating system user name. + // Recent tokio-postgres versions default to this if the user isn't specified. + // But tokio-postgres fork doesn't have this upstream commit: + // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79 + // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399 + .user(&whoami::username()) + .dbname(DB_NAME) + .connect(tokio_postgres::NoTls) + .await + .map_err(anyhow::Error::new) + } + + pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> { + let instance_dir = self.storage_controller_instance_dir(start_args.instance_id); + if let Err(err) = tokio::fs::create_dir(&instance_dir).await { + if err.kind() != std::io::ErrorKind::AlreadyExists { + panic!("Failed to create instance dir {instance_dir:?}"); + } + } + + let (listen, postgres_port) = { + if let Some(base_port) = start_args.base_port { + ( + format!("127.0.0.1:{base_port}"), + self.config + .database_url + .expect("--base-port requires NeonStorageControllerConf::database_url") + .port(), + ) + } else { + let listen_url = self.env.control_plane_api.clone().unwrap(); + + let listen = format!( + "{}:{}", + listen_url.host_str().unwrap(), + listen_url.port().unwrap() + ); + + (listen, listen_url.port().unwrap() + 1) + } + }; + + let socket_addr = listen + .parse() + .expect("listen address is a valid socket address"); + self.listen + .set(socket_addr) + .expect("StorageController::listen is only set here"); + + // Do we remove the pid file on stop? + let pg_started = self.is_postgres_running().await?; + let pg_lib_dir = self.get_pg_lib_dir().await?; + + if !pg_started { + // Start a vanilla Postgres process used by the storage controller for persistence. + let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone()) + .unwrap() + .join("storage_controller_db"); + let pg_bin_dir = self.get_pg_bin_dir().await?; + let pg_log_path = pg_data_path.join("postgres.log"); + + if !tokio::fs::try_exists(&pg_data_path).await? { + // Initialize empty database + let initdb_path = pg_bin_dir.join("initdb"); + let mut child = Command::new(&initdb_path) + .envs(vec![ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ]) + .args(["-D", pg_data_path.as_ref()]) + .spawn() + .expect("Failed to spawn initdb"); + let status = child.wait().await?; + if !status.success() { + anyhow::bail!("initdb failed with status {status}"); + } + }; + + // Write a minimal config file: + // - Specify the port, since this is chosen dynamically + // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing + // the storage controller we don't want a slow local disk to interfere with that. + // + // NB: it's important that we rewrite this file on each start command so we propagate changes + // from `LocalEnv`'s config file (`.neon/config`). + tokio::fs::write( + &pg_data_path.join("postgresql.conf"), + format!("port = {}\nfsync=off\n", postgres_port), + ) + .await?; + + println!("Starting storage controller database..."); + let db_start_args = [ + "-w", + "-D", + pg_data_path.as_ref(), + "-l", + pg_log_path.as_ref(), + "start", + ]; + + background_process::start_process( + "storage_controller_db", + &self.env.base_data_dir, + pg_bin_dir.join("pg_ctl").as_std_path(), + db_start_args, + vec![ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ], + background_process::InitialPidFile::Create(self.postgres_pid_file()), + &start_args.start_timeout, + || self.pg_isready(&pg_bin_dir, postgres_port), + ) + .await?; + + self.setup_database(postgres_port).await?; + } + + let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port); + + // We support running a startup SQL script to fiddle with the database before we launch storcon. + // This is used by the test suite. + let startup_script_path = self + .env + .base_data_dir + .join("storage_controller_db.startup.sql"); + let startup_script = match tokio::fs::read_to_string(&startup_script_path).await { + Ok(script) => { + tokio::fs::remove_file(startup_script_path).await?; + script + } + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { + // always run some startup script so that this code path doesn't bit rot + "BEGIN; COMMIT;".to_string() + } else { + anyhow::bail!("Failed to read startup script: {e}") + } + } + }; + let (mut client, conn) = self.connect_to_database(postgres_port).await?; + let conn = tokio::spawn(conn); + let tx = client.build_transaction(); + let tx = tx.start().await?; + tx.batch_execute(&startup_script).await?; + tx.commit().await?; + drop(client); + conn.await??; + + let listen = self + .listen + .get() + .expect("cell is set earlier in this function"); + let address_for_peers = Uri::builder() + .scheme("http") + .authority(format!("{}:{}", listen.ip(), listen.port())) + .path_and_query("") + .build() + .unwrap(); + + let mut args = vec![ + "-l", + &listen.to_string(), + "--dev", + "--database-url", + &database_url, + "--max-offline-interval", + &humantime::Duration::from(self.config.max_offline).to_string(), + "--max-warming-up-interval", + &humantime::Duration::from(self.config.max_warming_up).to_string(), + "--heartbeat-interval", + &humantime::Duration::from(self.config.heartbeat_interval).to_string(), + "--address-for-peers", + &address_for_peers.to_string(), + ] + .into_iter() + .map(|s| s.to_string()) + .collect::>(); + + if self.config.start_as_candidate { + args.push("--start-as-candidate".to_string()); + } + + if let Some(private_key) = &self.private_key { + let claims = Claims::new(None, Scope::PageServerApi); + let jwt_token = + encode_from_key_file(&claims, private_key).expect("failed to generate jwt token"); + args.push(format!("--jwt-token={jwt_token}")); + + let peer_claims = Claims::new(None, Scope::Admin); + let peer_jwt_token = encode_from_key_file(&peer_claims, private_key) + .expect("failed to generate jwt token"); + args.push(format!("--peer-jwt-token={peer_jwt_token}")); + } + + if let Some(public_key) = &self.public_key { + args.push(format!("--public-key=\"{public_key}\"")); + } + + if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api { + args.push(format!( + "--compute-hook-url={control_plane_compute_hook_api}" + )); + } + + if let Some(split_threshold) = self.config.split_threshold.as_ref() { + args.push(format!("--split-threshold={split_threshold}")) + } + + if let Some(lag) = self.config.max_secondary_lag_bytes.as_ref() { + args.push(format!("--max-secondary-lag-bytes={lag}")) + } + + args.push(format!( + "--neon-local-repo-dir={}", + self.env.base_data_dir.display() + )); + + background_process::start_process( + COMMAND, + &instance_dir, + &self.env.storage_controller_bin(), + args, + vec![ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ], + background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)), + &start_args.start_timeout, + || async { + match self.ready().await { + Ok(_) => Ok(true), + Err(_) => Ok(false), + } + }, + ) + .await?; + + Ok(()) + } + + pub async fn stop(&self, stop_args: NeonStorageControllerStopArgs) -> anyhow::Result<()> { + background_process::stop_process( + stop_args.immediate, + COMMAND, + &self.pid_file(stop_args.instance_id), + )?; + + let storcon_instances = self.env.storage_controller_instances().await?; + for (instance_id, instanced_dir_path) in storcon_instances { + if instance_id == stop_args.instance_id { + continue; + } + + let pid_file = instanced_dir_path.join("storage_controller.pid"); + let pid = tokio::fs::read_to_string(&pid_file) + .await + .map_err(|err| { + anyhow::anyhow!("Failed to read storcon pid file at {pid_file:?}: {err}") + })? + .parse::() + .expect("pid is valid i32"); + + let other_proc_alive = !background_process::process_has_stopped(Pid::from_raw(pid))?; + if other_proc_alive { + // There is another storage controller instance running, so we return + // and leave the database running. + return Ok(()); + } + } + + let pg_data_path = self.env.base_data_dir.join("storage_controller_db"); + let pg_bin_dir = self.get_pg_bin_dir().await?; + + println!("Stopping storage controller database..."); + let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"]; + let stop_status = Command::new(pg_bin_dir.join("pg_ctl")) + .args(pg_stop_args) + .spawn()? + .wait() + .await?; + if !stop_status.success() { + match self.is_postgres_running().await { + Ok(false) => { + println!("Storage controller database is already stopped"); + return Ok(()); + } + Ok(true) => { + anyhow::bail!("Failed to stop storage controller database"); + } + Err(err) => { + anyhow::bail!("Failed to stop storage controller database: {err}"); + } + } + } + + Ok(()) + } + + async fn is_postgres_running(&self) -> anyhow::Result { + let pg_data_path = self.env.base_data_dir.join("storage_controller_db"); + let pg_bin_dir = self.get_pg_bin_dir().await?; + + let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"]; + let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl")) + .args(pg_status_args) + .spawn()? + .wait() + .await?; + + // pg_ctl status returns this exit code if postgres is not running: in this case it is + // fine that stop failed. Otherwise it is an error that stop failed. + const PG_STATUS_NOT_RUNNING: i32 = 3; + const PG_NO_DATA_DIR: i32 = 4; + const PG_STATUS_RUNNING: i32 = 0; + match status_exitcode.code() { + Some(PG_STATUS_NOT_RUNNING) => Ok(false), + Some(PG_NO_DATA_DIR) => Ok(false), + Some(PG_STATUS_RUNNING) => Ok(true), + Some(code) => Err(anyhow::anyhow!( + "pg_ctl status returned unexpected status code: {:?}", + code + )), + None => Err(anyhow::anyhow!("pg_ctl status returned no status code")), + } + } + + fn get_claims_for_path(path: &str) -> anyhow::Result> { + let category = match path.find('/') { + Some(idx) => &path[..idx], + None => path, + }; + + match category { + "status" | "ready" => Ok(None), + "control" | "debug" => Ok(Some(Claims::new(None, Scope::Admin))), + "v1" => Ok(Some(Claims::new(None, Scope::PageServerApi))), + _ => Err(anyhow::anyhow!("Failed to determine claims for {}", path)), + } + } + + /// Simple HTTP request wrapper for calling into storage controller + async fn dispatch( + &self, + method: reqwest::Method, + path: String, + body: Option, + ) -> anyhow::Result + where + RQ: Serialize + Sized, + RS: DeserializeOwned + Sized, + { + // In the special case of the `storage_controller start` subcommand, we wish + // to use the API endpoint of the newly started storage controller in order + // to pass the readiness check. In this scenario [`Self::listen`] will be set + // (see [`Self::start`]). + // + // Otherwise, we infer the storage controller api endpoint from the configured + // control plane API. + let url = if let Some(socket_addr) = self.listen.get() { + Url::from_str(&format!( + "http://{}:{}/{path}", + socket_addr.ip().to_canonical(), + socket_addr.port() + )) + .unwrap() + } else { + // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out + // for general purpose API access. + let listen_url = self.env.control_plane_api.clone().unwrap(); + Url::from_str(&format!( + "http://{}:{}/{path}", + listen_url.host_str().unwrap(), + listen_url.port().unwrap() + )) + .unwrap() + }; + + let mut builder = self.client.request(method, url); + if let Some(body) = body { + builder = builder.json(&body) + } + if let Some(private_key) = &self.private_key { + println!("Getting claims for path {}", path); + if let Some(required_claims) = Self::get_claims_for_path(&path)? { + println!("Got claims {:?} for path {}", required_claims, path); + let jwt_token = encode_from_key_file(&required_claims, private_key)?; + builder = builder.header( + reqwest::header::AUTHORIZATION, + format!("Bearer {jwt_token}"), + ); + } + } + + let response = builder.send().await?; + let response = response.error_from_body().await?; + + Ok(response + .json() + .await + .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?) + } + + /// Call into the attach_hook API, for use before handing out attachments to pageservers + #[instrument(skip(self))] + pub async fn attach_hook( + &self, + tenant_shard_id: TenantShardId, + pageserver_id: NodeId, + ) -> anyhow::Result> { + let request = AttachHookRequest { + tenant_shard_id, + node_id: Some(pageserver_id), + generation_override: None, + }; + + let response = self + .dispatch::<_, AttachHookResponse>( + Method::POST, + "debug/v1/attach-hook".to_string(), + Some(request), + ) + .await?; + + Ok(response.gen) + } + + #[instrument(skip(self))] + pub async fn inspect( + &self, + tenant_shard_id: TenantShardId, + ) -> anyhow::Result> { + let request = InspectRequest { tenant_shard_id }; + + let response = self + .dispatch::<_, InspectResponse>( + Method::POST, + "debug/v1/inspect".to_string(), + Some(request), + ) + .await?; + + Ok(response.attachment) + } + + #[instrument(skip(self))] + pub async fn tenant_create( + &self, + req: TenantCreateRequest, + ) -> anyhow::Result { + self.dispatch(Method::POST, "v1/tenant".to_string(), Some(req)) + .await + } + + #[instrument(skip(self))] + pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result { + self.dispatch::<(), TenantCreateResponse>( + Method::POST, + format!("debug/v1/tenant/{tenant_id}/import"), + None, + ) + .await + } + + #[instrument(skip(self))] + pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result { + self.dispatch::<(), _>( + Method::GET, + format!("debug/v1/tenant/{tenant_id}/locate"), + None, + ) + .await + } + + #[instrument(skip(self))] + pub async fn tenant_migrate( + &self, + tenant_shard_id: TenantShardId, + node_id: NodeId, + ) -> anyhow::Result { + self.dispatch( + Method::PUT, + format!("control/v1/tenant/{tenant_shard_id}/migrate"), + Some(TenantShardMigrateRequest { + tenant_shard_id, + node_id, + }), + ) + .await + } + + #[instrument(skip(self), fields(%tenant_id, %new_shard_count))] + pub async fn tenant_split( + &self, + tenant_id: TenantId, + new_shard_count: u8, + new_stripe_size: Option, + ) -> anyhow::Result { + self.dispatch( + Method::PUT, + format!("control/v1/tenant/{tenant_id}/shard_split"), + Some(TenantShardSplitRequest { + new_shard_count, + new_stripe_size, + }), + ) + .await + } + + #[instrument(skip_all, fields(node_id=%req.node_id))] + pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> { + self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req)) + .await + } + + #[instrument(skip_all, fields(node_id=%req.node_id))] + pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> { + self.dispatch::<_, ()>( + Method::PUT, + format!("control/v1/node/{}/config", req.node_id), + Some(req), + ) + .await + } + + pub async fn node_list(&self) -> anyhow::Result> { + self.dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await + } + + #[instrument(skip(self))] + pub async fn ready(&self) -> anyhow::Result<()> { + self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None) + .await + } + + #[instrument(skip_all, fields(%tenant_id, timeline_id=%req.new_timeline_id))] + pub async fn tenant_timeline_create( + &self, + tenant_id: TenantId, + req: TimelineCreateRequest, + ) -> anyhow::Result { + self.dispatch( + Method::POST, + format!("v1/tenant/{tenant_id}/timeline"), + Some(req), + ) + .await + } +} diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml new file mode 100644 index 0000000000..be69208d0d --- /dev/null +++ b/control_plane/storcon_cli/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "storcon_cli" +version = "0.1.0" +edition.workspace = true +license.workspace = true + + +[dependencies] +anyhow.workspace = true +clap.workspace = true +comfy-table.workspace = true +futures.workspace = true +humantime.workspace = true +hyper.workspace = true +pageserver_api.workspace = true +pageserver_client.workspace = true +reqwest.workspace = true +serde.workspace = true +serde_json = { workspace = true, features = ["raw_value"] } +storage_controller_client.workspace = true +thiserror.workspace = true +tokio.workspace = true +tracing.workspace = true +utils.workspace = true +workspace_hack.workspace = true + diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs new file mode 100644 index 0000000000..2a81a3d825 --- /dev/null +++ b/control_plane/storcon_cli/src/main.rs @@ -0,0 +1,943 @@ +use futures::StreamExt; +use std::{str::FromStr, time::Duration}; + +use clap::{Parser, Subcommand}; +use pageserver_api::{ + controller_api::{ + NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest, + TenantDescribeResponse, TenantPolicyRequest, + }, + models::{ + EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, + ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest, + TenantShardSplitResponse, + }, + shard::{ShardStripeSize, TenantShardId}, +}; +use pageserver_client::mgmt_api::{self}; +use reqwest::{Method, StatusCode, Url}; +use utils::id::{NodeId, TenantId}; + +use pageserver_api::controller_api::{ + NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, + TenantShardMigrateRequest, TenantShardMigrateResponse, +}; +use storage_controller_client::control_api::Client; + +#[derive(Subcommand, Debug)] +enum Command { + /// Register a pageserver with the storage controller. This shouldn't usually be necessary, + /// since pageservers auto-register when they start up + NodeRegister { + #[arg(long)] + node_id: NodeId, + + #[arg(long)] + listen_pg_addr: String, + #[arg(long)] + listen_pg_port: u16, + + #[arg(long)] + listen_http_addr: String, + #[arg(long)] + listen_http_port: u16, + #[arg(long)] + availability_zone_id: String, + }, + + /// Modify a node's configuration in the storage controller + NodeConfigure { + #[arg(long)] + node_id: NodeId, + + /// Availability is usually auto-detected based on heartbeats. Set 'offline' here to + /// manually mark a node offline + #[arg(long)] + availability: Option, + /// Scheduling policy controls whether tenant shards may be scheduled onto this node. + #[arg(long)] + scheduling: Option, + }, + NodeDelete { + #[arg(long)] + node_id: NodeId, + }, + /// Modify a tenant's policies in the storage controller + TenantPolicy { + #[arg(long)] + tenant_id: TenantId, + /// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`), + /// or is in the normal attached state with N secondary locations (`attached:N`) + #[arg(long)] + placement: Option, + /// Scheduling policy enables pausing the controller's scheduling activity involving this tenant. `active` is normal, + /// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents + /// all reconciliation activity including for scheduling changes already made. `pause` and `stop` can make a tenant + /// unavailable, and are only for use in emergencies. + #[arg(long)] + scheduling: Option, + }, + /// List nodes known to the storage controller + Nodes {}, + /// List tenants known to the storage controller + Tenants {}, + /// Create a new tenant in the storage controller, and by extension on pageservers. + TenantCreate { + #[arg(long)] + tenant_id: TenantId, + }, + /// Delete a tenant in the storage controller, and by extension on pageservers. + TenantDelete { + #[arg(long)] + tenant_id: TenantId, + }, + /// Split an existing tenant into a higher number of shards than its current shard count. + TenantShardSplit { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + shard_count: u8, + /// Optional, in 8kiB pages. e.g. set 2048 for 16MB stripes. + #[arg(long)] + stripe_size: Option, + }, + /// Migrate the attached location for a tenant shard to a specific pageserver. + TenantShardMigrate { + #[arg(long)] + tenant_shard_id: TenantShardId, + #[arg(long)] + node: NodeId, + }, + /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure + /// that is passed through to pageservers, and does not affect storage controller behavior. + TenantConfig { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + config: String, + }, + /// Print details about a particular tenant, including all its shards' states. + TenantDescribe { + #[arg(long)] + tenant_id: TenantId, + }, + /// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary + /// mode so that it can warm up content on a pageserver. + TenantWarmup { + #[arg(long)] + tenant_id: TenantId, + }, + /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate + /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region. + TenantDrop { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + unclean: bool, + }, + NodeDrop { + #[arg(long)] + node_id: NodeId, + #[arg(long)] + unclean: bool, + }, + TenantSetTimeBasedEviction { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + period: humantime::Duration, + #[arg(long)] + threshold: humantime::Duration, + }, + // Migrate away from a set of specified pageservers by moving the primary attachments to pageservers + // outside of the specified set. + BulkMigrate { + // Set of pageserver node ids to drain. + #[arg(long)] + nodes: Vec, + // Optional: migration concurrency (default is 8) + #[arg(long)] + concurrency: Option, + // Optional: maximum number of shards to migrate + #[arg(long)] + max_shards: Option, + // Optional: when set to true, nothing is migrated, but the plan is printed to stdout + #[arg(long)] + dry_run: Option, + }, + /// Start draining the specified pageserver. + /// The drain is complete when the schedulling policy returns to active. + StartDrain { + #[arg(long)] + node_id: NodeId, + }, + /// Cancel draining the specified pageserver and wait for `timeout` + /// for the operation to be canceled. May be retried. + CancelDrain { + #[arg(long)] + node_id: NodeId, + #[arg(long)] + timeout: humantime::Duration, + }, + /// Start filling the specified pageserver. + /// The drain is complete when the schedulling policy returns to active. + StartFill { + #[arg(long)] + node_id: NodeId, + }, + /// Cancel filling the specified pageserver and wait for `timeout` + /// for the operation to be canceled. May be retried. + CancelFill { + #[arg(long)] + node_id: NodeId, + #[arg(long)] + timeout: humantime::Duration, + }, +} + +#[derive(Parser)] +#[command( + author, + version, + about, + long_about = "CLI for Storage Controller Support/Debug" +)] +#[command(arg_required_else_help(true))] +struct Cli { + #[arg(long)] + /// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local` + api: Url, + + #[arg(long)] + /// JWT token for authenticating with storage controller. Depending on the API used, this + /// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint + /// a token with both scopes to use with this tool. + jwt: Option, + + #[command(subcommand)] + command: Command, +} + +#[derive(Debug, Clone)] +struct PlacementPolicyArg(PlacementPolicy); + +impl FromStr for PlacementPolicyArg { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "detached" => Ok(Self(PlacementPolicy::Detached)), + "secondary" => Ok(Self(PlacementPolicy::Secondary)), + _ if s.starts_with("attached:") => { + let mut splitter = s.split(':'); + let _prefix = splitter.next().unwrap(); + match splitter.next().and_then(|s| s.parse::().ok()) { + Some(n) => Ok(Self(PlacementPolicy::Attached(n))), + None => Err(anyhow::anyhow!( + "Invalid format '{s}', a valid example is 'attached:1'" + )), + } + } + _ => Err(anyhow::anyhow!( + "Unknown placement policy '{s}', try detached,secondary,attached:" + )), + } + } +} + +#[derive(Debug, Clone)] +struct ShardSchedulingPolicyArg(ShardSchedulingPolicy); + +impl FromStr for ShardSchedulingPolicyArg { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "active" => Ok(Self(ShardSchedulingPolicy::Active)), + "essential" => Ok(Self(ShardSchedulingPolicy::Essential)), + "pause" => Ok(Self(ShardSchedulingPolicy::Pause)), + "stop" => Ok(Self(ShardSchedulingPolicy::Stop)), + _ => Err(anyhow::anyhow!( + "Unknown scheduling policy '{s}', try active,essential,pause,stop" + )), + } + } +} + +#[derive(Debug, Clone)] +struct NodeAvailabilityArg(NodeAvailabilityWrapper); + +impl FromStr for NodeAvailabilityArg { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "active" => Ok(Self(NodeAvailabilityWrapper::Active)), + "offline" => Ok(Self(NodeAvailabilityWrapper::Offline)), + _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")), + } + } +} + +async fn wait_for_scheduling_policy( + client: Client, + node_id: NodeId, + timeout: Duration, + f: F, +) -> anyhow::Result +where + F: Fn(NodeSchedulingPolicy) -> bool, +{ + let waiter = tokio::time::timeout(timeout, async move { + loop { + let node = client + .dispatch::<(), NodeDescribeResponse>( + Method::GET, + format!("control/v1/node/{node_id}"), + None, + ) + .await?; + + if f(node.scheduling) { + return Ok::(node.scheduling); + } + } + }); + + Ok(waiter.await??) +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let cli = Cli::parse(); + + let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone()); + + let mut trimmed = cli.api.to_string(); + trimmed.pop(); + let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref()); + + match cli.command { + Command::NodeRegister { + node_id, + listen_pg_addr, + listen_pg_port, + listen_http_addr, + listen_http_port, + availability_zone_id, + } => { + storcon_client + .dispatch::<_, ()>( + Method::POST, + "control/v1/node".to_string(), + Some(NodeRegisterRequest { + node_id, + listen_pg_addr, + listen_pg_port, + listen_http_addr, + listen_http_port, + availability_zone_id, + }), + ) + .await?; + } + Command::TenantCreate { tenant_id } => { + storcon_client + .dispatch::<_, ()>( + Method::POST, + "v1/tenant".to_string(), + Some(TenantCreateRequest { + new_tenant_id: TenantShardId::unsharded(tenant_id), + generation: None, + shard_parameters: ShardParameters::default(), + placement_policy: Some(PlacementPolicy::Attached(1)), + config: TenantConfig::default(), + }), + ) + .await?; + } + Command::TenantDelete { tenant_id } => { + let status = vps_client + .tenant_delete(TenantShardId::unsharded(tenant_id)) + .await?; + tracing::info!("Delete status: {}", status); + } + Command::Nodes {} => { + let mut resp = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + + resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr)); + + let mut table = comfy_table::Table::new(); + table.set_header(["Id", "Hostname", "Scheduling", "Availability"]); + for node in resp { + table.add_row([ + format!("{}", node.id), + node.listen_http_addr, + format!("{:?}", node.scheduling), + format!("{:?}", node.availability), + ]); + } + println!("{table}"); + } + Command::NodeConfigure { + node_id, + availability, + scheduling, + } => { + let req = NodeConfigureRequest { + node_id, + availability: availability.map(|a| a.0), + scheduling, + }; + storcon_client + .dispatch::<_, ()>( + Method::PUT, + format!("control/v1/node/{node_id}/config"), + Some(req), + ) + .await?; + } + Command::Tenants {} => { + let mut resp = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/tenant".to_string(), + None, + ) + .await?; + + resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id)); + + let mut table = comfy_table::Table::new(); + table.set_header([ + "TenantId", + "ShardCount", + "StripeSize", + "Placement", + "Scheduling", + ]); + for tenant in resp { + let shard_zero = tenant.shards.into_iter().next().unwrap(); + table.add_row([ + format!("{}", tenant.tenant_id), + format!("{}", shard_zero.tenant_shard_id.shard_count.literal()), + format!("{:?}", tenant.stripe_size), + format!("{:?}", tenant.policy), + format!("{:?}", shard_zero.scheduling_policy), + ]); + } + + println!("{table}"); + } + Command::TenantPolicy { + tenant_id, + placement, + scheduling, + } => { + let req = TenantPolicyRequest { + scheduling: scheduling.map(|s| s.0), + placement: placement.map(|p| p.0), + }; + storcon_client + .dispatch::<_, ()>( + Method::PUT, + format!("control/v1/tenant/{tenant_id}/policy"), + Some(req), + ) + .await?; + } + Command::TenantShardSplit { + tenant_id, + shard_count, + stripe_size, + } => { + let req = TenantShardSplitRequest { + new_shard_count: shard_count, + new_stripe_size: stripe_size.map(ShardStripeSize), + }; + + let response = storcon_client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{tenant_id}/shard_split"), + Some(req), + ) + .await?; + println!( + "Split tenant {} into {} shards: {}", + tenant_id, + shard_count, + response + .new_shards + .iter() + .map(|s| format!("{:?}", s)) + .collect::>() + .join(",") + ); + } + Command::TenantShardMigrate { + tenant_shard_id, + node, + } => { + let req = TenantShardMigrateRequest { + tenant_shard_id, + node_id: node, + }; + + storcon_client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{tenant_shard_id}/migrate"), + Some(req), + ) + .await?; + } + Command::TenantConfig { tenant_id, config } => { + let tenant_conf = serde_json::from_str(&config)?; + + vps_client + .tenant_config(&TenantConfigRequest { + tenant_id, + config: tenant_conf, + }) + .await?; + } + Command::TenantDescribe { tenant_id } => { + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await?; + let shards = describe_response.shards; + let mut table = comfy_table::Table::new(); + table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]); + for shard in shards { + let secondary = shard + .node_secondary + .iter() + .map(|n| format!("{}", n)) + .collect::>() + .join(","); + + let mut status_parts = Vec::new(); + if shard.is_reconciling { + status_parts.push("reconciling"); + } + + if shard.is_pending_compute_notification { + status_parts.push("pending_compute"); + } + + if shard.is_splitting { + status_parts.push("splitting"); + } + let status = status_parts.join(","); + + table.add_row([ + format!("{}", shard.tenant_shard_id), + shard + .node_attached + .map(|n| format!("{}", n)) + .unwrap_or(String::new()), + secondary, + shard.last_error, + status, + ]); + } + println!("{table}"); + } + Command::TenantWarmup { tenant_id } => { + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await; + match describe_response { + Ok(describe) => { + if matches!(describe.policy, PlacementPolicy::Secondary) { + // Fine: it's already known to controller in secondary mode: calling + // again to put it into secondary mode won't cause problems. + } else { + anyhow::bail!("Tenant already present with policy {:?}", describe.policy); + } + } + Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => { + // Fine: this tenant isn't know to the storage controller yet. + } + Err(e) => { + // Unexpected API error + return Err(e.into()); + } + } + + vps_client + .location_config( + TenantShardId::unsharded(tenant_id), + pageserver_api::models::LocationConfig { + mode: pageserver_api::models::LocationConfigMode::Secondary, + generation: None, + secondary_conf: Some(LocationConfigSecondary { warm: true }), + shard_number: 0, + shard_count: 0, + shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0, + tenant_conf: TenantConfig::default(), + }, + None, + true, + ) + .await?; + + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await?; + + let secondary_ps_id = describe_response + .shards + .first() + .unwrap() + .node_secondary + .first() + .unwrap(); + + println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}"); + loop { + let (status, progress) = vps_client + .tenant_secondary_download( + TenantShardId::unsharded(tenant_id), + Some(Duration::from_secs(10)), + ) + .await?; + println!( + "Progress: {}/{} layers, {}/{} bytes", + progress.layers_downloaded, + progress.layers_total, + progress.bytes_downloaded, + progress.bytes_total + ); + match status { + StatusCode::OK => { + println!("Download complete"); + break; + } + StatusCode::ACCEPTED => { + // Loop + } + _ => { + anyhow::bail!("Unexpected download status: {status}"); + } + } + } + } + Command::TenantDrop { tenant_id, unclean } => { + if !unclean { + anyhow::bail!("This command is not a tenant deletion, and uncleanly drops all controller state for the tenant. If you know what you're doing, add `--unclean` to proceed.") + } + storcon_client + .dispatch::<(), ()>( + Method::POST, + format!("debug/v1/tenant/{tenant_id}/drop"), + None, + ) + .await?; + } + Command::NodeDrop { node_id, unclean } => { + if !unclean { + anyhow::bail!("This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it. If you know what you're doing, add `--unclean` to proceed.") + } + storcon_client + .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None) + .await?; + } + Command::NodeDelete { node_id } => { + storcon_client + .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None) + .await?; + } + Command::TenantSetTimeBasedEviction { + tenant_id, + period, + threshold, + } => { + vps_client + .tenant_config(&TenantConfigRequest { + tenant_id, + config: TenantConfig { + eviction_policy: Some(EvictionPolicy::LayerAccessThreshold( + EvictionPolicyLayerAccessThreshold { + period: period.into(), + threshold: threshold.into(), + }, + )), + heatmap_period: Some("300s".to_string()), + ..Default::default() + }, + }) + .await?; + } + Command::BulkMigrate { + nodes, + concurrency, + max_shards, + dry_run, + } => { + // Load the list of nodes, split them up into the drained and filled sets, + // and validate that draining is possible. + let node_descs = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + + let mut node_to_drain_descs = Vec::new(); + let mut node_to_fill_descs = Vec::new(); + + for desc in node_descs { + let to_drain = nodes.iter().any(|id| *id == desc.id); + if to_drain { + node_to_drain_descs.push(desc); + } else { + node_to_fill_descs.push(desc); + } + } + + if nodes.len() != node_to_drain_descs.len() { + anyhow::bail!("Bulk migration requested away from node which doesn't exist.") + } + + node_to_fill_descs.retain(|desc| { + matches!(desc.availability, NodeAvailabilityWrapper::Active) + && matches!( + desc.scheduling, + NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Filling + ) + }); + + if node_to_fill_descs.is_empty() { + anyhow::bail!("There are no nodes to migrate to") + } + + // Set the node scheduling policy to draining for the nodes which + // we plan to drain. + for node_desc in node_to_drain_descs.iter() { + let req = NodeConfigureRequest { + node_id: node_desc.id, + availability: None, + scheduling: Some(NodeSchedulingPolicy::Draining), + }; + + storcon_client + .dispatch::<_, ()>( + Method::PUT, + format!("control/v1/node/{}/config", node_desc.id), + Some(req), + ) + .await?; + } + + // Perform the migration: move each tenant shard scheduled on a node to + // be drained to a node which is being filled. A simple round robin + // strategy is used to pick the new node. + let tenants = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/tenant".to_string(), + None, + ) + .await?; + + let mut selected_node_idx = 0; + + struct MigrationMove { + tenant_shard_id: TenantShardId, + from: NodeId, + to: NodeId, + } + + let mut moves: Vec = Vec::new(); + + let shards = tenants + .into_iter() + .flat_map(|tenant| tenant.shards.into_iter()); + for shard in shards { + if let Some(max_shards) = max_shards { + if moves.len() >= max_shards { + println!( + "Stop planning shard moves since the requested maximum was reached" + ); + break; + } + } + + let should_migrate = { + if let Some(attached_to) = shard.node_attached { + node_to_drain_descs + .iter() + .map(|desc| desc.id) + .any(|id| id == attached_to) + } else { + false + } + }; + + if !should_migrate { + continue; + } + + moves.push(MigrationMove { + tenant_shard_id: shard.tenant_shard_id, + from: shard + .node_attached + .expect("We only migrate attached tenant shards"), + to: node_to_fill_descs[selected_node_idx].id, + }); + selected_node_idx = (selected_node_idx + 1) % node_to_fill_descs.len(); + } + + let total_moves = moves.len(); + + if dry_run == Some(true) { + println!("Dryrun requested. Planned {total_moves} moves:"); + for mv in &moves { + println!("{}: {} -> {}", mv.tenant_shard_id, mv.from, mv.to) + } + + return Ok(()); + } + + const DEFAULT_MIGRATE_CONCURRENCY: usize = 8; + let mut stream = futures::stream::iter(moves) + .map(|mv| { + let client = Client::new(cli.api.clone(), cli.jwt.clone()); + async move { + client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id), + Some(TenantShardMigrateRequest { + tenant_shard_id: mv.tenant_shard_id, + node_id: mv.to, + }), + ) + .await + .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e)) + } + }) + .buffered(concurrency.unwrap_or(DEFAULT_MIGRATE_CONCURRENCY)); + + let mut success = 0; + let mut failure = 0; + + while let Some(res) = stream.next().await { + match res { + Ok(_) => { + success += 1; + } + Err((tenant_shard_id, from, to, error)) => { + failure += 1; + println!( + "Failed to migrate {} from node {} to node {}: {}", + tenant_shard_id, from, to, error + ); + } + } + + if (success + failure) % 20 == 0 { + println!( + "Processed {}/{} shards: {} succeeded, {} failed", + success + failure, + total_moves, + success, + failure + ); + } + } + + println!( + "Processed {}/{} shards: {} succeeded, {} failed", + success + failure, + total_moves, + success, + failure + ); + } + Command::StartDrain { node_id } => { + storcon_client + .dispatch::<(), ()>( + Method::PUT, + format!("control/v1/node/{node_id}/drain"), + None, + ) + .await?; + println!("Drain started for {node_id}"); + } + Command::CancelDrain { node_id, timeout } => { + storcon_client + .dispatch::<(), ()>( + Method::DELETE, + format!("control/v1/node/{node_id}/drain"), + None, + ) + .await?; + + println!("Waiting for node {node_id} to quiesce on scheduling policy ..."); + + let final_policy = + wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| { + use NodeSchedulingPolicy::*; + matches!(sched, Active | PauseForRestart) + }) + .await?; + + println!( + "Drain was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}" + ); + } + Command::StartFill { node_id } => { + storcon_client + .dispatch::<(), ()>(Method::PUT, format!("control/v1/node/{node_id}/fill"), None) + .await?; + + println!("Fill started for {node_id}"); + } + Command::CancelFill { node_id, timeout } => { + storcon_client + .dispatch::<(), ()>( + Method::DELETE, + format!("control/v1/node/{node_id}/fill"), + None, + ) + .await?; + + println!("Waiting for node {node_id} to quiesce on scheduling policy ..."); + + let final_policy = + wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| { + use NodeSchedulingPolicy::*; + matches!(sched, Active) + }) + .await?; + + println!( + "Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}" + ); + } + } + + Ok(()) +} diff --git a/deny.toml b/deny.toml index 22e39a2ca3..327ac58db7 100644 --- a/deny.toml +++ b/deny.toml @@ -4,6 +4,7 @@ # to your expectations and requirements. # Root options +[graph] targets = [ { triple = "x86_64-unknown-linux-gnu" }, { triple = "aarch64-unknown-linux-gnu" }, @@ -12,6 +13,7 @@ targets = [ ] all-features = false no-default-features = false +[output] feature-depth = 1 # This section is considered when running `cargo deny check advisories` @@ -19,17 +21,16 @@ feature-depth = 1 # https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html [advisories] db-urls = ["https://github.com/rustsec/advisory-db"] -vulnerability = "deny" -unmaintained = "warn" yanked = "warn" -notice = "warn" -ignore = [] + +[[advisories.ignore]] +id = "RUSTSEC-2023-0071" +reason = "the marvin attack only affects private key decryption, not public key signature verification" # This section is considered when running `cargo deny check licenses` # More documentation for the licenses section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html [licenses] -unlicensed = "deny" allow = [ "Apache-2.0", "Artistic-2.0", @@ -42,10 +43,6 @@ allow = [ "OpenSSL", "Unicode-DFS-2016", ] -deny = [] -copyleft = "warn" -allow-osi-fsf-free = "neither" -default = "deny" confidence-threshold = 0.8 exceptions = [ # Zlib license has some restrictions if we decide to change sth @@ -99,6 +96,13 @@ name = "async-executor" [[bans.deny]] name = "smol" +[[bans.deny]] +# We want to use rustls instead of the platform's native tls implementation. +name = "native-tls" + +[[bans.deny]] +name = "openssl" + # This section is considered when running `cargo deny check sources`. # More documentation about the 'sources' section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html diff --git a/diesel.toml b/diesel.toml new file mode 100644 index 0000000000..558c54a1e1 --- /dev/null +++ b/diesel.toml @@ -0,0 +1,9 @@ +# For documentation on how to configure this file, +# see https://diesel.rs/guides/configuring-diesel-cli + +[print_schema] +file = "storage_controller/src/schema.rs" +custom_type_derives = ["diesel::query_builder::QueryId"] + +[migrations_directory] +dir = "storage_controller/migrations" diff --git a/docker-compose/README.md b/docker-compose/README.md new file mode 100644 index 0000000000..bd47805a67 --- /dev/null +++ b/docker-compose/README.md @@ -0,0 +1,10 @@ + +# Example docker compose configuration + +The configuration in this directory is used for testing Neon docker images: it is +not intended for deploying a usable system. To run a development environment where +you can experiment with a minature Neon system, use `cargo neon` rather than container images. + +This configuration does not start the storage controller, because the controller +needs a way to reconfigure running computes, and no such thing exists in this setup. + diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile index f1b1986072..8378f37b48 100644 --- a/docker-compose/compute_wrapper/Dockerfile +++ b/docker-compose/compute_wrapper/Dockerfile @@ -1,4 +1,4 @@ -ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com +ARG REPOSITORY=neondatabase ARG COMPUTE_IMAGE=compute-node-v14 ARG TAG=latest @@ -8,6 +8,11 @@ USER root RUN apt-get update && \ apt-get install -y curl \ jq \ + python3-pip \ netcat +#Faker is required for the pg_anon test +RUN pip3 install Faker +#This is required for the pg_hintplan test +RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src -USER postgres +USER postgres \ No newline at end of file diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index 22660a63ce..33455e458a 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -23,18 +23,17 @@ echo "Page server is ready." echo "Create a tenant and timeline" generate_id tenant_id PARAMS=( - -sb - -X POST + -X PUT -H "Content-Type: application/json" - -d "{\"new_tenant_id\": \"${tenant_id}\"}" - http://pageserver:9898/v1/tenant/ + -d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}" + "http://pageserver:9898/v1/tenant/${tenant_id}/location_config" ) result=$(curl "${PARAMS[@]}") echo $result | jq . generate_id timeline_id PARAMS=( - -sb + -sbf -X POST -H "Content-Type: application/json" -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}" diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json index ccf0a91b90..8e582e74e1 100644 --- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json +++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json @@ -95,7 +95,7 @@ }, { "name": "shared_preload_libraries", - "value": "neon", + "value": "neon,pg_cron,timescaledb,pg_stat_statements", "vartype": "string" }, { @@ -127,6 +127,16 @@ "name": "max_replication_flush_lag", "value": "10GB", "vartype": "string" + }, + { + "name": "cron.database", + "value": "postgres", + "vartype": "string" + }, + { + "name": "session_preload_libraries", + "value": "anon", + "vartype": "string" } ] }, diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 9777d1fdd2..6e15fdbe0d 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -1,5 +1,3 @@ -version: '3' - services: minio: restart: always @@ -33,25 +31,14 @@ services: restart: always image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} environment: - - BROKER_ENDPOINT='http://storage_broker:50051' - AWS_ACCESS_KEY_ID=minio - AWS_SECRET_ACCESS_KEY=password #- RUST_BACKTRACE=1 ports: #- 6400:6400 # pg protocol handler - 9898:9898 # http endpoints - entrypoint: - - "/bin/sh" - - "-c" - command: - - "/usr/local/bin/pageserver -D /data/.neon/ - -c \"broker_endpoint=$$BROKER_ENDPOINT\" - -c \"listen_pg_addr='0.0.0.0:6400'\" - -c \"listen_http_addr='0.0.0.0:9898'\" - -c \"remote_storage={endpoint='http://minio:9000', - bucket_name='neon', - bucket_region='eu-north-1', - prefix_in_bucket='/pageserver/'}\"" + volumes: + - ./pageserver_config:/data/.neon/ depends_on: - storage_broker - minio_create_buckets @@ -161,12 +148,12 @@ services: context: ./compute_wrapper/ args: - REPOSITORY=${REPOSITORY:-neondatabase} - - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14} + - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16} - TAG=${TAG:-latest} - http_proxy=$http_proxy - https_proxy=$https_proxy environment: - - PG_VERSION=${PG_VERSION:-14} + - PG_VERSION=${PG_VERSION:-16} #- RUST_BACKTRACE=1 # Mount the test files directly, for faster editing cycle. volumes: @@ -194,3 +181,14 @@ services: done" depends_on: - compute + + neon-test-extensions: + profiles: ["test-extensions"] + image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest} + entrypoint: + - "/bin/bash" + - "-c" + command: + - sleep 1800 + depends_on: + - compute diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index e18b0f9176..10805a9952 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -7,54 +7,94 @@ # Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file # Their defaults point at DockerHub `neondatabase/neon:latest` image.`, # to verify custom image builds (e.g pre-published ones). - -# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer. - +# +# A test script for postgres extensions +# Currently supports only v16 +# set -eux -o pipefail -SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml - +COMPOSE_FILE='docker-compose.yml' +cd $(dirname $0) COMPUTE_CONTAINER_NAME=docker-compose-compute-1 -SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;" -PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres" +TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1 +PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres" +: ${http_proxy:=} +: ${https_proxy:=} +export http_proxy https_proxy cleanup() { echo "show container information" docker ps - docker compose -f $COMPOSE_FILE logs + docker compose --profile test-extensions -f $COMPOSE_FILE logs echo "stop containers..." - docker compose -f $COMPOSE_FILE down + docker compose --profile test-extensions -f $COMPOSE_FILE down } -echo "clean up containers if exists" -cleanup - for pg_version in 14 15 16; do - echo "start containers (pg_version=$pg_version)." - PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d + echo "clean up containers if exists" + cleanup + PG_TEST_VERSION=$(($pg_version < 16 ? 16 : $pg_version)) + PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d echo "wait until the compute is ready. timeout after 60s. " cnt=0 - while sleep 1; do + while sleep 3; do # check timeout - cnt=`expr $cnt + 1` + cnt=`expr $cnt + 3` if [ $cnt -gt 60 ]; then echo "timeout before the compute is ready." cleanup exit 1 fi - - # check if the compute is ready - set +o pipefail - result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l` - set -o pipefail - if [ $result -eq 1 ]; then + if docker compose --profile test-extensions -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then echo "OK. The compute is ready to connect." echo "execute simple queries." docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION" - cleanup break fi done + + if [ $pg_version -ge 16 ] + then + echo Enabling trust connection + docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' " + echo Adding postgres role + docker exec $COMPUTE_CONTAINER_NAME psql $PSQL_OPTION -c "CREATE ROLE postgres SUPERUSER LOGIN" + # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail + # It cannot be moved to Dockerfile now because the database directory is created after the start of the container + echo Adding dummy config + docker exec $COMPUTE_CONTAINER_NAME touch /var/db/postgres/compute/compute_ctl_temp_override.conf + # This block is required for the pg_anon extension test. + # The test assumes that it is running on the same host with the postgres engine. + # In our case it's not true, that's why we are copying files to the compute node + TMPDIR=$(mktemp -d) + docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data + echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv + docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data + rm -rf $TMPDIR + TMPDIR=$(mktemp -d) + # The following block does the same for the pg_hintplan test + docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data + docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/ + rm -rf $TMPDIR + # We are running tests now + if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \ + $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt + then + cleanup + else + FAILED=$(tail -1 testout.txt) + for d in $FAILED + do + mkdir $d + docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true + docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true + cat $d/regression.out $d/regression.diffs || true + done + rm -rf $FAILED + cleanup + exit 1 + fi + fi + cleanup done diff --git a/docker-compose/pageserver_config/identity.toml b/docker-compose/pageserver_config/identity.toml new file mode 100644 index 0000000000..20121327c7 --- /dev/null +++ b/docker-compose/pageserver_config/identity.toml @@ -0,0 +1 @@ +id=1234 diff --git a/docker-compose/pageserver_config/pageserver.toml b/docker-compose/pageserver_config/pageserver.toml new file mode 100644 index 0000000000..76935453b6 --- /dev/null +++ b/docker-compose/pageserver_config/pageserver.toml @@ -0,0 +1,5 @@ +broker_endpoint='http://storage_broker:50051' +pg_distrib_dir='/usr/local/' +listen_pg_addr='0.0.0.0:6400' +listen_http_addr='0.0.0.0:9898' +remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' } diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh new file mode 100644 index 0000000000..3fc0b90071 --- /dev/null +++ b/docker-compose/run-tests.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -x + +cd /ext-src || exit 2 +FAILED= +LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u) +for d in ${LIST} +do + [ -d "${d}" ] || continue + psql -c "select 1" >/dev/null || break + USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}" +done +[ -z "${FAILED}" ] && exit 0 +echo "${FAILED}" +exit 1 \ No newline at end of file diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index b275349168..5fd4080c28 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -1,13 +1,18 @@ # Summary +# Looking for `neon.tech` docs? + +This page linkes to a selection of technical content about the open source code in this repository. + +Please visit https://neon.tech/docs for documentation about using the Neon service, which is based on the code +in this repository. + +# Architecture + [Introduction]() - [Separation of Compute and Storage](./separation-compute-storage.md) -# Architecture - - [Compute]() - - [WAL proposer]() - - [WAL Backpressure]() - [Postgres changes](./core_changes.md) - [Pageserver](./pageserver.md) @@ -16,33 +21,15 @@ - [WAL Redo](./pageserver-walredo.md) - [Page cache](./pageserver-pagecache.md) - [Storage](./pageserver-storage.md) - - [Datadir mapping]() - - [Layer files]() - - [Branching]() - - [Garbage collection]() - - [Cloud Storage]() - [Processing a GetPage request](./pageserver-processing-getpage.md) - [Processing WAL](./pageserver-processing-wal.md) - - [Management API]() - - [Tenant Rebalancing]() - [WAL Service](walservice.md) - [Consensus protocol](safekeeper-protocol.md) - - [Management API]() - - [Rebalancing]() - -- [Control Plane]() - -- [Proxy]() - [Source view](./sourcetree.md) - [docker.md](./docker.md) — Docker images and building pipeline. - [Error handling and logging](./error-handling.md) - - [Testing]() - - [Unit testing]() - - [Integration testing]() - - [Benchmarks]() - - [Glossary](./glossary.md) @@ -58,28 +45,6 @@ # RFCs -- [RFCs](./rfcs/README.md) - -- [002-storage](rfcs/002-storage.md) -- [003-laptop-cli](rfcs/003-laptop-cli.md) -- [004-durability](rfcs/004-durability.md) -- [005-zenith_local](rfcs/005-zenith_local.md) -- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md) -- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md) -- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md) -- [008-push-pull](rfcs/008-push-pull.md) -- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md) -- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md) -- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md) -- [010-storage_details](rfcs/010-storage_details.md) -- [011-retention-policy](rfcs/011-retention-policy.md) -- [012-background-tasks](rfcs/012-background-tasks.md) -- [013-term-history](rfcs/013-term-history.md) -- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md) -- [014-storage-lsm](rfcs/014-storage-lsm.md) -- [015-storage-messaging](rfcs/015-storage-messaging.md) -- [016-connection-routing](rfcs/016-connection-routing.md) -- [017-timeline-data-management](rfcs/017-timeline-data-management.md) -- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md) -- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md) -- [cluster-size-limits](rfcs/cluster-size-limits.md) +Major changes are documented in RFCS: +- See [RFCs](./rfcs/README.md) for more information +- view the RFCs at https://github.com/neondatabase/neon/tree/main/docs/rfcs diff --git a/docs/authentication.md b/docs/authentication.md index f768b04c5b..522c5481b4 100644 --- a/docs/authentication.md +++ b/docs/authentication.md @@ -70,6 +70,9 @@ Should only be used e.g. for status check/tenant creation/list. Should only be used e.g. for status check. Currently also used for connection from any pageserver to any safekeeper. +"generations_api": Provides access to the upcall APIs served by the storage controller or the control plane. + +"admin": Provides access to the control plane and admin APIs of the storage controller. ### CLI CLI generates a key pair during call to `neon_local init` with the following commands: diff --git a/docs/core_changes.md b/docs/core_changes.md index ea219adae9..1388317728 100644 --- a/docs/core_changes.md +++ b/docs/core_changes.md @@ -11,15 +11,28 @@ page server. We currently use the same binary for both, with --wal-redo runtime the WAL redo mode. Some PostgreSQL changes are needed in the compute node, while others are just for the WAL redo process. -In addition to core PostgreSQL changes, there is a Neon extension in contrib/neon, to hook into the -smgr interface. Once all the core changes have been submitted to upstream or eliminated some other -way, the extension could live outside the postgres repository and build against vanilla PostgreSQL. +In addition to core PostgreSQL changes, there is a Neon extension in the pgxn/neon directory that +hooks into the smgr interface, and rmgr extension in pgxn/neon_rmgr. The extensions are loaded into +the Postgres processes with shared_preload_libraries. Most of the Neon-specific code is in the +extensions, and for any new features, that is preferred over modifying core PostgreSQL code. Below is a list of all the PostgreSQL source code changes, categorized into changes needed for compute, and changes needed for the WAL redo process: # Changes for Compute node +## Prefetching + +There are changes in many places to perform prefetching, for example for sequential scans. Neon +doesn't benefit from OS readahead, and the latency to pageservers is quite high compared to local +disk, so prefetching is critical for performance, also for sequential scans. + +### How to get rid of the patch + +Upcoming "streaming read" work in v17 might simplify this. And async I/O work in v18 will hopefully +do more. + + ## Add t_cid to heap WAL records ``` @@ -37,54 +50,11 @@ The problem is that the XLOG_HEAP_INSERT record does not include the command id Bite the bullet and submit the patch to PostgreSQL, to add the t_cid to the WAL records. It makes the WAL records larger, which could make this unpopular in the PostgreSQL community. However, it might simplify some logical decoding code; Andres Freund briefly mentioned in PGCon 2022 discussion on Heikki's Neon presentation that logical decoding currently needs to jump through some hoops to reconstruct the same information. +Update from Heikki (2024-04-17): I tried to write an upstream patch for that, to use the t_cid field for logical decoding, but it was not as straightforward as it first sounded. ### Alternatives Perhaps we could write an extra WAL record with the t_cid information, when a page is evicted that contains rows that were touched a transaction that's still running. However, that seems very complicated. -## ginfast.c - -``` -diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c -index e0d9940946..2d964c02e9 100644 ---- a/src/backend/access/gin/ginfast.c -+++ b/src/backend/access/gin/ginfast.c -@@ -285,6 +285,17 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) - memset(&sublist, 0, sizeof(GinMetaPageData)); - makeSublist(index, collector->tuples, collector->ntuples, &sublist); - -+ if (metadata->head != InvalidBlockNumber) -+ { -+ /* -+ * ZENITH: Get buffer before XLogBeginInsert() to avoid recursive call -+ * of XLogBeginInsert(). Reading a new buffer might evict a dirty page from -+ * the buffer cache, and if that page happens to be an FSM or VM page, zenith_write() -+ * will try to WAL-log an image of the page. -+ */ -+ buffer = ReadBuffer(index, metadata->tail); -+ } -+ - if (needWal) - XLogBeginInsert(); - -@@ -316,7 +327,6 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) - data.prevTail = metadata->tail; - data.newRightlink = sublist.head; - -- buffer = ReadBuffer(index, metadata->tail); - LockBuffer(buffer, GIN_EXCLUSIVE); - page = BufferGetPage(buffer); -``` - -The problem is explained in the comment above - -### How to get rid of the patch - -Can we stop WAL-logging FSM or VM pages? Or delay the WAL logging until we're out of the critical -section or something. - -Maybe some bigger rewrite of FSM and VM would help to avoid WAL-logging FSM and VM page images? - - ## Mark index builds that use buffer manager without logging explicitly ``` @@ -95,6 +65,8 @@ Maybe some bigger rewrite of FSM and VM would help to avoid WAL-logging FSM and also some changes in src/backend/storage/smgr/smgr.c ``` +pgvector 0.6.0 also needs a similar change, which would be very nice to get rid of too. + When a GIN index is built, for example, it is built by inserting the entries into the index more or less normally, but without WAL-logging anything. After the index has been built, we iterate through all pages and write them to the WAL. That doesn't work for Neon, because if a page is not WAL-logged @@ -109,6 +81,10 @@ an operation: `smgr_start_unlogged_build`, `smgr_finish_unlogged_build_phase_1` I think it would make sense to be more explicit about that in PostgreSQL too. So extract these changes to a patch and post to pgsql-hackers. +Perhaps we could deduce that an unlogged index build has started when we see a page being evicted +with zero LSN. How to be sure it's an unlogged index build rather than a bug? Currently we have a +check for that and PANIC if we see page with zero LSN being evicted. And how do we detect when the +index build has finished? See https://github.com/neondatabase/neon/pull/7440 for an attempt at that. ## Track last-written page LSN @@ -140,57 +116,6 @@ The old method is still available, though. Wait until v15? -## Cache relation sizes - -The Neon extension contains a little cache for smgrnblocks() and smgrexists() calls, to avoid going -to the page server every time. It might be useful to cache those in PostgreSQL, maybe in the -relcache? (I think we do cache nblocks in relcache already, check why that's not good enough for -Neon) - - -## Use buffer manager when extending VM or FSM - -``` - src/backend/storage/freespace/freespace.c | 14 +- - src/backend/access/heap/visibilitymap.c | 15 +- - -diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c -index e198df65d8..addfe93eac 100644 ---- a/src/backend/access/heap/visibilitymap.c -+++ b/src/backend/access/heap/visibilitymap.c -@@ -652,10 +652,19 @@ vm_extend(Relation rel, BlockNumber vm_nblocks) - /* Now extend the file */ - while (vm_nblocks_now < vm_nblocks) - { -- PageSetChecksumInplace((Page) pg.data, vm_nblocks_now); -+ /* -+ * ZENITH: Initialize VM pages through buffer cache to prevent loading -+ * them from pageserver. -+ */ -+ Buffer buffer = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, P_NEW, -+ RBM_ZERO_AND_LOCK, NULL); -+ Page page = BufferGetPage(buffer); -+ -+ PageInit((Page) page, BLCKSZ, 0); -+ PageSetChecksumInplace(page, vm_nblocks_now); -+ MarkBufferDirty(buffer); -+ UnlockReleaseBuffer(buffer); - -- smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now, -- pg.data, false); - vm_nblocks_now++; - } -``` - -### Problem we're trying to solve - -??? - -### How to get rid of the patch - -Maybe this would be a reasonable change in PostgreSQL too? - - ## Allow startup without reading checkpoint record In Neon, the compute node is stateless. So when we are launching compute node, we need to provide @@ -231,7 +156,7 @@ index 0415df9ccb..9f9db3c8bc 100644 * crash we can lose (skip over) as many values as we pre-logged. */ -#define SEQ_LOG_VALS 32 -+/* Zenith XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */ ++/* Neon XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */ +/* #define SEQ_LOG_VALS 32 */ +#define SEQ_LOG_VALS 0 ``` @@ -250,66 +175,6 @@ would be weird if the sequence moved backwards though, think of PITR. Or add a GUC for the amount to prefix to PostgreSQL, and force it to 1 in Neon. -## Walproposer - -``` - src/Makefile | 1 + - src/backend/replication/libpqwalproposer/Makefile | 37 + - src/backend/replication/libpqwalproposer/libpqwalproposer.c | 416 ++++++++++++ - src/backend/postmaster/bgworker.c | 4 + - src/backend/postmaster/postmaster.c | 6 + - src/backend/replication/Makefile | 4 +- - src/backend/replication/walproposer.c | 2350 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - src/backend/replication/walproposer_utils.c | 402 +++++++++++ - src/backend/replication/walreceiver.c | 7 + - src/backend/replication/walsender.c | 320 ++++++--- - src/backend/storage/ipc/ipci.c | 6 + - src/include/replication/walproposer.h | 565 ++++++++++++++++ -``` - -WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes. It is -currently implemented as patch to standard WAL sender. - -### How to get rid of the patch - -Refactor into an extension. Submit hooks or APIs into upstream if necessary. - -@MMeent did some work on this already: https://github.com/neondatabase/postgres/pull/96 - -## Ignore unexpected data beyond EOF in bufmgr.c - -``` -@@ -922,11 +928,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, - */ - bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr); - if (!PageIsNew((Page) bufBlock)) -- ereport(ERROR, -+ { -+ // XXX-ZENITH -+ MemSet((char *) bufBlock, 0, BLCKSZ); -+ ereport(DEBUG1, - (errmsg("unexpected data beyond EOF in block %u of relation %s", - blockNum, relpath(smgr->smgr_rnode, forkNum)), - errhint("This has been seen to occur with buggy kernels; consider updating your system."))); -- -+ } - /* - * We *must* do smgrextend before succeeding, else the page will not - * be reserved by the kernel, and the next P_NEW call will decide to -``` - -PostgreSQL is a bit sloppy with extending relations. Usually, the relation is extended with zeros -first, then the page is filled, and finally the new page WAL-logged. But if multiple backends extend -a relation at the same time, the pages can be WAL-logged in different order. - -I'm not sure what scenario exactly required this change in Neon, though. - -### How to get rid of the patch - -Submit patches to pgsql-hackers, to tighten up the WAL-logging around relation extension. It's a bit -confusing even in PostgreSQL. Maybe WAL log the intention to extend first, then extend the relation, -and finally WAL-log that the extension succeeded. - ## Make smgr interface available to extensions ``` @@ -321,6 +186,8 @@ and finally WAL-log that the extension succeeded. Submit to upstream. This could be useful for the Disk Encryption patches too, or for compression. +We have submitted this to upstream, but it's moving at glacial a speed. +https://commitfest.postgresql.org/47/4428/ ## Added relpersistence argument to smgropen() @@ -444,6 +311,148 @@ Ignore it. This is only needed for disaster recovery, so once we've eliminated a patches, we can just keep it around as a patch or as separate branch in a repo. +## pg_waldump flags to ignore errors + +After creating a new project or branch in Neon, the first timeline can begin in the middle of a WAL segment. pg_waldump chokes on that, so we added some flags to make it possible to ignore errors. + +### How to get rid of the patch + +Like previous one, ignore it. + + + +## Backpressure if pageserver doesn't ingest WAL fast enough + +``` +@@ -3200,6 +3202,7 @@ ProcessInterrupts(void) + return; + InterruptPending = false; + ++retry: + if (ProcDiePending) + { + ProcDiePending = false; +@@ -3447,6 +3450,13 @@ ProcessInterrupts(void) + + if (ParallelApplyMessagePending) + HandleParallelApplyMessages(); ++ ++ /* Call registered callback if any */ ++ if (ProcessInterruptsCallback) ++ { ++ if (ProcessInterruptsCallback()) ++ goto retry; ++ } + } +``` + + +### How to get rid of the patch + +Submit a patch to upstream, for a hook in ProcessInterrupts. Could be useful for other extensions +too. + + +## SLRU on-demand download + +``` + src/backend/access/transam/slru.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------- + 1 file changed, 92 insertions(+), 13 deletions(-) +``` + +### Problem we're trying to solve + +Previously, SLRU files were included in the basebackup, but the total size of them can be large, +several GB, and downloading them all made the startup time too long. + +### Alternatives + +FUSE hook or LD_PRELOAD trick to intercept the reads on SLRU files + + +## WAL-log an all-zeros page as one large hole + +- In XLogRecordAssemble() + +### Problem we're trying to solve + +This change was made in v16. Starting with v16, when PostgreSQL extends a relation, it first extends +it with zeros, and it can extend the relation more than one block at a time. The all-zeros page is WAL-ogged, but it's very wasteful to include 8 kB of zeros in the WAL for that. This hack was made so that we WAL logged a compact record with a whole-page "hole". However, PostgreSQL has assertions that prevent that such WAL records from being replayed, so this breaks compatibility such that unmodified PostreSQL cannot process Neon-generated WAL. + +### How to get rid of the patch + +Find another compact representation for a full-page image of an all-zeros page. A compressed image perhaps. + + +## Shut down walproposer after checkpointer + +``` ++ /* Neon: Also allow walproposer background worker to be treated like a WAL sender, so that it's shut down last */ ++ if ((bp->bkend_type == BACKEND_TYPE_NORMAL || bp->bkend_type == BACKEND_TYPE_BGWORKER) && +``` + +This changes was needed so that postmaster shuts down the walproposer process only after the shutdown checkpoint record is written. Otherwise, the shutdown record will never make it to the safekeepers. + +### How to get rid of the patch + +Do a bigger refactoring of the postmaster state machine, such that a background worker can specify +the shutdown ordering by itself. The postmaster state machine has grown pretty complicated, and +would benefit from a refactoring for the sake of readability anyway. + + +## EXPLAIN changes for prefetch and LFC + +### How to get rid of the patch + +Konstantin submitted a patch to -hackers already: https://commitfest.postgresql.org/47/4643/. Get that into a committable state. + + +## On-demand download of extensions + +### How to get rid of the patch + +FUSE or LD_PRELOAD trickery to intercept reads? + + +## Publication superuser checks + +We have hacked CreatePublication so that also neon_superuser can create them. + +### How to get rid of the patch + +Create an upstream patch with more fine-grained privileges for publications CREATE/DROP that can be GRANTed to users. + + +## WAL log replication slots + +### How to get rid of the patch + +Utilize the upcoming v17 "slot sync worker", or a similar neon-specific background worker process, to periodically WAL-log the slots, or to export them somewhere else. + + +## WAL-log replication snapshots + +### How to get rid of the patch + +WAL-log them periodically, from a backgound worker. + + +## WAL-log relmapper files + +Similarly to replications snapshot files, the CID mapping files generated during VACUUM FULL of a catalog table are WAL-logged + +### How to get rid of the patch + +WAL-log them periodically, from a backgound worker. + + +## XLogWaitForReplayOf() + +?? + + + + # Not currently committed but proposed ## Disable ring buffer buffer manager strategies @@ -472,23 +481,10 @@ hint bits are set. Wal logging hint bits updates requires FPI which significantl Add special WAL record for setting page hints. -## Prefetching - -### Why? - -As far as pages in Neon are loaded on demand, to reduce node startup time -and also speedup some massive queries we need some mechanism for bulk loading to -reduce page request round-trip overhead. - -Currently Postgres is supporting prefetching only for bitmap scan. -In Neon we should also use prefetch for sequential and index scans, because the OS is not doing it for us. -For sequential scan we could prefetch some number of following pages. For index scan we could prefetch pages -of heap relation addressed by TIDs. - ## Prewarming ### Why? -Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith. +Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Neon. But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow. We can capture state of compute node buffer cache and send bulk request for this pages at startup. diff --git a/docs/docker.md b/docs/docker.md index 9761cc4346..ce806c4e6c 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -4,24 +4,24 @@ Currently we build two main images: -- [neondatabase/neon](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). -- [neondatabase/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). +- [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). +- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. And additional intermediate image: - [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools. -## Building pipeline +## Build pipeline We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs -1. `neondatabase/compute-tools` and `neondatabase/compute-node` +1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14) 2. `neondatabase/neon` ## Docker Compose example -You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers. +You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following containers. - pageserver x 1 - safekeeper x 3 @@ -34,12 +34,12 @@ You can see a [docker compose](https://docs.docker.com/compose/) example to crea 1. create containers You can specify version of neon cluster using following environment values. -- PG_VERSION: postgres version for compute (default is 14) -- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml) +- PG_VERSION: postgres version for compute (default is 16 as of this writing) +- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags), which is tagged in [CI test](/.github/workflows/build_and_test.yml). Default is 'latest' ``` $ cd docker-compose/ -$ docker-compose down # remove the conainers if exists -$ PG_VERSION=15 TAG=2937 docker-compose up --build -d # You can specify the postgres and image version +$ docker-compose down # remove the containers if exists +$ PG_VERSION=16 TAG=latest docker-compose up --build -d # You can specify the postgres and image version Creating network "dockercompose_default" with the default driver Creating docker-compose_storage_broker_1 ... done (...omit...) @@ -47,29 +47,31 @@ Creating docker-compose_storage_broker_1 ... done 2. connect compute node ``` -$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass -$ chmod 600 ~/.pgpass -$ psql -h localhost -p 55433 -U cloud_admin +$ psql postgresql://cloud_admin:cloud_admin@localhost:55433/postgres +psql (16.3) +Type "help" for help. + postgres=# CREATE TABLE t(key int primary key, value text); CREATE TABLE -postgres=# insert into t values(1,1); +postgres=# insert into t values(1, 1); INSERT 0 1 postgres=# select * from t; - key | value + key | value -----+------- 1 | 1 (1 row) + ``` 3. If you want to see the log, you can use `docker-compose logs` command. ``` # check the container name you want to see $ docker ps -CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES -d6968a5ae912 dockercompose_compute "/shell/compute.sh" 5 minutes ago Up 5 minutes 0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp dockercompose_compute_1 +CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES +3582f6d76227 docker-compose_compute "/shell/compute.sh" 2 minutes ago Up 2 minutes 0.0.0.0:3080->3080/tcp, :::3080->3080/tcp, 0.0.0.0:55433->55433/tcp, :::55433->55433/tcp docker-compose_compute_1 (...omit...) -$ docker logs -f dockercompose_compute_1 +$ docker logs -f docker-compose_compute_1 2022-10-21 06:15:48.757 GMT [56] LOG: connection authorized: user=cloud_admin database=postgres application_name=psql 2022-10-21 06:17:00.307 GMT [56] LOG: [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400' (...omit...) diff --git a/docs/pageserver-pagecache.md b/docs/pageserver-pagecache.md index d9b120bbb9..d022742dff 100644 --- a/docs/pageserver-pagecache.md +++ b/docs/pageserver-pagecache.md @@ -5,4 +5,3 @@ TODO: - shared across tenants - store pages from layer files - store pages from "in-memory layer" -- store materialized pages diff --git a/docs/pageserver-services.md b/docs/pageserver-services.md index ba5d3c423e..11d984eb08 100644 --- a/docs/pageserver-services.md +++ b/docs/pageserver-services.md @@ -101,11 +101,12 @@ or ```toml [remote_storage] container_name = 'some-container-name' +storage_account = 'somestorageaccnt' container_region = 'us-east' prefix_in_container = '/test-prefix/' ``` -`AZURE_STORAGE_ACCOUNT` and `AZURE_STORAGE_ACCESS_KEY` env variables can be used to specify the azure credentials if needed. +The `AZURE_STORAGE_ACCESS_KEY` env variable can be used to specify the azure credentials if needed. ## Repository background tasks diff --git a/docs/pageserver-storage.md b/docs/pageserver-storage.md index 77e7ff35bc..9902f6b930 100644 --- a/docs/pageserver-storage.md +++ b/docs/pageserver-storage.md @@ -64,7 +64,7 @@ Storage. The LayerMap tracks what layers exist in a timeline. -Currently, the layer map is just a resizeable array (Vec). On a GetPage@LSN or +Currently, the layer map is just a resizable array (Vec). On a GetPage@LSN or other read request, the layer map scans through the array to find the right layer that contains the data for the requested page. The read-code in LayeredTimeline is aware of the ancestor, and returns data from the ancestor timeline if it's diff --git a/docs/pageserver-thread-mgmt.md b/docs/pageserver-thread-mgmt.md index c911d2c53d..5d862415eb 100644 --- a/docs/pageserver-thread-mgmt.md +++ b/docs/pageserver-thread-mgmt.md @@ -22,7 +22,7 @@ timeline to shutdown. It will also wait for them to finish. A task registered in the task registry can check if it has been requested to shut down, by calling `is_shutdown_requested()`. There's -also a `shudown_watcher()` Future that can be used with `tokio::select!` +also a `shutdown_watcher()` Future that can be used with `tokio::select!` or similar, to wake up on shutdown. diff --git a/docs/pageserver-walredo.md b/docs/pageserver-walredo.md index 1de9c177cc..7b366ff616 100644 --- a/docs/pageserver-walredo.md +++ b/docs/pageserver-walredo.md @@ -74,4 +74,4 @@ somewhat wasteful, but because most WAL records only affect one page, the overhead is acceptable. The WAL redo always happens for one particular page. If the WAL record -coantains changes to other pages, they are ignored. +contains changes to other pages, they are ignored. diff --git a/docs/rfcs/002-storage.md b/docs/rfcs/002-storage.md index f99683cf09..d11b750e73 100644 --- a/docs/rfcs/002-storage.md +++ b/docs/rfcs/002-storage.md @@ -1,4 +1,4 @@ -# Zenith storage node — alternative +# Neon storage node — alternative ## **Design considerations** diff --git a/docs/rfcs/003-laptop-cli.md b/docs/rfcs/003-laptop-cli.md index 1a549c2df5..003a05bd16 100644 --- a/docs/rfcs/003-laptop-cli.md +++ b/docs/rfcs/003-laptop-cli.md @@ -1,6 +1,6 @@ # Command line interface (end-user) -Zenith CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside zenith distribution at least at the start. +Neon CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside neon distribution at least at the start. This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots. @@ -8,40 +8,40 @@ The most important concept here is a snapshot, which can be created/pushed/pulle # Possible usage scenarios -## Install zenith, run a postgres +## Install neon, run a postgres ``` -> brew install pg-zenith -> zenith pg create # creates pgdata with default pattern pgdata$i -> zenith pg list +> brew install pg-neon +> neon pg create # creates pgdata with default pattern pgdata$i +> neon pg list ID PGDATA USED STORAGE ENDPOINT -primary1 pgdata1 0G zenith-local localhost:5432 +primary1 pgdata1 0G neon-local localhost:5432 ``` -## Import standalone postgres to zenith +## Import standalone postgres to neon ``` -> zenith snapshot import --from=basebackup://replication@localhost:5432/ oldpg +> neon snapshot import --from=basebackup://replication@localhost:5432/ oldpg [====================------------] 60% | 20MB/s -> zenith snapshot list +> neon snapshot list ID SIZE PARENT oldpg 5G - -> zenith pg create --snapshot oldpg +> neon pg create --snapshot oldpg Started postgres on localhost:5432 -> zenith pg list +> neon pg list ID PGDATA USED STORAGE ENDPOINT -primary1 pgdata1 5G zenith-local localhost:5432 +primary1 pgdata1 5G neon-local localhost:5432 -> zenith snapshot destroy oldpg +> neon snapshot destroy oldpg Ok ``` Also, we may start snapshot import implicitly by looking at snapshot schema ``` -> zenith pg create --snapshot basebackup://replication@localhost:5432/ +> neon pg create --snapshot basebackup://replication@localhost:5432/ Downloading snapshot... Done. Started postgres on localhost:5432 Destroying snapshot... Done. @@ -52,39 +52,39 @@ Destroying snapshot... Done. Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage). ``` -> zenith pg create --snapshot http://learn-postgres.com/movies_db.zenith movies +> neon pg create --snapshot http://learn-postgres.com/movies_db.neon movies ``` ## Create snapshot and push it to the cloud ``` -> zenith snapshot create pgdata1@snap1 -> zenith snapshot push --to ssh://stas@zenith.tech pgdata1@snap1 +> neon snapshot create pgdata1@snap1 +> neon snapshot push --to ssh://stas@neon.tech pgdata1@snap1 ``` ## Rollback database to the snapshot -One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `zenith pg checkout`. +One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `neon pg checkout`. ``` -> zenith pg list +> neon pg list ID PGDATA USED STORAGE ENDPOINT -primary1 pgdata1 5G zenith-local localhost:5432 +primary1 pgdata1 5G neon-local localhost:5432 -> zenith snapshot create pgdata1@snap1 +> neon snapshot create pgdata1@snap1 -> zenith snapshot list +> neon snapshot list ID SIZE PARENT oldpg 5G - pgdata1@snap1 6G - pgdata1@CURRENT 6G - -> zenith pg checkout pgdata1@snap1 +> neon pg checkout pgdata1@snap1 Stopping postgres on pgdata1. Rolling back pgdata1@CURRENT to pgdata1@snap1. Starting postgres on pgdata1. -> zenith snapshot list +> neon snapshot list ID SIZE PARENT oldpg 5G - pgdata1@snap1 6G - @@ -99,7 +99,7 @@ Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite). ``` -> zenith pitr create --storage s3tank --ttl 30d --name pitr_last_month +> neon pitr create --storage s3tank --ttl 30d --name pitr_last_month ``` Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area. @@ -108,29 +108,29 @@ Resetting the database to some state in past would require creating a snapshot o ## storage -Storage is either zenith pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default. +Storage is either neon pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default. -**zenith storage attach** -t [native|s3] -c key=value -n name +**neon storage attach** -t [native|s3] -c key=value -n name -Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=zenith.tech/stas/mystore. Other possible term for native is 'zstore'. +Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=neon.tech/stas/mystore. Other possible term for native is 'zstore'. -**zenith storage list** +**neon storage list** Show currently attached storages. For example: ``` -> zenith storage list +> neon storage list NAME USED TYPE OPTIONS PATH -local 5.1G zenith-local /opt/zenith/store/local -local.compr 20.4G zenith-local compression=on /opt/zenith/store/local.compr -zcloud 60G zenith-remote zenith.tech/stas/mystore +local 5.1G neon-local /opt/neon/store/local +local.compr 20.4G neon-local compression=on /opt/neon/store/local.compr +zcloud 60G neon-remote neon.tech/stas/mystore s3tank 80G S3 ``` -**zenith storage detach** +**neon storage detach** -**zenith storage show** +**neon storage show** @@ -140,29 +140,29 @@ Manages postgres data directories and can start postgres instances with proper c Pg is a term for a single postgres running on some data. I'm trying to avoid separation of datadir management and postgres instance management -- both that concepts bundled here together. -**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata +**neon pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr. --no-start: just init datadir without creating ---snapshot snap: init from the snapshot. Snap is a name or URL (zenith.tech/stas/mystore/snap1) +--snapshot snap: init from the snapshot. Snap is a name or URL (neon.tech/stas/mystore/snap1) --cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database) -**zenith pg destroy** +**neon pg destroy** -**zenith pg start** [--replica] pgdata +**neon pg start** [--replica] pgdata Start postgres with proper extensions preloaded/installed. -**zenith pg checkout** +**neon pg checkout** Rollback data directory to some previous snapshot. -**zenith pg stop** pg_id +**neon pg stop** pg_id -**zenith pg list** +**neon pg list** ``` ROLE PGDATA USED STORAGE ENDPOINT @@ -173,7 +173,7 @@ primary my_pg2 3.2G local.compr localhost:5435 - my_pg3 9.2G local.compr - ``` -**zenith pg show** +**neon pg show** ``` my_pg: @@ -194,7 +194,7 @@ my_pg: ``` -**zenith pg start-rest/graphql** pgdata +**neon pg start-rest/graphql** pgdata Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea. @@ -203,35 +203,35 @@ Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout. -**zenith snapshot create** pgdata_name@snap_name +**neon snapshot create** pgdata_name@snap_name Creates a new snapshot in the same storage where pgdata_name exists. -**zenith snapshot push** --to url pgdata_name@snap_name +**neon snapshot push** --to url pgdata_name@snap_name -Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `zenith snapshot recv` before push happens. If url has some special schema like zenith:// receiving side may require auth start `zenith snapshot recv` on the go. +Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `neon snapshot recv` before push happens. If url has some special schema like neon:// receiving side may require auth start `neon snapshot recv` on the go. -**zenith snapshot recv** +**neon snapshot recv** Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket. -**zenith snapshot pull** --from url or path +**neon snapshot pull** --from url or path -Connects to a remote zenith/s3/file and pulls snapshot. The remote site should be zenith service or files in our format. +Connects to a remote neon/s3/file and pulls snapshot. The remote site should be neon service or files in our format. -**zenith snapshot import** --from basebackup://<...> or path +**neon snapshot import** --from basebackup://<...> or path Creates a new snapshot out of running postgres via basebackup protocol or basebackup files. -**zenith snapshot export** +**neon snapshot export** -Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be zenith own format which is handy for us (but I think just tar of basebackup would be okay). +Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be neon own format which is handy for us (but I think just tar of basebackup would be okay). -**zenith snapshot diff** snap1 snap2 +**neon snapshot diff** snap1 snap2 Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses. -**zenith snapshot destroy** +**neon snapshot destroy** ## pitr @@ -239,7 +239,7 @@ Pitr represents wal stream and ttl policy for that stream XXX: any suggestions on a better name? -**zenith pitr create** name +**neon pitr create** name --ttl = inf | period @@ -247,21 +247,21 @@ XXX: any suggestions on a better name? --storage = storage_name -**zenith pitr extract-snapshot** pitr_name --lsn xxx +**neon pitr extract-snapshot** pitr_name --lsn xxx Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export) -**zenith pitr gc** pitr_name +**neon pitr gc** pitr_name Force garbage collection on some PITR area. -**zenith pitr list** +**neon pitr list** -**zenith pitr destroy** +**neon pitr destroy** ## console -**zenith console** +**neon console** Opens browser targeted at web console with the more or less same functionality as described here. diff --git a/docs/rfcs/004-durability.md b/docs/rfcs/004-durability.md index d4716156d1..6b83c77403 100644 --- a/docs/rfcs/004-durability.md +++ b/docs/rfcs/004-durability.md @@ -6,7 +6,7 @@ When do we consider the WAL record as durable, so that we can acknowledge the commit to the client and be reasonably certain that we will not lose the transaction? -Zenith uses a group of WAL safekeeper nodes to hold the generated WAL. +Neon uses a group of WAL safekeeper nodes to hold the generated WAL. A WAL record is considered durable, when it has been written to a majority of WAL safekeeper nodes. In this document, I use 5 safekeepers, because I have five fingers. A WAL record is durable, diff --git a/docs/rfcs/005-zenith_local.md b/docs/rfcs/005-zenith_local.md index e36d0a9ae3..6c283d7a37 100644 --- a/docs/rfcs/005-zenith_local.md +++ b/docs/rfcs/005-zenith_local.md @@ -1,23 +1,23 @@ -# Zenith local +# Neon local -Here I list some objectives to keep in mind when discussing zenith-local design and a proposal that brings all components together. Your comments on both parts are very welcome. +Here I list some objectives to keep in mind when discussing neon-local design and a proposal that brings all components together. Your comments on both parts are very welcome. #### Why do we need it? - For distribution - this easy to use binary will help us to build adoption among developers. - For internal use - to test all components together. -In my understanding, we consider it to be just a mock-up version of zenith-cloud. +In my understanding, we consider it to be just a mock-up version of neon-cloud. > Question: How much should we care about durability and security issues for a local setup? #### Why is it better than a simple local postgres? -- Easy one-line setup. As simple as `cargo install zenith && zenith start` +- Easy one-line setup. As simple as `cargo install neon && neon start` - Quick and cheap creation of compute nodes over the same storage. > Question: How can we describe a use-case for this feature? -- Zenith-local can work with S3 directly. +- Neon-local can work with S3 directly. - Push and pull images (snapshots) to remote S3 to exchange data with other users. @@ -31,50 +31,50 @@ Ideally, just one binary that incorporates all elements we need. #### Components: -- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responses to show them in a user-friendly way. -CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md -WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli +- **neon-CLI** - interface for end-users. Turns commands to REST requests and handles responses to show them in a user-friendly way. +CLI proposal is here https://github.com/neondatabase/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md +WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src/bin/cli -- **zenith-console** - WEB UI with same functionality as CLI. +- **neon-console** - WEB UI with same functionality as CLI. >Note: not for the first release. -- **zenith-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below. - > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping zenith-local. +- **neon-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below. + > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping neon-local. -- **zenith-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation). +- **neon-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation). > Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server? -WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src +WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src -- **zenith-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to zenith. +- **neon-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to neon. > Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)? > Question: Do we use it together with local page store or they are interchangeable? WIP code is ??? -- **zenith-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed. +- **neon-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed. > Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system. -WIP code is here: https://github.com/libzenith/postgres/tree/main/src/bin/safekeeper +WIP code is here: https://github.com/neondatabase/postgres/tree/main/src/bin/safekeeper -- **zenith-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database. +- **neon-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database. - WIP code is in main branch and here: https://github.com/libzenith/postgres/commits/compute_node + WIP code is in main branch and here: https://github.com/neondatabase/postgres/commits/compute_node #### REST API: Service endpoint: `http://localhost:3000` Resources: -- /storages - Where data lives: zenith-pageserver or zenith-s3 -- /pgs - Postgres - zenith-computenode +- /storages - Where data lives: neon-pageserver or neon-s3 +- /pgs - Postgres - neon-computenode - /snapshots - snapshots **TODO** ->Question: Do we want to extend this API to manage zenith components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all? +>Question: Do we want to extend this API to manage neon components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all? Methods and their mapping to CLI: -- /storages - zenith-pageserver or zenith-s3 +- /storages - neon-pageserver or neon-s3 CLI | REST API ------------- | ------------- @@ -84,7 +84,7 @@ storage list | GET /storages storage show -n name | GET /storages/:storage_name -- /pgs - zenith-computenode +- /pgs - neon-computenode CLI | REST API ------------- | ------------- diff --git a/docs/rfcs/006-laptop-cli-v2-CLI.md b/docs/rfcs/006-laptop-cli-v2-CLI.md index 84dc932211..5030ecc7e7 100644 --- a/docs/rfcs/006-laptop-cli-v2-CLI.md +++ b/docs/rfcs/006-laptop-cli-v2-CLI.md @@ -1,45 +1,45 @@ -Zenith CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog". +Neon CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog". # CLI v2 (after chatting with Carl) -Zenith introduces the notion of a repository. +Neon introduces the notion of a repository. ```bash -zenith init -zenith clone zenith://zenith.tech/piedpiper/northwind -- clones a repo to the northwind directory +neon init +neon clone neon://neon.tech/piedpiper/northwind -- clones a repo to the northwind directory ``` Once you have a cluster catalog you can explore it ```bash -zenith log -- returns a list of commits -zenith status -- returns if there are changes in the catalog that can be committed -zenith commit -- commits the changes and generates a new commit hash -zenith branch experimental -- creates a branch called testdb based on a given commit hash +neon log -- returns a list of commits +neon status -- returns if there are changes in the catalog that can be committed +neon commit -- commits the changes and generates a new commit hash +neon branch experimental -- creates a branch called testdb based on a given commit hash ``` To make changes in the catalog you need to run compute nodes ```bash -- here is how you a compute node -zenith start /home/pipedpiper/northwind:main -- starts a compute instance -zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud +neon start /home/pipedpiper/northwind:main -- starts a compute instance +neon start neon://neon.tech/northwind:main -- starts a compute instance in the cloud -- you can start a compute node against any hash or branch -zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port) +neon start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port) -- you can start a compute node against any hash or branch -zenith start /home/pipedpiper/northwind: --port 8009 -- start another compute instance (on different port) +neon start /home/pipedpiper/northwind: --port 8009 -- start another compute instance (on different port) -- After running some DML you can run --- zenith status and see how there are two WAL streams one on top of +-- neon status and see how there are two WAL streams one on top of -- the main branch -zenith status +neon status -- and another on top of the experimental branch -zenith status -b experimental +neon status -b experimental -- you can commit each branch separately -zenith commit main +neon commit main -- or -zenith commit -c /home/pipedpiper/northwind:experimental +neon commit -c /home/pipedpiper/northwind:experimental ``` Starting compute instances against cloud environments @@ -47,18 +47,18 @@ Starting compute instances against cloud environments ```bash -- you can start a compute instance against the cloud environment -- in this case all of the changes will be streamed into the cloud -zenith start https://zenith:tech/pipedpiper/northwind:main -zenith start https://zenith:tech/pipedpiper/northwind:main -zenith status -c https://zenith:tech/pipedpiper/northwind:main -zenith commit -c https://zenith:tech/pipedpiper/northwind:main -zenith branch -c https://zenith:tech/pipedpiper/northwind: experimental +neon start https://neon:tecj/pipedpiper/northwind:main +neon start https://neon:tecj/pipedpiper/northwind:main +neon status -c https://neon:tecj/pipedpiper/northwind:main +neon commit -c https://neon:tecj/pipedpiper/northwind:main +neon branch -c https://neon:tecj/pipedpiper/northwind: experimental ``` Pushing data into the cloud ```bash -- pull all the commits from the cloud -zenith pull +neon pull -- push all the commits to the cloud -zenith push +neon push ``` diff --git a/docs/rfcs/006-laptop-cli-v2-repository-structure.md b/docs/rfcs/006-laptop-cli-v2-repository-structure.md index e6e6e172ad..749a940313 100644 --- a/docs/rfcs/006-laptop-cli-v2-repository-structure.md +++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md @@ -1,14 +1,14 @@ # Repository format -A Zenith repository is similar to a traditional PostgreSQL backup +A Neon repository is similar to a traditional PostgreSQL backup archive, like a WAL-G bucket or pgbarman backup catalogue. It holds multiple versions of a PostgreSQL database cluster. -The distinguishing feature is that you can launch a Zenith Postgres +The distinguishing feature is that you can launch a Neon Postgres server directly against a branch in the repository, without having to -"restore" it first. Also, Zenith manages the storage automatically, +"restore" it first. Also, Neon manages the storage automatically, there is no separation between full and incremental backups nor WAL -archive. Zenith relies heavily on the WAL, and uses concepts similar +archive. Neon relies heavily on the WAL, and uses concepts similar to incremental backups and WAL archiving internally, but it is hidden from the user. @@ -19,15 +19,15 @@ efficient. Just something to get us started. The repository directory looks like this: - .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/ - .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots// - .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history + .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/ + .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots// + .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history - .zenith/refs/branches/mybranch - .zenith/refs/tags/foo - .zenith/refs/tags/bar + .neon/refs/branches/mybranch + .neon/refs/tags/foo + .neon/refs/tags/bar - .zenith/datadirs/ + .neon/datadirs/ ### Timelines @@ -39,7 +39,7 @@ All WAL is generated on a timeline. You can launch a read-only node against a tag or arbitrary LSN on a timeline, but in order to write, you need to create a timeline. -Each timeline is stored in a directory under .zenith/timelines. It +Each timeline is stored in a directory under .neon/timelines. It consists of a WAL archive, containing all the WAL in the standard PostgreSQL format, under the wal/ subdirectory. @@ -66,18 +66,18 @@ contains the UUID of the timeline (and LSN, for tags). ### Datadirs -.zenith/datadirs contains PostgreSQL data directories. You can launch +.neon/datadirs contains PostgreSQL data directories. You can launch a Postgres instance on one of them with: ``` - postgres -D .zenith/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c + postgres -D .neon/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c ``` All the actual data is kept in the timeline directories, under -.zenith/timelines. The data directories are only needed for active +.neon/timelines. The data directories are only needed for active PostgreQSL instances. After an instance is stopped, the data directory -can be safely removed. "zenith start" will recreate it quickly from -the data in .zenith/timelines, if it's missing. +can be safely removed. "neon start" will recreate it quickly from +the data in .neon/timelines, if it's missing. ## Version 2 @@ -103,14 +103,14 @@ more advanced. The exact format is TODO. But it should support: ### Garbage collection -When you run "zenith gc", old timelines that are no longer needed are +When you run "neon gc", old timelines that are no longer needed are removed. That involves collecting the list of "unreachable" objects, starting from the named branches and tags. Also, if enough WAL has been generated on a timeline since last snapshot, a new snapshot or delta is created. -### zenith push/pull +### neon push/pull Compare the tags and branches on both servers, and copy missing ones. For each branch, compare the timeline it points to in both servers. If @@ -123,7 +123,7 @@ every time you start up an instance? Then you would detect that the timelines have diverged. That would match with the "epoch" concept that we have in the WAL safekeeper -### zenith checkout/commit +### neon checkout/commit In this format, there is no concept of a "working tree", and hence no concept of checking out or committing. All modifications are done on @@ -134,7 +134,7 @@ You can easily fork off a temporary timeline to emulate a "working tree". You can later remove it and have it garbage collected, or to "commit", re-point the branch to the new timeline. -If we want to have a worktree and "zenith checkout/commit" concept, we can +If we want to have a worktree and "neon checkout/commit" concept, we can emulate that with a temporary timeline. Create the temporary timeline at -"zenith checkout", and have "zenith commit" modify the branch to point to +"neon checkout", and have "neon commit" modify the branch to point to the new timeline. diff --git a/docs/rfcs/007-serverless-on-laptop.md b/docs/rfcs/007-serverless-on-laptop.md index e6355f4a03..96f117bfe9 100644 --- a/docs/rfcs/007-serverless-on-laptop.md +++ b/docs/rfcs/007-serverless-on-laptop.md @@ -4,27 +4,27 @@ How it works now 1. Create repository, start page server on it ``` -$ zenith init +$ neon init ... created main branch -new zenith repository was created in .zenith +new neon repository was created in .neon -$ zenith pageserver start -Starting pageserver at '127.0.0.1:64000' in .zenith +$ neon pageserver start +Starting pageserver at '127.0.0.1:64000' in .neon Page server started ``` 2. Create a branch, and start a Postgres instance on it ``` -$ zenith branch heikki main +$ neon branch heikki main branching at end of WAL: 0/15ECF68 -$ zenith pg create heikki +$ neon pg create heikki Initializing Postgres on timeline 76cf9279915be7797095241638e64644... -Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/pg1 port=55432 +Extracting base backup to create postgres instance: path=.neon/pgdatadirs/pg1 port=55432 -$ zenith pg start pg1 +$ neon pg start pg1 Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki' waiting for server to start.... done server started @@ -52,20 +52,20 @@ serverless on your laptop, so that the workflow becomes just: 1. Create repository, start page server on it (same as before) ``` -$ zenith init +$ neon init ... created main branch -new zenith repository was created in .zenith +new neon repository was created in .neon -$ zenith pageserver start -Starting pageserver at '127.0.0.1:64000' in .zenith +$ neon pageserver start +Starting pageserver at '127.0.0.1:64000' in .neon Page server started ``` 2. Create branch ``` -$ zenith branch heikki main +$ neon branch heikki main branching at end of WAL: 0/15ECF68 ``` diff --git a/docs/rfcs/008-push-pull.md b/docs/rfcs/008-push-pull.md index 272628e1ce..a36932222a 100644 --- a/docs/rfcs/008-push-pull.md +++ b/docs/rfcs/008-push-pull.md @@ -7,22 +7,22 @@ Here is a proposal about implementing push/pull mechanics between pageservers. W The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that). ``` -zenith origin add -zenith origin list -zenith origin remove +neon origin add +neon origin list +neon origin remove ``` Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport. -Behind the scenes, this commands may update toml file inside .zenith directory. +Behind the scenes, this commands may update toml file inside .neon directory. ## Push ### Pushing branch ``` -zenith push mybranch cloudserver # push to eponymous branch in cloudserver -zenith push mybranch cloudserver:otherbranch # push to a different branch in cloudserver +neon push mybranch cloudserver # push to eponymous branch in cloudserver +neon push mybranch cloudserver:otherbranch # push to a different branch in cloudserver ``` Exact mechanics would be slightly different in the following situations: diff --git a/docs/rfcs/009-snapshot-first-storage-cli.md b/docs/rfcs/009-snapshot-first-storage-cli.md index 0acbd68f86..bbd0f75fe2 100644 --- a/docs/rfcs/009-snapshot-first-storage-cli.md +++ b/docs/rfcs/009-snapshot-first-storage-cli.md @@ -2,7 +2,7 @@ While working on export/import commands, I understood that they fit really well We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files. -Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to zenith. +Even if neon aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to neon. So here is an attempt to design consistent CLI for different usage scenarios: @@ -16,8 +16,8 @@ Save`storage_dest` and other parameters in config. Push snapshots to `storage_dest` in background. ``` -zenith init --storage_dest=S3_PREFIX -zenith start +neon init --storage_dest=S3_PREFIX +neon start ``` #### 2. Restart pageserver (manually or crash-recovery). @@ -25,7 +25,7 @@ Take `storage_dest` from pageserver config, start pageserver from latest snapsho Push snapshots to `storage_dest` in background. ``` -zenith start +neon start ``` #### 3. Import. @@ -35,22 +35,22 @@ Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time Save`storage_dest` parameters in config. Push snapshots to `storage_dest` in background. ``` -//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage. -zenith init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX -zenith start +//I.e. we want to start neon on top of existing $PGDATA and use s3 as a persistent storage. +neon init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX +neon start ``` How to pass credentials needed for `snapshot_path`? #### 4. Export. Manually push snapshot to `snapshot_path` which differs from `storage_dest` -Optionally set `snapshot_format`, which can be plain pgdata format or zenith format. +Optionally set `snapshot_format`, which can be plain pgdata format or neon format. ``` -zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata +neon export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata ``` #### Notes and questions - safekeeper s3_offload should use same (similar) syntax for storage. How to set it in UI? -- Why do we need `zenith init` as a separate command? Can't we init everything at first start? +- Why do we need `neon init` as a separate command? Can't we init everything at first start? - We can think of better names for all options. - Export to plain postgres format will be useless, if we are not 100% compatible on page level. I can recall at least one such difference - PD_WAL_LOGGED flag in pages. diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md index 7e815abf73..2f3ccbc09b 100644 --- a/docs/rfcs/013-term-history.md +++ b/docs/rfcs/013-term-history.md @@ -9,7 +9,7 @@ receival and this might lag behind `term`; safekeeper switches to epoch `n` when it has received all committed log records from all `< n` terms. This roughly corresponds to proposed in -https://github.com/zenithdb/rfcs/pull/3/files +https://github.com/neondatabase/rfcs/pull/3/files This makes our biggest our difference from Raft. In Raft, every log record is diff --git a/docs/rfcs/014-safekeepers-gossip.md b/docs/rfcs/014-safekeepers-gossip.md index 3d6cc04b94..ff38a0a0ef 100644 --- a/docs/rfcs/014-safekeepers-gossip.md +++ b/docs/rfcs/014-safekeepers-gossip.md @@ -1,6 +1,6 @@ # Safekeeper gossip -Extracted from this [PR](https://github.com/zenithdb/rfcs/pull/13) +Extracted from this [PR](https://github.com/neondatabase/rfcs/pull/13) ## Motivation diff --git a/docs/rfcs/015-storage-messaging.md b/docs/rfcs/015-storage-messaging.md index a415b90459..7702311d65 100644 --- a/docs/rfcs/015-storage-messaging.md +++ b/docs/rfcs/015-storage-messaging.md @@ -2,7 +2,7 @@ Created on 19.01.22 -Initially created [here](https://github.com/zenithdb/rfcs/pull/16) by @kelvich. +Initially created [here](https://github.com/neondatabase/rfcs/pull/16) by @kelvich. That it is an alternative to (014-safekeeper-gossip)[] @@ -292,4 +292,4 @@ But with an etcd we are in a bit different situation: 1. We don't need persistency and strong consistency guarantees for the data we store in the etcd 2. etcd uses Grpc as a protocol, and messages are pretty simple -So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local zenith installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres). +So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local neon installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres). diff --git a/docs/rfcs/017-console-split.md b/docs/rfcs/017-console-split.md new file mode 100644 index 0000000000..8036920610 --- /dev/null +++ b/docs/rfcs/017-console-split.md @@ -0,0 +1,420 @@ +# Splitting cloud console + +Created on 17.06.2022 + +## Summary + +Currently we have `cloud` repository that contains code implementing public API for our clients as well as code for managing storage and internal infrastructure services. We can split everything user-related from everything storage-related to make it easier to test and maintain. + +This RFC proposes to introduce a new control-plane service with HTTP API. The overall architecture will look like this: + +```markup +. x + external area x internal area + (our clients) x (our services) + x + x ┌───────────────────────┐ + x ┌───────────────┐ > ┌─────────────────────┐ │ Storage (EC2) │ + x │ console db │ > │ control-plane db │ │ │ + x └───────────────┘ > └─────────────────────┘ │ - safekeepers │ + x ▲ > ▲ │ - pageservers │ + x │ > │ │ │ +┌──────────────────┐ x ┌───────┴───────┐ > │ │ Dependencies │ +│ browser UI ├──►│ │ > ┌──────────┴──────────┐ │ │ +└──────────────────┘ x │ │ > │ │ │ - etcd │ + x │ console ├───────►│ control-plane ├────►│ - S3 │ +┌──────────────────┐ x │ │ > │ (deployed in k8s) │ │ - more? │ +│public API clients├──►│ │ > │ │ │ │ +└──────────────────┘ x └───────┬───────┘ > └──────────┬──────────┘ └───────────────────────┘ + x │ > ▲ │ ▲ + x │ > │ │ │ + x ┌───────┴───────┐ > │ │ ┌───────────┴───────────┐ + x │ dependencies │ > │ │ │ │ + x │- analytics │ > │ └───────────────►│ computes │ + x │- auth │ > │ │ (deployed in k8s) │ + x │- billing │ > │ │ │ + x └───────────────┘ > │ └───────────────────────┘ + x > │ ▲ + x > ┌─────┴───────────────┐ │ +┌──────────────────┐ x > │ │ │ +│ │ x > │ proxy ├─────────────────┘ +│ postgres ├───────────────────────────►│ (deployed in k8s) │ +│ users │ x > │ │ +│ │ x > └─────────────────────┘ +└──────────────────┘ x > + > + > + closed-source > open-source + > + > +``` + +Notes: + +- diagram is simplified in the less-important places +- directed arrows are strict and mean that connections in the reverse direction are forbidden + +This split is quite complex and this RFC proposes several smaller steps to achieve the larger goal: + +1. Start by refactoring the console code, the goal is to have console and control-plane code in the different directories without dependencies on each other. +2. Do similar refactoring for tables in the console database, remove queries selecting data from both console and control-plane; move control-plane tables to a separate database. +3. Implement control-plane HTTP API serving on a separate TCP port; make all console→control-plane calls to go through that HTTP API. +4. Move control-plane source code to the neon repo; start control-plane as a separate service. + +## Motivation + +These are the two most important problems we want to solve: + +- Publish open-source implementation of all our cloud/storage features +- Make a unified control-plane that is used in all cloud (serverless) and local (tests) setups + +Right now we have some closed-source code in the cloud repo. That code contains implementation for running Neon computes in k8s and without that code it’s impossible to automatically scale PostgreSQL computes. That means that we don’t have an open-source serverless PostgreSQL at the moment. + +After splitting and open-sourcing control-plane service we will have source code and Docker images for all storage services. That control-plane service should have HTTP API for creating and managing tenants (including all our storage features), while proxy will listen for incoming connections and create computes on-demand. + +Improving our test suite is an important task, but requires a lot of prerequisites and may require a separate RFC. Possible implementation of that is described in the section [Next steps](#next-steps). + +Another piece of motivation can be a better involvement of storage development team into a control-plane. By splitting control-plane from the console, it can be more convenient to test and develop control-plane with paying less attention to “business” features, such as user management, billing and analytics. + +For example, console currently requires authentication providers such as GitHub OAuth to work at all, as well as nodejs to be able to build it locally. It will be more convenient to build and run it locally without these requirements. + +## Proposed implementation + +### Current state of things + +Let’s start with defining the current state of things at the moment of this proposal. We have three repositories containing source code: + +- open-source `postgres` — our fork of postgres +- open-source `neon` — our main repository for storage source code +- closed-source `cloud` — mostly console backend and UI frontend + +This proposal aims not to change anything at the existing code in `neon` and `postgres` repositories, but to create control-plane service and move it’s source code from `cloud` to the `neon` repository. That means that we need to split code in `cloud` repo only, and will consider only this repository for exploring its source code. + +Let’s look at the miscellaneous things in the `cloud` repo which are NOT part of the console application, i.e. NOT the Go source code that is compiled to the `./console` binary. There we have: + +- command-line tools, such as cloudbench, neonadmin +- markdown documentation +- cloud operations scripts (helm, terraform, ansible) +- configs and other things +- e2e python tests +- incidents playbooks +- UI frontend +- Make build scripts, code generation scripts +- database migrations +- swagger definitions + +And also let’s take a look at what we have in the console source code, which is the service we’d like to split: + +- API Servers + - Public API v2 + - Management API v2 + - Public API v1 + - Admin API v1 (same port as Public API v1) + - Management API v1 +- Workers + - Monitor Compute Activity + - Watch Failed Operations + - Availability Checker + - Business Metrics Collector +- Internal Services + - Auth Middleware, UserIsAdmin, Cookies + - Cable Websocket Server + - Admin Services + - Global Settings, Operations, Pageservers, Platforms, Projects, Safekeepers, Users + - Authenticate Proxy + - API Keys + - App Controller, serving UI HTML + - Auth Controller + - Branches + - Projects + - Psql Connect + Passwordless login + - Users + - Cloud Metrics + - User Metrics + - Invites + - Pageserver/Safekeeper management + - Operations, k8s/docker/common logic + - Platforms, Regions + - Project State + - Projects Roles, SCRAM + - Global Settings +- Other things + - segment analytics integration + - sentry integration + - other common utilities packages + +### Drawing the splitting line + +The most challenging and the most important thing is to define the line that will split new control-plane service from the existing cloud service. If we don’t get it right, then we can end up with having a lot more issues without many benefits. + +We propose to define that line as follows: + +- everything user-related stays in the console service +- everything storage-related should be in the control-plane service +- something that falls in between should be decided where to go, but most likely should stay in the console service +- some similar parts should be in both services, such as admin/management/db_migrations + +We call user-related all requests that can be connected to some user. The general idea is don’t have any user_id in the control-plane service and operate exclusively on tenant_id+timeline_id, the same way as existing storage services work now (compute, safekeeper, pageserver). + +Storage-related things can be defined as doing any of the following: + +- using k8s API +- doing requests to any of the storage services (proxy, compute, safekeeper, pageserver, etc..) +- tracking current status of tenants/timelines, managing lifetime of computes + +Based on that idea, we can say that new control-plane service should have the following components: + +- single HTTP API for everything + - Create and manage tenants and timelines + - Manage global settings and storage configuration (regions, platforms, safekeepers, pageservers) + - Admin API for storage health inspection and debugging +- Workers + - Monitor Compute Activity + - Watch Failed Operations + - Availability Checker +- Internal Services + - Admin Services + - Global Settings, Operations, Pageservers, Platforms, Tenants, Safekeepers + - Authenticate Proxy + - Branches + - Psql Connect + - Cloud Metrics + - Pageserver/Safekeeper management + - Operations, k8s/docker/common logic + - Platforms, Regions + - Tenant State + - Compute Roles, SCRAM + - Global Settings + +--- + +And other components should probably stay in the console service: + +- API Servers (no changes here) + - Public API v2 + - Management API v2 + - Public API v1 + - Admin API v1 (same port as Public API v1) + - Management API v1 +- Workers + - Business Metrics Collector +- Internal Services + - Auth Middleware, UserIsAdmin, Cookies + - Cable Websocket Server + - Admin Services + - Users admin stays the same + - Other admin services can redirect requests to the control-plane + - API Keys + - App Controller, serving UI HTML + - Auth Controller + - Projects + - User Metrics + - Invites + - Users + - Passwordless login +- Other things + - segment analytics integration + - sentry integration + - other common utilities packages + +There are also miscellaneous things that are useful for all kinds of services. So we can say that these things can be in both services: + +- markdown documentation +- e2e python tests +- make build scripts, code generation scripts +- database migrations +- swagger definitions + +The single entrypoint to the storage should be control-plane API. After we define that API, we can have code-generated implementation for the client and for the server. The general idea is to move code implementing storage components from the console to the API implementation inside the new control-plane service. + +After the code is moved to the new service, we can fill the created void by making API calls to the new service: + +- authorization of the client +- mapping user_id + project_id to the tenant_id +- calling the control-plane API + +### control-plane API + +Currently we have the following projects API in the console: + +``` +GET /projects/{project_id} +PATCH /projects/{project_id} +POST /projects/{project_id}/branches +GET /projects/{project_id}/databases +POST /projects/{project_id}/databases +GET /projects/{project_id}/databases/{database_id} +PUT /projects/{project_id}/databases/{database_id} +DELETE /projects/{project_id}/databases/{database_id} +POST /projects/{project_id}/delete +GET /projects/{project_id}/issue_token +GET /projects/{project_id}/operations +GET /projects/{project_id}/operations/{operation_id} +POST /projects/{project_id}/query +GET /projects/{project_id}/roles +POST /projects/{project_id}/roles +GET /projects/{project_id}/roles/{role_name} +DELETE /projects/{project_id}/roles/{role_name} +POST /projects/{project_id}/roles/{role_name}/reset_password +POST /projects/{project_id}/start +POST /projects/{project_id}/stop +POST /psql_session/{psql_session_id} +``` + +It looks fine and we probably already have clients relying on it. So we should not change it, at least for now. But most of these endpoints (if not all) are related to storage, and it can suggest us what control-plane API should look like: + +``` +GET /tenants/{tenant_id} +PATCH /tenants/{tenant_id} +POST /tenants/{tenant_id}/branches +GET /tenants/{tenant_id}/databases +POST /tenants/{tenant_id}/databases +GET /tenants/{tenant_id}/databases/{database_id} +PUT /tenants/{tenant_id}/databases/{database_id} +DELETE /tenants/{tenant_id}/databases/{database_id} +POST /tenants/{tenant_id}/delete +GET /tenants/{tenant_id}/issue_token +GET /tenants/{tenant_id}/operations +GET /tenants/{tenant_id}/operations/{operation_id} +POST /tenants/{tenant_id}/query +GET /tenants/{tenant_id}/roles +POST /tenants/{tenant_id}/roles +GET /tenants/{tenant_id}/roles/{role_name} +DELETE /tenants/{tenant_id}/roles/{role_name} +POST /tenants/{tenant_id}/roles/{role_name}/reset_password +POST /tenants/{tenant_id}/start +POST /tenants/{tenant_id}/stop +POST /psql_session/{psql_session_id} +``` + +One of the options here is to use gRPC instead of the HTTP, which has some useful features, but there are some strong points towards using plain HTTP: + +- HTTP API is easier to use for the clients +- we already have HTTP API in pageserver/safekeeper/console +- we probably want control-plane API to be similar to the console API, available in the cloud + +### Getting updates from the storage + +There can be some valid cases, when we would like to know what is changed in the storage. For example, console might want to know when user has queried and started compute and when compute was scaled to zero after that, to know how much user should pay for the service. Another example is to get info about reaching the disk space limits. Yet another example is to do analytics, such as how many users had at least one active project in a month. + +All of the above cases can happen without using the console, just by accessing compute through the proxy. + +To solve this, we can have a log of events occurring in the storage (event logs). That is very similar to operations table we have right now, the only difference is that events are immutable and we cannot change them after saving to the database. For example, we might want to have events for the following activities: + +- We finished processing some HTTP API query, such as resetting the password +- We changed some state, such as started or stopped a compute +- Operation is created +- Operation is started for the first time +- Operation is failed for the first time +- Operation is finished + +Once we save these events to the database, we can create HTTP API to subscribe to these events. That API can look like this: + +``` +GET /events/ + +{ + "events": [...], + "next_cursor": 123 +} +``` + +It should be possible to replay event logs from some point of time, to get a state of almost anything from the storage services. That means that if we maintain some state in the control-plane database and we have a reason to have the same state in the console database, it is possible by polling events from the control-plane API and changing the state in the console database according to the events. + +### Next steps + +After implementing control-plane HTTP API and starting control-plane as a separate service, we might want to think of exploiting benefits of the new architecture, such as reorganizing test infrastructure. Possible options are listed in the [Next steps](#next-steps-1). + +## Non Goals + +RFC doesn’t cover the actual cloud deployment scripts and schemas, such as terraform, ansible, k8s yaml’s and so on. + +## Impacted components + +Mostly console, but can also affect some storage service. + +## Scalability + +We should support starting several instances of the new control-plane service at the same time. + +At the same time, it should be possible to use only single instance of control-plane, which can be useful for local tests. + +## Security implications + +New control-plane service is an internal service, so no external requests can reach it. But at the same time, it contains API to do absolutely anything with any of the tenants. That means that bad internal actor can potentially read and write all of the tenants. To make this safer, we can have one of these: + +- Simple option is to protect all requests with a single private key, so that no one can make requests without having that one key. +- Another option is to have a separate token for every tenant and store these tokens in another secure place. This way it’s harder to access all tenants at once, because they have the different tokens. + +## Alternative implementation + +There was an idea to create a k8s operator for managing storage services and computes, but author of this RFC is not really familiar with it. + +Regarding less alternative ideas, there are another options for the name of the new control-plane service: + +- storage-ctl +- cloud +- cloud-ctl + +## Pros/cons of proposed approaches (TODO) + +Pros: + +- All storage features are completely open-source +- Better tests coverage, less difference between cloud and local setups +- Easier to develop storage and cloud features, because there is no need to setup console for that +- Easier to deploy storage-only services to the any cloud + +Cons: + +- All storage features are completely open-source +- Distributed services mean more code to connect different services and potential network issues +- Console needs to have a dependency on storage API, there can be complications with developing new feature in a branch +- More code to JOIN data from different services (console and control-plane) + +## Definition of Done + +We have a new control-plane service running in the k8s. Source code for that control-plane service is located in the open-source neon repo. + +## Next steps + +After we’ve reached DoD, we can make further improvements. + +First thing that can benefit from the split is local testing. The same control-plane service can implement starting computes as a local processes instead of k8s deployments. If it will also support starting pageservers/safekeepers/proxy for the local setup, then it can completely replace `./neon_local` binary, which is currently used for testing. The local testing environment can look like this: + +``` +┌─────────────────────┐ ┌───────────────────────┐ +│ │ │ Storage (local) │ +│ control-plane db │ │ │ +│ (local process) │ │ - safekeepers │ +│ │ │ - pageservers │ +└──────────▲──────────┘ │ │ + │ │ Dependencies │ +┌──────────┴──────────┐ │ │ +│ │ │ - etcd │ +│ control-plane ├────►│ - S3 │ +│ (local process) │ │ - more? │ +│ │ │ │ +└──────────┬──────────┘ └───────────────────────┘ + ▲ │ ▲ + │ │ │ + │ │ ┌───────────┴───────────┐ + │ │ │ │ + │ └───────────────►│ computes │ + │ │ (local processes) │ + │ │ │ +┌──────┴──────────────┐ └───────────────────────┘ +│ │ ▲ +│ proxy │ │ +│ (local process) ├─────────────────┘ +│ │ +└─────────────────────┘ +``` + +The key thing here is that control-plane local service have the same API and almost the same implementation as the one deployed in the k8s. This allows to run the same e2e tests against both cloud and local setups. + +For the python test_runner tests everything can stay mostly the same. To do that, we just need to replace `./neon_local` cli commands with API calls to the control-plane. + +The benefit here will be in having fast local tests that are really close to our cloud setup. Bugs in k8s queries are still cannot be found when running computes as a local processes, but it should be really easy to start k8s locally (for example in k3s) and run the same tests with control-plane connected to the local k8s. + +Talking about console and UI tests, after the split there should be a way to test these without spinning up all the storage locally. New control-plane service has a well-defined API, allowing us to mock it. This way we can create UI tests to verify the right calls are issued after specific UI interactions and verify that we render correct messages when API returns errors. \ No newline at end of file diff --git a/docs/rfcs/018-storage-messaging-2.md b/docs/rfcs/018-storage-messaging-2.md index 364f62dd2e..2419dd5fc5 100644 --- a/docs/rfcs/018-storage-messaging-2.md +++ b/docs/rfcs/018-storage-messaging-2.md @@ -78,7 +78,7 @@ with grpc streams and tokio mpsc channels. The implementation description is at It is just 500 lines of code and core functionality is complete. 1-1 pub sub gives about 120k received messages per second; having multiple subscribers in -different connecitons quickly scales to 1 million received messages per second. +different connections quickly scales to 1 million received messages per second. I had concerns about many concurrent streams in singe connection, but 2^20 subscribers still work (though eat memory, with 10 publishers 20GB are consumed; in this implementation each publisher holds full copy of all subscribers). There @@ -95,12 +95,12 @@ other members, with best-effort this is simple. ### Security implications Communication happens in a private network that is not exposed to users; -additionaly we can add auth to the broker. +additionally we can add auth to the broker. ## Alternative: get existing pub-sub We could take some existing pub sub solution, e.g. RabbitMQ, Redis. But in this -case IMV simplicity of our own outweights external dependency costs (RabbitMQ is +case IMV simplicity of our own outweighs external dependency costs (RabbitMQ is much more complicated and needs VM; Redis Rust client maintenance is not ideal...). Also note that projects like CockroachDB and TiDB are based on gRPC as well. diff --git a/docs/rfcs/019-tenant-timeline-lifecycles.md b/docs/rfcs/019-tenant-timeline-lifecycles.md index 2734bf17b9..558b5335e7 100644 --- a/docs/rfcs/019-tenant-timeline-lifecycles.md +++ b/docs/rfcs/019-tenant-timeline-lifecycles.md @@ -74,7 +74,7 @@ TenantMaintenanceGuard: Like ActiveTenantGuard, but can be held even when the tenant is not in Active state. Used for operations like attach/detach. Perhaps allow only one such guard on a Tenant at a time. -Similarly for Timelines. We don't currentl have a "state" on Timeline, but I think +Similarly for Timelines. We don't currently have a "state" on Timeline, but I think we need at least two states: Active and Stopping. The Stopping state is used at deletion, to prevent new TimelineActiveGuards from appearing, while you wait for existing TimelineActiveGuards to die out. @@ -85,7 +85,7 @@ have a TenantActiveGuard, and the tenant's state changes from Active to Stopping, the is_shutdown_requested() function should return true, and shutdown_watcher() future should return. -This signaling doesn't neessarily need to cover all cases. For example, if you +This signaling doesn't necessarily need to cover all cases. For example, if you have a block of code in spawn_blocking(), it might be acceptable if is_shutdown_requested() doesn't return true even though the tenant is in Stopping state, as long as the code finishes reasonably fast. diff --git a/docs/rfcs/020-pageserver-s3-coordination.md b/docs/rfcs/020-pageserver-s3-coordination.md index 5e2912ba99..90ba3a6f4d 100644 --- a/docs/rfcs/020-pageserver-s3-coordination.md +++ b/docs/rfcs/020-pageserver-s3-coordination.md @@ -37,7 +37,7 @@ sequenceDiagram ``` At this point it is not possible to restore from index, it contains L2 which -is no longer available in s3 and doesnt contain L3 added by compaction by the +is no longer available in s3 and doesn't contain L3 added by compaction by the first pageserver. So if any of the pageservers restart initial sync will fail (or in on-demand world it will fail a bit later during page request from missing layer) @@ -74,7 +74,7 @@ One possible solution for relocation case is to orchestrate background jobs from outside. The oracle who runs migration can turn off background jobs on PS1 before migration and then run migration -> enable them on PS2. The problem comes if migration fails. In this case in order to resume background jobs -oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt +oracle needs to guarantee that PS2 doesn't run background jobs and if it doesn't respond then PS1 is stuck unable to run compaction/gc. This cannot be solved without human ensuring that no upload from PS2 can happen. In order to be able to resolve this automatically CAS is required on S3 side so pageserver can @@ -128,7 +128,7 @@ During discussion it seems that we converged on the approach consisting of: whether we need to apply change to the index state or not. - Responsibility for running background jobs is assigned externally. Pageserver keeps locally persistent flag for each tenant that indicates whether this - pageserver is considered as primary one or not. TODO what happends if we + pageserver is considered as primary one or not. TODO what happens if we crash and cannot start for some extended period of time? Control plane can assign ownership to some other pageserver. Pageserver needs some way to check if its still the blessed one. Maybe by explicit request to control plane on @@ -138,7 +138,7 @@ Requirement for deterministic layer generation was considered overly strict because of two reasons: - It can limit possible optimizations e g when pageserver wants to reshuffle - some data locally and doesnt want to coordinate this + some data locally and doesn't want to coordinate this - The deterministic algorithm itself can change so during deployments for some time there will be two different version running at the same time which can cause non determinism @@ -164,7 +164,7 @@ sequenceDiagram CP->>PS1: Yes deactivate CP PS1->>S3: Fetch PS1 index. - note over PS1: Continue operations, start backround jobs + note over PS1: Continue operations, start background jobs note over PS1,PS2: PS1 starts up and still and is not a leader anymore PS1->>CP: Am I still the leader for Tenant X? CP->>PS1: No @@ -203,7 +203,7 @@ sequenceDiagram ### Eviction When two pageservers operate on a tenant for extended period of time follower -doesnt perform write operations in s3. When layer is evicted follower relies +doesn't perform write operations in s3. When layer is evicted follower relies on updates from primary to get info about layers it needs to cover range for evicted layer. diff --git a/docs/rfcs/022-pageserver-delete-from-s3.md b/docs/rfcs/022-pageserver-delete-from-s3.md index 260e549670..c237a3edb8 100644 --- a/docs/rfcs/022-pageserver-delete-from-s3.md +++ b/docs/rfcs/022-pageserver-delete-from-s3.md @@ -4,7 +4,7 @@ Created on 08.03.23 ## Motivation -Currently we dont delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC). +Currently we don't delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC). This RFC aims to spin a discussion to come to a robust deletion solution that wont put us in into a corner for features like postponed deletion (when we keep data for user to be able to restore a project if it was deleted by accident) @@ -75,9 +75,9 @@ Remote one is needed for cases when pageserver is lost during deletion so other Why local mark file is needed? -If we dont have one, we have two choices, delete local data before deleting the remote part or do that after. +If we don't have one, we have two choices, delete local data before deleting the remote part or do that after. -If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote conuterparts of locally available tenants). +If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote counterparts of locally available tenants). If we delete local data after remote then at the end of the sequence when remote mark file is deleted if pageserver restart happens then the state is the same to situation when pageserver just missing data on remote without knowing the fact that this data is intended to be deleted. In this case the current behavior is upload everything local-only to remote. @@ -145,7 +145,7 @@ sequenceDiagram CP->>PS: Retry delete tenant PS->>CP: Not modified else Mark is missing - note over PS: Continue to operate the tenant as if deletion didnt happen + note over PS: Continue to operate the tenant as if deletion didn't happen note over CP: Eventually console should
retry delete request @@ -168,7 +168,7 @@ sequenceDiagram PS->>CP: True ``` -Similar sequence applies when both local and remote marks were persisted but Control Plane still didnt receive a response. +Similar sequence applies when both local and remote marks were persisted but Control Plane still didn't receive a response. If pageserver crashes after both mark files were deleted then it will reply to control plane status poll request with 404 which should be treated by control plane as success. @@ -187,7 +187,7 @@ If pageseserver is lost then the deleted tenant should be attached to different ##### Restrictions for tenant that is in progress of being deleted -I propose to add another state to tenant/timeline - PendingDelete. This state shouldnt allow executing any operations aside from polling the deletion status. +I propose to add another state to tenant/timeline - PendingDelete. This state shouldn't allow executing any operations aside from polling the deletion status. #### Summary @@ -237,7 +237,7 @@ New branch gets created PS1 starts up (is it possible or we just recycle it?) PS1 is unaware of the new branch. It can either fall back to s3 ls, or ask control plane. -So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage neeeds to ask control plane. +So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage needs to ask control plane. ### Summary @@ -250,7 +250,7 @@ Cons: Pros: -- Easier to reason about if you dont have to account for pageserver restarts +- Easier to reason about if you don't have to account for pageserver restarts ### Extra notes @@ -262,7 +262,7 @@ Delayed deletion can be done with both approaches. As discussed with Anna (@step After discussion in comments I see that we settled on two options (though a bit different from ones described in rfc). First one is the same - pageserver owns as much as possible. The second option is that pageserver owns markers thing, but actual deletion happens in control plane by repeatedly calling ls + delete. -To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge abouth paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesnt support shutting down pageservers, which are separate docker containers there instead of just processes. +To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge about paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesn't support shutting down pageservers, which are separate docker containers there instead of just processes. With pageserver owning everything we still give the retry logic to control plane but its easier to duplicate if needed compared to sharing inner s3 workings. We will have needed tests for retry logic in neon repo. diff --git a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md index 836c91fb25..97e62bf8c6 100644 --- a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md +++ b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md @@ -75,7 +75,7 @@ sequenceDiagram ``` At this point it is not possible to restore the state from index, it contains L2 which -is no longer available in s3 and doesnt contain L3 added by compaction by the +is no longer available in s3 and doesn't contain L3 added by compaction by the first pageserver. So if any of the pageservers restart, initial sync will fail (or in on-demand world it will fail a bit later during page request from missing layer) @@ -171,7 +171,7 @@ sequenceDiagram Another problem is a possibility of concurrent branch creation calls. -I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we dont need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state. +I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we don't need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state. ## Simplistic approach diff --git a/docs/rfcs/024-extension-loading.md b/docs/rfcs/024-extension-loading.md index 26ba4f7927..7e243b23e3 100644 --- a/docs/rfcs/024-extension-loading.md +++ b/docs/rfcs/024-extension-loading.md @@ -55,7 +55,7 @@ When PostgreSQL requests a file, `compute_ctl` downloads it. PostgreSQL requests files in the following cases: - When loading a preload library set in `local_preload_libraries` - When explicitly loading a library with `LOAD` -- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files))) +- When creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files))) #### Summary diff --git a/docs/rfcs/025-generation-numbers.md b/docs/rfcs/025-generation-numbers.md index 6a0131c66a..dfc8529d2d 100644 --- a/docs/rfcs/025-generation-numbers.md +++ b/docs/rfcs/025-generation-numbers.md @@ -26,7 +26,7 @@ plane guarantee prevents robust response to failures, as if a pageserver is unre we may not detach from it. The mechanism in this RFC fixes this, by making it safe to attach to a new, different pageserver even if an unresponsive pageserver may be running. -Futher, lack of safety during split-brain conditions blocks two important features where occasional +Further lack of safety during split-brain conditions blocks two important features where occasional split-brain conditions are part of the design assumptions: - seamless tenant migration ([RFC PR](https://github.com/neondatabase/neon/pull/5029)) @@ -490,11 +490,11 @@ The above makes it safe for control plane to change the assignment of tenant to pageserver in control plane while a timeline creation is ongoing. The reason is that the creation request against the new assigned pageserver uses a new generation number. However, care must be taken by control plane -to ensure that a "timeline creation successul" response from some pageserver +to ensure that a "timeline creation successful" response from some pageserver is checked for the pageserver's generation for that timeline's tenant still being the latest. If it is not the latest, the response does not constitute a successful timeline creation. It is acceptable to discard such responses, the scrubber will clean up the S3 state. -It is better to issue a timelien deletion request to the stale attachment. +It is better to issue a timeline deletion request to the stale attachment. #### Timeline Deletion @@ -633,7 +633,7 @@ As outlined in the Part 1 on correctness, it is critical that deletions are only executed once the key is not referenced anywhere in S3. This property is obviously upheld by the scheme above. -#### We Accept Object Leakage In Acceptable Circumcstances +#### We Accept Object Leakage In Acceptable Circumstances If we crash in the flow above between (2) and (3), we lose track of unreferenced object. Further, enqueuing a single to the persistent queue may not be durable immediately to amortize cost of flush to disk. diff --git a/docs/rfcs/026-pageserver-s3-mvcc.md b/docs/rfcs/026-pageserver-s3-mvcc.md index 2a8c925781..473d5a2bd0 100644 --- a/docs/rfcs/026-pageserver-s3-mvcc.md +++ b/docs/rfcs/026-pageserver-s3-mvcc.md @@ -162,7 +162,7 @@ struct Tenant { ... txns: HashMap, - // the most recently started txn's id; only most recently sarted can win + // the most recently started txn's id; only most recently started can win next_winner_txn: Option, } struct Transaction { @@ -186,7 +186,7 @@ A transaction T in state Committed has subsequent transactions that may or may n So, for garbage collection, we need to assess transactions in state Committed and RejectAcknowledged: -- Commited: delete objects on the deadlist. +- Committed: delete objects on the deadlist. - We don’t need a LIST request here, the deadlist is sufficient. So, it’s really cheap. - This is **not true MVCC garbage collection**; by deleting the objects on Committed transaction T ’s deadlist, we might delete data referenced by other transactions that were concurrent with T, i.e., they started while T was still open. However, the fact that T is committed means that the other transactions are RejectPending or RejectAcknowledged, so, they don’t matter. Pageservers executing these doomed RejectPending transactions must handle 404 for GETs gracefully, e.g., by trying to commit txn so they observe the rejection they’re destined to get anyways. 404’s for RejectAcknowledged is handled below. - RejectAcknowledged: delete all objects created in that txn, and discard deadlists. @@ -242,15 +242,15 @@ If a pageserver is unresponsive from Control Plane’s / Compute’s perspective At this point, availability is restored and user pain relieved. -What’s left is to somehow close the doomed transaction of the unresponsive pageserver, so that it beomes RejectAcknowledged, and GC can make progress. Since S3 is cheap, we can afford to wait a really long time here, especially if we put a soft bound on the amount of data a transaction may produce before it must commit. Procedure: +What’s left is to somehow close the doomed transaction of the unresponsive pageserver, so that it becomes RejectAcknowledged, and GC can make progress. Since S3 is cheap, we can afford to wait a really long time here, especially if we put a soft bound on the amount of data a transaction may produce before it must commit. Procedure: 1. Ensure the unresponsive pageserver is taken out of rotation for new attachments. That probably should happen as part of the routine above. 2. Make a human operator investigate decide what to do (next morning, NO ONCALL ALERT): 1. Inspect the instance, investigate logs, understand root cause. 2. Try to re-establish connectivity between pageserver and Control Plane so that pageserver can retry commits, get rejected, ack rejection ⇒ enable GC. - 3. Use below procedure to decomission pageserver. + 3. Use below procedure to decommission pageserver. -### Decomissioning A Pageserver (Dead or Alive-but-Unrespsonive) +### Decommissioning A Pageserver (Dead or Alive-but-Unresponsive) The solution, enabled by this proposal: @@ -310,7 +310,7 @@ Issues that we discussed: 1. In abstract terms, this proposal provides a linearized history for a given S3 prefix. 2. In concrete terms, this proposal provides a linearized history per tenant. 3. There can be multiple writers at a given time, but only one of them will win to become part of the linearized history. -4. ************************************************************************************Alternative ideas mentioned during meetings that should be turned into a written prospoal like this one:************************************************************************************ +4. ************************************************************************************Alternative ideas mentioned during meetings that should be turned into a written proposal like this one:************************************************************************************ 1. @Dmitry Rodionov : having linearized storage of index_part.json in some database that allows serializable transactions / atomic compare-and-swap PUT 2. @Dmitry Rodionov : 3. @Stas : something like this scheme, but somehow find a way to equate attachment duration with transaction duration, without losing work if pageserver dies months after attachment. diff --git a/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md index 2c6b46eabe..e18b7c16c9 100644 --- a/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md +++ b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md @@ -54,7 +54,7 @@ If the compaction algorithm doesn't change between the two compaction runs, is d *However*: 1. the file size of the overwritten L1s may not be identical, and 2. the bit pattern of the overwritten L1s may not be identical, and, -3. in the future, we may want to make the compaction code non-determinstic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite +3. in the future, we may want to make the compaction code non-deterministic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite The items above are a problem for the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919) because it assumes that layer files in S3 are only ever deleted, but never replaced (overPUTted). @@ -63,7 +63,7 @@ But node B based its world view on the version of node A's `index_part.json` fro That earlier `index_part.json`` contained the file size of the pre-overwrite L1. If the overwritten L1 has a different file size, node B will refuse to read data from the overwritten L1. Effectively, the data in the L1 has become inaccessible to node B. -If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same probem. +If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same problem. If we ever introduce checksums instead of checking just the file size, then a mismatching bit pattern (2) will cause similar problems. @@ -121,7 +121,7 @@ Multi-object changes that previously created and removed files in timeline dir a * atomic `index_part.json` update in S3, as per guarantee that S3 PUT is atomic * local timeline dir state: * irrelevant for layer map content => irrelevant for atomic updates / crash consistency - * if we crash after index part PUT, local layer files will be used, so, no on-demand downloads neede for them + * if we crash after index part PUT, local layer files will be used, so, no on-demand downloads needed for them * if we crash before index part PUT, local layer files will be deleted ## Trade-Offs @@ -140,7 +140,7 @@ Assuming upload queue allows for unlimited queue depth (that's what it does toda * wal ingest: currently unbounded * L0 => L1 compaction: CPU time proportional to `O(sum(L0 size))` and upload work proportional to `O()` * Compaction threshold is 10 L0s and each L0 can be up to 256M in size. Target size for L1 is 128M. - * In practive, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`. + * In practice, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`. * image layer generation: CPU time `O(sum(input data))` + upload work `O(sum(new image layer size))` * I have no intuition how expensive / long-running it is in reality. * gc: `update_gc_info`` work (not substantial, AFAIK) @@ -158,7 +158,7 @@ Pageserver crashes are very rare ; it would likely be acceptable to re-do the lo However, regular pageserver restart happen frequently, e.g., during weekly deploys. In general, pageserver restart faces the problem of tenants that "take too long" to shut down. -They are a problem because other tenants that shut down quickly are unavailble while we wait for the slow tenants to shut down. +They are a problem because other tenants that shut down quickly are unavailable while we wait for the slow tenants to shut down. We currently allot 10 seconds for graceful shutdown until we SIGKILL the pageserver process (as per `pageserver.service` unit file). A longer budget would expose tenants that are done early to a longer downtime. A short budget would risk throwing away more work that'd have to be re-done after restart. @@ -236,7 +236,7 @@ tenants/$tenant/timelines/$timeline/$key_and_lsn_range tenants/$tenant/timelines/$timeline/$layer_file_id-$key_and_lsn_range ``` -To guarantee uniqueness, the unqiue number is a sequence number, stored in `index_part.json`. +To guarantee uniqueness, the unique number is a sequence number, stored in `index_part.json`. This alternative does not solve atomic layer map updates. In our crash-during-compaction scenario above, the compaction run after the crash will not overwrite the L1s, but write/PUT new files with new sequence numbers. @@ -246,11 +246,11 @@ We'd need to write a deduplication pass that checks if perfectly overlapping lay However, this alternative is appealing because it systematically prevents overwrites at a lower level than this RFC. So, this alternative is sufficient for the needs of the split-brain safety RFC (immutable layer files locally and in S3). -But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more accute. +But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more acute. The proposed design in this RFC addresses both. So, if this alternative sounds appealing, we should implement the proposal in this RFC first, then implement this alternative on top. -That way, we avoid a phase where the crash-during-compaction problem is accute. +That way, we avoid a phase where the crash-during-compaction problem is acute. ## Related issues diff --git a/docs/rfcs/028-pageserver-migration.md b/docs/rfcs/028-pageserver-migration.md index f708f641aa..17ef9aef52 100644 --- a/docs/rfcs/028-pageserver-migration.md +++ b/docs/rfcs/028-pageserver-migration.md @@ -596,4 +596,4 @@ pageservers are updated to be aware of it. As well as simplifying implementation, putting heatmaps in S3 will be useful for future analytics purposes -- gathering aggregated statistics on activity -pattersn across many tenants may be done directly from data in S3. +patterns across many tenants may be done directly from data in S3. diff --git a/docs/rfcs/029-pageserver-wal-disaster-recovery.md b/docs/rfcs/029-pageserver-wal-disaster-recovery.md index 15ebd72bfe..229e40100e 100644 --- a/docs/rfcs/029-pageserver-wal-disaster-recovery.md +++ b/docs/rfcs/029-pageserver-wal-disaster-recovery.md @@ -147,7 +147,7 @@ Separating corrupt writes from non-corrupt ones is a hard problem in general, and if the application was involved in making the corrupt write, a recovery would also involve the application. Therefore, corruption that has made it into the WAL is outside of the scope of this feature. However, the WAL replay can be -issued to right before the point in time where the corruption occured. Then the +issued to right before the point in time where the corruption occurred. Then the data loss is isolated to post-corruption writes only. ## Impacted components (e.g. pageserver, safekeeper, console, etc) @@ -161,7 +161,7 @@ limits and billing we apply to existing timelines. ## Proposed implementation -The first problem to keep in mind is the reproducability of `initdb`. +The first problem to keep in mind is the reproducibility of `initdb`. So an initial step would be to upload `initdb` snapshots to S3. After that, we'd have the endpoint spawn a background process which diff --git a/docs/rfcs/030-vectored-timeline-get.md b/docs/rfcs/030-vectored-timeline-get.md index d4017471b7..093a964f38 100644 --- a/docs/rfcs/030-vectored-timeline-get.md +++ b/docs/rfcs/030-vectored-timeline-get.md @@ -69,7 +69,7 @@ However, unlike above, an ideal solution will * This means, read each `DiskBtree` page at most once. * Facilitate merging of the reads we issue to the OS and eventually NVMe. -Each of these items above represents a signficant amount of work. +Each of these items above represents a significant amount of work. ## Performance diff --git a/docs/rfcs/031-sharding-static.md b/docs/rfcs/031-sharding-static.md new file mode 100644 index 0000000000..fe009b8660 --- /dev/null +++ b/docs/rfcs/031-sharding-static.md @@ -0,0 +1,408 @@ +# Sharding Phase 1: Static Key-space Sharding + +## Summary + +To enable databases with sizes approaching the capacity of a pageserver's disk, +it is necessary to break up the storage for the database, or _shard_ it. + +Sharding in general is a complex area. This RFC aims to define an initial +capability that will permit creating large-capacity databases using a static configuration +defined at time of Tenant creation. + +## Motivation + +Currently, all data for a Tenant, including all its timelines, is stored on a single +pageserver. The local storage required may be several times larger than the actual +database size, due to LSM write inflation. + +If a database is larger than what one pageserver can hold, then it becomes impossible +for the pageserver to hold it in local storage, as it must do to provide service to +clients. + +### Prior art + +In Neon: + +- Layer File Spreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Konstantin-21fd9b11b618475da5f39c61dd8ab7a4 +- Layer File SPreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843 +- Key Space partitioning: https://www.notion.so/neondatabase/One-Pager-Key-Space-Partitioning-Stas-8e3a28a600a04a25a68523f42a170677 + +Prior art in other distributed systems is too broad to capture here: pretty much +any scale out storage system does something like this. + +## Requirements + +- Enable creating a large (for example, 16TiB) database without requiring dedicated + pageserver nodes. +- Share read/write bandwidth costs for large databases across pageservers, as well + as storage capacity, in order to avoid large capacity databases acting as I/O hotspots + that disrupt service to other tenants. +- Our data distribution scheme should handle sparse/nonuniform keys well, since postgres + does not write out a single contiguous ranges of page numbers. + +_Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database +that a user might create on a current-gen enterprise SSD should also work well on +Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the +pageserver backend is not the limiting factor in the database size_. + +## Non Goals + +- Independently distributing timelines within the same tenant. If a tenant has many + timelines, then sharding may be a less efficient mechanism for distributing load than + sharing out timelines between pageservers. +- Distributing work in the LSN dimension: this RFC focuses on the Key dimension only, + based on the idea that separate mechanisms will make sense for each dimension. + +## Impacted Components + +pageserver, control plane, postgres/smgr + +## Terminology + +**Key**: a postgres page number, qualified by relation. In the sense that the pageserver is a versioned key-value store, +the page number is the key in that store. `Key` is a literal data type in existing code. + +**LSN dimension**: this just means the range of LSNs (history), when talking about the range +of keys and LSNs as a two dimensional space. + +## Implementation + +### Key sharding vs. LSN sharding + +When we think of sharding across the two dimensional key/lsn space, this is an +opportunity to think about how the two dimensions differ: + +- Sharding the key space distributes the _write_ workload of ingesting data + and compacting. This work must be carefully managed so that exactly one + node owns a given key. +- Sharding the LSN space distributes the _historical read_ workload. This work + can be done by anyone without any special coordination, as long as they can + see the remote index and layers. + +The key sharding is the harder part, and also the more urgent one, to support larger +capacity databases. Because distributing historical LSN read work is a relatively +simpler problem that most users don't have, we defer it to future work. It is anticipated +that some quite simple P2P offload model will enable distributing work for historical +reads: a node which is low on space can call out to peer to ask it to download and +serve reads from a historical layer. + +### Key mapping scheme + +Having decided to focus on key sharding, we must next decide how we will map +keys to shards. It is proposed to use a "wide striping" approach, to obtain a good compromise +between data locality and avoiding entire large relations mapping to the same shard. + +We will define two spaces: + +- Key space: unsigned integer +- Shard space: integer from 0 to N-1, where we have N shards. + +### Key -> Shard mapping + +Keys are currently defined in the pageserver's getpage@lsn interface as follows: + +``` +pub struct Key { + pub field1: u8, + pub field2: u32, + pub field3: u32, + pub field4: u32, + pub field5: u8, + pub field6: u32, +} + + +fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: blknum, + } +} +``` + +_Note: keys for relation metadata are ignored here, as this data will be mirrored to all +shards. For distribution purposes, we only care about user data keys_ + +The properties we want from our Key->Shard mapping are: + +- Locality in `blknum`, such that adjacent `blknum` will usually map to + the same stripe and consequently land on the same shard, even though the overall + collection of blocks in a relation will be spread over many stripes and therefore + many shards. +- Avoid the same blknum on different relations landing on the same stripe, so that + with many small relations we do not end up aliasing data to the same stripe/shard. +- Avoid vulnerability to aliasing in the values of relation identity fields, such that + if there are patterns in the value of `relnode`, these do not manifest as patterns + in data placement. + +To accomplish this, the blknum is used to select a stripe, and stripes are +assigned to shards in a pseudorandom order via a hash. The motivation for +pseudo-random distribution (rather than sequential mapping of stripe to shard) +is to avoid I/O hotspots when sequentially reading multiple relations: we don't want +all relations' stripes to touch pageservers in the same order. + +To map a `Key` to a shard: + +- Hash the `Key` field 4 (relNode). +- Divide field 6 (`blknum`) field by the stripe size in pages, and combine the + hash of this with the hash from the previous step. +- The total hash modulo the shard count gives the shard holding this key. + +Why don't we use the other fields in the Key? + +- We ignore `forknum` for key mapping, because it distinguishes different classes of data + in the same relation, and we would like to keep the data in a relation together. +- We would like to use spcNode and dbNode, but cannot. Postgres database creation operations can refer to an existing database as a template, such that the created + database's blocks differ only by spcNode and dbNode from the original. To enable running + this type of creation without cross-pageserver communication, we must ensure that these + blocks map to the same shard -- we do this by excluding spcNode and dbNode from the hash. + +### Data placement examples + +For example, consider the extreme large databases cases of postgres data layout in a system with 8 shards +and a stripe size of 32k pages: + +- A single large relation: `blknum` division will break the data up into 4096 + stripes, which will be scattered across the shards. +- 4096 relations of of 32k pages each: each relation will map to exactly one stripe, + and that stripe will be placed according to the hash of the key fields 4. The + data placement will be statistically uniform across shards. + +Data placement will be more uneven on smaller databases: + +- A tenant with 2 shards and 2 relations of one stripe size each: there is a 50% chance + that both relations land on the same shard and no data lands on the other shard. +- A tenant with 8 shards and one relation of size 12 stripes: 4 shards will have double + the data of the other four shards. + +These uneven cases for small amounts of data do not matter, as long as the stripe size +is an order of magnitude smaller than the amount of data we are comfortable holding +in a single shard: if our system handles shard sizes up to 10-100GB, then it is not an issue if +a tenant has some shards with 256MB size and some shards with 512MB size, even though +the standard deviation of shard size within the tenant is very high. Our key mapping +scheme provides a statistical guarantee that as the tenant's overall data size increases, +uniformity of placement will improve. + +### Important Types + +#### `ShardIdentity` + +Provides the information needed to know whether a particular key belongs +to a particular shard: + +- Layout version +- Stripe size +- Shard count +- Shard index + +This structure's size is constant. Note that if we had used a differnet key +mapping scheme such as consistent hashing with explicit hash ranges assigned +to each shard, then the ShardIdentity's size would grow with the shard count: the simpler +key mapping scheme used here enables a small fixed size ShardIdentity. + +### Pageserver changes + +#### Structural + +Everywhere the Pageserver currently deals with Tenants, it will move to dealing with +`TenantShard`s, which are just a `Tenant` plus a `ShardIdentity` telling it which part +of the keyspace it owns. An un-sharded tenant is just a `TenantShard` whose `ShardIdentity` +covers the whole keyspace. + +When the pageserver writes layers and index_part.json to remote storage, it must +include the shard index & count in the name, to avoid collisions (the count is +necessary for future-proofing: the count will vary in time). These keys +will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work +exactly the same for TenantShards as it does for Tenants today: each shard will have +its own generation number. + +#### Storage Format: Keys + +For tenants with >1 shard, layer files implicitly become sparse: within the key +range described in the layer name, the layer file for a shard will only hold the +content relevant to stripes assigned to the shard. + +For this reason, the LayerFileName within a tenant is no longer unique: different shards +may use the same LayerFileName to refer to different data. We may solve this simply +by including the shard number in the keys used for layers. + +The shard number will be included as a prefix (as part of tenant ID), like this: + +`pageserver/v1/tenants/-/timelines//-` + +`pageserver/v1/tenants/-/timelines//index_part.json-` + +Reasons for this particular format: + +- Use of a prefix is convenient for implementation (no need to carry the shard ID everywhere + we construct a layer file name), and enables efficient listing of index_parts within + a particular shard-timeline prefix. +- Including the shard _count_ as well as shard number means that in future when we implement + shard splitting, it will be possible for a parent shard and one of its children to write + the same layer file without a name collision. For example, a parent shard 0_1 might split + into two (0_2, 1_2), and in the process of splitting shard 0_2 could write a layer or index_part + that is distinct from what shard 0_1 would have written at the same place. + +In practice, we expect shard counts to be relatively small, so a `u8` will be sufficient, +and therefore the shard part of the path can be a fixed-length hex string like `{:02X}{:02X}`, +for example a single-shard tenant's prefix will be `0001`. + +For backward compatibility, we may define a special `ShardIdentity` that has shard_count==0, +and use this as a cue to construct paths with no prefix at all. + +#### Storage Format: Indices + +In the phase 1 described in this RFC, shards only reference layers they write themselves. However, +when we implement shard splitting in future, it will be useful to enable shards to reference layers +written by other shards (specifically the parent shard during a split), so that shards don't +have to exhaustively copy all data into their own shard-prefixed keys. + +To enable this, the `IndexPart` structure will be extended to store the (shard number, shard count) +tuple on each layer, such that it can construct paths for layers written by other shards. This +naturally raises the question of who "owns" such layers written by ancestral shards: this problem +will be addressed in phase 2. + +For backward compatibility, any index entry without shard information will be assumed to be +in the legacy shardidentity. + +#### WAL Ingest + +In Phase 1, all shards will subscribe to the safekeeper to download WAL content. They will filter +it down to the pages relevant to their shard: + +- For ordinary user data writes, only retain a write if it matches the ShardIdentity +- For metadata describing relations etc, all shards retain these writes. + +The pageservers must somehow give the safekeeper correct feedback on remote_consistent_lsn: +one solution here is for the 0th shard to periodically peek at the IndexParts for all the other shards, +and have only the 0th shard populate remote_consistent_lsn. However, this is relatively +expensive: if the safekeeper can be made shard-aware then it could be taught to use +the max() of all shards' remote_consistent_lsns to decide when to trim the WAL. + +#### Compaction/GC + +No changes needed. + +The pageserver doesn't have to do anything special during compaction +or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity. +This will result in sparse layer files, containing keys only in the stripes that this +shard owns. Where optimizations currently exist in compaction for spotting "gaps" in +the key range, these should be updated to ignore gaps that are due to sharding, to +avoid spuriously splitting up layers ito stripe-sized pieces. + +### Compute Endpoints + +Compute endpoints will need to: + +- Accept a vector of connection strings as part of their configuration from the control plane +- Route pageserver requests according to mapping the hash of key to the correct + entry in the vector of connection strings. + +Doing this in compute rather than routing requests via a single pageserver is +necessary to enable sharding tenants without adding latency from extra hops. + +### Control Plane + +Tenants, or _Projects_ in the control plane, will each own a set of TenantShards (this will +be 1 for small tenants). Logic for placement of tenant shards is just the same as the current logic for placing +tenants. + +Tenant lifecycle operations like deletion will require fanning-out to all the shards +in the tenant. The same goes for timeline creation and deletion: a timeline should +not be considered created until it has been created in all shards. + +#### Selectively enabling sharding for large tenants + +Initially, we will explicitly enable sharding for large tenants only. + +In future, this hint mechanism will become optional when we implement automatic +re-sharding of tenants. + +## Future Phases + +This section exists to indicate what will likely come next after this phase. + +Phases 2a and 2b are amenable to execution in parallel. + +### Phase 2a: WAL fan-out + +**Problem**: when all shards consume the whole WAL, the network bandwidth used +for transmitting the WAL from safekeeper to pageservers is multiplied by a factor +of the shard count. + +Network bandwidth is not our most pressing bottleneck, but it is likely to become +a problem if we set a modest shard count (~8) on a significant number of tenants, +especially as those larger tenants which we shard are also likely to have higher +write bandwidth than average. + +### Phase 2b: Shard Splitting + +**Problem**: the number of shards in a tenant is defined at creation time and cannot +be changed. This causes excessive sharding for most small tenants, and an upper +bound on scale for very large tenants. + +To address this, a _splitting_ feature will later be added. One shard can split its +data into a number of children by doing a special compaction operation to generate +image layers broken up child-shard-wise, and then writing out an `index_part.json` for +each child. This will then require external coordination (by the control plane) to +safely attach these new child shards and then move them around to distribute work. +The opposite _merging_ operation can also be imagined, but is unlikely to be implemented: +once a Tenant has been sharded, the marginal efficiency benefit of merging is unlikely to justify +the risk/complexity of implementing such a rarely-encountered scenario. + +### Phase N (future): distributed historical reads + +**Problem**: while sharding based on key is good for handling changes in overall +database size, it is less suitable for spiky/unpredictable changes in the read +workload to historical layers. Sudden increases in historical reads could result +in sudden increases in local disk capacity required for a TenantShard. + +Example: the extreme case of this would be to run a tenant for a year, then create branches +with ancestors at monthly intervals. This could lead to a sudden 12x inflation in +the on-disk capacity footprint of a TenantShard, since it would be serving reads +from all those disparate historical layers. + +If we can respond fast enough, then key-sharding a tenant more finely can help with +this, but splitting may be a relatively expensive operation and the increased historical +read load may be transient. + +A separate mechanism for handling heavy historical reads could be something like +a gossip mechanism for pageservers to communicate +about their workload, and then a getpageatlsn offload mechanism where one pageserver can +ask another to go read the necessary layers from remote storage to serve the read. This +requires relativly little coordination because it is read-only: any node can service any +read. All reads to a particular shard would still flow through one node, but the +disk capactity & I/O impact of servicing the read would be distributed. + +## FAQ/Alternatives + +### Why stripe the data, rather than using contiguous ranges of keyspace for each shard? + +When a database is growing under a write workload, writes may predominantly hit the +end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user +is intensively re-writing a particular relation, if that relation lived in a particular +shard then it would not achieve our goal of distributing the write work across shards. + +### Why not proxy read requests through one pageserver, so that endpoints don't have to change? + +1. This would not achieve scale-out of network bandwidth: a busy tenant with a large + database would still cause a load hotspot on the pageserver routing its read requests. +2. The additional hop through the "proxy" pageserver would add latency and overall + resource cost (CPU, network bandwidth) + +### Layer File Spreading: use one pageserver as the owner of a tenant, and have it spread out work on a per-layer basis to peers + +In this model, there would be no explicit sharding of work, but the pageserver to which +a tenant is attached would not hold all layers on its disk: instead, it would call out +to peers to have them store some layers, and call out to those peers to request reads +in those layers. + +This mechanism will work well for distributing work in the LSN dimension, but in the key +space dimension it has the major limitation of requiring one node to handle all +incoming writes, and compactions. Even if the write workload for a large database +fits in one pageserver, it will still be a hotspot and such tenants may still +de-facto require their own pageserver. diff --git a/docs/rfcs/032-shard-splitting.md b/docs/rfcs/032-shard-splitting.md new file mode 100644 index 0000000000..d5fbda8415 --- /dev/null +++ b/docs/rfcs/032-shard-splitting.md @@ -0,0 +1,479 @@ +# Shard splitting + +## Summary + +This RFC describes a new pageserver API for splitting an existing tenant shard into +multiple shards, and describes how to use this API to safely increase the total +shard count of a tenant. + +## Motivation + +In the [sharding RFC](031-sharding-static.md), a mechanism was introduced to scale +tenants beyond the capacity of a single pageserver by breaking up the key space +into stripes, and distributing these stripes across many pageservers. However, +the shard count was defined once at tenant creation time and not varied thereafter. + +In practice, the expected size of a database is rarely known at creation time, and +it is inefficient to enable sharding for very small tenants: we need to be +able to create a tenant with a small number of shards (such as 1), and later expand +when it becomes clear that the tenant has grown in size to a point where sharding +is beneficial. + +### Prior art + +Many distributed systems have the problem of choosing how many shards to create for +tenants that do not specify an expected size up-front. There are a couple of general +approaches: + +- Write to a key space in order, and start a new shard when the highest key advances + past some point. This doesn't work well for Neon, because we write to our key space + in many different contiguous ranges (per relation), rather than in one contiguous + range. To adapt to this kind of model, we would need a sharding scheme where each + relation had its own range of shards, which would be inefficient for the common + case of databases with many small relations. +- Monitor the system, and automatically re-shard at some size threshold. For + example in Ceph, the [pg_autoscaler](https://github.com/ceph/ceph/blob/49c27499af4ee9a90f69fcc6bf3597999d6efc7b/src/pybind/mgr/pg_autoscaler/module.py) + component monitors the size of each RADOS Pool, and adjusts the number of Placement + Groups (Ceph's shard equivalent). + +## Requirements + +- A configurable capacity limit per-shard is enforced. +- Changes in shard count do not interrupt service beyond requiring postgres + to reconnect (i.e. milliseconds). +- Human being does not have to choose shard count + +## Non Goals + +- Shard splitting is always a tenant-global operation: we will not enable splitting + one shard while leaving others intact. +- The inverse operation (shard merging) is not described in this RFC. This is a lower + priority than splitting, because databases grow more often than they shrink, and + a database with many shards will still work properly if the stored data shrinks, just + with slightly more overhead (e.g. redundant WAL replication) +- Shard splitting is only initiated based on capacity bounds, not load. Splitting + a tenant based on load will make sense for some medium-capacity, high-load workloads, + but is more complex to reason about and likely is not desirable until we have + shard merging to reduce the shard count again if the database becomes less busy. + +## Impacted Components + +pageserver, storage controller + +(the _storage controller_ is the evolution of what was called `attachment_service` in our test environment) + +## Terminology + +**Parent** shards are the shards that exist before a split. **Child** shards are +the new shards created during a split. + +**Shard** is synonymous with _tenant shard_. + +**Shard Index** is the 2-tuple of shard number and shard count, written in +paths as {:02x}{:02x}, e.g. `0001`. + +## Background + +In the implementation section, a couple of existing aspects of sharding are important +to remember: + +- Shard identifiers contain the shard number and count, so that "shard 0 of 1" (`0001`) is + a distinct shard from "shard 0 of 2" (`0002`). This is the case in key paths, local + storage paths, and remote index metadata. +- Remote layer file paths contain the shard index of the shard that created them, and + remote indices contain the same index to enable building the layer file path. A shard's + index may reference layers that were created by another shard. +- Local tenant shard directories include the shard index. All layers downloaded by + a tenant shard are stored in this shard-prefixed path, even if those layers were + initially created by another shard: tenant shards do not read and write one anothers' + paths. +- The `Tenant` pageserver type represents one tenant _shard_, not the whole tenant. + This is for historical reasons and will be cleaned up in future, but the existing + name is used here to help comprehension when reading code. + +## Implementation + +Note: this section focuses on the correctness of the core split process. This will +be fairly inefficient in a naive implementation, and several important optimizations +are described in a later section. + +There are broadly two parts to the implementation: + +1. The pageserver split API, which splits one shard on one pageserver +2. The overall tenant split proccess which is coordinated by the storage controller, + and calls into the pageserver split API as needed. + +### Pageserver Split API + +The pageserver will expose a new API endpoint at `/v1/tenant/:tenant_shard_id/shard_split` +that takes the new total shard count in the body. + +The pageserver split API operates on one tenant shard, on one pageserver. External +coordination is required to use it safely, this is described in the later +'Split procedure' section. + +#### Preparation + +First identify the shard indices for the new child shards. These are deterministic, +calculated from the parent shard's index, and the number of children being created (this +is an input to the API, and validated to be a power of two). In a trivial example, splitting +0001 in two always results in 0002 and 0102. + +Child shard indices are chosen such that the childrens' parts of the keyspace will +be subsets of the parent's parts of the keyspace. + +#### Step 1: write new remote indices + +In remote storage, splitting is very simple: we may just write new index_part.json +objects for each child shard, containing exactly the same layers as the parent shard. + +The children will have more data than they need, but this avoids any exhausive +re-writing or copying of layer files. + +The index key path includes a generation number: the parent shard's current +attached generation number will also be used for the child shards' indices. This +makes the operation safely retryable: if everything crashes and restarts, we may +call the split API again on the parent shard, and the result will be some new remote +indices for the child shards, under a higher generation number. + +#### Step 2: start new `Tenant` objects + +A new `Tenant` object may be instantiated for each child shard, while the parent +shard still exists. When calling the tenant_spawn function for this object, +the remote index from step 1 will be read, and the child shard will start +to ingest WAL to catch up from whatever was in the remote storage at step 1. + +We now wait for child shards' WAL ingestion to catch up with the parent shard, +so that we can safely tear down the parent shard without risking an availability +gap to clients reading recent LSNs. + +#### Step 3: tear down parent `Tenant` object + +Once child shards are running and have caught up with WAL ingest, we no longer +need the parent shard. Note that clients may still be using it -- when we +shut it down, any page_service handlers will also shut down, causing clients +to disconnect. When the client reconnects, it will re-lookup the tenant, +and hit the child shard instead of the parent (shard lookup from page_service +should bias toward higher ShardCount shards). + +Note that at this stage the page service client has not yet been notified of +any split. In the trivial single split example: + +- Shard 0001 is gone: Tenant object torn down +- Shards 0002 and 0102 are running on the same pageserver where Shard 0001 used to live. +- Clients will continue to connect to that server thinking that shard 0001 is there, + and all requests will work, because any key that was in shard 0001 is definitely + available in either shard 0002 or shard 0102. +- Eventually, the storage controller (not the pageserver) will decide to migrate + some child shards away: at that point it will do a live migration, ensuring + that the client has an updated configuration before it detaches anything + from the original server. + +#### Complete + +When we send a 200 response to the split request, we are promising the caller: + +- That the child shards are persistent in remote storage +- That the parent shard has been shut down + +This enables the caller to proceed with the overall shard split operation, which +may involve other shards on other pageservers. + +### Storage Controller Split procedure + +Splitting a tenant requires calling the pageserver split API, and tracking +enough state to ensure recovery + completion in the event of any component (pageserver +or storage controller) crashing (or request timing out) during the split. + +1. call the split API on all existing shards. Ensure that the resulting + child shards are pinned to their pageservers until _all_ the split calls are done. + This pinning may be implemented as a "split bit" on the tenant shards, that + blocks any migrations, and also acts as a sign that if we restart, we must go + through some recovery steps to resume the split. +2. Once all the split calls are done, we may unpin the child shards (clear + the split bit). The split is now complete: subsequent steps are just migrations, + not strictly part of the split. +3. Try to schedule new pageserver locations for the child shards, using + a soft anti-affinity constraint to place shards from the same tenant onto different + pageservers. + +Updating computes about the new shard count is not necessary until we migrate +any of the child shards away from the parent's location. + +### Recovering from failures + +#### Rolling back an incomplete split + +An incomplete shard split may be rolled back quite simply, by attaching the parent shards to pageservers, +and detaching child shards. This will lose any WAL ingested into the children after the parents +were detached earlier, but the parents will catch up. + +No special pageserver API is needed for this. From the storage controllers point of view, the +procedure is: + +1. For all parent shards in the tenant, ensure they are attached +2. For all child shards, ensure they are not attached +3. Drop child shards from the storage controller's database, and clear the split bit on the parent shards. + +Any remote storage content for child shards is left behind. This is similar to other cases where +we may leave garbage objects in S3 (e.g. when we upload a layer but crash before uploading an +index that references it). Future online scrub/cleanup functionality can remove these objects, or +they will be removed when the tenant is deleted, as tenant deletion lists all objects in the prefix, +which would include any child shards that were rolled back. + +If any timelines had been created on child shards, they will be lost when rolling back. To mitigate +this, we will **block timeline creation during splitting**, so that we can safely roll back until +the split is complete, without risking losing timelines. + +Rolling back an incomplete split will happen automatically if a split fails due to some fatal +reason, and will not be accessible via an API: + +- A pageserver fails to complete its split API request after too many retries +- A pageserver returns a fatal unexpected error such as 400 or 500 +- The storage controller database returns a non-retryable error +- Some internal invariant is violated in the storage controller split code + +#### Rolling back a complete split + +A complete shard split may be rolled back similarly to an incomplete split, with the following +modifications: + +- The parent shards will no longer exist in the storage controller database, so these must + be re-synthesized somehow: the hard part of this is figuring the parent shards' generations. This + may be accomplished either by probing in S3, or by retaining some tombstone state for deleted + shards in the storage controller database. +- Any timelines that were created after the split complete will disappear when rolling back + to the tenant shards. For this reason, rolling back after a complete split should only + be done due to serious issues where loss of recently created timelines is acceptable, or + in cases where we have confirmed that no timelines were created in the intervening period. +- Parent shards' layers must not have been deleted: this property will come "for free" when + we first roll out sharding, by simply not implementing deletion of parent layers after + a split. When we do implement such deletion (see "Cleaning up parent-shard layers" in the + Optimizations section), it should apply a TTL to layers such that we have a + defined walltime window in which rollback will be possible. + +The storage controller will expose an API for rolling back a complete split, for use +in the field if we encounter some critical bug with a post-split tenant. + +#### Retrying API calls during Pageserver Restart + +When a pageserver restarts during a split API call, it may witness on-disk content for both parent and +child shards from an ongoing split. This does not intrinsically break anything, and the +pageserver may include all these shards in its `/re-attach` request to the storage controller. + +In order to support such restarts, it is important that the storage controller stores +persistent records of each child shard before it calls into a pageserver, as these child shards +may require generation increments via a `/re-attach` request. + +The pageserver restart will also result in a failed API call from the storage controller's point +of view. Recall that if _any_ pageserver fails to split, the overall split operation may not +complete, and all shards must remain pinned to their current pageserver locations until the +split is done. + +The pageserver API calls during splitting will retry on transient errors, so that +short availability gaps do not result in a failure of the overall operation. The +split in progress will be automatically rolled back if the threshold for API +retries is reached (e.g. if a pageserver stays offline for longer than a typical +restart). + +#### Rollback on Storage Controller Restart + +On startup, the storage controller will inspect the split bit for tenant shards that +it loads from the database. If any splits are in progress: + +- Database content will be reverted to the parent shards +- Child shards will be dropped from memory +- The parent and child shards will be included in the general startup reconciliation that + the storage controller does: any child shards will be detached from pageservers because + they don't exist in the storage controller's expected set of shards, and parent shards + will be attached if they aren't already. + +#### Storage controller API request failures/retries + +The split request handler will implement idempotency: if the [`Tenant`] requested to split +doesn't exist, we will check for the would-be child shards, and if they already exist, +we consider the request complete. + +If a request is retried while the original request is still underway, then the split +request handler will notice an InProgress marker in TenantManager, and return 503 +to encourage the client to backoff/retry. This is the same as the general pageserver +API handling for calls that try to act on an InProgress shard. + +#### Compute start/restart during a split + +If a compute starts up during split, it will be configured with the old sharding +configuration. This will work for reads irrespective of the progress of the split +as long as no child hards have been migrated away from their original location, and +this is guaranteed in the split procedure (see earlier section). + +#### Pageserver fails permanently during a split + +If a pageserver permanently fails (i.e. the storage controller availability state for it +goes to Offline) while a split is in progress, the splitting operation will roll back, and +during the roll back it will skip any API calls to the offline pageserver. If the offline +pageserver becomes available again, any stale locations will be cleaned up via the normal reconciliation process (the `/re-attach` API). + +### Handling secondary locations + +For correctness, it is not necessary to split secondary locations. We can simply detach +the secondary locations for parent shards, and then attach new secondary locations +for child shards. + +Clearly this is not optimal, as it will result in re-downloads of layer files that +were already present on disk. See "Splitting secondary locations" + +### Conditions to trigger a split + +The pageserver will expose a new API for reporting on shards that are candidates +for split: this will return a top-N report of the largest tenant shards by +physical size (remote size). This should exclude any tenants that are already +at the maximum configured shard count. + +The API would look something like: +`/v1/top_n_tenant?shard_count_lt=8&sort_by=resident_size` + +The storage controller will poll that API across all pageservers it manages at some appropriate interval (e.g. 60 seconds). + +A split operation will be started when the tenant exceeds some threshold. This threshold +should be _less than_ how large we actually want shards to be, perhaps much less. That's to +minimize the amount of work involved in splitting -- if we want 100GiB shards, we shouldn't +wait for a tenant to exceed 100GiB before we split anything. Some data analysis of existing +tenant size distribution may be useful here: if we can make a statement like "usually, if +a tenant has exceeded 20GiB they're probably going to exceed 100GiB later", then we might +make our policy to split a tenant at 20GiB. + +The finest split we can do is by factors of two, but we can do higher-cardinality splits +too, and this will help to reduce the overhead of repeatedly re-splitting a tenant +as it grows. An example of a very simple heuristic for early deployment of the splitting +feature would be: "Split tenants into 8 shards when their physical size exceeds 64GiB": that +would give us two kinds of tenant (1 shard and 8 shards), and the confidence that once we had +split a tenant, it will not need re-splitting soon after. + +## Optimizations + +### Flush parent shard to remote storage during split + +Any data that is in WAL but not remote storage at time of split will need +to be replayed by child shards when they start for the first time. To minimize +this work, we may flush the parent shard to remote storage before writing the +remote indices for child shards. + +It is important that this flush is subject to some time bounds: we may be splitting +in response to a surge of write ingest, so it may be time-critical to split. A +few seconds to flush latest data should be sufficient to optimize common cases without +running the risk of holding up a split for a harmful length of time when a parent +shard is being written heavily. If the flush doesn't complete in time, we may proceed +to shut down the parent shard and carry on with the split. + +### Hard linking parent layers into child shard directories + +Before we start the Tenant objects for child shards, we may pre-populate their +local storage directories with hard links to the layer files already present +in the parent shard's local directory. When the child shard starts and downloads +its remote index, it will find all those layer files already present on local disk. + +This avoids wasting download capacity and makes splitting faster, but more importantly +it avoids taking up a factor of N more disk space when splitting 1 shard into N. + +This mechanism will work well in typical flows where shards are migrated away +promptly after a split, but for the general case including what happens when +layers are evicted and re-downloaded after a split, see the 'Proactive compaction' +section below. + +### Filtering during compaction + +Compaction, especially image layer generation, should skip any keys that are +present in a shard's layer files, but do not match the shard's ShardIdentity's +is_key_local() check. This avoids carrying around data for longer than necessary +in post-split compactions. + +This was already implemented in https://github.com/neondatabase/neon/pull/6246 + +### Proactive compaction + +In remote storage, there is little reason to rewrite any data on a shard split: +all the children can reference parent layers via the very cheap write of the child +index_part.json. + +In local storage, things are more nuanced. During the initial split there is no +capacity cost to duplicating parent layers, if we implement the hard linking +optimization described above. However, as soon as any layers are evicted from +local disk and re-downloaded, the downloaded layers will not be hard-links any more: +they'll have real capacity footprint. That isn't a problem if we migrate child shards +away from the parent node swiftly, but it risks a significant over-use of local disk +space if we do not. + +For example, if we did an 8-way split of a shard, and then _didn't_ migrate 7 of +the shards elsewhere, then churned all the layers in all the shards via eviction, +then we would blow up the storage capacity used on the node by 8x. If we're splitting +a 100GB shard, that could take the pageserver to the point of exhausting disk space. + +To avoid this scenario, we could implement a special compaction mode where we just +read historic layers, drop unwanted keys, and write back the layer file. This +is pretty expensive, but useful if we have split a large shard and are not going to +migrate the child shards away. + +The heuristic conditions for triggering such a compaction are: + +- A) eviction plus time: if a child shard + has existed for more than a time threshold, and has been requested to perform at least one eviction, then it becomes urgent for this child shard to execute a proactive compaction to reduce its storage footprint, at the cost of I/O load. +- B) resident size plus time: we may inspect the resident layers and calculate how + many of them include the overhead of storing pre-split keys. After some time + threshold (different to the one in case A) we still have such layers occupying + local disk space, then we should proactively compact them. + +### Cleaning up parent-shard layers + +It is functionally harmless to leave parent shard layers in remote storage indefinitely. +They would be cleaned up in the event of the tenant's deletion. + +As an optimization to avoid leaking remote storage capacity (which costs money), we may +lazily clean up parent shard layers once no child shards reference them. + +This may be done _very_ lazily: e.g. check every PITR interval. The cleanup procedure is: + +- list all the key prefixes beginning with the tenant ID, and select those shard prefixes + which do not belong to the most-recently-split set of shards (_ancestral shards_, i.e. `shard*count < max(shard_count) over all shards)`, and those shard prefixes which do have the latest shard count (_current shards_) +- If there are no _ancestral shard_ prefixes found, we have nothing to clean up and + may drop out now. +- find the latest-generation index for each _current shard_, read all and accumulate the set of layers belonging to ancestral shards referenced by these indices. +- for all ancestral shards, list objects in the prefix and delete any layer which was not + referenced by a current shard. + +If this cleanup is scheduled for 1-2 PITR periods after the split, there is a good chance that child shards will have written their own image layers covering the whole keyspace, such that all parent shard layers will be deletable. + +The cleanup may be done by the scrubber (external process), or we may choose to have +the zeroth shard in the latest generation do the work -- there is no obstacle to one shard +reading the other shard's indices at runtime, and we do not require visibility of the +latest index writes. + +Cleanup should be artificially delayed by some period (for example 24 hours) to ensure +that we retain the option to roll back a split in case of bugs. + +### Splitting secondary locations + +We may implement a pageserver API similar to the main splitting API, which does a simpler +operation for secondary locations: it would not write anything to S3, instead it would simply +create the child shard directory on local disk, hard link in directories from the parent, +and set up the in memory (TenantSlot) state for the children. + +Similar to attached locations, a subset of secondary locations will probably need re-locating +after the split is complete, to avoid leaving multiple child shards on the same pageservers, +where they may use excessive space for the tenant. + +## FAQ/Alternatives + +### What should the thresholds be set to? + +Shard size limit: the pre-sharding default capacity quota for databases was 200GiB, so this could be a starting point for the per-shard size limit. + +Max shard count: + +- The safekeeper overhead to sharding is currently O(N) network bandwidth because + the un-filtered WAL is sent to all shards. To avoid this growing out of control, + a limit of 8 shards should be temporarily imposed until WAL filtering is implemented + on the safekeeper. +- there is also little benefit to increasing the shard count beyond the number + of pageservers in a region. + +### Is it worth just rewriting all the data during a split to simplify reasoning about space? diff --git a/docs/rfcs/033-storage-controller-drain-and-fill.md b/docs/rfcs/033-storage-controller-drain-and-fill.md new file mode 100644 index 0000000000..733f7c0bd8 --- /dev/null +++ b/docs/rfcs/033-storage-controller-drain-and-fill.md @@ -0,0 +1,345 @@ +# Graceful Restarts of Storage Controller Managed Clusters + +## Summary +This RFC describes new storage controller APIs for draining and filling tenant shards from/on pageserver nodes. +It also covers how these new APIs should be used by an orchestrator (e.g. Ansible) in order to implement +graceful cluster restarts. + +## Motivation + +Pageserver restarts cause read availablity downtime for tenants. + +For example pageserver-3 @ us-east-1 was unavailable for a randomly +picked tenant (which requested on-demand activation) for around 30 seconds +during the restart at 2024-04-03 16:37 UTC. + +Note that lots of shutdowns on loaded pageservers do not finish within the +[10 second systemd enforced timeout](https://github.com/neondatabase/infra/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers +and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse. + +This problem is not yet very acutely felt in storage controller managed pageservers since +tenant density is much lower there. However, we are planning on eventually migrating all +pageservers to storage controller management, so it makes sense to solve the issue proactively. + +## Requirements + +- Pageserver re-deployments cause minimal downtime for tenants +- The storage controller exposes HTTP API hooks for draining and filling tenant shards +from a given pageserver. Said hooks can be used by an orchestrator proces or a human operator. +- The storage controller exposes some HTTP API to cancel draining and filling background operations. +- Failures to drain or fill the node should not be fatal. In such cases, cluster restarts should proceed +as usual (with downtime). +- Progress of draining/filling is visible through metrics + +## Non Goals + +- Integration with the control plane +- Graceful restarts for large non-HA tenants. + +## Impacted Components + +- storage controller +- deployment orchestrator (i.e. Ansible) +- pageserver (indirectly) + +## Terminology + +** Draining ** is the process through which all tenant shards that can be migrated from a given pageserver +are distributed across the rest of the cluster. + +** Filling ** is the symmetric opposite of draining. In this process tenant shards are migrated onto a given +pageserver until the cluster reaches a resonable, quiescent distribution of tenant shards across pageservers. + +** Node scheduling policies ** act as constraints to the scheduler. For instance, when a +node is set in the `Paused` policy, no further shards will be scheduled on it. + +** Node ** is a pageserver. Term is used interchangeably in this RFC. + +** Deployment orchestrator ** is a generic term for whatever drives our deployments. +Currently, it's an Ansible playbook. + +## Background + +### Storage Controller Basics (skip if already familiar) + +Fundamentally, the storage controller is a reconciler which aims to move from the observed mapping between pageservers and tenant shards to an intended mapping. Pageserver nodes and tenant shards metadata is durably persisted in a database, but note that the mapping between the two entities is not durably persisted. Instead, this mapping (*observed state*) is constructed at startup by sending `GET location_config` requests to registered pageservers. + +An internal scheduler maps tenant shards to pageservers while respecting certain constraints. The result of scheduling is the *intent state*. When the intent state changes, a *reconciliation* will inform pageservers about the new assigment via `PUT location_config` requests and will notify the compute via the configured hook. + +### Background Optimizations + +The storage controller performs scheduling optimizations in the background. It will +migrate attachments to warm secondaries and replace secondaries in order to balance +the cluster out. + +### Reconciliations Concurrency Limiting + +There's a hard limit on the number of reconciles that the storage controller +can have in flight at any given time. To get an idea of scales, the limit is +128 at the time of writing. + +## Implementation + +Note: this section focuses on the core functionality of the graceful restart process. +It doesn't neccesarily describe the most efficient approach. Optimizations are described +separately in a later section. + +### Overall Flow + +This section describes how to implement graceful restarts from the perspective +of Ansible, the deployment orchestrator. Pageservers are already restarted sequentially. +The orchestrator shall implement the following epilogue and prologue steps for each +pageserver restart: + +#### Prologue + +The orchestrator shall first fetch the pageserver node id from the control plane or +the pageserver it aims to restart directly. Next, it issues an HTTP request +to the storage controller in order to start the drain of said pageserver node. +All error responses are retried with a short back-off. When a 202 (Accepted) +HTTP code is returned, the drain has started. Now the orchestrator polls the +node status endpoint exposed by the storage controller in order to await the +end of the drain process. When the `policy` field of the node status response +becomes `PauseForRestart`, the drain has completed and the orchestrator can +proceed with restarting the pageserver. + +The prologue is subject to an overall timeout. It will have a value in the ballpark +of minutes. As storage controller managed pageservers become more loaded this timeout +will likely have to increase. + +#### Epilogue + +After restarting the pageserver, the orchestrator issues an HTTP request +to the storage controller to kick off the filling process. This API call +may be retried for all error codes with a short backoff. This also serves +as a synchronization primitive as the fill will be refused if the pageserver +has not yet re-attached to the storage controller. When a 202(Accepted) HTTP +code is returned, the fill has started. Now the orchestrator polls the node +status endpoint exposed by the storage controller in order to await the end of +the filling process. When the `policy` field of the node status response becomes +`Active`, the fill has completed and the orchestrator may proceed to the next pageserver. + +Again, the epilogue is subject to an overall timeout. We can start off with +using the same timeout as for the prologue, but can also consider relying on +the storage controller's background optimizations with a shorter timeout. + +In the case that the deployment orchestrator times out, it attempts to cancel +the fill. This operation shall be retried with a short back-off. If it ultimately +fails it will require manual intervention to set the nodes scheduling policy to +`NodeSchedulingPolicy::Active`. Not doing that is not immediately problematic, +but it constrains the scheduler as mentioned previously. + +### Node Scheduling Policy State Machine + +The state machine below encodes the behaviours discussed above and +the various failover situations described in a later section. + +Assuming no failures and/or timeouts the flow should be: +`Active -> Draining -> PauseForRestart -> Active -> Filling -> Active` + +``` + Operator requested drain + +-----------------------------------------+ + | | + +-------+-------+ +-------v-------+ + | | | | + | Pause | +-----------> Draining +----------+ + | | | | | | + +---------------+ | +-------+-------+ | + | | | + | | | + Drain requested| | | + | |Drain complete | Drain failed + | | | Cancelled/PS reattach/Storcon restart + | | | + +-------+-------+ | | + | | | | + +-------------+ Active <-----------+------------------+ + | | | | +Fill requested | +---^---^-------+ | + | | | | + | | | | + | | | | + | Fill completed| | | + | | |PS reattach | + | | |after restart | + +-------v-------+ | | +-------v-------+ + | | | | | | + | Filling +---------+ +-----------+PauseForRestart| + | | | | + +---------------+ +---------------+ +``` + +### Draining/Filling APIs + +The storage controller API to trigger the draining of a given node is: +`PUT /v1/control/node/:node_id/{drain,fill}`. + +The following HTTP non-success return codes are used. +All of them are safely retriable from the perspective of the storage controller. +- 404: Requested node was not found +- 503: Requested node is known to the storage controller, but unavailable +- 412: Drain precondition failed: there is no other node to drain to or the node's schedulling policy forbids draining +- 409: A {drain, fill} is already in progress. Only one such background operation +is allowed per node. + +When the drain is accepted and commenced a 202 HTTP code is returned. + +Drains and fills shall be cancellable by the deployment orchestrator or a +human operator via: `DELETE /v1/control/node/:node_id/{drain,fill}`. A 200 +response is returned when the cancelation is successful. Errors are retriable. + +### Drain Process + +Before accpeting a drain request the following validations is applied: +* Ensure that the node is known the storage controller +* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active` or `NodeSchedulingPolicy::Pause` +* Ensure that another drain or fill is not already running on the node +* Ensure that a drain is possible (i.e. check that there is at least one +schedulable node to drain to) + +After accepting the drain, the scheduling policy of the node is set to +`NodeSchedulingPolicy::Draining` and persisted in both memory and the database. +This disallows the optimizer from adding or removing shards from the node which +is desirable to avoid them racing. + +Next, a separate Tokio task is spawned to manage the draining. For each tenant +shard attached to the node being drained, demote the node to a secondary and +attempt to schedule the node away. Scheduling might fail due to unsatisfiable +constraints, but that is fine. Draining is a best effort process since it might +not always be possible to cut over all shards. + +Importantly, this task manages the concurrency of issued reconciles in order to +avoid drowning out the target pageservers and to allow other important reconciles +to proceed. + +Once the triggered reconciles have finished or timed out, set the node's scheduling +policy to `NodeSchedulingPolicy::PauseForRestart` to signal the end of the drain. + +A note on non HA tenants: These tenants do not have secondaries, so by the description +above, they would not be migrated. It makes sense to skip them (especially the large ones) +since, depending on tenant size, this might be more disruptive than the restart since the +pageserver we've moved to do will need to on-demand download the entire working set for the tenant. +We can consider expanding to small non-HA tenants in the future. + +### Fill Process + +Before accpeting a fill request the following validations is applied: +* Ensure that the node is known the storage controller +* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active`. +This is the only acceptable policy for the fill starting state. When a node re-attaches, +it set the scheduling policy to `NodeSchedulingPolicy::Active` if it was equal to +`NodeSchedulingPolicy::PauseForRestart` or `NodeSchedulingPolicy::Draining` (possible end states for a node drain). +* Ensure that another drain or fill is not already running on the node + +After accepting the drain, the scheduling policy of the node is set to +`NodeSchedulingPolicy::Filling` and persisted in both memory and the database. +This disallows the optimizer from adding or removing shards from the node which +is desirable to avoid them racing. + +Next, a separate Tokio task is spawned to manage the draining. For each tenant +shard where the filled node is a secondary, promote the secondary. This is done +until we run out of shards or the counts of attached shards become balanced across +the cluster. + +Like for draining, the concurrency of spawned reconciles is limited. + +### Failure Modes & Handling + +Failures are generally handled by transition back into the `Active` +(neutral) state. This simplifies the implementation greatly at the +cost of adding transitions to the state machine. For example, we +could detect the `Draining` state upon restart and proceed with a drain, +but how should the storage controller know that's what the orchestrator +needs still? + +#### Storage Controller Crash + +When the storage controller starts up reset the node scheduling policy +of all nodes in states `Draining`, `Filling` or `PauseForRestart` to +`Active`. The rationale is that when the storage controller restarts, +we have lost context of what the deployment orchestrator wants. It also +has the benefit of making things easier to reason about. + +#### Pageserver Crash During Drain + +The pageserver will attempt to re-attach during restart at which +point the node scheduling policy will be set back to `Active`, thus +reenabling the scheduler to use the node. + +#### Non-drained Pageserver Crash During Drain + +What should happen when a pageserver we are draining to crashes during the +process. Two reasonable options are: cancel the drain and focus on the failover +*or* do both, but prioritise failover. Since the number of concurrent reconciles +produced by drains/fills are limited, we get the later behaviour for free. +My suggestion is we take this approach, but the cancellation option is trivial +to implement as well. + +#### Pageserver Crash During Fill + +The pageserver will attempt to re-attach during restart at which +point the node scheduling policy will be set back to `Active`, thus +reenabling the scheduler to use the node. + +#### Pageserver Goes unavailable During Drain/Fill + +The drain and fill jobs handle this by stopping early. When the pageserver +is detected as online by storage controller heartbeats, reset its scheduling +policy to `Active`. If a restart happens instead, see the pageserver crash +failure mode. + +#### Orchestrator Drain Times Out + +Orchestrator will still proceed with the restart. +When the pageserver re-attaches, the scheduling policy is set back to +`Active`. + +#### Orchestrator Fill Times Out + +Orchestrator will attempt to cancel the fill operation. If that fails, +the fill will continue until it quiesces and the node will be left +in the `Filling` scheduling policy. This hinders the scheduler, but is +otherwise harmless. A human operator can handle this by setting the scheduling +policy to `Active`, or we can bake in a fill timeout into the storage controller. + +## Optimizations + +### Location Warmth + +When cutting over to a secondary, the storage controller will wait for it to +become "warm" (i.e. download enough of the tenants data). This means that some +reconciliations can take significantly longer than others and hold up precious +reconciliations units. As an optimization, the drain stage can only cut over +tenants that are already "warm". Similarly, the fill stage can prioritise the +"warmest" tenants in the fill. + +Given that the number of tenants by the storage controller will be fairly low +for the foreseable future, the first implementation could simply query the tenants +for secondary status. This doesn't scale well with increasing tenant counts, so +eventually we will need new pageserver API endpoints to report the sets of +"warm" and "cold" nodes. + +## Alternatives Considered + +### Draining and Filling Purely as Scheduling Constraints + +At its core, the storage controller is a big background loop that detects changes +in the environment and reacts on them. One could express draining and filling +of nodes purely in terms of constraining the scheduler (as opposed to having +such background tasks). + +While theoretically nice, I think that's harder to implement and more importantly operate and reason about. +Consider cancellation of a drain/fill operation. We would have to update the scheduler state, create +an entirely new schedule (intent state) and start work on applying that. It gets trickier if we wish +to cancel the reconciliation tasks spawned by drain/fill nodes. How would we know which ones belong +to the conceptual drain/fill? One could add labels to reconciliations, but it gets messy in my opinion. + +It would also mean that reconciliations themselves have side effects that persist in the database +(persist something to the databse when the drain is done), which I'm not conceptually fond of. + +## Proof of Concept + +This RFC is accompanied by a POC which implements nearly everything mentioned here +apart from the optimizations and some of the failure handling: +https://github.com/neondatabase/neon/pull/7682 diff --git a/docs/rfcs/034-ancestor-deletion.md b/docs/rfcs/034-ancestor-deletion.md new file mode 100644 index 0000000000..7341d930e2 --- /dev/null +++ b/docs/rfcs/034-ancestor-deletion.md @@ -0,0 +1,252 @@ +# Ancestor Timeline Deletion + +Created on: 2024-02-23 + +Author: John Spray + +# Summary + +When a tenant creates a new timeline that they will treat as their 'main' history, +it is awkward to permanently retain an 'old main' timeline as its ancestor. Currently +this is necessary because it is forbidden to delete a timeline which has descendents. + +A new pageserver API is proposed to 'adopt' data from a parent timeline into +one of its children, such that the link between ancestor and child can be severed, +leaving the parent in a state where it may then be deleted. + +# Motivation + +Retaining parent timelines currently has two costs: + +- Cognitive load on users, who have to remember which is the "real" main timeline. +- Storage capacity cost, as the parent timeline will retain layers up to the + child's timeline point, even if the child fully covers its keyspace with image + layers and will never actually read from the parent. + +# Solution + +A new pageserver API `PUT /v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor` +will be added. The `timeline_id` in this URL is that of the _child_ timeline that we +wish to detach from its parent. + +On success, this API will leave the following state: + +- The detached child timeline will no longer have an ancestor, and will contain all + the data needed to service reads without recursing into an ancestor. +- Any other children of the parent whose timeline points were at a lower LSN than + the detached child timeline will be modified to have the child timeline as their + new parent. +- The parent timeline will still exist, but the child will no longer have it as an + ancestor. If this was the last timeline that depended on the parent, then the + parent will become deletable. + +This API's implementation will consist of a series of retryable steps, such that +on failures/timeout it can safely be called again to reach the target state. + +## Example + +### Before + +The user has "rolled back" their project to LSN X, resulting in a "new main" +timeline. The parent "old main" timeline still exists, and they would like +to clean it up. + +They have two other timelines A and B. A is from before the rollback point, +and B is from after the rollback point. + +``` +----"old main" timeline-------X--------------------------------------------> + | | | + |-> child A | | + |-> "new main" timeline | + -> child B + +``` + +### After calling detach ancestor API + +The "new main" timeline is no longer dependent on old main, and neither +is child A, because it had a branch point before X. + +The user may now choose to delete child B and "old main" to get to +a pristine state. Child B is likely to be unwanted since the user +chose to roll back to X, and it branches from after X. However, we +don't assume this in the API; it is up to the user to delete it. + +``` +|----"old main" timeline----------------------------------------------------> + | + | + | + -> child B + +|----"new main" timeline---------> + | + |-> child A + + +``` + +### After removing timelines + +We end up with a totally clean state that leaves no trace that a rollback +ever happened: there is only one root timeline. + +``` +| ----"new main" timeline-----------> + | + |-> child A + + +``` + +## Caveats + +Important things for API users to bear in mind: + +- this API does not delete the parent timeline: you must still do that explicitly. +- if there are other child timelines ahead of the branch point of the detached + child, the parent won't be deletable: you must either delete or detach those + children. +- do _not_ simply loop over all children and detach them all: this can have an + extremely high storage cost. The detach ancestor API is intended for use on a single + timeline to make it the new "main". +- The detach ancestor API should also not be + exposed directly to the user as button/API, because they might decide + to click it for all the children and thereby generate many copies of the + parent's data -- the detach ancestor API should be used as part + of a high level "clean up after rollback" feature. + +## `detach_ancestor` API implementation + +Terms used in the following sections: + +- "the child": the timeline whose ID is specified in the detach ancestor API URL, also + called "new main" in the example. +- "the parent": the parent of "the child". Also called "old main" in the example. +- "the branch point" the ancestor_lsn of "the child" + +### Phase 1: write out adopted layers to S3 + +The child will "adopt" layers from the parent, such that its end state contains +all the parent's history as well as its own. + +For all layers in the parent's layer map whose high LSN is below the branch +point, issue S3 CopyObject requests to duplicate them into the child timeline's +prefix. Do not add them to the child's layer map yet. + +For delta layers in the parent's layer map which straddle the branch point, read them +and write out only content up to the branch point into new layer objects. + +This is a long running operation if the parent has many layers: it should be +implemented in a way that resumes rather than restarting from scratch, if the API +times out and is called again. + +As an optimization, if there are no other timelines that will be adopted into +the child, _and_ the child's image layers already full cover the branch LSN, +then we may skip adopting layers. + +### Phase 2: update the child's index + +Having written out all needed layers in phase 1, atomically link them all +into the child's IndexPart and upload to S3. This may be done while the +child Timeline is still running. + +### Phase 3: modify timelines ancestry + +Modify the child's ancestor to None, and upload its IndexPart to persist the change. + +For all timelines which have the same parent as the child, and have a branch +point lower than our branch point, switch their ancestor_timeline to the child, +and upload their IndexPart to persist the change. + +## Alternatives considered + +### Generate full image layer on child, rather than adopting parent deltas + +This would work for the case of a single child, but would prevent re-targeting +other timelines that depended on the parent. If we detached many children this +way, the storage cost would become prohibitive (consider a 1TB database with +100 child timelines: it would cost 100TiB if they all generated their own image layers). + +### Don't rewrite anything: just fake it in the API + +We could add a layer of indirection that let a child "pretend" that it had no +ancestor, when in reality it still had the parent. The pageserver API could +accept deletion of ancestor timelines, and just update child metadata to make +them look like they have no ancestor. + +This would not achieve the desired reduction in storage cost, and may well be more +complex to maintain than simply implementing the API described in this RFC. + +### Avoid copying objects: enable child index to use parent layers directly + +We could teach IndexPart to store a TimelineId for each layer, such that a child +timeline could reference a parent's layers directly, rather than copying them +into the child's prefix. + +This would impose a cost for the normal case of indices that only target the +timeline's own layers, add complexity, and break the useful simplifying +invariant that timelines "own" their own path. If child timelines were +referencing layers from the parent, we would have to ensure that the parent +never runs GC/compaction again, which would make the API less flexible (the +proposal in this RFC enables deletion of the parent but doesn't require it.) + +## Performance + +### Adopting layers + +- CopyObject is a relatively cheap operation, but we may need to issue tens of thousands + of such requests: this can take up to tens of seconds and will compete for RemoteStorage + semaphore units with other activity on the pageserver. +- If we are running on storage backend that doesn't implement CopyObject, then + this part will be much more expensive as we would stream all layer content + through the pageserver. This is no different to issuing a lot + of reads to a timeline that does not have a warm local cache: it will move + a lot of gigabytes, but that shouldn't break anything. +- Generating truncated layers for delta that straddle the branch point will + require streaming read/write of all the layers in question. + +### Updating timeline ancestry + +The simplest way to update timeline ancestry will probably be to stop and start +all the Timeline objects: this is preferable to the complexity of making their +ancestry mutable at runtime. + +There will be a corresponding "stutter" in the availability of the timelines, +of the order 10-100ms, which is the time taken to upload their IndexPart, and +restart the Timeline. + +# Interaction with other features + +## Concurrent timeline creation + +If new historic timelines are created using the parent as an ancestor while the +detach ancestor API is running, they will not be re-parented to the child. This +doesn't break anything, but it leaves the parent in a state where it might not +be possible to delete it. + +Since timeline creations are an explicit user action, this is not something we need to +worry about as the storage layer: a user who wants to delete their parent timeline will not create +new children, and if they do, they can choose to delete those children to +enable deleting the parent. + +For the least surprise to the user, before starting the detach ancestor branch +operation, the control plane should wait until all branches are created and not +allow any branches to be created before the branch point on the ancestor branch +while the operation is ongoing. + +## WAL based disaster recovery + +WAL based disaster recovery currently supports only restoring of the main +branch. Enabling WAL based disaster recovery in the future requires that we +keep a record which timeline generated the WAL and at which LSN was a parent +detached. Keep a list of timeline ids and the LSN in which they were detached in +the `index_part.json`. Limit the size of the list to 100 first entries, after +which the WAL disaster recovery will not be possible. + +## Sharded tenants + +For sharded tenants, calls to the detach ancestor API will pass through the storage +controller, which will handle them the same as timeline creations: invoke first +on shard zero, and then on all the other shards. diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md new file mode 100644 index 0000000000..239ec58186 --- /dev/null +++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md @@ -0,0 +1,495 @@ +# Safekeeper dynamic membership change + +To quickly recover from safekeeper node failures and do rebalancing we need to +be able to change set of safekeepers the timeline resides on. The procedure must +be safe (not lose committed log) regardless of safekeepers and compute state. It +should be able to progress if any majority of old safekeeper set, any majority +of new safekeeper set and compute are up and connected. This is known as a +consensus membership change. It always involves two phases: 1) switch old +majority to old + new configuration, preventing commits without acknowledge from +the new set 2) bootstrap the new set by ensuring majority of the new set has all +data which ever could have been committed before the first phase completed; +after that switch is safe to finish. Without two phases switch to the new set +which quorum might not intersect with quorum of the old set (and typical case of +ABC -> ABD switch is an example of that, because quorums AC and BD don't +intersect). Furthermore, procedure is typically carried out by the consensus +leader, and so enumeration of configurations which establishes order between +them is done through consensus log. + +In our case consensus leader is compute (walproposer), and we don't want to wake +up all computes for the change. Neither we want to fully reimplement the leader +logic second time outside compute. Because of that the proposed algorithm relies +for issuing configurations on the external fault tolerant (distributed) strongly +consisent storage with simple API: CAS (compare-and-swap) on the single key. +Properly configured postgres suits this. + +In the system consensus is implemented at the timeline level, so algorithm below +applies to the single timeline. + +## Algorithm + +### Definitions + +A configuration is + +``` +struct Configuration { + generation: Generation, // a number uniquely identifying configuration + sk_set: Vec, // current safekeeper set + new_sk_set: Optional>, +} +``` + +Configuration with `new_set` present is used for the intermediate step during +the change and called joint configuration. Generations establish order of +generations: we say `c1` is higher than `c2` if `c1.generation` > +`c2.generation`. + +### Persistently stored data changes + +Safekeeper starts storing its current configuration in the control file. Update +of is atomic, so in-memory value always matches the persistent one. + +External CAS providing storage (let's call it configuration storage here) also +stores configuration for each timeline. It is initialized with generation 1 and +initial set of safekeepers during timeline creation. Executed CAS on it must +never be lost. + +### Compute <-> safekeeper protocol changes + +`ProposerGreeting` message carries walproposer's configuration if it is already +established (see below), else null. `AcceptorGreeting` message carries +safekeeper's current `Configuration`. All further messages (`VoteRequest`, +`VoteResponse`, `ProposerElected`, `AppendRequest`, `AppendResponse`) carry +generation number, of walproposer in case of wp->sk message or of safekeeper in +case of sk->wp message. + +### Safekeeper changes + +Basic rule: once safekeeper observes configuration higher than his own it +immediately switches to it. It must refuse all messages with lower generation +that his. It also refuses messages if it is not member of the current generation +(that is, of either `sk_set` of `sk_new_set`), though it is likely not unsafe to +process them (walproposer should ignore result anyway). + +If there is non null configuration in `ProposerGreeting` and it is higher than +current safekeeper one, safekeeper switches to it. + +Safekeeper sends its current configuration in its first message to walproposer +`AcceptorGreeting`. It refuses all other walproposer messages if the +configuration generation in them is less than its current one. Namely, it +refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In +response it sends its current configuration generation to let walproposer know. + +Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration` +accepting `Configuration`. Safekeeper switches to the given conf it is higher than its +current one and ignores it otherwise. In any case it replies with +``` +struct ConfigurationSwitchResponse { + conf: Configuration, + term: Term, + last_log_term: Term, + flush_lsn: Lsn, +} +``` + +### Compute (walproposer) changes + +Basic rule is that joint configuration requires votes from majorities in the +both `set` and `new_sk_set`. + +Compute receives list of safekeepers to connect to from the control plane as +currently and tries to communicate with all of them. However, the list does not +define consensus members. Instead, on start walproposer tracks highest +configuration it receives from `AcceptorGreeting`s. Once it assembles greetings +from majority of `sk_set` and majority of `new_sk_set` (if it is present), it +establishes this configuration as its own and moves to voting. + +It should stop talking to safekeepers not listed in the configuration at this +point, though it is not unsafe to continue doing so. + +To be elected it must receive votes from both majorites if `new_sk_set` is present. +Similarly, to commit WAL it must receive flush acknowledge from both majorities. + +If walproposer hears from safekeeper configuration higher than his own (i.e. +refusal to accept due to configuration change) it simply restarts. + +### Change algorithm + +The following algorithm can be executed anywhere having access to configuration +storage and safekeepers. It is safe to interrupt / restart it and run multiple +instances of it concurrently, though likely one of them won't make +progress then. It accepts `desired_set: Vec` as input. + +Algorithm will refuse to make the change if it encounters previous interrupted +change attempt, but in this case it will try to finish it. + +It will eventually converge if old majority, new majority and configuration +storage are reachable. + +1) Fetch current timeline configuration from the configuration storage. +2) If it is already joint one and `new_set` is different from `desired_set` + refuse to change. However, assign join conf to (in memory) var + `join_conf` and proceed to step 4 to finish the ongoing change. +3) Else, create joint `joint_conf: Configuration`: increment current conf number + `n` and put `desired_set` to `new_sk_set`. Persist it in the configuration + storage by doing CAS on the current generation: change happens only if + current configuration number is still `n`. Apart from guaranteeing uniqueness + of configurations, CAS linearizes them, ensuring that new configuration is + created only following the previous one when we know that the transition is + safe. Failed CAS aborts the procedure. +4) Call `PUT` `configuration` on safekeepers from the current set, + delivering them `joint_conf`. Collecting responses from majority is required + to proceed. If any response returned generation higher than + `joint_conf.generation`, abort (another switch raced us). Otherwise, choose + max `` among responses and establish it as + (in memory) `sync_position`. Also choose max `term` and establish it as (in + memory) `sync_term`. We can't finish the switch until majority of the new set + catches up to this `sync_position` because data before it could be committed + without ack from the new set. Similarly, we'll bump term on new majority + to `sync_term` so that two computes with the same term are never elected. +4) Initialize timeline on safekeeper(s) from `new_sk_set` where it + doesn't exist yet by doing `pull_timeline` from the majority of the + current set. Doing that on majority of `new_sk_set` is enough to + proceed, but it is reasonable to ensure that all `new_sk_set` members + are initialized -- if some of them are down why are we migrating there? +5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set. + Success on majority is enough. +6) Repeatedly call `PUT` `configuration` on safekeepers from the new set, + delivering them `joint_conf` and collecting their positions. This will + switch them to the `joint_conf` which generally won't be needed + because `pull_timeline` already includes it and plus additionally would be + broadcast by compute. More importantly, we may proceed to the next step + only when `` on the majority of the new set reached + `sync_position`. Similarly, on the happy path no waiting is not needed because + `pull_timeline` already includes it. However, we should double + check to be safe. For example, timeline could have been created earlier e.g. + manually or after try-to-migrate, abort, try-to-migrate-again sequence. +7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new + safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration + storage under one more CAS. +8) Call `PUT` `configuration` on safekeepers from the new set, + delivering them `new_conf`. It is enough to deliver it to the majority + of the new set; the rest can be updated by compute. + +I haven't put huge effort to make the description above very precise, because it +is natural language prone to interpretations anyway. Instead I'd like to make TLA+ +spec of it. + +Description above focuses on safety. To make the flow practical and live, here a few more +considerations. +1) It makes sense to ping new set to ensure it we are migrating to live node(s) before + step 3. +2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed + it is safe to rollback to the old conf with one more CAS. +3) On step 4 timeline might be already created on members of the new set for various reasons; + the simplest is the procedure restart. There are more complicated scenarious like mentioned + in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving + generations, so seems simpler to treat existing timeline as success. However, this also + has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in + the step 5 is never reached until compute is (re)awaken up to synchronize new member(s). + I don't think we'll observe this in practice, but can add waking up compute if needed. +4) In the end timeline should be locally deleted on the safekeeper(s) which are + in the old set but not in the new one, unless they are unreachable. To be + safe this also should be done under generation number (deletion proceeds only if + current configuration is <= than one in request and safekeeper is not memeber of it). +5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`, + jump to step 7, using it as `new_conf`. + +## Implementation + +The procedure ought to be driven from somewhere. Obvious candidates are control +plane and storage_controller; and as each of them already has db we don't want +yet another storage. I propose to manage safekeepers in storage_controller +because 1) since it is in rust it simplifies simulation testing (more on this +below) 2) it already manages pageservers. + +This assumes that migration will be fully usable only after we migrate all +tenants/timelines to storage_controller. It is discussible whether we want also +to manage pageserver attachments for all of these, but likely we do. + +This requires us to define storcon <-> cplane interface. + +### storage_controller <-> control plane interface + +First of all, control plane should +[change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829) +storing safekeepers per timeline instead of per tenant because we can't migrate +tenants atomically. + +The important question is how updated configuration is delivered from +storage_controller to control plane to provide it to computes. As always, there +are two options, pull and push. Let's do it the same push as with pageserver +`/notify-attach` because 1) it keeps storage_controller out of critical compute +start path 2) provides easier upgrade: there won't be such a thing as 'timeline +managed by control plane / storcon', cplane just takes the value out of its db +when needed 3) uniformity. It makes storage_controller responsible for retrying notifying +control plane until it succeeds. + +So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and +updates it in the db if the provided conf generation is higher (the cplane db +should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it +should update db which makes the call successful, and then try to schedule +`apply_config` if possible, it is ok if not. storage_controller +should rate limit calling the endpoint, but likely this won't be needed, as migration +throughput is limited by `pull_timeline`. + +Timeline (branch) creation in cplane should call storage_controller POST +`tenant/:tenant_id/timeline` like it currently does for sharded tenants. +Response should be augmented with `safekeeper_conf: Configuration`. The call +should be retried until succeeds. + +Timeline deletion and tenant deletion in cplane should call appropriate +storage_controller endpoints like it currently does for sharded tenants. The +calls should be retried until they succeed. + +### storage_controller implementation + +Current 'load everything on startup and keep in memory' easy design is fine. +Single timeline shouldn't take more than 100 bytes (it's 16 byte tenant_id, 16 +byte timeline_id, int generation, vec of ~3 safekeeper ids plus some flags), so +10^6 of timelines shouldn't take more than 100MB. + +Similar to pageserver attachment Intents storage_controller would have in-memory +`MigrationRequest` (or its absense) for each timeline and pool of tasks trying +to make these request reality; this ensures one instance of storage_controller +won't do several migrations on the same timeline concurrently. In the first +version it is simpler to have more manual control and no retries, i.e. migration +failure removes the request. Later we can build retries and automatic +scheduling/migration. `MigrationRequest` is +``` +enum MigrationRequest { + To(Vec), + FinishPending, +} +``` + +`FinishPending` requests to run the procedure to ensure state is clean: current +configuration is not joint and majority of safekeepers are aware of it, but do +not attempt to migrate anywhere. If current configuration fetched on step 1 is +not joint it jumps to step 7. It should be run at startup for all timelines (but +similarly, in the first version it is ok to trigger it manually). + +#### Schema + +`safekeepers` table mirroring current `nodes` should be added, except that for +`scheduling_policy` field (seems like `status` is a better name for it): it is enough +to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3) +`decomissioned`. + +`timelines` table: +``` +table! { + // timeline_id is primary key + timelines (tenant_id, timeline_id) { + timeline_id -> Varchar, + tenant_id -> Varchar, + generation -> Int4, + sk_set -> Array, // list of safekeeper ids + new_sk_set -> Nullable>, // list of safekeeper ids, null if not joint conf + cplane_notified_generation -> Int4, + } +} +``` + +#### API + +Node management is similar to pageserver: +1) POST `/control/v1/safekeepers` upserts safekeeper. +2) GET `/control/v1/safekeepers` lists safekeepers. +3) GET `/control/v1/safekeepers/:node_id` gets safekeeper. +4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g. + `offline` or `decomissioned`. Initially it is simpler not to schedule any + migrations here. + +Safekeeper deploy scripts should register safekeeper at storage_contorller as +they currently do with cplane, under the same id. + +Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline` +would 1) choose initial set of safekeepers; 2) write to the db initial +`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in +case of conflict; 3) create timeline on the majority of safekeepers (already +created is ok). + +We don't want to block timeline creation when one safekeeper is down. Currently +this is solved by compute implicitly creating timeline on any safekeeper it is +connected to. This creates ugly timeline state on safekeeper when timeline is +created, but start LSN is not defined yet. It would be nice to remove this; to +do that, controller can in the background retry to create timeline on +safekeeper(s) which missed that during initial creation call. It can do that +through `pull_timeline` from majority so it doesn't need to remember +`parent_lsn` in its db. + +Timeline deletion removes the row from the db and forwards deletion to the +current configuration members. Without additional actions deletions might leak, +see below on this; initially let's ignore these, reporting to cplane success if +at least one safekeeper deleted the timeline (this will remove s3 data). + +Tenant deletion repeats timeline deletion for all timelines. + +Migration API: the first version is the simplest and the most imperative: +1) PUT `/control/v1/safekeepers/migrate` schedules `MigrationRequest`s to move +all timelines from one safekeeper to another. It accepts json +``` +{ + "src_sk": u32, + "dst_sk": u32, + "limit": Optional, +} +``` + +Returns list of scheduled requests. + +2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest` + to move single timeline to given set of safekeepers: +``` +{ + "desired_set": Vec, +} +``` + +Returns scheduled request. + +Similar call should be added for the tenant. + +It would be great to have some way of subscribing to the results (apart from +looking at logs/metrics). + +Migration is executed as described above. One subtlety is that (local) deletion on +source safekeeper might fail, which is not a problem if we are going to +decomission the node but leaves garbage otherwise. I'd propose in the first version +1) Don't attempt deletion at all if node status is `offline`. +2) If it failed, just issue warning. +And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and +remove garbage timelines for manual use. It will 1) list all timelines on the +safekeeper 2) compare each one against configuration storage: if timeline +doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can +be deleted under generation number if node is not member of current generation. + +Automating this is untrivial; we'd need to register all potential missing +deletions in the same transaction +which switches configurations. Similarly when timeline is fully deleted to +prevent cplane operation from blocking when some safekeeper is not available +deletion should be also registered. + +One more task pool should infinitely retry notifying control plane about changed +safekeeper sets. + +3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return + current in memory state of the timeline and pending `MigrationRequest`, + if any. + +4) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate_abort` tries to abort the + migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS + (incrementing generation as always). + +#### Dealing with multiple instances of storage_controller + +Operations described above executed concurrently might create some errors but do +not prevent progress, so while we normally don't want to run multiple instances +of storage_controller it is fine to have it temporarily, e.g. during redeploy. + +Any interactions with db update in-memory controller state, e.g. if migration +request failed because different one is in progress, controller remembers that +and tries to finish it. + +## Testing + +`neon_local` should be switched to use storage_controller, playing role of +control plane. + +There should be following layers of tests: +1) Model checked TLA+ spec specifies the algorithm and verifies its basic safety. + +2) To cover real code and at the same time test many schedules we should have + simulation tests. For that, configuration storage, storage_controller <-> + safekeeper communication and pull_timeline need to be mocked and main switch + procedure wrapped to as a node (thread) in simulation tests, using these + mocks. Test would inject migrations like it currently injects + safekeeper/walproposer restars. Main assert is the same -- committed WAL must + not be lost. + +3) Since simulation testing injects at relatively high level points (not + syscalls), it omits some code, in particular `pull_timeline`. Thus it is + better to have basic tests covering whole system as well. Extended version of + `test_restarts_under_load` would do: start background load and do migration + under it, then restart endpoint and check that no reported commits + had been lost. I'd also add one more creating classic network split scenario, with + one compute talking to AC and another to BD while migration from nodes ABC to ABD + happens. + +4) Simple e2e test should ensure that full flow including cplane notification works. + +## Order of implementation and rollout + +Note that +- Control plane parts and integration with it is fully independent from everything else + (tests would use simulation and neon_local). +- There is a lot of infra work making storage_controller aware of timelines and safekeepers + and its impl/rollout should be separate from migration itself. +- Initially walproposer can just stop working while it observers joint configuration. + Such window would be typically very short anyway. + +To rollout smoothly, both walproposer and safekeeper should have flag +`configurations_enabled`; when set to false, they would work as currently, i.e. +walproposer is able to commit on whatever safekeeper set it is provided. Until +all timelines are managed by storcon we'd need to use current script to migrate +and update/drop entries in the storage_controller database if it has any. + +Safekeepers would need to be able to talk both current and new protocol version +with compute to reduce number of computes restarted in prod once v2 protocol is +deployed (though before completely switching we'd need to force this). + +Let's have the following rollout order: +- storage_controller becomes aware of safekeepers; +- storage_controller gets timeline creation for new timelines and deletion requests, but + doesn't manage all timelines yet. Migration can be tested on these new timelines. + To keep control plane and storage_controller databases in sync while control + plane still chooses the safekeepers initially (until all timelines are imported + it can choose better), `TimelineCreateRequest` can get optional safekeepers + field with safekeepers chosen by cplane. +- Then we can import all existing timelines from control plane to + storage_controller and gradually enable configurations region by region. + + +Very rough implementation order: +- Add concept of configurations to safekeepers (including control file), + implement v3 protocol. +- Implement walproposer changes, including protocol. +- Implement storconn part. Use it in neon_local (and pytest). +- Make cplane store safekeepers per timeline instead of per tenant. +- Implement cplane/storcon integration. Route branch creation/deletion + through storcon. Then we can test migration of new branches. +- Finally import existing branches. Then we can drop cplane + safekeeper selection code. Gradually enable configurations at + computes and safekeepers. Before that, all computes must talk only + v3 protocol version. + +## Integration with evicted timelines + +Currently, `pull_timeline` doesn't work correctly with evicted timelines because +copy would point to original partial file. To fix let's just do s3 copy of the +file. It is a bit stupid as generally unnecessary work, but it makes sense to +implement proper migration before doing smarter timeline archival. [Issue](https://github.com/neondatabase/neon/issues/8542) + +## Possible optimizations + +Steps above suggest walproposer restart (with re-election) and thus reconnection +to safekeepers. Since by bumping term on new majority we ensure that leader +terms are unique even across generation switches it is possible to preserve +connections. However, it is more complicated, reconnection is very fast and it +is much more important to avoid compute restart than millisecond order of write +stall. + +Multiple joint consensus: algorithm above rejects attempt to change membership +while another attempt is in progress. It is possible to overlay them and AFAIK +Aurora does this but similarly I don't think this is needed. + +## Misc + +We should use Compute <-> safekeeper protocol change to include other (long +yearned) modifications: +- send data in network order to make arm work. +- remove term_start_lsn from AppendRequest +- add horizon to TermHistory +- add to ProposerGreeting number of connection from this wp to sk diff --git a/docs/rfcs/035-timeline-archive.md b/docs/rfcs/035-timeline-archive.md new file mode 100644 index 0000000000..c834216962 --- /dev/null +++ b/docs/rfcs/035-timeline-archive.md @@ -0,0 +1,507 @@ +# Timeline Archival + +## Summary + +This RFC describes a mechanism for pageservers to eliminate local storage + compute work +for timelines which are not in use, in response to external API calls to "archive" a timeline. + +The archived state roughly corresponds to fully offloading a timeline to object storage, such +that its cost is purely the cost of that object storage. + +## Motivation + +Archived timelines serve multiple purposes: +- Act as a 'snapshot' for workloads that would like to retain restorable copies of their + database from longer ago than their PITR window. +- Enable users to create huge numbers of branches (e.g. one per github PR) without having + to diligently clean them up later to avoid overloading the pageserver (currently we support + up to ~500 branches per tenant). + +### Prior art + +Most storage and database systems have some form of snapshot, which can be implemented several ways: +1. full copies of data (e.g. an EBS snapshot to S3) +2. shallow snapshots which are CoW relative to the original version of the data, e.g. on a typical NFS appliance, or a filesystem like CephFS. +3. a series of snapshots which are CoW or de-duplicated relative to one another. + +Today's Neon branches are approximately like `2.`, although due to implementation details branches +often end up storing much more data than they really need, as parent branches assume that all data +at the branch point is needed. The layers pinned in the parent branch may have a much larger size +than the physical size of a compressed image layer representing the data at the branch point. + +## Requirements + +- Enter & exit the archived state in response to external admin API calls +- API calls to modify the archived state are atomic and durable +- An archived timeline should eventually (once out of PITR window) use an efficient compressed + representation, and avoid retaining arbitrarily large data in its parent branch. +- Remote object GETs during tenant start may be O(N) with the number of _active_ branches, + but must not scale with the number of _archived_ branches. +- Background I/O for archived branches should only be done a limited number of times to evolve them + to a long-term-efficient state (e.g. rewriting to image layers). There should be no ongoing "housekeeping" + overhead for archived branches, including operations related to calculating sizes for billing. +- The pageserver should put no load on the safekeeper for archived branches. +- Performance of un-archiving a branch must make good use of S3/disk bandwidth to restore the branch + to a performant state in a short time (linear with the branch's logical size) + +## Non Goals + +- Archived branches are not a literal `fullbackup` postgres snapshot: they are still stored + in Neon's internal format. +- Compute cold starts after activating an archived branch will not have comparable performance to + cold starts on an active branch. +- Archived branches will not use any new/additional compression or de-duplication beyond what + is already implemented for image layers (zstd per page). +- The pageserver will not "auto start" archived branches in response to page_service API requests: they + are only activated explicitly via the HTTP API. +- We will not implement a total offload of archived timelines from safekeepers: their control file (small) will + remain on local disk, although existing eviction mechanisms will remove any segments from local disk. +- We will not expose any prometheus metrics for archived timelines, or make them visible in any + detailed HTTP APIs other than the specific API for listing archived timelines. +- A parent branch may not be archived unless all its children are. + +## Impacted Components + +pageserver, storage controller + +## Terminology + +**Archived**: a branch is _archived_ when an HTTP API request to archive it has succeeded: the caller +may assume that this branch is now very cheap to store, although this may not be physically so until the +branch proceeds to the offloaded state. + +**Active** branches are branches which are available for use by page_service clients, and have a relatively +high cost due to consuming local storage. + +**Offloaded** branches are a subset of _archived_ branches, which have had their local state removed such +that they now consume minimal runtime resources and have a cost similar to the cost of object storage. + +**Activate** (verb): transition from Archived to Active + +**Archive** (verb): transition from Active to Archived + +**Offload** (verb): transition from Archived to Offloaded + +**Offload manifest**: an object stored in S3 that describes timelines which pageservers do not load. + +**Warm up** (verb): operation done on an active branch, by downloading its active layers. Once a branch is +warmed up, good performance will be available to page_service clients. + +## Implementation + +### High level flow + +We may think of a timeline which is archived and then activated as proceeding through a series of states: + +```mermaid +stateDiagram + [*] --> Active(warm) + Active(warm) --> Archived + Archived --> Offloaded + Archived --> Active(warm) + Offloaded --> Active(cold) + Active(cold) --> Active(warm) +``` + +Note that the transition from Archived to Active(warm) is expected to be fairly rare: the most common lifecycles +of branches will be: +- Very frequent: Short lived branches: Active -> Deleted +- Frequent: Long-lived branches: Active -> Archived -> Offloaded -> Deleted +- Rare: Branches used to restore old state: Active ->Archived -> Offloaded -> Active + +These states are _not_ all stored as a single physical state on the timeline, but rather represent the combination +of: +- the timeline's lifecycle state: active or archived, stored in the timeline's index +- its offload state: whether pageserver has chosen to drop local storage of the timeline and write it into the + manifest of offloaded timelines. +- cache state (whether it's warm or cold). + +### Storage format changes + +There are two storage format changes: +1. `index_part.json` gets a new attribute `state` that describes whether the timeline is to + be considered active or archived. +2. A new tenant-level _manifest_ object `tenant_manifest-v1.json` describes which timelines a tenant does not need to load + at startup (and is available for storing other small, rarely changing tenant-wide attributes in future) + +The manifest object will have a format like this: +``` +{ + "offload_timelines": [ + { + "timeline_id": ... + "last_record_lsn": ... + "last_record_lsn_time": ... + "pitr_interval": ... + "last_gc_lsn": ... # equal to last_record_lsn if this branch has no history (i.e. a snapshot) + "logical_size": ... # The size at last_record_lsn + "physical_size" ... + "parent": Option<{ + "timeline_id"... + "lsn"... # Branch point LSN on the parent + "requires_data": bool # True if this branch depends on layers in its parent, identify it here + + }> + } + ] +} +``` + +The information about a timeline in its offload state is intentionally minimal: just enough to decide: +- Whether it requires [archive optimization](#archive-branch-optimization) by rewriting as a set of image layers: we may infer this + by checking if now > last_record_lsn_time - pitr_interval, and pitr_lsn < last_record_lsn. +- Whether a parent branch should include this offloaded branch in its GC inputs to avoid removing + layers that the archived branch depends on +- Whether requests to delete this `timeline_id` should be executed (i.e. if a deletion request + is received for a timeline_id that isn't in the site of live `Timelines` or in the manifest, then + we don't need to go to S3 for the deletion. +- How much archived space to report in consumption metrics + +The contents of the manifest's offload list will also be stored as an attribute of `Tenant`, such that the total +set of timelines may be found by the union of `Tenant::timelines` (non-offloaded timelines) and `Tenant::offloaded` +(offloaded timelines). + +For split-brain protection, the manifest object will be written with a generation suffix, in the same way as +index_part objects are (see [generation numbers RFC](025-generation-numbers.md)). This will add some complexity, but +give us total safety against two pageservers with the same tenant attached fighting over the object. Existing code +for finding the latest generation and for cleaning up old generations (in the scrubber) will be generalized to cover +the manifest file. + +### API & Timeline state + +Timelines will store a lifecycle state (enum of Active or Archived) in their IndexPart. This will +be controlled by a new per-timeline `configure` endpoint. This is intentionally generic naming, which +may be used in future to control other per-timeline attributes (e.g. in future we may make PITR interval +a per-timeline configuration). + +`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configure` +``` +{ + 'state': 'active|archive' +} +``` + +When archiving a timeline, this API will complete as soon as the timeline's state has been set in index_part, and that index has been uploaded. + +When activating a timeline, this API will complete as soon as the timeline's state has been set in index_part, +**and** the `Timeline` object has been instantiated and activated. This will require reading the timeline's +index, but not any data: it should be about as fast as a couple of small S3 requests. + +The API will be available with identical path via the storage controller: calling this on a sharded tenant +will simply map the API call to all the shards. + +Archived timelines may never have descendent timelines which are active. This will be enforced at the API level, +such that activating a timeline requires that all its ancestors are active, and archiving a timeline requires +that all its descendents are archived. It is the callers responsibility to walk the hierarchy of timelines +in the proper order if they would like to archive whole trees of branches. + +Because archive timelines will be excluded from the usual timeline listing APIs, a new API specifically +for archived timelines will be added: this is for use in support/debug: + +``` +GET /v1/tenants/{tenant_id}/archived_timelines + +{ + ...same per-timeline content as the tenant manifest... +} + +``` + +### Tenant attach changes + +Currently, during Tenant::spawn we list all the timelines in the S3 bucket, and then for each timeline +we load their index_part.json. To avoid the number of GETs scaling linearly with the number of archived +timelines, we must have a single object that tells us which timelines do not need to be loaded. The +number of ListObjects requests while listing timelines will still scale O(N), but this is less problematic +because each request covers 1000 timelines. + +This is **not** literally the same as the set of timelines who have state=archived. Rather, it is +the set of timelines which have been offloaded in the background after their state was set to archived. + +We may simply skip loading these timelines: there will be no special state of `Timeline`, they just won't +exist from the perspective of an active `Tenant` apart from in deletion: timeline deletion will need +to check for offloaded timelines as well as active timelines, to avoid wrongly returning 404 on trying +to delete an offloaded timeline. + +### Warm-up API + +`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/download?wait_ms=1234` + +This API will be similar to the existing `download_remote_layers` API, but smarter: +- It will not download _all_ remote layers, just the visible set (i.e. layers needed for a read) +- It will download layers in the visible set until reaching `wait_ms`, then return a struct describing progress + of downloads, so that the caller can poll. + +The _visible set_ mentioned above will be calculated by the pageserver in the background, by taking the set +of readable LSNs (i.e. branch points and heads of branches), and walking the layer map to work out which layers +can possibly be read from these LSNs. This concept of layer visibility is more generally useful for cache +eviction and heatmaps, as well as in this specific case of warming up a timeline. + +The caller does not have to wait for the warm up API, or call it at all. But it is strongly advised +to call it, because otherwise populating local contents for a timeline can take a long time when waiting +for SQL queries to coincidentally hit all the layers, and during that time query latency remains quite +volatile. + +### Background work + +Archived branches are not subject to normal compaction. Instead, when the compaction loop encounters +an archived branch, it will consider rewriting the branch to just image layers if the branch has no history +([archive branch optimization](#archive-branch-optimization)), or offloading the timeline from local disk +if its state permits that. + +Additionally, the tenant compaction task will walk the state of already offloaded timelines to consider +optimizing their storage, e.g. if a timeline had some history when offloaded, but since then its PITR +has elapsed and it can now be rewritten to image layers. + +#### Archive branch offload + +Recall that when we archive a timeline via the HTTP API, this only sets a state: it doesn't do +any actual work. + +This work is done in the background compaction loop. It makes sense to tag this work on to the compaction +loop, because it is spiritually aligned: offloading data for archived branches improves storage efficiency. + +The condition for offload is simple: + - a `Timeline` object exists with state `Archived` + - the timeline does not have any non-offloaded children. + + Regarding the condition that children must be offloaded, this will always be eventually true, because + we enforce at the API level that children of archived timelines must themselves be archived, and all + archived timelines will eventually be offloaded. + +Offloading a timeline is simple: +- Read the timeline's attributes that we will store in its offloaded state (especially its logical size) +- Call `shutdown()` on the timeline and remove it from the `Tenant` (as if we were about to delete it) +- Erase all the timeline's content from local storage (`remove_dir_all` on its path) +- Write the tenant manifest to S3 to prevent this timeline being loaded on next start. + +#### Archive branch optimization (flattening) + +When we offloaded a branch, it might have had some history that prevented rewriting it to a single +point in time set of image layers. For example, a branch might have several days of writes and a 7 +day PITR: when we archive it, it still has those days of history. + +Once the PITR has expired, we have an opportunity to reduce the physical footprint of the branch by: +- Writing compressed image layers within the archived branch, as these are more efficient as a way of storing + a point in time compared with delta layers +- Updating the branch's offload metadata to indicate that this branch no longer depends on its ancestor + for data, i.e. the ancestor is free to GC layers files at+below the branch point + +Fully compacting an archived branch into image layers at a single LSN may be thought of as *flattening* the +branch, such that it is now a one-dimensional keyspace rather than a two-dimensional key/lsn space. It becomes +a true snapshot at that LSN. + +It is not always more efficient to flatten a branch than to keep some extra history on the parent: this +is described in more detail in [optimizations](#delaying-storage-optimization-if-retaining-parent-layers-is-cheaper) + +Archive branch optimization should be done _before_ background offloads during compaction, because there may +be timelines which are ready to be offloaded but also would benefit from the optimization step before +being offloaded. For example, a branch which has already fallen out of PITR window and has no history +of its own may be immediately re-written as a series of image layers before being offloaded. + +### Consumption metrics + +Archived timelines and offloaded timelines will be excluded from the synthetic size calculation, in anticipating +that billing structures based on consumption metrics are highly likely to apply different $/GB rates to archived +vs. ordinary content. + +Archived and offloaded timelines' logical size will be reported under the existing `timeline_logical_size` +variant of `MetricsKey`: receivers are then free to bill on this metric as they please. + +### Secondary locations + +Archived timelines (including offloaded timelines) will be excluded from heatmaps, and thereby +when a timeline is archived, after the next cycle of heatmap upload & secondary download, its contents +will be dropped from secondary locations. + +### Sharding + +Archiving or activating a timeline will be done symmetrically across all shards in a tenant, in +the same way that timeline creation and deletion is done. There are no special rules about ordering: +the storage controller may dispatch concurrent calls to all shards when archiving or activating a timeline. + +Since consumption metrics are only transmitted from shard zero, the state of archival on this shard +will be authoritative for consumption metrics. + +## Error cases + +### Errors in sharded tenants + +If one shard in a tenant fails an operation but others succeed, the tenant may end up in a mixed +state, where a timeline is archived on some shards but not on others. + +We will not bother implementing a rollback mechanism for this: errors in archiving/activating a timeline +are either transient (e.g. S3 unavailable, shutting down), or the fault of the caller (NotFound, BadRequest). +In the transient case callers are expected to retry until success, or to make appropriate API calls to clear +up their mistake. We rely on this good behavior of callers to eventually get timelines into a consistent +state across all shards. If callers do leave a timeline in an inconsistent state across shards, this doesn't +break anything, it's just "weird". + +This is similar to the status quo for timeline creation and deletion: callers are expected to retry +these operations until they succeed. + +### Archiving/activating + +Archiving/activating a timeline can fail in a limited number of ways: +1. I/O error storing/reading the timeline's updated index + - These errors are always retryable: a fundamental design assumption of the pageserver is that remote + storage errors are always transient. +2. NotFound if the timeline doesn't exist + - Callers of the API are expected to avoid calling deletion and archival APIs concurrently. + - The storage controller has runtime locking to prevent races such as deleting a timeline while + archiving it. +3. BadRequest if the rules around ancestors/descendents of archived timelines would be violated + - Callers are expected to do their own checks to avoid hitting this case. If they make + a mistake and encounter this error, they should give up. + +### Offloading + +Offloading can only fail if remote storage is unavailable, which would prevent us from writing the +tenant manifest. In such error cases, we give up in the expectation that offloading will be tried +again at the next iteration of the compaction loop. + +### Archive branch optimization + +Optimization is a special form of compaction, so can encounter all the same errors as regular compaction +can: it should return Result<(), CompactionError>, and as with compaction it will be retried on +the next iteration of the compaction loop. + +## Optimizations + +### Delaying storage optimization if retaining parent layers is cheaper + +Optimizing archived branches to image layers and thereby enabling parent branch GC to progress +is a safe default: archived branches cannot over-fill a pageserver's local disk, and once they +are offloaded to S3 they're totally safe, inert things. + +However, in some cases it can be advantageous to retain extra history on their parent branch rather +than flattening the archived branch. For example, if a 1TB parent branch is rather slow-changing (1GB +of data per day), and archive branches are being created nightly, then writing out full 1TB image layers +for each nightly branch is inefficient compared with just keeping more history on the main branch. + +Getting this right requires consideration of: +- Compaction: if keeping more history on the main branch is going to prompt the main branch's compaction to + write out extra image layers, then it might make more sense to just write out the image layers on + the archived branch. +- Metadata bloat: keeping extra history on a parent branch doesn't just cost GB of storage, it makes + the layer map (and index_part) bigger. There are practical limits beyond which writing an indefinitely + large layer map can cause problems elsewhere. + +This optimization can probably be implemented quite cheaply with some basic heuristics like: +- don't bother doing optimization on an archive branch if the LSN distance between + its branch point and the end of the PITR window is <5% of the logical size of the archive branch. +- ...but, Don't keep more history on the main branch than double the PITR + +### Creating a timeline in archived state (a snapshot) + +Sometimes, one might want to create a branch with no history, which will not be written to +before it is archived. This is a snapshot, although we do not require a special snapshot API, +since a snapshot can be represented as a timeline with no history. + +This can be accomplished by simply creating a timeline and then immediately archiving it, but +that is somewhat wasteful: this timeline it will spin up various tasks and open a connection to the storage +broker to try and ingest WAL, before being shutdown in the subsequent archival call. To explicitly +support this common special case, we may add a parameter to the timeline creation API which +creates a timeline directly into the archived state. + +Such a timeline creation will do exactly two I/Os at creation time: +- write the index_part object to record the timeline's existence +- when the timeline is offloaded in the next iteration of the compaction loop (~20s later), + write the tenant manifest. + +Later, when the timeline falls off the end of the PITR interval, the usual offload logic will wake +up the 'snapshot' branch and write out image layers. + +## Future Work + +### Enabling `fullbackup` dumps from archive branches + +It would be useful to be able to export an archive branch to another system, or for use in a local +postgres database. + +This could be implemented as a general capability for all branches, in which case it would "just work" +for archive branches by activating them. However, downloading all the layers in a branch just to generate +a fullbackup is a bit inefficient: we could implement a special case for flattened archived branches +which streams image layers from S3 and outputs the fullbackup stream without writing the layers out to disk. + +Implementing `fullbackup` is a bit more complicated than this because of sharding, but solving that problem +is unrelated to the topic of archived branches (it probably involves having each shard write out a fullbackup +stream to S3 in an intermediate format and, then having one node stitch them together). + +### Tagging layers from archived branches + +When we know a layer is an image layer written for an archived branch that has fallen off the PITR window, +we may add tags to the S3 objects to enable writing lifecycle policies that transition such layers to even +cheaper storage. + +This could be done for all archived layers, or it could be driven by the archival API, to give the pageserver +external hints on which branches are likely to be reactivated, and which branches are good candidates for +tagging for low performance storage. + +Tagging+lifecycles is just one mechanism: one might also directly use S3 storage classes. Other clouds' object +stores have similar mechanisms. + +### Storing sequences of archive branches as deltas + +When archived branches are used as scheduled snapshots, we could store them even more efficiently +by encoding them as deltas relative to each other (i.e. for nightly snapshots, when we do the +storage optimization for Tuesday's snapshot, we would read Monday's snapshot and store only the modified +pages). This is the kind of encoding that many backup storage systems use. + +The utility of this depends a lot on the churn rate of the data, and the cost of doing the delta encoding +vs. just writing out a simple stream of the entire database. For smaller databases, writing out a full +copy is pretty trivial (e.g. writing a compressed copy of a 10GiB database to S3 can take under 10 seconds, +so the complexity tradeoff of diff-encoding it is dubious). + +One does not necessarily have to read-back the previous snapshot in order to encoded the next one: if the +pageserver knows about the schedule, it can intentionally retain extra history on the main branch so that +we can say: "A branch exists from Monday night. I have Monday night's data still active in the main branch, +so now I can read at the Monday LSN and the Tuesday LSN, calculate the delta, and store it as Tuesday's +delta snapshot". + +Clearly this all requires careful housekeeping to retain the relationship between branches that depend on +each other: perhaps this would be done by making the archive branches have child/parent relationships with +each other, or perhaps we would permit them to remain children of their original parent, but additionally +have a relationship with the snapshot they're encoded relative to. + +Activating a branch that is diff-encoded may require activating several earlier branches too, so figuring +out how frequently to write a full copy is important. This is essentially a zoomed-out version of what +we do with delta layers and image layers within a timeline, except each "layer" is a whole timeline. + + +## FAQ/Alternatives + +### Store all timelines in the tenant manifest + +Rather than special-casing offloaded timelines in the offload manifest, we could store a total +manifest of all timelines, eliminating the need for the pageserver to list timelines in S3 on +startup. + +That would be a more invasive change (require hooking in to timeline creation), and would +generate much more I/O to this manifest for tenants that had many branches _and_ frequent +create/delete cycles for short lived branches. Restricting the manifest to offloaded timelines +means that we only have to cope with the rate at which long-lived timelines are archived, rather +than the rate at which sort lived timelines are created & destroyed. + +### Automatically archiving/activating timelines without external API calls + +We could implement TTL driven offload of timelines, waking them up when a page request +arrives. + +This has downsides: +- Opacity: if we do TTL-driven offload inside the pageserver, then the end user doesn't + know which of their branches are in this state, and might get a surprise when they try + to use such a branch. +- Price fluctuation: if the archival of a branch is used in end user pricing, then users + prefer clarity & consistency. Ideally a branch's storage should cost the same from the moment it + is created, rather than having a usage-dependency storage price. +- Complexity: enabling the page service to call up into the Tenant to activate a timeline + would be awkward, compared with an external entry point. + +### Make offloaded a state of Timeline + +To reduce the operator-facing complexity of having some timelines APIs that only return +non-offloaded timelines, we could build the offloaded state into the Timeline type. + +`timeline.rs` is already one of the most egregiously long source files in the tree, so +this is rejected on the basis that we need to avoid making that complexity worse. \ No newline at end of file diff --git a/docs/rfcs/036-physical-replication.md b/docs/rfcs/036-physical-replication.md new file mode 100644 index 0000000000..41aced0545 --- /dev/null +++ b/docs/rfcs/036-physical-replication.md @@ -0,0 +1,265 @@ +# Physical Replication + +This RFC is a bit special in that we have already implemented physical +replication a long time ago. However, we never properly wrote down all +the decisions and assumptions, and in the last months when more users +have started to use the feature, numerous issues have surfaced. + +This RFC documents the design decisions that have been made. + +## Summary + +PostgreSQL has a feature called streaming replication, where a replica +streams WAL from the primary and continuously applies it. It is also +known as "physical replication", to distinguish it from logical +replication. In PostgreSQL, a replica is initialized by taking a +physical backup of the primary. In Neon, the replica is initialized +from a slim "base backup" from the pageserver, just like a primary, +and the primary and the replicas connect to the same pageserver, +sharing the storage. + +There are two kinds of read-only replicas in Neon: +- replicas that follow the primary, and +- "static" replicas that are pinned at a particular LSN. + +A static replica is useful e.g. for performing time-travel queries and +running one-off slow queries without affecting the primary. A replica +that follows the primary can be used e.g. to scale out read-only +workloads. + +## Motivation + +Read-only replicas allow offloading read-only queries. It's useful for +isolation, if you want to make sure that read-only queries don't +affect the primary, and it's also an easy way to provide guaranteed +read-only access to an application, without having to mess with access +controls. + +## Non Goals (if relevant) + +This RFC is all about WAL-based *physical* replication. Logical +replication is a different feature. + +Neon also has the capability to launch "static" read-only nodes which +do not follow the primary, but are pinned to a particular LSN. They +can be used for long-running one-off queries, or for Point-in-time +queries. They work similarly to read replicas that follow the primary, +but some things are simpler: there are no concerns about cache +invalidation when the data changes on the primary, or worrying about +transactions that are in-progress on the primary. + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +- Control plane launches the replica +- Replica Postgres instance connects to the safekeepers, to stream the WAL +- The primary does not know about the standby, except for the hot standby feedback +- The primary and replicas all connect to the same pageservers + + +# Context + +Some useful things to know about hot standby and replicas in +PostgreSQL. + +## PostgreSQL startup sequence + +"Running" and "start up" terms are little imprecise. PostgreSQL +replica startup goes through several stages: + +1. First, the process is started up, and various initialization steps + are performed, like initializing shared memory. If you try to + connect to the server in this stage, you get an error: ERROR: the + database system is starting up. This stage happens very quickly, no + +2. Then the server reads the checpoint record from the WAL and starts + the WAL replay starting from the checkpoint. This works differently + in Neon: we start the WAL replay at the basebackup LSN, not from a + checkpoint! If you connect to the server in this state, you get an + error: ERROR: the database system is not yet accepting + connections. We proceed to the next stage, when the WAL replay sees + a running-xacts record. Or in Neon, the "CLOG scanning" mechanism + can allow us to move directly to next stage, with all the caveats + listed in this RFC. + +3. When the running-xacts information is established, the server + starts to accept connections normally. + +From PostgreSQL's point of view, the server is already running in +stage 2, even though it's not accepting connections yet. Our +`compute_ctl` does not consider it as running until stage 3. If the +transition from stage 2 to 3 doesn't happen fast enough, the control +plane will mark the start operation as failed. + + +## Decisions, Issues + +### Cache invalidation in replica + +When a read replica follows the primary in PostgreSQL, it needs to +stream all the WAL from the primary and apply all the records, to keep +the local copy of the data consistent with the primary. In Neon, the +replica can fetch the updated page versions from the pageserver, so +it's not necessary to apply all the WAL. However, it needs to ensure +that any pages that are currently in the Postgres buffer cache, or the +Local File Cache, are either updated, or thrown away so that the next +read of the page will fetch the latest version. + +We choose to apply the WAL records for pages that are already in the +buffer cache, and skip records for other pages. Somewhat arbitrarily, +we also apply records affecting catalog relations, fetching the old +page version from the pageserver if necessary first. See +`neon_redo_read_buffer_filter()` function. + +The replica wouldn't necessarily need to see all the WAL records, only +the records that apply to cached pages. For simplicity, we do stream +all the WAL to the replica, and the replica simply ignores WAL records +that require no action. + +Like in PostgreSQL, the read replica maintains a "replay LSN", which +is the LSN up to which the replica has received and replayed the +WAL. The replica can lag behind the primary, if it cannot quite keep +up with the primary, or if a long-running query conflicts with changes +that are about to be applied, or even intentionally if the user wishes +to see delayed data (see recovery_min_apply_delay). It's important +that the replica sees a consistent view of the whole cluster at the +replay LSN, when it's lagging behind. + +In Neon, the replica connects to a safekeeper to get the WAL +stream. That means that the safekeepers must be able to regurgitate +the original WAL as far back as the replay LSN of any running read +replica. (A static read-only node that does not follow the primary +does not require a WAL stream however). The primary does not need to +be running, and when it is, the replicas don't incur any extra +overhead to the primary (see hot standby feedback though). + +### In-progress transactions + +In PostgreSQL, when a hot standby server starts up, it cannot +immediately open up for queries (see [PostgreSQL startup +sequence]). It first needs to establish a complete list of in-progress +transactions, including subtransactions, that are running at the +primary, at the current replay LSN. Normally that happens quickly, +when the replica sees a "running-xacts" WAL record, because the +primary writes a running-xacts WAL record at every checkpoint, and in +PostgreSQL the replica always starts the WAL replay from a checkpoint +REDO point. (A shutdown checkpoint WAL record also implies that all +the non-prepared transactions have ended.) If there are a lot of +subtransactions in progress, however, the standby might need to wait +for old transactions to complete before it can open up for queries. + +In Neon that problem is worse: a replica can start at any LSN, so +there's no guarantee that it will see a running-xacts record any time +soon. In particular, if the primary is not running when the replica is +started, it might never see a running-xacts record. + +To make things worse, we initially missed this issue, and always +started accepting queries at replica startup, even if it didn't have +the transaction information. That could lead to incorrect query +results and data corruption later. However, as we fixed that, we +introduced a new problem compared to what we had before: previously +the replica would always start up, but after fixing that bug, it might +not. In a superficial way, the old behavior was better (but could lead +to serious issues later!). That made fixing that bug was very hard, +because as we fixed it, we made things (superficially) worse for +others. + +See https://github.com/neondatabase/neon/pull/7288 which fixed the +bug, and follow-up PRs https://github.com/neondatabase/neon/pull/8323 +and https://github.com/neondatabase/neon/pull/8484 to try to claw back +the cases that started to cause trouble as fixing it. As of this +writing, there are still cases where a replica might not immediately +start up, causing the control plane operation to fail, the remaining +issues are tracked in https://github.com/neondatabase/neon/issues/6211. + +One long-term fix for this is to switch to using so-called CSN +snapshots in read replica. That would make it unnecessary to have the +full in-progress transaction list in the replica at startup time. See +https://commitfest.postgresql.org/48/4912/ for a work-in-progress +patch to upstream to implement that. + +Another thing we could do is to teach the control plane about that +distinction between "starting up" and "running but haven't received +running-xacts information yet", so that we could keep the replica +waiting longer in that stage, and also give any client connections the +same `ERROR: the database system is not yet accepting connections` +error that you get in standalone PostgreSQL in that state. + + +### Recovery conflicts and Hot standby feedback + +It's possible that a tuple version is vacuumed away in the primary, +even though it is still needed by a running transactions in the +replica. This is called a "recovery conflict", and PostgreSQL provides +various options for dealing with it. By default, the WAL replay will +wait up to 30 s for the conflicting query to finish. After that, it +will kill the running query, so that the WAL replay can proceed. + +Another way to avoid the situation is to enable the +[`hot_standby_feedback`](https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-HOT-STANDBY-FEEDBACK) +option. When it is enabled, the primary will refrain from vacuuming +tuples that are still needed in the primary. That means potentially +bloating the primary, which violates the usual rule that read replicas +don't affect the operations on the primary, which is why it's off by +default. We leave it to users to decide if they want to turn it on, +same as PostgreSQL. + +Neon supports `hot_standby_feedback` by passing the feedback messages +from the replica to the safekeepers, and from safekeepers to the +primary. + +### Relationship of settings between primary and replica + +In order to enter hot standby mode, some configuration options need to +be set to the same or larger values in the standby, compared to the +primary. See [explanation in the PostgreSQL +docs](https://www.postgresql.org/docs/current/hot-standby.html#HOT-STANDBY-ADMIN) + +In Neon, we have this problem too. To prevent customers from hitting +it, the control plane automatically adjusts the settings of a replica, +so that they match or exceed the primary's settings (see +https://github.com/neondatabase/cloud/issues/14903). However, you +can still hit the issue if the primary is restarted with larger +settings, while the replica is running. + + +### Interaction with Pageserver GC + +The read replica can lag behind the primary. If there are recovery +conflicts or the replica cannot keep up for some reason, the lag can +in principle grow indefinitely. The replica will issue all GetPage +requests to the pageservers at the current replay LSN, and needs to +see the old page versions. + +If the retention period in the pageserver is set to be small, it may +have already garbage collected away the old page versions. That will +cause read errors in the compute, and can mean that the replica cannot +make progress with the replication anymore. + +There is a mechanism for replica to pass information about its replay +LSN to the pageserver, so that the pageserver refrains from GC'ing +data that is still needed by the standby. It's called +'standby_horizon' in the pageserver code, see +https://github.com/neondatabase/neon/pull/7368. A separate "lease" +mechanism also is in the works, where the replica could hold a lease +on the old LSN, preventing the pageserver from advancing the GC +horizon past that point. The difference is that the standby_horizon +mechanism relies on a feedback message from replica to safekeeper, +while the least API is exposed directly from the pageserver. A static +read-only node is not connected to safekeepers, so it cannot use the +standby_horizon mechanism. + + +### Synchronous replication + +We haven't put any effort into synchronous replication yet. + +PostgreSQL provides multiple levels of synchronicity. In the weaker +levels, a transaction is not acknowledged as committed to the client +in the primary until the WAL has been streamed to a replica or flushed +to disk there. Those modes don't make senses in Neon, because the +safekeepers handle durability. + +`synchronous_commit=remote_apply` mode would make sense. In that mode, +the commit is not acknowledged to the client until it has been +replayed in the replica. That ensures that after commit, you can see +the commit in the replica too (aka. read-your-write consistency). diff --git a/docs/rfcs/037-storage-controller-restarts.md b/docs/rfcs/037-storage-controller-restarts.md new file mode 100644 index 0000000000..bad422344f --- /dev/null +++ b/docs/rfcs/037-storage-controller-restarts.md @@ -0,0 +1,259 @@ +# Rolling Storage Controller Restarts + +## Summary + +This RFC describes the issues around the current storage controller restart procedure +and describes an implementation which reduces downtime to a few milliseconds on the happy path. + +## Motivation + +Storage controller upgrades (restarts, more generally) can cause multi-second availability gaps. +While the storage controller does not sit on the main data path, it's generally not acceptable +to block management requests for extended periods of time (e.g. https://github.com/neondatabase/neon/issues/8034). + +### Current Implementation + +The storage controller runs in a Kubernetes Deployment configured for one replica and strategy set to [Recreate](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#recreate-deployment). +In non Kubernetes terms, during an upgrade, the currently running storage controller is stopped and, only after, +a new instance is created. + +At start-up, the storage controller calls into all the pageservers it manages (retrieved from DB) to learn the +latest locations of all tenant shards present on them. This is usually fast, but can push into tens of seconds +under unfavourable circumstances: pageservers are heavily loaded or unavailable. + +## Prior Art + +There's probably as many ways of handling restarts gracefully as there are distributed systems. Some examples include: +* Active/Standby architectures: Two or more instance of the same service run, but traffic is only routed to one of them. +For fail-over, traffic is routed to one of the standbys (which becomes active). +* Consensus Algorithms (Raft, Paxos and friends): The part of consensus we care about here is leader election: peers communicate to each other +and use a voting scheme that ensures the existence of a single leader (e.g. Raft epochs). + +## Requirements + +* Reduce storage controller unavailability during upgrades to milliseconds +* Minimize the interval in which it's possible for more than one storage controller +to issue reconciles. +* Have one uniform implementation for restarts and upgrades +* Fit in with the current Kubernetes deployment scheme + +## Non Goals + +* Implement our own consensus algorithm from scratch +* Completely eliminate downtime storage controller downtime. Instead we aim to reduce it to the point where it looks +like a transient error to the control plane + +## Impacted Components + +* storage controller +* deployment orchestration (i.e. Ansible) +* helm charts + +## Terminology + +* Observed State: in-memory mapping between tenant shards and their current pageserver locations - currently built up +at start-up by quering pageservers +* Deployment: Kubernetes [primitive](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) that models +a set of replicas + +## Implementation + +### High Level Flow + +At a very high level the proposed idea is to start a new storage controller instance while +the previous one is still running and cut-over to it when it becomes ready. The new instance, +should coordinate with the existing one and transition responsibility gracefully. While the controller +has built in safety against split-brain situations (via generation numbers), we'd like to avoid such +scenarios since they can lead to availability issues for tenants that underwent changes while two controllers +were operating at the same time and require operator intervention to remedy. + +### Kubernetes Deployment Configuration + +On the Kubernetes configuration side, the proposal is to update the storage controller `Deployment` +to use `spec.strategy.type = RollingUpdate`, `spec.strategy.rollingUpdate.maxSurge=1` and `spec.strategy.maxUnavailable=0`. +Under the hood, Kubernetes creates a new replica set and adds one pod to it (`maxSurge=1`). The old replica set does not +scale down until the new replica set has one replica in the ready state (`maxUnavailable=0`). + +The various possible failure scenarios are investigated in the [Handling Failures](#handling-failures) section. + +### Storage Controller Start-Up + +This section describes the primitives required on the storage controller side and the flow of the happy path. + +#### Database Table For Leader Synchronization + +A new table should be added to the storage controller database for leader synchronization during startup. +This table will always contain at most one row. The proposed name for the table is `leader` and the schema +contains two elements: +* `hostname`: represents the hostname for the current storage controller leader - should be addressible +from other pods in the deployment +* `start_timestamp`: holds the start timestamp for the current storage controller leader (UTC timezone) - only required +for failure case handling: see [Previous Leader Crashes Before New Leader Readiness](#previous-leader-crashes-before-new-leader-readiness) + +Storage controllers will read the leader row at start-up and then update it to mark themselves as the leader +at the end of the start-up sequence. We want compare-and-exchange semantics for the update: avoid the +situation where two concurrent updates succeed and overwrite each other. The default Postgres isolation +level is `READ COMMITTED`, which isn't strict enough here. This update transaction should use at least `REPEATABLE +READ` isolation level in order to [prevent lost updates](https://www.interdb.jp/pg/pgsql05/08.html). Currently, +the storage controller uses the stricter `SERIALIZABLE` isolation level for all transactions. This more than suits +our needs here. + +``` +START TRANSACTION ISOLATION LEVEL REPEATABLE READ +UPDATE leader SET hostname=, start_timestamp= +WHERE hostname=, start_timestampt=; +``` + +If the transaction fails or if no rows have been updated, then the compare-and-exchange is regarded as a failure. + +#### Step Down API + +A new HTTP endpoint should be added to the storage controller: `POST /control/v1/step_down`. Upon receiving this +request the leader cancels any pending reconciles and goes into a mode where it replies with 503 to all other APIs +and does not issue any location configurations to its pageservers. The successful HTTP response will return a serialized +snapshot of the observed state. + +If other step down requests come in after the initial one, the request is handled and the observed state is returned (required +for failure scenario handling - see [Handling Failures](#handling-failures)). + +#### Graceful Restart Happy Path + +At start-up, the first thing the storage controller does is retrieve the sole row from the new +`leader` table. If such an entry exists, send a `/step_down` PUT API call to the current leader. +This should be retried a few times with a short backoff (see [1]). The aspiring leader loads the +observed state into memory and the start-up sequence proceeds as usual, but *without* querying the +pageservers in order to build up the observed state. + +Before doing any reconciliations or persistence change, update the `leader` database table as described in the [Database Table For Leader Synchronization](database-table-for-leader-synchronization) +section. If this step fails, the storage controller process exits. + +Note that no row will exist in the `leaders` table for the first graceful restart. In that case, force update the `leader` table +(without the WHERE clause) and perform with the pre-existing start-up procedure (i.e. build observed state by querying pageservers). + +Summary of proposed new start-up sequence: +1. Call `/step_down` +2. Perform any pending database migrations +3. Load state from database +4. Load observed state returned in step (1) into memory +5. Do initial heartbeat round (may be moved after 5) +7. Mark self as leader by updating the database +8. Reschedule and reconcile everything + +Some things to note from the steps above: +* The storage controller makes no changes to the cluster state before step (5) (i.e. no location config +calls to the pageserver and no compute notifications) +* Ask the current leader to step down before loading state from database so we don't get a lost update +if the transactions overlap. +* Before loading the observed state at step (3), cross-validate against the database. If validation fails, +fall back to asking the pageservers about their current locations. +* Database migrations should only run **after** the previous instance steps down (or the step down times out). + + +[1] The API call might fail because there's no storage controller running (i.e. [restart](#storage-controller-crash-or-restart)), +so we don't want to extend the unavailability period by much. We still want to retry since that's not the common case. + +### Handling Failures + +#### Storage Controller Crash Or Restart + +The storage controller may crash or be restarted outside of roll-outs. When a new pod is created, its call to +`/step_down` will fail since the previous leader is no longer reachable. In this case perform the pre-existing +start-up procedure and update the leader table (with the WHERE clause). If the update fails, the storage controller +exists and consistency is maintained. + +#### Previous Leader Crashes Before New Leader Readiness + +When the previous leader (P1) crashes before the new leader (P2) passses the readiness check, Kubernetes will +reconcile the old replica set and create a new pod for it (P1'). The `/step_down` API call will fail for P1' +(see [2]). + +Now we have two cases to consider: +* P2 updates the `leader` table first: The database update from P1' will fail and P1' will exit, or be terminated +by Kubernetes depending on timings. +* P1' updates the `leader` table first: The `hostname` field of the `leader` row stays the same, but the `start_timestamp` field changes. +The database update from P2 will fail (since `start_timestamp` does not match). P2 will exit and Kubernetes will +create a new replacement pod for it (P2'). Now the entire dance starts again, but with P1' as the leader and P2' as the incumbent. + +[2] P1 and P1' may (more likely than not) be the same pod and have the same hostname. The implementation +should avoid this self reference and fail the API call at the client if the persisted hostname matches +the current one. + +#### Previous Leader Crashes After New Leader Readiness + +The deployment's replica sets already satisfy the deployment's replica count requirements and the +Kubernetes deployment rollout will just clean up the dead pod. + +#### New Leader Crashes Before Pasing Readiness Check + +The deployment controller scales up the new replica sets by creating a new pod. The entire procedure is repeated +with the new pod. + +#### Network Partition Between New Pod and Previous Leader + +This feels very unlikely, but should be considered in any case. P2 (the new aspiring leader) fails the `/step_down` +API call into P1 (the current leader). P2 proceeds with the pre-existing startup procedure and updates the `leader` table. +Kubernetes will terminate P1, but there may be a brief period where both storage controller can drive reconciles. + +### Dealing With Split Brain Scenarios + +As we've seen in the previous section, we can end up with two storage controller running at the same time. The split brain +duration is not bounded since the Kubernetes controller might become partitioned from the pods (unlikely though). While these +scenarios are not fatal, they can cause tenant unavailability, so we'd like to reduce the chances of this happening. +The rest of this section sketches some safety measure. It's likely overkill to implement all of them however. + +### Ensure Leadership Before Producing Side Effects + +The storage controller has two types of side effects: location config requests into pageservers and compute notifications into the control plane. +Before issuing either, the storage controller could check that it is indeed still the leader by querying the database. Side effects might still be +applied if they race with the database updatem, but the situation will eventually be detected. The storage controller process should terminate in these cases. + +### Leadership Lease + +Up until now, the leadership defined by this RFC is static. In order to bound the length of the split brain scenario, we could require the leadership +to be renewed periodically. Two new columns would be added to the leaders table: +1. `last_renewed` - timestamp indicating when the lease was last renewed +2. `lease_duration` - duration indicating the amount of time after which the lease expires + +The leader periodically attempts to renew the lease by checking that it is in fact still the legitimate leader and updating `last_renewed` in the +same transaction. If the update fails, the process exits. New storage controller instances wishing to become leaders must wait for the current lease +to expire before acquiring leadership if they have not succesfully received a response to the `/step_down` request. + +### Notify Pageserver Of Storage Controller Term + +Each time that leadership changes, we can bump a `term` integer column in the `leader` table. This term uniquely identifies a leader. +Location config requests and re-attach responses can include this term. On the pageserver side, keep the latest term in memory and refuse +anything which contains a stale term (i.e. smaller than the current one). + +### Observability + +* The storage controller should expose a metric which describes it's state (`Active | WarmingUp | SteppedDown`). +Per region alerts should be added on this metric which triggers when: + + no storage controller has been in the `Active` state for an extended period of time + + more than one storage controllers are in the `Active` state + +* An alert that periodically verifies that the `leader` table is in sync with the metric above would be very useful. +We'd have to expose the storage controller read only database to Grafana (perhaps it is already done). + +## Alternatives + +### Kubernetes Leases + +Kubernetes has a [lease primitive](https://kubernetes.io/docs/concepts/architecture/leases/) which can be used to implement leader election. +Only one instance may hold a lease at any given time. This lease needs to be periodically renewed and has an expiration period. + +In our case, it would work something like this: +* `/step_down` deletes the lease or stops it from renewing +* lease acquisition becomes part of the start-up procedure + +The kubert crate implements a [lightweight lease API](https://docs.rs/kubert/latest/kubert/lease/struct.LeaseManager.html), but it's still +not exactly trivial to implement. + +This approach has the benefit of baked in observability (`kubectl describe lease`), but: +* We offload the responsibility to Kubernetes which makes it harder to debug when things go wrong. +* More code surface than the simple "row in database" approach. Also, most of this code would be in +a dependency not subject to code review, etc. +* Hard to test. Our testing infra does not run the storage controller in Kubernetes and changing it do +so is not simple and complictes and the test set-up. + +To my mind, the "row in database" approach is straightforward enough that we don't have to offload this +to something external. diff --git a/docs/settings.md b/docs/settings.md index 817f97d8ba..12a6a4c171 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -134,7 +134,7 @@ depends on that, so if you change it, bad things will happen. #### page_cache_size -Size of the page cache, to hold materialized page versions. Unit is +Size of the page cache. Unit is number of 8 kB blocks. The default is 8192, which means 64 MB. #### max_file_descriptors diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 12fa80349e..3732bfdab2 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -7,6 +7,11 @@ Below you will find a brief overview of each subdir in the source tree in alphab Neon storage broker, providing messaging between safekeepers and pageservers. [storage_broker.md](./storage_broker.md) +`storage_controller`: + +Neon storage controller, manages a cluster of pageservers and exposes an API that enables +managing a many-sharded tenant as a single entity. + `/control_plane`: Local control plane. diff --git a/docs/storage_controller.md b/docs/storage_controller.md new file mode 100644 index 0000000000..6d2ef929a4 --- /dev/null +++ b/docs/storage_controller.md @@ -0,0 +1,150 @@ +# Storage Controller + +## Concepts + +The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller, +which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations). + +It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding +the underlying details of how data is spread across multiple nodes. + +The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent. + +## APIs + +The storage controller’s HTTP server implements four logically separate APIs: + +- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because that’s where clients expect to find it on a pageserver. +- `/control/v1/...` path is the storage controller’s API, which enables operations such as registering and management pageservers, or executing shard splits. +- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system. +- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers + to ensure data safety with generation numbers. + +The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers’ APIs). + +See the `http.rs` file in the source for where the HTTP APIs are implemented. + +## Database + +The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not +persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and +rebuilt on startup. + +The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why. + +The `diesel` crate is used for defining models & migrations. + +Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controller’s database. + +### Diesel tip: migrations + +If you need to modify the database schema, here’s how to create a migration: + +- Install the diesel CLI with `cargo install diesel_cli` +- Use `diesel migration generate ` to create a new migration +- Populate the SQL files in the `migrations/` subdirectory +- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically. + - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/storage_controller` +- Commit the migration files and the changes to schema.rs +- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again. +- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed. + +## storcon_cli + +The `storcon_cli` tool enables interactive management of the storage controller. This is usually +only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline). + +`storcon_cli --help` includes details on commands. + +# Deploying + +This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as +part of a self-hosted system. + +_General note: since the default `neon_local` environment includes a storage controller, this is a useful +reference when figuring out deployment._ + +## Database + +It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral +local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver. + +The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte. + +Set the URL to the database using the `--database-url` CLI option. + +There is no need to run migrations manually: the storage controller automatically applies migrations +when it starts up. + +## Configure pageservers to use the storage controller + +1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should + point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters. +2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself + with the storage controller when it starts up. See the example below for the format of this file. + +### Example `metadata.json` + +``` +{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000} +``` + +- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever + postgres runs. +- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where + the storage controller runs. + +## Handle compute notifications. + +The storage controller independently moves tenant attachments between pageservers in response to +changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable +postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver +location changes. + +The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires +JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request. + +In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems +the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling +the compute hook. + +When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated: +the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience. + +``` +struct ComputeHookNotifyRequestShard { + node_id: NodeId, + shard_number: ShardNumber, +} + +struct ComputeHookNotifyRequest { + tenant_id: TenantId, + stripe_size: Option, + shards: Vec, +} +``` + +When a notification is received: + +1. Modify postgres configuration for this tenant: + + - set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The + shards identified by `NodeId` must be converted to the address+port of the node. + - if stripe_size is not None, set `neon.stripe_size` to this value + +2. Send SIGHUP to postgres to reload configuration +3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller + will retry the notification until it succeeds.. + +### Example notification body + +``` +{ + "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc", + "stripe_size": 32768, + "shards": [ + {"node_id": 344, "shard_number": 0}, + {"node_id": 722, "shard_number": 1}, + ], +} +``` diff --git a/docs/synthetic-size.md b/docs/synthetic-size.md index 407d7b525a..b6b90d90c2 100644 --- a/docs/synthetic-size.md +++ b/docs/synthetic-size.md @@ -21,9 +21,9 @@ implementation where we keep more data than we would need to, do not change the synthetic size or incur any costs to the user. The synthetic size is calculated for the whole project. It is not -straighforward to attribute size to individual branches. See "What is -the size of an individual branch?" for discussion on those -difficulties. +straightforward to attribute size to individual branches. See [What is +the size of an individual branch?](#what-is-the-size-of-an-individual-branch) +for a discussion of those difficulties. The synthetic size is designed to: @@ -40,8 +40,9 @@ The synthetic size is designed to: - logical size is the size of a branch *at a given point in time*. It's the total size of all tables in all databases, as you see with "\l+" in psql for example, plus the Postgres SLRUs and some - small amount of metadata. NOTE that currently, Neon does not include - the SLRUs and metadata in the logical size. See comment to `get_current_logical_size_non_incremental()`. + small amount of metadata. Note that currently, Neon does not include + the SLRUs and metadata in the logical size. Refer to the comment in + [`get_current_logical_size_non_incremental()`](/pageserver/src/pgdatadir_mapping.rs#L813-L814). - a "point in time" is defined as an LSN value. You can convert a timestamp to an LSN, but the storage internally works with LSNs. @@ -248,7 +249,7 @@ and truncate the WAL. Synthetic size is calculated for the whole project, and includes all branches. There is no such thing as the size of a branch, because it -is not straighforward to attribute the parts of size to individual +is not straightforward to attribute the parts of size to individual branches. ## Example: attributing size to branches diff --git a/docs/updating-postgres.md b/docs/updating-postgres.md index 1868bbf5f7..7913b0a9e2 100644 --- a/docs/updating-postgres.md +++ b/docs/updating-postgres.md @@ -21,30 +21,21 @@ _Example: 15.4 is the new minor version to upgrade to from 15.3._ 1. Create a new branch based on the stable branch you are updating. ```shell - git checkout -b my-branch REL_15_STABLE_neon + git checkout -b my-branch-15 REL_15_STABLE_neon ``` -1. Tag the last commit on the stable branch you are updating. +1. Find the upstream release tags you're looking for. They are of the form `REL_X_Y`. - ```shell - git tag REL_15_3_neon - ``` - -1. Push the new tag to the Neon Postgres repository. - - ```shell - git push origin REL_15_3_neon - ``` - -1. Find the release tags you're looking for. They are of the form `REL_X_Y`. - -1. Rebase the branch you created on the tag and resolve any conflicts. +1. Merge the upstream tag into the branch you created on the tag and resolve any conflicts. ```shell git fetch upstream REL_15_4 - git rebase REL_15_4 + git merge REL_15_4 ``` + In the commit message of the merge commit, mention if there were + any non-trivial conflicts or other issues. + 1. Run the Postgres test suite to make sure our commits have not affected Postgres in a negative way. @@ -57,7 +48,7 @@ Postgres in a negative way. 1. Push your branch to the Neon Postgres repository. ```shell - git push origin my-branch + git push origin my-branch-15 ``` 1. Clone the Neon repository if you have not done so already. @@ -74,7 +65,7 @@ branch. 1. Update the Git submodule. ```shell - git submodule set-branch --branch my-branch vendor/postgres-v15 + git submodule set-branch --branch my-branch-15 vendor/postgres-v15 git submodule update --remote vendor/postgres-v15 ``` @@ -89,14 +80,12 @@ minor Postgres release. 1. Create a pull request, and wait for CI to go green. -1. Force push the rebased Postgres branches into the Neon Postgres repository. +1. Push the Postgres branches with the merge commits into the Neon Postgres repository. ```shell - git push --force origin my-branch:REL_15_STABLE_neon + git push origin my-branch-15:REL_15_STABLE_neon ``` - It may require disabling various branch protections. - 1. Update your Neon PR to point at the branches. ```shell diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml index b377bd2cce..8aaa481f8c 100644 --- a/libs/compute_api/Cargo.toml +++ b/libs/compute_api/Cargo.toml @@ -14,5 +14,3 @@ regex.workspace = true utils = { path = "../utils" } remote_storage = { version = "0.1", path = "../remote_storage/" } - -workspace_hack.workspace = true diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 92bbf79cd4..d05d625b0a 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -3,7 +3,7 @@ use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize, Serializer}; -use crate::spec::ComputeSpec; +use crate::spec::{ComputeSpec, Database, Role}; #[derive(Serialize, Debug, Deserialize)] pub struct GenericAPIError { @@ -52,6 +52,10 @@ pub enum ComputeStatus { // compute will exit soon or is waiting for // control-plane to terminate it. Failed, + // Termination requested + TerminationPending, + // Terminated Postgres + Terminated, } fn rfc3339_serialize(x: &Option>, s: S) -> Result @@ -109,6 +113,12 @@ pub struct ComputeMetrics { pub total_ext_download_size: u64, } +#[derive(Clone, Debug, Default, Serialize)] +pub struct CatalogObjects { + pub roles: Vec, + pub databases: Vec, +} + /// Response of the `/computes/{compute_id}/spec` control-plane API. /// This is not actually a compute API response, so consider moving /// to a different place. diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 5361d14004..883c624f71 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -33,6 +33,23 @@ pub struct ComputeSpec { #[serde(default)] pub features: Vec, + /// If compute_ctl was passed `--resize-swap-on-bind`, a value of `Some(_)` instructs + /// compute_ctl to `/neonvm/bin/resize-swap` with the given size, when the spec is first + /// received. + /// + /// Both this field and `--resize-swap-on-bind` are required, so that the control plane's + /// spec generation doesn't need to be aware of the actual compute it's running on, while + /// guaranteeing gradual rollout of swap. Otherwise, without `--resize-swap-on-bind`, we could + /// end up trying to resize swap in VMs without it -- or end up *not* resizing swap, thus + /// giving every VM much more swap than it should have (32GiB). + /// + /// Eventually we may remove `--resize-swap-on-bind` and exclusively use `swap_size_bytes` for + /// enabling the swap resizing behavior once rollout is complete. + /// + /// See neondatabase/cloud#12047 for more. + #[serde(default)] + pub swap_size_bytes: Option, + /// Expected cluster state at the end of transition process. pub cluster: Cluster, pub delta_operations: Option>, @@ -90,8 +107,8 @@ pub enum ComputeFeature { /// track short-lived connections as user activity. ActivityMonitorExperimental, - /// Enable running migrations - Migrations, + /// Pre-install and initialize anon extension for every database in the cluster + AnonExtension, /// This is a special feature flag that is used to represent unknown feature flags. /// Basically all unknown to enum flags are represented as this one. See unit test diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml index 3f290821c2..a40b74b952 100644 --- a/libs/consumption_metrics/Cargo.toml +++ b/libs/consumption_metrics/Cargo.toml @@ -6,10 +6,8 @@ license = "Apache-2.0" [dependencies] anyhow.workspace = true -chrono.workspace = true +chrono = { workspace = true, features = ["serde"] } rand.workspace = true serde.workspace = true serde_with.workspace = true utils.workspace = true - -workspace_hack.workspace = true diff --git a/libs/desim/Cargo.toml b/libs/desim/Cargo.toml new file mode 100644 index 0000000000..0c4be90267 --- /dev/null +++ b/libs/desim/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "desim" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +anyhow.workspace = true +rand.workspace = true +tracing.workspace = true +bytes.workspace = true +utils.workspace = true +parking_lot.workspace = true +hex.workspace = true +scopeguard.workspace = true +smallvec = { workspace = true, features = ["write"] } diff --git a/libs/desim/README.md b/libs/desim/README.md new file mode 100644 index 0000000000..80568ebb1b --- /dev/null +++ b/libs/desim/README.md @@ -0,0 +1,7 @@ +# Discrete Event SIMulator + +This is a library for running simulations of distributed systems. The main idea is borrowed from [FoundationDB](https://www.youtube.com/watch?v=4fFDFbi3toc). + +Each node runs as a separate thread. This library was not optimized for speed yet, but it's already much faster than running usual intergration tests in real time, because it uses virtual simulation time and can fast-forward time to skip intervals where all nodes are doing nothing but sleeping or waiting for something. + +The original purpose for this library is to test walproposer and safekeeper implementation working together, in a scenarios close to the real world environment. This simulator is determenistic and can inject failures in networking without waiting minutes of wall-time to trigger timeout, which makes it easier to find bugs in our consensus implementation compared to using integration tests. diff --git a/libs/desim/src/chan.rs b/libs/desim/src/chan.rs new file mode 100644 index 0000000000..6661d59871 --- /dev/null +++ b/libs/desim/src/chan.rs @@ -0,0 +1,108 @@ +use std::{collections::VecDeque, sync::Arc}; + +use parking_lot::{Mutex, MutexGuard}; + +use crate::executor::{self, PollSome, Waker}; + +/// FIFO channel with blocking send and receive. Can be cloned and shared between threads. +/// Blocking functions should be used only from threads that are managed by the executor. +pub struct Chan { + shared: Arc>, +} + +impl Clone for Chan { + fn clone(&self) -> Self { + Chan { + shared: self.shared.clone(), + } + } +} + +impl Default for Chan { + fn default() -> Self { + Self::new() + } +} + +impl Chan { + pub fn new() -> Chan { + Chan { + shared: Arc::new(State { + queue: Mutex::new(VecDeque::new()), + waker: Waker::new(), + }), + } + } + + /// Get a message from the front of the queue, block if the queue is empty. + /// If not called from the executor thread, it can block forever. + pub fn recv(&self) -> T { + self.shared.recv() + } + + /// Panic if the queue is empty. + pub fn must_recv(&self) -> T { + self.shared + .try_recv() + .expect("message should've been ready") + } + + /// Get a message from the front of the queue, return None if the queue is empty. + /// Never blocks. + pub fn try_recv(&self) -> Option { + self.shared.try_recv() + } + + /// Send a message to the back of the queue. + pub fn send(&self, t: T) { + self.shared.send(t); + } +} + +struct State { + queue: Mutex>, + waker: Waker, +} + +impl State { + fn send(&self, t: T) { + self.queue.lock().push_back(t); + self.waker.wake_all(); + } + + fn try_recv(&self) -> Option { + let mut q = self.queue.lock(); + q.pop_front() + } + + fn recv(&self) -> T { + // interrupt the receiver to prevent consuming everything at once + executor::yield_me(0); + + let mut queue = self.queue.lock(); + if let Some(t) = queue.pop_front() { + return t; + } + loop { + self.waker.wake_me_later(); + if let Some(t) = queue.pop_front() { + return t; + } + MutexGuard::unlocked(&mut queue, || { + executor::yield_me(-1); + }); + } + } +} + +impl PollSome for Chan { + /// Schedules a wakeup for the current thread. + fn wake_me(&self) { + self.shared.waker.wake_me_later(); + } + + /// Checks if chan has any pending messages. + fn has_some(&self) -> bool { + !self.shared.queue.lock().is_empty() + } +} diff --git a/libs/desim/src/executor.rs b/libs/desim/src/executor.rs new file mode 100644 index 0000000000..9d44bd7741 --- /dev/null +++ b/libs/desim/src/executor.rs @@ -0,0 +1,483 @@ +use std::{ + panic::AssertUnwindSafe, + sync::{ + atomic::{AtomicBool, AtomicU32, AtomicU8, Ordering}, + mpsc, Arc, OnceLock, + }, + thread::JoinHandle, +}; + +use tracing::{debug, error, trace}; + +use crate::time::Timing; + +/// Stores status of the running threads. Threads are registered in the runtime upon creation +/// and deregistered upon termination. +pub struct Runtime { + // stores handles to all threads that are currently running + threads: Vec, + // stores current time and pending wakeups + clock: Arc, + // thread counter + thread_counter: AtomicU32, + // Thread step counter -- how many times all threads has been actually + // stepped (note that all world/time/executor/thread have slightly different + // meaning of steps). For observability. + pub step_counter: u64, +} + +impl Runtime { + /// Init new runtime, no running threads. + pub fn new(clock: Arc) -> Self { + Self { + threads: Vec::new(), + clock, + thread_counter: AtomicU32::new(0), + step_counter: 0, + } + } + + /// Spawn a new thread and register it in the runtime. + pub fn spawn(&mut self, f: F) -> ExternalHandle + where + F: FnOnce() + Send + 'static, + { + let (tx, rx) = mpsc::channel(); + + let clock = self.clock.clone(); + let tid = self.thread_counter.fetch_add(1, Ordering::SeqCst); + debug!("spawning thread-{}", tid); + + let join = std::thread::spawn(move || { + let _guard = tracing::info_span!("", tid).entered(); + + let res = std::panic::catch_unwind(AssertUnwindSafe(|| { + with_thread_context(|ctx| { + assert!(ctx.clock.set(clock).is_ok()); + ctx.id.store(tid, Ordering::SeqCst); + tx.send(ctx.clone()).expect("failed to send thread context"); + // suspend thread to put it to `threads` in sleeping state + ctx.yield_me(0); + }); + + // start user-provided function + f(); + })); + debug!("thread finished"); + + if let Err(e) = res { + with_thread_context(|ctx| { + if !ctx.allow_panic.load(std::sync::atomic::Ordering::SeqCst) { + error!("thread panicked, terminating the process: {:?}", e); + std::process::exit(1); + } + + debug!("thread panicked: {:?}", e); + let mut result = ctx.result.lock(); + if result.0 == -1 { + *result = (256, format!("thread panicked: {:?}", e)); + } + }); + } + + with_thread_context(|ctx| { + ctx.finish_me(); + }); + }); + + let ctx = rx.recv().expect("failed to receive thread context"); + let handle = ThreadHandle::new(ctx.clone(), join); + + self.threads.push(handle); + + ExternalHandle { ctx } + } + + /// Returns true if there are any unfinished activity, such as running thread or pending events. + /// Otherwise returns false, which means all threads are blocked forever. + pub fn step(&mut self) -> bool { + trace!("runtime step"); + + // have we run any thread? + let mut ran = false; + + self.threads.retain(|thread: &ThreadHandle| { + let res = thread.ctx.wakeup.compare_exchange( + PENDING_WAKEUP, + NO_WAKEUP, + Ordering::SeqCst, + Ordering::SeqCst, + ); + if res.is_err() { + // thread has no pending wakeups, leaving as is + return true; + } + ran = true; + + trace!("entering thread-{}", thread.ctx.tid()); + let status = thread.step(); + self.step_counter += 1; + trace!( + "out of thread-{} with status {:?}", + thread.ctx.tid(), + status + ); + + if status == Status::Sleep { + true + } else { + trace!("thread has finished"); + // removing the thread from the list + false + } + }); + + if !ran { + trace!("no threads were run, stepping clock"); + if let Some(ctx_to_wake) = self.clock.step() { + trace!("waking up thread-{}", ctx_to_wake.tid()); + ctx_to_wake.inc_wake(); + } else { + return false; + } + } + + true + } + + /// Kill all threads. This is done by setting a flag in each thread context and waking it up. + pub fn crash_all_threads(&mut self) { + for thread in self.threads.iter() { + thread.ctx.crash_stop(); + } + + // all threads should be finished after a few steps + while !self.threads.is_empty() { + self.step(); + } + } +} + +impl Drop for Runtime { + fn drop(&mut self) { + debug!("dropping the runtime"); + self.crash_all_threads(); + } +} + +#[derive(Clone)] +pub struct ExternalHandle { + ctx: Arc, +} + +impl ExternalHandle { + /// Returns true if thread has finished execution. + pub fn is_finished(&self) -> bool { + let status = self.ctx.mutex.lock(); + *status == Status::Finished + } + + /// Returns exitcode and message, which is available after thread has finished execution. + pub fn result(&self) -> (i32, String) { + let result = self.ctx.result.lock(); + result.clone() + } + + /// Returns thread id. + pub fn id(&self) -> u32 { + self.ctx.id.load(Ordering::SeqCst) + } + + /// Sets a flag to crash thread on the next wakeup. + pub fn crash_stop(&self) { + self.ctx.crash_stop(); + } +} + +struct ThreadHandle { + ctx: Arc, + _join: JoinHandle<()>, +} + +impl ThreadHandle { + /// Create a new [`ThreadHandle`] and wait until thread will enter [`Status::Sleep`] state. + fn new(ctx: Arc, join: JoinHandle<()>) -> Self { + let mut status = ctx.mutex.lock(); + // wait until thread will go into the first yield + while *status != Status::Sleep { + ctx.condvar.wait(&mut status); + } + drop(status); + + Self { ctx, _join: join } + } + + /// Allows thread to execute one step of its execution. + /// Returns [`Status`] of the thread after the step. + fn step(&self) -> Status { + let mut status = self.ctx.mutex.lock(); + assert!(matches!(*status, Status::Sleep)); + + *status = Status::Running; + self.ctx.condvar.notify_all(); + + while *status == Status::Running { + self.ctx.condvar.wait(&mut status); + } + + *status + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum Status { + /// Thread is running. + Running, + /// Waiting for event to complete, will be resumed by the executor step, once wakeup flag is set. + Sleep, + /// Thread finished execution. + Finished, +} + +const NO_WAKEUP: u8 = 0; +const PENDING_WAKEUP: u8 = 1; + +pub struct ThreadContext { + id: AtomicU32, + // used to block thread until it is woken up + mutex: parking_lot::Mutex, + condvar: parking_lot::Condvar, + // used as a flag to indicate runtime that thread is ready to be woken up + wakeup: AtomicU8, + clock: OnceLock>, + // execution result, set by exit() call + result: parking_lot::Mutex<(i32, String)>, + // determines if process should be killed on receiving panic + allow_panic: AtomicBool, + // acts as a signal that thread should crash itself on the next wakeup + crash_request: AtomicBool, +} + +impl ThreadContext { + pub(crate) fn new() -> Self { + Self { + id: AtomicU32::new(0), + mutex: parking_lot::Mutex::new(Status::Running), + condvar: parking_lot::Condvar::new(), + wakeup: AtomicU8::new(NO_WAKEUP), + clock: OnceLock::new(), + result: parking_lot::Mutex::new((-1, String::new())), + allow_panic: AtomicBool::new(false), + crash_request: AtomicBool::new(false), + } + } +} + +// Functions for executor to control thread execution. +impl ThreadContext { + /// Set atomic flag to indicate that thread is ready to be woken up. + fn inc_wake(&self) { + self.wakeup.store(PENDING_WAKEUP, Ordering::SeqCst); + } + + /// Internal function used for event queues. + pub(crate) fn schedule_wakeup(self: &Arc, after_ms: u64) { + self.clock + .get() + .unwrap() + .schedule_wakeup(after_ms, self.clone()); + } + + fn tid(&self) -> u32 { + self.id.load(Ordering::SeqCst) + } + + fn crash_stop(&self) { + let status = self.mutex.lock(); + if *status == Status::Finished { + debug!( + "trying to crash thread-{}, which is already finished", + self.tid() + ); + return; + } + assert!(matches!(*status, Status::Sleep)); + drop(status); + + self.allow_panic.store(true, Ordering::SeqCst); + self.crash_request.store(true, Ordering::SeqCst); + // set a wakeup + self.inc_wake(); + // it will panic on the next wakeup + } +} + +// Internal functions. +impl ThreadContext { + /// Blocks thread until it's woken up by the executor. If `after_ms` is 0, is will be + /// woken on the next step. If `after_ms` > 0, wakeup is scheduled after that time. + /// Otherwise wakeup is not scheduled inside `yield_me`, and should be arranged before + /// calling this function. + fn yield_me(self: &Arc, after_ms: i64) { + let mut status = self.mutex.lock(); + assert!(matches!(*status, Status::Running)); + + match after_ms.cmp(&0) { + std::cmp::Ordering::Less => { + // block until something wakes us up + } + std::cmp::Ordering::Equal => { + // tell executor that we are ready to be woken up + self.inc_wake(); + } + std::cmp::Ordering::Greater => { + // schedule wakeup + self.clock + .get() + .unwrap() + .schedule_wakeup(after_ms as u64, self.clone()); + } + } + + *status = Status::Sleep; + self.condvar.notify_all(); + + // wait until executor wakes us up + while *status != Status::Running { + self.condvar.wait(&mut status); + } + + if self.crash_request.load(Ordering::SeqCst) { + panic!("crashed by request"); + } + } + + /// Called only once, exactly before thread finishes execution. + fn finish_me(&self) { + let mut status = self.mutex.lock(); + assert!(matches!(*status, Status::Running)); + + *status = Status::Finished; + { + let mut result = self.result.lock(); + if result.0 == -1 { + *result = (0, "finished normally".to_owned()); + } + } + self.condvar.notify_all(); + } +} + +/// Invokes the given closure with a reference to the current thread [`ThreadContext`]. +#[inline(always)] +fn with_thread_context(f: impl FnOnce(&Arc) -> T) -> T { + thread_local!(static THREAD_DATA: Arc = Arc::new(ThreadContext::new())); + THREAD_DATA.with(f) +} + +/// Waker is used to wake up threads that are blocked on condition. +/// It keeps track of contexts [`Arc`] and can increment the counter +/// of several contexts to send a notification. +pub struct Waker { + // contexts that are waiting for a notification + contexts: parking_lot::Mutex; 8]>>, +} + +impl Default for Waker { + fn default() -> Self { + Self::new() + } +} + +impl Waker { + pub fn new() -> Self { + Self { + contexts: parking_lot::Mutex::new(smallvec::SmallVec::new()), + } + } + + /// Subscribe current thread to receive a wake notification later. + pub fn wake_me_later(&self) { + with_thread_context(|ctx| { + self.contexts.lock().push(ctx.clone()); + }); + } + + /// Wake up all threads that are waiting for a notification and clear the list. + pub fn wake_all(&self) { + let mut v = self.contexts.lock(); + for ctx in v.iter() { + ctx.inc_wake(); + } + v.clear(); + } +} + +/// See [`ThreadContext::yield_me`]. +pub fn yield_me(after_ms: i64) { + with_thread_context(|ctx| ctx.yield_me(after_ms)) +} + +/// Get current time. +pub fn now() -> u64 { + with_thread_context(|ctx| ctx.clock.get().unwrap().now()) +} + +pub fn exit(code: i32, msg: String) { + with_thread_context(|ctx| { + ctx.allow_panic.store(true, Ordering::SeqCst); + let mut result = ctx.result.lock(); + *result = (code, msg); + panic!("exit"); + }); +} + +pub(crate) fn get_thread_ctx() -> Arc { + with_thread_context(|ctx| ctx.clone()) +} + +/// Trait for polling channels until they have something. +pub trait PollSome { + /// Schedule wakeup for message arrival. + fn wake_me(&self); + + /// Check if channel has a ready message. + fn has_some(&self) -> bool; +} + +/// Blocks current thread until one of the channels has a ready message. Returns +/// index of the channel that has a message. If timeout is reached, returns None. +/// +/// Negative timeout means block forever. Zero timeout means check channels and return +/// immediately. Positive timeout means block until timeout is reached. +pub fn epoll_chans(chans: &[Box], timeout: i64) -> Option { + let deadline = if timeout < 0 { + 0 + } else { + now() + timeout as u64 + }; + + loop { + for chan in chans { + chan.wake_me() + } + + for (i, chan) in chans.iter().enumerate() { + if chan.has_some() { + return Some(i); + } + } + + if timeout < 0 { + // block until wakeup + yield_me(-1); + } else { + let current_time = now(); + if current_time >= deadline { + return None; + } + + yield_me((deadline - current_time) as i64); + } + } +} diff --git a/libs/desim/src/lib.rs b/libs/desim/src/lib.rs new file mode 100644 index 0000000000..14f5a885c5 --- /dev/null +++ b/libs/desim/src/lib.rs @@ -0,0 +1,8 @@ +pub mod chan; +pub mod executor; +pub mod network; +pub mod node_os; +pub mod options; +pub mod proto; +pub mod time; +pub mod world; diff --git a/libs/desim/src/network.rs b/libs/desim/src/network.rs new file mode 100644 index 0000000000..e15a714daa --- /dev/null +++ b/libs/desim/src/network.rs @@ -0,0 +1,451 @@ +use std::{ + cmp::Ordering, + collections::{BinaryHeap, VecDeque}, + fmt::{self, Debug}, + ops::DerefMut, + sync::{mpsc, Arc}, +}; + +use parking_lot::{ + lock_api::{MappedMutexGuard, MutexGuard}, + Mutex, RawMutex, +}; +use rand::rngs::StdRng; +use tracing::debug; + +use crate::{ + executor::{self, ThreadContext}, + options::NetworkOptions, + proto::NetEvent, + proto::NodeEvent, +}; + +use super::{chan::Chan, proto::AnyMessage}; + +pub struct NetworkTask { + options: Arc, + connections: Mutex>, + /// min-heap of connections having something to deliver. + events: Mutex>, + task_context: Arc, +} + +impl NetworkTask { + pub fn start_new(options: Arc, tx: mpsc::Sender>) { + let ctx = executor::get_thread_ctx(); + let task = Arc::new(Self { + options, + connections: Mutex::new(Vec::new()), + events: Mutex::new(BinaryHeap::new()), + task_context: ctx, + }); + + // send the task upstream + tx.send(task.clone()).unwrap(); + + // start the task + task.start(); + } + + pub fn start_new_connection(self: &Arc, rng: StdRng, dst_accept: Chan) -> TCP { + let now = executor::now(); + let connection_id = self.connections.lock().len(); + + let vc = VirtualConnection { + connection_id, + dst_accept, + dst_sockets: [Chan::new(), Chan::new()], + state: Mutex::new(ConnectionState { + buffers: [NetworkBuffer::new(None), NetworkBuffer::new(Some(now))], + rng, + }), + }; + vc.schedule_timeout(self); + vc.send_connect(self); + + let recv_chan = vc.dst_sockets[0].clone(); + self.connections.lock().push(vc); + + TCP { + net: self.clone(), + conn_id: connection_id, + dir: 0, + recv_chan, + } + } +} + +// private functions +impl NetworkTask { + /// Schedule to wakeup network task (self) `after_ms` later to deliver + /// messages of connection `id`. + fn schedule(&self, id: usize, after_ms: u64) { + self.events.lock().push(Event { + time: executor::now() + after_ms, + conn_id: id, + }); + self.task_context.schedule_wakeup(after_ms); + } + + /// Get locked connection `id`. + fn get(&self, id: usize) -> MappedMutexGuard<'_, RawMutex, VirtualConnection> { + MutexGuard::map(self.connections.lock(), |connections| { + connections.get_mut(id).unwrap() + }) + } + + fn collect_pending_events(&self, now: u64, vec: &mut Vec) { + vec.clear(); + let mut events = self.events.lock(); + while let Some(event) = events.peek() { + if event.time > now { + break; + } + let event = events.pop().unwrap(); + vec.push(event); + } + } + + fn start(self: &Arc) { + debug!("started network task"); + + let mut events = Vec::new(); + loop { + let now = executor::now(); + self.collect_pending_events(now, &mut events); + + for event in events.drain(..) { + let conn = self.get(event.conn_id); + conn.process(self); + } + + // block until wakeup + executor::yield_me(-1); + } + } +} + +// 0 - from node(0) to node(1) +// 1 - from node(1) to node(0) +type MessageDirection = u8; + +fn sender_str(dir: MessageDirection) -> &'static str { + match dir { + 0 => "client", + 1 => "server", + _ => unreachable!(), + } +} + +fn receiver_str(dir: MessageDirection) -> &'static str { + match dir { + 0 => "server", + 1 => "client", + _ => unreachable!(), + } +} + +/// Virtual connection between two nodes. +/// Node 0 is the creator of the connection (client), +/// and node 1 is the acceptor (server). +struct VirtualConnection { + connection_id: usize, + /// one-off chan, used to deliver Accept message to dst + dst_accept: Chan, + /// message sinks + dst_sockets: [Chan; 2], + state: Mutex, +} + +struct ConnectionState { + buffers: [NetworkBuffer; 2], + rng: StdRng, +} + +impl VirtualConnection { + /// Notify the future about the possible timeout. + fn schedule_timeout(&self, net: &NetworkTask) { + if let Some(timeout) = net.options.keepalive_timeout { + net.schedule(self.connection_id, timeout); + } + } + + /// Send the handshake (Accept) to the server. + fn send_connect(&self, net: &NetworkTask) { + let now = executor::now(); + let mut state = self.state.lock(); + let delay = net.options.connect_delay.delay(&mut state.rng); + let buffer = &mut state.buffers[0]; + assert!(buffer.buf.is_empty()); + assert!(!buffer.recv_closed); + assert!(!buffer.send_closed); + assert!(buffer.last_recv.is_none()); + + let delay = if let Some(ms) = delay { + ms + } else { + debug!("NET: TCP #{} dropped connect", self.connection_id); + buffer.send_closed = true; + return; + }; + + // Send a message into the future. + buffer + .buf + .push_back((now + delay, AnyMessage::InternalConnect)); + net.schedule(self.connection_id, delay); + } + + /// Transmit some of the messages from the buffer to the nodes. + fn process(&self, net: &Arc) { + let now = executor::now(); + + let mut state = self.state.lock(); + + for direction in 0..2 { + self.process_direction( + net, + state.deref_mut(), + now, + direction as MessageDirection, + &self.dst_sockets[direction ^ 1], + ); + } + + // Close the one side of the connection by timeout if the node + // has not received any messages for a long time. + if let Some(timeout) = net.options.keepalive_timeout { + let mut to_close = [false, false]; + for direction in 0..2 { + let buffer = &mut state.buffers[direction]; + if buffer.recv_closed { + continue; + } + if let Some(last_recv) = buffer.last_recv { + if now - last_recv >= timeout { + debug!( + "NET: connection {} timed out at {}", + self.connection_id, + receiver_str(direction as MessageDirection) + ); + let node_idx = direction ^ 1; + to_close[node_idx] = true; + } + } + } + drop(state); + + for (node_idx, should_close) in to_close.iter().enumerate() { + if *should_close { + self.close(node_idx); + } + } + } + } + + /// Process messages in the buffer in the given direction. + fn process_direction( + &self, + net: &Arc, + state: &mut ConnectionState, + now: u64, + direction: MessageDirection, + to_socket: &Chan, + ) { + let buffer = &mut state.buffers[direction as usize]; + if buffer.recv_closed { + assert!(buffer.buf.is_empty()); + } + + while !buffer.buf.is_empty() && buffer.buf.front().unwrap().0 <= now { + let msg = buffer.buf.pop_front().unwrap().1; + + buffer.last_recv = Some(now); + self.schedule_timeout(net); + + if let AnyMessage::InternalConnect = msg { + // TODO: assert to_socket is the server + let server_to_client = TCP { + net: net.clone(), + conn_id: self.connection_id, + dir: direction ^ 1, + recv_chan: to_socket.clone(), + }; + // special case, we need to deliver new connection to a separate channel + self.dst_accept.send(NodeEvent::Accept(server_to_client)); + } else { + to_socket.send(NetEvent::Message(msg)); + } + } + } + + /// Try to send a message to the buffer, optionally dropping it and + /// determining delivery timestamp. + fn send(&self, net: &NetworkTask, direction: MessageDirection, msg: AnyMessage) { + let now = executor::now(); + let mut state = self.state.lock(); + + let (delay, close) = if let Some(ms) = net.options.send_delay.delay(&mut state.rng) { + (ms, false) + } else { + (0, true) + }; + + let buffer = &mut state.buffers[direction as usize]; + if buffer.send_closed { + debug!( + "NET: TCP #{} dropped message {:?} (broken pipe)", + self.connection_id, msg + ); + return; + } + + if close { + debug!( + "NET: TCP #{} dropped message {:?} (pipe just broke)", + self.connection_id, msg + ); + buffer.send_closed = true; + return; + } + + if buffer.recv_closed { + debug!( + "NET: TCP #{} dropped message {:?} (recv closed)", + self.connection_id, msg + ); + return; + } + + // Send a message into the future. + buffer.buf.push_back((now + delay, msg)); + net.schedule(self.connection_id, delay); + } + + /// Close the connection. Only one side of the connection will be closed, + /// and no further messages will be delivered. The other side will not be notified. + fn close(&self, node_idx: usize) { + let mut state = self.state.lock(); + let recv_buffer = &mut state.buffers[1 ^ node_idx]; + if recv_buffer.recv_closed { + debug!( + "NET: TCP #{} closed twice at {}", + self.connection_id, + sender_str(node_idx as MessageDirection), + ); + return; + } + + debug!( + "NET: TCP #{} closed at {}", + self.connection_id, + sender_str(node_idx as MessageDirection), + ); + recv_buffer.recv_closed = true; + for msg in recv_buffer.buf.drain(..) { + debug!( + "NET: TCP #{} dropped message {:?} (closed)", + self.connection_id, msg + ); + } + + let send_buffer = &mut state.buffers[node_idx]; + send_buffer.send_closed = true; + drop(state); + + // TODO: notify the other side? + + self.dst_sockets[node_idx].send(NetEvent::Closed); + } +} + +struct NetworkBuffer { + /// Messages paired with time of delivery + buf: VecDeque<(u64, AnyMessage)>, + /// True if the connection is closed on the receiving side, + /// i.e. no more messages from the buffer will be delivered. + recv_closed: bool, + /// True if the connection is closed on the sending side, + /// i.e. no more messages will be added to the buffer. + send_closed: bool, + /// Last time a message was delivered from the buffer. + /// If None, it means that the server is the receiver and + /// it has not yet aware of this connection (i.e. has not + /// received the Accept). + last_recv: Option, +} + +impl NetworkBuffer { + fn new(last_recv: Option) -> Self { + Self { + buf: VecDeque::new(), + recv_closed: false, + send_closed: false, + last_recv, + } + } +} + +/// Single end of a bidirectional network stream without reordering (TCP-like). +/// Reads are implemented using channels, writes go to the buffer inside VirtualConnection. +pub struct TCP { + net: Arc, + conn_id: usize, + dir: MessageDirection, + recv_chan: Chan, +} + +impl Debug for TCP { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "TCP #{} ({})", self.conn_id, sender_str(self.dir),) + } +} + +impl TCP { + /// Send a message to the other side. It's guaranteed that it will not arrive + /// before the arrival of all messages sent earlier. + pub fn send(&self, msg: AnyMessage) { + let conn = self.net.get(self.conn_id); + conn.send(&self.net, self.dir, msg); + } + + /// Get a channel to receive incoming messages. + pub fn recv_chan(&self) -> Chan { + self.recv_chan.clone() + } + + pub fn connection_id(&self) -> usize { + self.conn_id + } + + pub fn close(&self) { + let conn = self.net.get(self.conn_id); + conn.close(self.dir as usize); + } +} +struct Event { + time: u64, + conn_id: usize, +} + +// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here +// to get that. +impl PartialOrd for Event { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Event { + fn cmp(&self, other: &Self) -> Ordering { + (other.time, other.conn_id).cmp(&(self.time, self.conn_id)) + } +} + +impl PartialEq for Event { + fn eq(&self, other: &Self) -> bool { + (other.time, other.conn_id) == (self.time, self.conn_id) + } +} + +impl Eq for Event {} diff --git a/libs/desim/src/node_os.rs b/libs/desim/src/node_os.rs new file mode 100644 index 0000000000..7744a9f5e1 --- /dev/null +++ b/libs/desim/src/node_os.rs @@ -0,0 +1,54 @@ +use std::sync::Arc; + +use rand::Rng; + +use crate::proto::NodeEvent; + +use super::{ + chan::Chan, + network::TCP, + world::{Node, NodeId, World}, +}; + +/// Abstraction with all functions (aka syscalls) available to the node. +#[derive(Clone)] +pub struct NodeOs { + world: Arc, + internal: Arc, +} + +impl NodeOs { + pub fn new(world: Arc, internal: Arc) -> NodeOs { + NodeOs { world, internal } + } + + /// Get the node id. + pub fn id(&self) -> NodeId { + self.internal.id + } + + /// Opens a bidirectional connection with the other node. Always successful. + pub fn open_tcp(&self, dst: NodeId) -> TCP { + self.world.open_tcp(dst) + } + + /// Returns a channel to receive node events (socket Accept and internal messages). + pub fn node_events(&self) -> Chan { + self.internal.node_events() + } + + /// Get current time. + pub fn now(&self) -> u64 { + self.world.now() + } + + /// Generate a random number in range [0, max). + pub fn random(&self, max: u64) -> u64 { + self.internal.rng.lock().gen_range(0..max) + } + + /// Append a new event to the world event log. + pub fn log_event(&self, data: String) { + self.internal.log_event(data) + } +} diff --git a/libs/desim/src/options.rs b/libs/desim/src/options.rs new file mode 100644 index 0000000000..5da7c2c482 --- /dev/null +++ b/libs/desim/src/options.rs @@ -0,0 +1,50 @@ +use rand::{rngs::StdRng, Rng}; + +/// Describes random delays and failures. Delay will be uniformly distributed in [min, max]. +/// Connection failure will occur with the probablity fail_prob. +#[derive(Clone, Debug)] +pub struct Delay { + pub min: u64, + pub max: u64, + pub fail_prob: f64, // [0; 1] +} + +impl Delay { + /// Create a struct with no delay, no failures. + pub fn empty() -> Delay { + Delay { + min: 0, + max: 0, + fail_prob: 0.0, + } + } + + /// Create a struct with a fixed delay. + pub fn fixed(ms: u64) -> Delay { + Delay { + min: ms, + max: ms, + fail_prob: 0.0, + } + } + + /// Generate a random delay in range [min, max]. Return None if the + /// message should be dropped. + pub fn delay(&self, rng: &mut StdRng) -> Option { + if rng.gen_bool(self.fail_prob) { + return None; + } + Some(rng.gen_range(self.min..=self.max)) + } +} + +/// Describes network settings. All network packets will be subjected to the same delays and failures. +#[derive(Clone, Debug)] +pub struct NetworkOptions { + /// Connection will be automatically closed after this timeout if no data is received. + pub keepalive_timeout: Option, + /// New connections will be delayed by this amount of time. + pub connect_delay: Delay, + /// Each message will be delayed by this amount of time. + pub send_delay: Delay, +} diff --git a/libs/desim/src/proto.rs b/libs/desim/src/proto.rs new file mode 100644 index 0000000000..92a7e8a27d --- /dev/null +++ b/libs/desim/src/proto.rs @@ -0,0 +1,63 @@ +use std::fmt::Debug; + +use bytes::Bytes; +use utils::lsn::Lsn; + +use crate::{network::TCP, world::NodeId}; + +/// Internal node events. +#[derive(Debug)] +pub enum NodeEvent { + Accept(TCP), + Internal(AnyMessage), +} + +/// Events that are coming from a network socket. +#[derive(Clone, Debug)] +pub enum NetEvent { + Message(AnyMessage), + Closed, +} + +/// Custom events generated throughout the simulation. Can be used by the test to verify the correctness. +#[derive(Debug)] +pub struct SimEvent { + pub time: u64, + pub node: NodeId, + pub data: String, +} + +/// Umbrella type for all possible flavours of messages. These events can be sent over network +/// or to an internal node events channel. +#[derive(Clone)] +pub enum AnyMessage { + /// Not used, empty placeholder. + None, + /// Used internally for notifying node about new incoming connection. + InternalConnect, + Just32(u32), + ReplCell(ReplCell), + Bytes(Bytes), + LSN(u64), +} + +impl Debug for AnyMessage { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + AnyMessage::None => write!(f, "None"), + AnyMessage::InternalConnect => write!(f, "InternalConnect"), + AnyMessage::Just32(v) => write!(f, "Just32({})", v), + AnyMessage::ReplCell(v) => write!(f, "ReplCell({:?})", v), + AnyMessage::Bytes(v) => write!(f, "Bytes({})", hex::encode(v)), + AnyMessage::LSN(v) => write!(f, "LSN({})", Lsn(*v)), + } + } +} + +/// Used in reliable_copy_test.rs +#[derive(Clone, Debug)] +pub struct ReplCell { + pub value: u32, + pub client_id: u32, + pub seqno: u32, +} diff --git a/libs/desim/src/time.rs b/libs/desim/src/time.rs new file mode 100644 index 0000000000..7bb71db95c --- /dev/null +++ b/libs/desim/src/time.rs @@ -0,0 +1,129 @@ +use std::{ + cmp::Ordering, + collections::BinaryHeap, + ops::DerefMut, + sync::{ + atomic::{AtomicU32, AtomicU64}, + Arc, + }, +}; + +use parking_lot::Mutex; +use tracing::trace; + +use crate::executor::ThreadContext; + +/// Holds current time and all pending wakeup events. +pub struct Timing { + /// Current world's time. + current_time: AtomicU64, + /// Pending timers. + queue: Mutex>, + /// Global nonce. Makes picking events from binary heap queue deterministic + /// by appending a number to events with the same timestamp. + nonce: AtomicU32, + /// Used to schedule fake events. + fake_context: Arc, +} + +impl Default for Timing { + fn default() -> Self { + Self::new() + } +} + +impl Timing { + /// Create a new empty clock with time set to 0. + pub fn new() -> Timing { + Timing { + current_time: AtomicU64::new(0), + queue: Mutex::new(BinaryHeap::new()), + nonce: AtomicU32::new(0), + fake_context: Arc::new(ThreadContext::new()), + } + } + + /// Return the current world's time. + pub fn now(&self) -> u64 { + self.current_time.load(std::sync::atomic::Ordering::SeqCst) + } + + /// Tick-tock the global clock. Return the event ready to be processed + /// or move the clock forward and then return the event. + pub(crate) fn step(&self) -> Option> { + let mut queue = self.queue.lock(); + + if queue.is_empty() { + // no future events + return None; + } + + if !self.is_event_ready(queue.deref_mut()) { + let next_time = queue.peek().unwrap().time; + self.current_time + .store(next_time, std::sync::atomic::Ordering::SeqCst); + trace!("rewind time to {}", next_time); + assert!(self.is_event_ready(queue.deref_mut())); + } + + Some(queue.pop().unwrap().wake_context) + } + + /// Append an event to the queue, to wakeup the thread in `ms` milliseconds. + pub(crate) fn schedule_wakeup(&self, ms: u64, wake_context: Arc) { + self.nonce.fetch_add(1, std::sync::atomic::Ordering::SeqCst); + let nonce = self.nonce.load(std::sync::atomic::Ordering::SeqCst); + self.queue.lock().push(Pending { + time: self.now() + ms, + nonce, + wake_context, + }) + } + + /// Append a fake event to the queue, to prevent clocks from skipping this time. + pub fn schedule_fake(&self, ms: u64) { + self.queue.lock().push(Pending { + time: self.now() + ms, + nonce: 0, + wake_context: self.fake_context.clone(), + }); + } + + /// Return true if there is a ready event. + fn is_event_ready(&self, queue: &mut BinaryHeap) -> bool { + queue.peek().map_or(false, |x| x.time <= self.now()) + } + + /// Clear all pending events. + pub(crate) fn clear(&self) { + self.queue.lock().clear(); + } +} + +struct Pending { + time: u64, + nonce: u32, + wake_context: Arc, +} + +// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here +// to get that. +impl PartialOrd for Pending { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Pending { + fn cmp(&self, other: &Self) -> Ordering { + (other.time, other.nonce).cmp(&(self.time, self.nonce)) + } +} + +impl PartialEq for Pending { + fn eq(&self, other: &Self) -> bool { + (other.time, other.nonce) == (self.time, self.nonce) + } +} + +impl Eq for Pending {} diff --git a/libs/desim/src/world.rs b/libs/desim/src/world.rs new file mode 100644 index 0000000000..7d60be04b5 --- /dev/null +++ b/libs/desim/src/world.rs @@ -0,0 +1,180 @@ +use parking_lot::Mutex; +use rand::{rngs::StdRng, SeedableRng}; +use std::{ + ops::DerefMut, + sync::{mpsc, Arc}, +}; + +use crate::{ + executor::{ExternalHandle, Runtime}, + network::NetworkTask, + options::NetworkOptions, + proto::{NodeEvent, SimEvent}, + time::Timing, +}; + +use super::{chan::Chan, network::TCP, node_os::NodeOs}; + +pub type NodeId = u32; + +/// World contains simulation state. +pub struct World { + nodes: Mutex>>, + /// Random number generator. + rng: Mutex, + /// Internal event log. + events: Mutex>, + /// Separate task that processes all network messages. + network_task: Arc, + /// Runtime for running threads and moving time. + runtime: Mutex, + /// To get current time. + timing: Arc, +} + +impl World { + pub fn new(seed: u64, options: Arc) -> World { + let timing = Arc::new(Timing::new()); + let mut runtime = Runtime::new(timing.clone()); + + let (tx, rx) = mpsc::channel(); + + runtime.spawn(move || { + // create and start network background thread, and send it back via the channel + NetworkTask::start_new(options, tx) + }); + + // wait for the network task to start + while runtime.step() {} + + let network_task = rx.recv().unwrap(); + + World { + nodes: Mutex::new(Vec::new()), + rng: Mutex::new(StdRng::seed_from_u64(seed)), + events: Mutex::new(Vec::new()), + network_task, + runtime: Mutex::new(runtime), + timing, + } + } + + pub fn step(&self) -> bool { + self.runtime.lock().step() + } + + pub fn get_thread_step_count(&self) -> u64 { + self.runtime.lock().step_counter + } + + /// Create a new random number generator. + pub fn new_rng(&self) -> StdRng { + let mut rng = self.rng.lock(); + StdRng::from_rng(rng.deref_mut()).unwrap() + } + + /// Create a new node. + pub fn new_node(self: &Arc) -> Arc { + let mut nodes = self.nodes.lock(); + let id = nodes.len() as NodeId; + let node = Arc::new(Node::new(id, self.clone(), self.new_rng())); + nodes.push(node.clone()); + node + } + + /// Get an internal node state by id. + fn get_node(&self, id: NodeId) -> Option> { + let nodes = self.nodes.lock(); + let num = id as usize; + if num < nodes.len() { + Some(nodes[num].clone()) + } else { + None + } + } + + pub fn stop_all(&self) { + self.runtime.lock().crash_all_threads(); + } + + /// Returns a writable end of a TCP connection, to send src->dst messages. + pub fn open_tcp(self: &Arc, dst: NodeId) -> TCP { + // TODO: replace unwrap() with /dev/null socket. + let dst = self.get_node(dst).unwrap(); + let dst_accept = dst.node_events.lock().clone(); + + let rng = self.new_rng(); + self.network_task.start_new_connection(rng, dst_accept) + } + + /// Get current time. + pub fn now(&self) -> u64 { + self.timing.now() + } + + /// Get a copy of the internal clock. + pub fn clock(&self) -> Arc { + self.timing.clone() + } + + pub fn add_event(&self, node: NodeId, data: String) { + let time = self.now(); + self.events.lock().push(SimEvent { time, node, data }); + } + + pub fn take_events(&self) -> Vec { + let mut events = self.events.lock(); + let mut res = Vec::new(); + std::mem::swap(&mut res, &mut events); + res + } + + pub fn deallocate(&self) { + self.stop_all(); + self.timing.clear(); + self.nodes.lock().clear(); + } +} + +/// Internal node state. +pub struct Node { + pub id: NodeId, + node_events: Mutex>, + world: Arc, + pub(crate) rng: Mutex, +} + +impl Node { + pub fn new(id: NodeId, world: Arc, rng: StdRng) -> Node { + Node { + id, + node_events: Mutex::new(Chan::new()), + world, + rng: Mutex::new(rng), + } + } + + /// Spawn a new thread with this node context. + pub fn launch(self: &Arc, f: impl FnOnce(NodeOs) + Send + 'static) -> ExternalHandle { + let node = self.clone(); + let world = self.world.clone(); + self.world.runtime.lock().spawn(move || { + f(NodeOs::new(world, node.clone())); + }) + } + + /// Returns a channel to receive Accepts and internal messages. + pub fn node_events(&self) -> Chan { + self.node_events.lock().clone() + } + + /// This will drop all in-flight Accept messages. + pub fn replug_node_events(&self, chan: Chan) { + *self.node_events.lock() = chan; + } + + /// Append event to the world's log. + pub fn log_event(&self, data: String) { + self.world.add_event(self.id, data) + } +} diff --git a/libs/desim/tests/reliable_copy_test.rs b/libs/desim/tests/reliable_copy_test.rs new file mode 100644 index 0000000000..cf7bff8f5a --- /dev/null +++ b/libs/desim/tests/reliable_copy_test.rs @@ -0,0 +1,244 @@ +//! Simple test to verify that simulator is working. +#[cfg(test)] +mod reliable_copy_test { + use anyhow::Result; + use desim::executor::{self, PollSome}; + use desim::options::{Delay, NetworkOptions}; + use desim::proto::{NetEvent, NodeEvent, ReplCell}; + use desim::world::{NodeId, World}; + use desim::{node_os::NodeOs, proto::AnyMessage}; + use parking_lot::Mutex; + use std::sync::Arc; + use tracing::info; + + /// Disk storage trait and implementation. + pub trait Storage { + fn flush_pos(&self) -> u32; + fn flush(&mut self) -> Result<()>; + fn write(&mut self, t: T); + } + + #[derive(Clone)] + pub struct SharedStorage { + pub state: Arc>>, + } + + impl SharedStorage { + pub fn new() -> Self { + Self { + state: Arc::new(Mutex::new(InMemoryStorage::new())), + } + } + } + + impl Storage for SharedStorage { + fn flush_pos(&self) -> u32 { + self.state.lock().flush_pos + } + + fn flush(&mut self) -> Result<()> { + executor::yield_me(0); + self.state.lock().flush() + } + + fn write(&mut self, t: T) { + executor::yield_me(0); + self.state.lock().write(t); + } + } + + pub struct InMemoryStorage { + pub data: Vec, + pub flush_pos: u32, + } + + impl InMemoryStorage { + pub fn new() -> Self { + Self { + data: Vec::new(), + flush_pos: 0, + } + } + + pub fn flush(&mut self) -> Result<()> { + self.flush_pos = self.data.len() as u32; + Ok(()) + } + + pub fn write(&mut self, t: T) { + self.data.push(t); + } + } + + /// Server implementation. + pub fn run_server(os: NodeOs, mut storage: Box>) { + info!("started server"); + + let node_events = os.node_events(); + let mut epoll_vec: Vec> = vec![Box::new(node_events.clone())]; + let mut sockets = vec![]; + + loop { + let index = executor::epoll_chans(&epoll_vec, -1).unwrap(); + + if index == 0 { + let node_event = node_events.must_recv(); + info!("got node event: {:?}", node_event); + if let NodeEvent::Accept(tcp) = node_event { + tcp.send(AnyMessage::Just32(storage.flush_pos())); + epoll_vec.push(Box::new(tcp.recv_chan())); + sockets.push(tcp); + } + continue; + } + + let recv_chan = sockets[index - 1].recv_chan(); + let socket = &sockets[index - 1]; + + let event = recv_chan.must_recv(); + info!("got event: {:?}", event); + if let NetEvent::Message(AnyMessage::ReplCell(cell)) = event { + if cell.seqno != storage.flush_pos() { + info!("got out of order data: {:?}", cell); + continue; + } + storage.write(cell.value); + storage.flush().unwrap(); + socket.send(AnyMessage::Just32(storage.flush_pos())); + } + } + } + + /// Client copies all data from array to the remote node. + pub fn run_client(os: NodeOs, data: &[ReplCell], dst: NodeId) { + info!("started client"); + + let mut delivered = 0; + + let mut sock = os.open_tcp(dst); + let mut recv_chan = sock.recv_chan(); + + while delivered < data.len() { + let num = &data[delivered]; + info!("sending data: {:?}", num.clone()); + sock.send(AnyMessage::ReplCell(num.clone())); + + // loop { + let event = recv_chan.recv(); + match event { + NetEvent::Message(AnyMessage::Just32(flush_pos)) => { + if flush_pos == 1 + delivered as u32 { + delivered += 1; + } + } + NetEvent::Closed => { + info!("connection closed, reestablishing"); + sock = os.open_tcp(dst); + recv_chan = sock.recv_chan(); + } + _ => {} + } + + // } + } + + let sock = os.open_tcp(dst); + for num in data { + info!("sending data: {:?}", num.clone()); + sock.send(AnyMessage::ReplCell(num.clone())); + } + + info!("sent all data and finished client"); + } + + /// Run test simulations. + #[test] + fn sim_example_reliable_copy() { + utils::logging::init( + utils::logging::LogFormat::Test, + utils::logging::TracingErrorLayerEnablement::Disabled, + utils::logging::Output::Stdout, + ) + .expect("logging init failed"); + + let delay = Delay { + min: 1, + max: 60, + fail_prob: 0.4, + }; + + let network = NetworkOptions { + keepalive_timeout: Some(50), + connect_delay: delay.clone(), + send_delay: delay.clone(), + }; + + for seed in 0..20 { + let u32_data: [u32; 5] = [1, 2, 3, 4, 5]; + let data = u32_to_cells(&u32_data, 1); + let world = Arc::new(World::new(seed, Arc::new(network.clone()))); + + start_simulation(Options { + world, + time_limit: 1_000_000, + client_fn: Box::new(move |os, server_id| run_client(os, &data, server_id)), + u32_data, + }); + } + } + + pub struct Options { + pub world: Arc, + pub time_limit: u64, + pub u32_data: [u32; 5], + pub client_fn: Box, + } + + pub fn start_simulation(options: Options) { + let world = options.world; + + let client_node = world.new_node(); + let server_node = world.new_node(); + let server_id = server_node.id; + + // start the client thread + client_node.launch(move |os| { + let client_fn = options.client_fn; + client_fn(os, server_id); + }); + + // start the server thread + let shared_storage = SharedStorage::new(); + let server_storage = shared_storage.clone(); + server_node.launch(move |os| run_server(os, Box::new(server_storage))); + + while world.step() && world.now() < options.time_limit {} + + let disk_data = shared_storage.state.lock().data.clone(); + assert!(verify_data(&disk_data, &options.u32_data[..])); + } + + pub fn u32_to_cells(data: &[u32], client_id: u32) -> Vec { + let mut res = Vec::new(); + for (i, _) in data.iter().enumerate() { + res.push(ReplCell { + client_id, + seqno: i as u32, + value: data[i], + }); + } + res + } + + fn verify_data(disk_data: &[u32], data: &[u32]) -> bool { + if disk_data.len() != data.len() { + return false; + } + for i in 0..data.len() { + if disk_data[i] != data[i] { + return false; + } + } + true + } +} diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml index d4323ae766..f87e7b8e3a 100644 --- a/libs/metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -9,5 +9,13 @@ prometheus.workspace = true libc.workspace = true once_cell.workspace = true chrono.workspace = true +twox-hash.workspace = true +measured.workspace = true -workspace_hack.workspace = true +[target.'cfg(target_os = "linux")'.dependencies] +procfs.workspace = true +measured-process.workspace = true + +[dev-dependencies] +rand = "0.8" +rand_distr = "0.4.3" diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs new file mode 100644 index 0000000000..723916a742 --- /dev/null +++ b/libs/metrics/src/hll.rs @@ -0,0 +1,324 @@ +//! HyperLogLog is an algorithm for the count-distinct problem, +//! approximating the number of distinct elements in a multiset. +//! Calculating the exact cardinality of the distinct elements +//! of a multiset requires an amount of memory proportional to +//! the cardinality, which is impractical for very large data sets. +//! Probabilistic cardinality estimators, such as the HyperLogLog algorithm, +//! use significantly less memory than this, but can only approximate the cardinality. + +use std::{ + hash::{BuildHasher, BuildHasherDefault, Hash}, + sync::atomic::AtomicU8, +}; + +use measured::{ + label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}, + metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec}, + text::TextEncoder, + LabelGroup, +}; +use twox_hash::xxh3; + +/// Create an [`HyperLogLogVec`] and registers to default registry. +#[macro_export(local_inner_macros)] +macro_rules! register_hll_vec { + ($N:literal, $OPTS:expr, $LABELS_NAMES:expr $(,)?) => {{ + let hll_vec = $crate::HyperLogLogVec::<$N>::new($OPTS, $LABELS_NAMES).unwrap(); + $crate::register(Box::new(hll_vec.clone())).map(|_| hll_vec) + }}; + + ($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{ + $crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES) + }}; +} + +/// Create an [`HyperLogLog`] and registers to default registry. +#[macro_export(local_inner_macros)] +macro_rules! register_hll { + ($N:literal, $OPTS:expr $(,)?) => {{ + let hll = $crate::HyperLogLog::<$N>::with_opts($OPTS).unwrap(); + $crate::register(Box::new(hll.clone())).map(|_| hll) + }}; + + ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{ + $crate::register_hll!($N, $crate::opts!($NAME, $HELP)) + }}; +} + +/// HLL is a probabilistic cardinality measure. +/// +/// How to use this time-series for a metric name `my_metrics_total_hll`: +/// +/// ```promql +/// # harmonic mean +/// 1 / ( +/// sum ( +/// 2 ^ -( +/// # HLL merge operation +/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...) +/// ) +/// ) without (hll_shard) +/// ) +/// * alpha +/// * shards_count +/// * shards_count +/// ``` +/// +/// If you want an estimate over time, you can use the following query: +/// +/// ```promql +/// # harmonic mean +/// 1 / ( +/// sum ( +/// 2 ^ -( +/// # HLL merge operation +/// max ( +/// max_over_time(my_metrics_total_hll{}[$__rate_interval]) +/// ) by (hll_shard, other_labels...) +/// ) +/// ) without (hll_shard) +/// ) +/// * alpha +/// * shards_count +/// * shards_count +/// ``` +/// +/// In the case of low cardinality, you might want to use the linear counting approximation: +/// +/// ```promql +/// # LinearCounting(m, V) = m log (m / V) +/// shards_count * ln(shards_count / +/// # calculate V = how many shards contain a 0 +/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard) +/// ) +/// ``` +/// +/// See for estimates on alpha +pub type HyperLogLogVec = MetricVec, L>; +pub type HyperLogLog = Metric>; + +pub struct HyperLogLogState { + shards: [AtomicU8; N], +} +impl Default for HyperLogLogState { + fn default() -> Self { + #[allow(clippy::declare_interior_mutable_const)] + const ZERO: AtomicU8 = AtomicU8::new(0); + Self { shards: [ZERO; N] } + } +} + +impl MetricType for HyperLogLogState { + type Metadata = (); +} + +impl HyperLogLogState { + pub fn measure(&self, item: &impl Hash) { + // changing the hasher will break compatibility with previous measurements. + self.record(BuildHasherDefault::::default().hash_one(item)); + } + + fn record(&self, hash: u64) { + let p = N.ilog2() as u8; + let j = hash & (N as u64 - 1); + let rho = (hash >> p).leading_zeros() as u8 + 1 - p; + self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed); + } + + fn take_sample(&self) -> [u8; N] { + self.shards.each_ref().map(|x| { + // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus. + + // This seems like it would be a race condition, + // but HLL is not impacted by a write in one shard happening in between. + // This is because in PromQL we will be implementing a harmonic mean of all buckets. + // we will also merge samples in a time series using `max by (hll_shard)`. + + // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window. + // this would mean that a dev port-forwarding the metrics url won't break the sampling. + x.swap(0, std::sync::atomic::Ordering::Relaxed) + }) + } +} + +impl measured::metric::MetricEncoding> + for HyperLogLogState +{ + fn write_type( + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + enc.write_type(&name, measured::text::MetricType::Gauge) + } + fn collect_into( + &self, + _: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + struct I64(i64); + impl LabelValue for I64 { + fn visit(&self, v: V) -> V::Output { + v.write_int(self.0) + } + } + + struct HllShardLabel { + hll_shard: i64, + } + + impl LabelGroup for HllShardLabel { + fn visit_values(&self, v: &mut impl LabelGroupVisitor) { + const LE: &LabelName = LabelName::from_str("hll_shard"); + v.write_value(LE, &I64(self.hll_shard)); + } + } + + self.take_sample() + .into_iter() + .enumerate() + .try_for_each(|(hll_shard, val)| { + CounterState::new(val as u64).collect_into( + &(), + labels.by_ref().compose_with(HllShardLabel { + hll_shard: hll_shard as i64, + }), + name.by_ref(), + enc, + ) + }) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + use measured::{label::StaticLabelSet, FixedCardinalityLabel}; + use rand::{rngs::StdRng, Rng, SeedableRng}; + use rand_distr::{Distribution, Zipf}; + + use crate::HyperLogLogVec; + + #[derive(FixedCardinalityLabel, Clone, Copy)] + #[label(singleton = "x")] + enum Label { + A, + B, + } + + fn collect(hll: &HyperLogLogVec, 32>) -> ([u8; 32], [u8; 32]) { + // cannot go through the `hll.collect_family_into` interface yet... + // need to see if I can fix the conflicting impls problem in measured. + ( + hll.get_metric(hll.with_labels(Label::A)).take_sample(), + hll.get_metric(hll.with_labels(Label::B)).take_sample(), + ) + } + + fn get_cardinality(samples: &[[u8; 32]]) -> f64 { + let mut buckets = [0.0; 32]; + for &sample in samples { + for (i, m) in sample.into_iter().enumerate() { + buckets[i] = f64::max(buckets[i], m as f64); + } + } + + buckets + .into_iter() + .map(|f| 2.0f64.powf(-f)) + .sum::() + .recip() + * 0.697 + * 32.0 + * 32.0 + } + + fn test_cardinality(n: usize, dist: impl Distribution) -> ([usize; 3], [f64; 3]) { + let hll = HyperLogLogVec::, 32>::new(); + + let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist); + let mut set_a = HashSet::new(); + let mut set_b = HashSet::new(); + + for x in iter.by_ref().take(n) { + set_a.insert(x.to_bits()); + hll.get_metric(hll.with_labels(Label::A)) + .measure(&x.to_bits()); + } + for x in iter.by_ref().take(n) { + set_b.insert(x.to_bits()); + hll.get_metric(hll.with_labels(Label::B)) + .measure(&x.to_bits()); + } + let merge = &set_a | &set_b; + + let (a, b) = collect(&hll); + let len = get_cardinality(&[a, b]); + let len_a = get_cardinality(&[a]); + let len_b = get_cardinality(&[b]); + + ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b]) + } + + #[test] + fn test_cardinality_small() { + let (actual, estimate) = test_cardinality(100, Zipf::new(100, 1.2f64).unwrap()); + + assert_eq!(actual, [46, 30, 32]); + assert!(51.3 < estimate[0] && estimate[0] < 51.4); + assert!(44.0 < estimate[1] && estimate[1] < 44.1); + assert!(39.0 < estimate[2] && estimate[2] < 39.1); + } + + #[test] + fn test_cardinality_medium() { + let (actual, estimate) = test_cardinality(10000, Zipf::new(10000, 1.2f64).unwrap()); + + assert_eq!(actual, [2529, 1618, 1629]); + assert!(2309.1 < estimate[0] && estimate[0] < 2309.2); + assert!(1566.6 < estimate[1] && estimate[1] < 1566.7); + assert!(1629.5 < estimate[2] && estimate[2] < 1629.6); + } + + #[test] + fn test_cardinality_large() { + let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(1_000_000, 1.2f64).unwrap()); + + assert_eq!(actual, [129077, 79579, 79630]); + assert!(126067.2 < estimate[0] && estimate[0] < 126067.3); + assert!(83076.8 < estimate[1] && estimate[1] < 83076.9); + assert!(64251.2 < estimate[2] && estimate[2] < 64251.3); + } + + #[test] + fn test_cardinality_small2() { + let (actual, estimate) = test_cardinality(100, Zipf::new(200, 0.8f64).unwrap()); + + assert_eq!(actual, [92, 58, 60]); + assert!(116.1 < estimate[0] && estimate[0] < 116.2); + assert!(81.7 < estimate[1] && estimate[1] < 81.8); + assert!(69.3 < estimate[2] && estimate[2] < 69.4); + } + + #[test] + fn test_cardinality_medium2() { + let (actual, estimate) = test_cardinality(10000, Zipf::new(20000, 0.8f64).unwrap()); + + assert_eq!(actual, [8201, 5131, 5051]); + assert!(6846.4 < estimate[0] && estimate[0] < 6846.5); + assert!(5239.1 < estimate[1] && estimate[1] < 5239.2); + assert!(4292.8 < estimate[2] && estimate[2] < 4292.9); + } + + #[test] + fn test_cardinality_large2() { + let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(2_000_000, 0.8f64).unwrap()); + + assert_eq!(actual, [777847, 482069, 482246]); + assert!(699437.4 < estimate[0] && estimate[0] < 699437.5); + assert!(374948.9 < estimate[1] && estimate[1] < 374949.0); + assert!(434609.7 < estimate[2] && estimate[2] < 434609.8); + } +} diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index d09ba11344..cd4526c089 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -4,6 +4,17 @@ //! a default registry. #![deny(clippy::undocumented_unsafe_blocks)] +use measured::{ + label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels}, + metric::{ + counter::CounterState, + gauge::GaugeState, + group::Encoding, + name::{MetricName, MetricNameEncoder}, + MetricEncoding, MetricFamilyEncoding, + }, + FixedCardinalityLabel, LabelGroup, MetricGroup, +}; use once_cell::sync::Lazy; use prometheus::core::{ Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec, @@ -11,6 +22,7 @@ use prometheus::core::{ pub use prometheus::opts; pub use prometheus::register; pub use prometheus::Error; +use prometheus::Registry; pub use prometheus::{core, default_registry, proto}; pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{register_counter_vec, Counter, CounterVec}; @@ -23,12 +35,14 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec}; pub use prometheus::{register_int_gauge, IntGauge}; pub use prometheus::{register_int_gauge_vec, IntGaugeVec}; pub use prometheus::{Encoder, TextEncoder}; -use prometheus::{Registry, Result}; pub mod launch_timestamp; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; -pub mod metric_vec_duration; +mod hll; +pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec}; +#[cfg(target_os = "linux")] +pub mod more_process_metrics; pub type UIntGauge = GenericGauge; pub type UIntGaugeVec = GenericGaugeVec; @@ -54,9 +68,10 @@ macro_rules! register_uint_gauge { static INTERNAL_REGISTRY: Lazy = Lazy::new(Registry::new); /// Register a collector in the internal registry. MUST be called before the first call to `gather()`. +/// /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector /// while holding the lock. -pub fn register_internal(c: Box) -> Result<()> { +pub fn register_internal(c: Box) -> prometheus::Result<()> { INTERNAL_REGISTRY.register(c) } @@ -89,9 +104,134 @@ static MAXRSS_KB: Lazy = Lazy::new(|| { .expect("Failed to register maxrss_kb int gauge") }); -pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[ - 0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, -]; +/// Most common fsync latency is 50 µs - 100 µs, but it can be much higher, +/// especially during many concurrent disk operations. +pub const DISK_FSYNC_SECONDS_BUCKETS: &[f64] = + &[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0]; + +pub struct BuildInfo { + pub revision: &'static str, + pub build_tag: &'static str, +} + +// todo: allow label group without the set +impl LabelGroup for BuildInfo { + fn visit_values(&self, v: &mut impl LabelGroupVisitor) { + const REVISION: &LabelName = LabelName::from_str("revision"); + v.write_value(REVISION, &self.revision); + const BUILD_TAG: &LabelName = LabelName::from_str("build_tag"); + v.write_value(BUILD_TAG, &self.build_tag); + } +} + +impl MetricFamilyEncoding for BuildInfo +where + GaugeState: MetricEncoding, +{ + fn collect_family_into( + &self, + name: impl measured::metric::name::MetricNameEncoder, + enc: &mut T, + ) -> Result<(), T::Err> { + enc.write_help(&name, "Build/version information")?; + GaugeState::write_type(&name, enc)?; + GaugeState { + count: std::sync::atomic::AtomicI64::new(1), + } + .collect_into(&(), self, name, enc) + } +} + +#[derive(MetricGroup)] +#[metric(new(build_info: BuildInfo))] +pub struct NeonMetrics { + #[cfg(target_os = "linux")] + #[metric(namespace = "process")] + #[metric(init = measured_process::ProcessCollector::for_self())] + process: measured_process::ProcessCollector, + + #[metric(namespace = "libmetrics")] + #[metric(init = LibMetrics::new(build_info))] + libmetrics: LibMetrics, +} + +#[derive(MetricGroup)] +#[metric(new(build_info: BuildInfo))] +pub struct LibMetrics { + #[metric(init = build_info)] + build_info: BuildInfo, + + #[metric(flatten)] + rusage: Rusage, + + serve_count: CollectionCounter, +} + +fn write_gauge( + x: i64, + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Enc, +) -> Result<(), Enc::Err> +where + GaugeState: MetricEncoding, +{ + GaugeState::new(x).collect_into(&(), labels, name, enc) +} + +#[derive(Default)] +struct Rusage; + +#[derive(FixedCardinalityLabel, Clone, Copy)] +#[label(singleton = "io_operation")] +enum IoOp { + Read, + Write, +} + +impl MetricGroup for Rusage +where + GaugeState: MetricEncoding, +{ + fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> { + const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total"); + const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb"); + + let ru = get_rusage_stats(); + + enc.write_help( + DISK_IO, + "Bytes written and read from disk, grouped by the operation (read|write)", + )?; + GaugeState::write_type(DISK_IO, enc)?; + write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?; + write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?; + + enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?; + GaugeState::write_type(MAXRSS, enc)?; + write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?; + + Ok(()) + } +} + +#[derive(Default)] +struct CollectionCounter(CounterState); + +impl MetricFamilyEncoding for CollectionCounter +where + CounterState: MetricEncoding, +{ + fn collect_family_into( + &self, + name: impl measured::metric::name::MetricNameEncoder, + enc: &mut T, + ) -> Result<(), T::Err> { + self.0.inc(); + enc.write_help(&name, "Number of metric requests made")?; + self.0.collect_into(&(), NoLabels, name, enc) + } +} pub fn set_build_info_metric(revision: &str, build_tag: &str) { let metric = register_int_gauge_vec!( @@ -102,6 +242,7 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) { .expect("Failed to register build info metric"); metric.with_label_values(&[revision, build_tag]).set(1); } +const BYTES_IN_BLOCK: i64 = 512; // Records I/O stats in a "cross-platform" way. // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats. @@ -111,18 +252,25 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) { // performed by the process. // We know the size of the block, so we can determine the I/O bytes out of it. // The value might be not 100% exact, but should be fine for Prometheus metrics in this case. -#[allow(clippy::unnecessary_cast)] fn update_rusage_metrics() { let rusage_stats = get_rusage_stats(); - const BYTES_IN_BLOCK: i64 = 512; DISK_IO_BYTES .with_label_values(&["read"]) .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK); DISK_IO_BYTES .with_label_values(&["write"]) .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK); - MAXRSS_KB.set(rusage_stats.ru_maxrss); + + // On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669 + #[cfg(target_os = "macos")] + { + MAXRSS_KB.set(rusage_stats.ru_maxrss / 1024); + } + #[cfg(not(target_os = "macos"))] + { + MAXRSS_KB.set(rusage_stats.ru_maxrss); + } } fn get_rusage_stats() -> libc::rusage { @@ -149,6 +297,7 @@ macro_rules! register_int_counter_pair_vec { } }}; } + /// Create an [`IntCounterPair`] and registers to default registry. #[macro_export(local_inner_macros)] macro_rules! register_int_counter_pair { @@ -186,7 +335,10 @@ impl GenericCounterPairVec

{ /// /// An error is returned if the number of label values is not the same as the /// number of VariableLabels in Desc. - pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result> { + pub fn get_metric_with_label_values( + &self, + vals: &[&str], + ) -> prometheus::Result> { Ok(GenericCounterPair { inc: self.inc.get_metric_with_label_values(vals)?, dec: self.dec.get_metric_with_label_values(vals)?, @@ -198,6 +350,11 @@ impl GenericCounterPairVec

{ pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair

{ self.get_metric_with_label_values(vals).unwrap() } + + pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) { + res[0] = self.inc.remove_label_values(vals); + res[1] = self.dec.remove_label_values(vals); + } } impl GenericCounterPair

{ @@ -244,6 +401,15 @@ impl GenericCounterPair

{ } } +impl Clone for GenericCounterPair

{ + fn clone(&self) -> Self { + Self { + inc: self.inc.clone(), + dec: self.dec.clone(), + } + } +} + /// Guard returned by [`GenericCounterPair::guard`] pub struct GenericCounterPairGuard(GenericCounter

); @@ -269,3 +435,162 @@ pub type IntCounterPair = GenericCounterPair; /// A guard for [`IntCounterPair`] that will decrement the gauge on drop pub type IntCounterPairGuard = GenericCounterPairGuard; + +pub trait CounterPairAssoc { + const INC_NAME: &'static MetricName; + const DEC_NAME: &'static MetricName; + + const INC_HELP: &'static str; + const DEC_HELP: &'static str; + + type LabelGroupSet: LabelGroupSet; +} + +pub struct CounterPairVec { + vec: measured::metric::MetricVec, +} + +impl Default for CounterPairVec +where + A::LabelGroupSet: Default, +{ + fn default() -> Self { + Self { + vec: Default::default(), + } + } +} + +impl CounterPairVec { + pub fn guard( + &self, + labels: ::Group<'_>, + ) -> MeasuredCounterPairGuard<'_, A> { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).inc.inc(); + MeasuredCounterPairGuard { vec: &self.vec, id } + } + pub fn inc(&self, labels: ::Group<'_>) { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).inc.inc(); + } + pub fn dec(&self, labels: ::Group<'_>) { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).dec.inc(); + } + pub fn remove_metric( + &self, + labels: ::Group<'_>, + ) -> Option { + let id = self.vec.with_labels(labels); + self.vec.remove_metric(id) + } + + pub fn sample(&self, labels: ::Group<'_>) -> u64 { + let id = self.vec.with_labels(labels); + let metric = self.vec.get_metric(id); + + let inc = metric.inc.count.load(std::sync::atomic::Ordering::Relaxed); + let dec = metric.dec.count.load(std::sync::atomic::Ordering::Relaxed); + inc.saturating_sub(dec) + } +} + +impl ::measured::metric::group::MetricGroup for CounterPairVec +where + T: ::measured::metric::group::Encoding, + A: CounterPairAssoc, + ::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding, +{ + fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> { + // write decrement first to avoid a race condition where inc - dec < 0 + T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?; + self.vec + .collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?; + + T::write_help(enc, A::INC_NAME, A::INC_HELP)?; + self.vec + .collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?; + + Ok(()) + } +} + +#[derive(MetricGroup, Default)] +pub struct MeasuredCounterPairState { + pub inc: CounterState, + pub dec: CounterState, +} + +impl measured::metric::MetricType for MeasuredCounterPairState { + type Metadata = (); +} + +pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> { + vec: &'a measured::metric::MetricVec, + id: measured::metric::LabelId, +} + +impl Drop for MeasuredCounterPairGuard<'_, A> { + fn drop(&mut self) { + self.vec.get_metric(self.id).dec.inc(); + } +} + +/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder. +struct Inc(T); +/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder. +struct Dec(T); + +impl Encoding for Inc { + type Err = T::Err; + + fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { + self.0.write_help(name, help) + } +} + +impl MetricEncoding> for MeasuredCounterPairState +where + CounterState: MetricEncoding, +{ + fn write_type(name: impl MetricNameEncoder, enc: &mut Inc) -> Result<(), T::Err> { + CounterState::write_type(name, &mut enc.0) + } + fn collect_into( + &self, + metadata: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Inc, + ) -> Result<(), T::Err> { + self.inc.collect_into(metadata, labels, name, &mut enc.0) + } +} + +impl Encoding for Dec { + type Err = T::Err; + + fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { + self.0.write_help(name, help) + } +} + +/// Write the dec counter to the encoder +impl MetricEncoding> for MeasuredCounterPairState +where + CounterState: MetricEncoding, +{ + fn write_type(name: impl MetricNameEncoder, enc: &mut Dec) -> Result<(), T::Err> { + CounterState::write_type(name, &mut enc.0) + } + fn collect_into( + &self, + metadata: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Dec, + ) -> Result<(), T::Err> { + self.dec.collect_into(metadata, labels, name, &mut enc.0) + } +} diff --git a/libs/metrics/src/metric_vec_duration.rs b/libs/metrics/src/metric_vec_duration.rs deleted file mode 100644 index e9a0a65570..0000000000 --- a/libs/metrics/src/metric_vec_duration.rs +++ /dev/null @@ -1,23 +0,0 @@ -//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec`. - -use std::{future::Future, time::Instant}; - -pub trait DurationResultObserver { - fn observe_result(&self, res: &Result, duration: std::time::Duration); -} - -pub async fn observe_async_block_duration_by_result< - T, - E, - F: Future>, - O: DurationResultObserver, ->( - observer: &O, - block: F, -) -> Result { - let start = Instant::now(); - let result = block.await; - let duration = start.elapsed(); - observer.observe_result(&result, duration); - result -} diff --git a/libs/metrics/src/more_process_metrics.rs b/libs/metrics/src/more_process_metrics.rs new file mode 100644 index 0000000000..920724fdec --- /dev/null +++ b/libs/metrics/src/more_process_metrics.rs @@ -0,0 +1,54 @@ +//! process metrics that the [`::prometheus`] crate doesn't provide. + +// This module has heavy inspiration from the prometheus crate's `process_collector.rs`. + +use crate::UIntGauge; + +pub struct Collector { + descs: Vec, + vmlck: crate::UIntGauge, +} + +const NMETRICS: usize = 1; + +impl prometheus::core::Collector for Collector { + fn desc(&self) -> Vec<&prometheus::core::Desc> { + self.descs.iter().collect() + } + + fn collect(&self) -> Vec { + let Ok(myself) = procfs::process::Process::myself() else { + return vec![]; + }; + let mut mfs = Vec::with_capacity(NMETRICS); + if let Ok(status) = myself.status() { + if let Some(vmlck) = status.vmlck { + self.vmlck.set(vmlck); + mfs.extend(self.vmlck.collect()) + } + } + mfs + } +} + +impl Collector { + pub fn new() -> Self { + let mut descs = Vec::new(); + + let vmlck = + UIntGauge::new("libmetrics_process_status_vmlck", "/proc/self/status vmlck").unwrap(); + descs.extend( + prometheus::core::Collector::desc(&vmlck) + .into_iter() + .cloned(), + ); + + Self { descs, vmlck } + } +} + +impl Default for Collector { + fn default() -> Self { + Self::new() + } +} diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 96c6c10d3e..8710904cec 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -4,6 +4,10 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +# See pageserver/Cargo.toml +testing = ["dep:nix"] + [dependencies] serde.workspace = true serde_with.workspace = true @@ -18,10 +22,17 @@ enum-map.workspace = true strum.workspace = true strum_macros.workspace = true hex.workspace = true +humantime.workspace = true thiserror.workspace = true humantime-serde.workspace = true - -workspace_hack.workspace = true +chrono = { workspace = true, features = ["serde"] } +itertools.workspace = true +storage_broker.workspace = true +camino = {workspace = true, features = ["serde1"]} +remote_storage.workspace = true +postgres_backend.workspace = true +nix = {workspace = true, optional = true} +reqwest.workspace = true [dev-dependencies] bincode.workspace = true diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs new file mode 100644 index 0000000000..1194ee93ef --- /dev/null +++ b/libs/pageserver_api/src/config.rs @@ -0,0 +1,517 @@ +use camino::Utf8PathBuf; + +#[cfg(test)] +mod tests; + +use const_format::formatcp; +pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; +pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); +pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; +pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); + +use postgres_backend::AuthType; +use remote_storage::RemoteStorageConfig; +use serde_with::serde_as; +use std::{ + collections::HashMap, + num::{NonZeroU64, NonZeroUsize}, + str::FromStr, + time::Duration, +}; +use utils::logging::LogFormat; + +use crate::models::ImageCompressionAlgorithm; +use crate::models::LsnLease; + +// Certain metadata (e.g. externally-addressable name, AZ) is delivered +// as a separate structure. This information is not neeed by the pageserver +// itself, it is only used for registering the pageserver with the control +// plane and/or storage controller. +// +#[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)] +pub struct NodeMetadata { + #[serde(rename = "host")] + pub postgres_host: String, + #[serde(rename = "port")] + pub postgres_port: u16, + pub http_host: String, + pub http_port: u16, + + // Deployment tools may write fields to the metadata file beyond what we + // use in this type: this type intentionally only names fields that require. + #[serde(flatten)] + pub other: HashMap, +} + +/// `pageserver.toml` +/// +/// We use serde derive with `#[serde(default)]` to generate a deserializer +/// that fills in the default values for each config field. +/// +/// If there cannot be a static default value because we need to make runtime +/// checks to determine the default, make it an `Option` (which defaults to None). +/// The runtime check should be done in the consuming crate, i.e., `pageserver`. +#[serde_as] +#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct ConfigToml { + // types mapped 1:1 into the runtime PageServerConfig type + pub listen_pg_addr: String, + pub listen_http_addr: String, + pub availability_zone: Option, + #[serde(with = "humantime_serde")] + pub wait_lsn_timeout: Duration, + #[serde(with = "humantime_serde")] + pub wal_redo_timeout: Duration, + pub superuser: String, + pub page_cache_size: usize, + pub max_file_descriptors: usize, + pub pg_distrib_dir: Option, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub http_auth_type: AuthType, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub pg_auth_type: AuthType, + pub auth_validation_public_key_path: Option, + pub remote_storage: Option, + pub tenant_config: TenantConfigToml, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub broker_endpoint: storage_broker::Uri, + #[serde(with = "humantime_serde")] + pub broker_keepalive_interval: Duration, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub log_format: LogFormat, + pub concurrent_tenant_warmup: NonZeroUsize, + pub concurrent_tenant_size_logical_size_queries: NonZeroUsize, + #[serde(with = "humantime_serde")] + pub metric_collection_interval: Duration, + pub metric_collection_endpoint: Option, + pub metric_collection_bucket: Option, + #[serde(with = "humantime_serde")] + pub synthetic_size_calculation_interval: Duration, + pub disk_usage_based_eviction: Option, + pub test_remote_failures: u64, + pub ondemand_download_behavior_treat_error_as_warn: bool, + #[serde(with = "humantime_serde")] + pub background_task_maximum_delay: Duration, + pub control_plane_api: Option, + pub control_plane_api_token: Option, + pub control_plane_emergency_mode: bool, + pub heatmap_upload_concurrency: usize, + pub secondary_download_concurrency: usize, + pub virtual_file_io_engine: Option, + pub ingest_batch_size: u64, + pub max_vectored_read_bytes: MaxVectoredReadBytes, + pub image_compression: ImageCompressionAlgorithm, + pub ephemeral_bytes_per_memory_kb: usize, + pub l0_flush: Option, + #[serde(skip_serializing)] + // TODO(https://github.com/neondatabase/neon/issues/8184): remove after this field is removed from all pageserver.toml's + pub compact_level0_phase1_value_access: serde::de::IgnoredAny, + pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode, + pub io_buffer_alignment: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(deny_unknown_fields)] +pub struct DiskUsageEvictionTaskConfig { + pub max_usage_pct: utils::serde_percent::Percent, + pub min_avail_bytes: u64, + #[serde(with = "humantime_serde")] + pub period: Duration, + #[cfg(feature = "testing")] + pub mock_statvfs: Option, + /// Select sorting for evicted layers + #[serde(default)] + pub eviction_order: EvictionOrder, +} + +pub mod statvfs { + pub mod mock { + #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + #[serde(tag = "type")] + pub enum Behavior { + Success { + blocksize: u64, + total_blocks: u64, + name_filter: Option, + }, + #[cfg(feature = "testing")] + Failure { mocked_error: MockedError }, + } + + #[cfg(feature = "testing")] + #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + #[allow(clippy::upper_case_acronyms)] + pub enum MockedError { + EIO, + } + + #[cfg(feature = "testing")] + impl From for nix::Error { + fn from(e: MockedError) -> Self { + match e { + MockedError::EIO => nix::Error::EIO, + } + } + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(tag = "type", content = "args")] +pub enum EvictionOrder { + RelativeAccessed { + highest_layer_count_loses_first: bool, + }, +} + +impl Default for EvictionOrder { + fn default() -> Self { + Self::RelativeAccessed { + highest_layer_count_loses_first: true, + } + } +} + +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +pub enum GetVectoredImpl { + Sequential, + Vectored, +} + +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +pub enum GetImpl { + Legacy, + Vectored, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct MaxVectoredReadBytes(pub NonZeroUsize); + +/// A tenant's calcuated configuration, which is the result of merging a +/// tenant's TenantConfOpt with the global TenantConf from PageServerConf. +/// +/// For storing and transmitting individual tenant's configuration, see +/// TenantConfOpt. +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(deny_unknown_fields, default)] +pub struct TenantConfigToml { + // Flush out an inmemory layer, if it's holding WAL older than this + // This puts a backstop on how much WAL needs to be re-digested if the + // page server crashes. + // This parameter actually determines L0 layer file size. + pub checkpoint_distance: u64, + // Inmemory layer is also flushed at least once in checkpoint_timeout to + // eventually upload WAL after activity is stopped. + #[serde(with = "humantime_serde")] + pub checkpoint_timeout: Duration, + // Target file size, when creating image and delta layers. + // This parameter determines L1 layer file size. + pub compaction_target_size: u64, + // How often to check if there's compaction work to be done. + // Duration::ZERO means automatic compaction is disabled. + #[serde(with = "humantime_serde")] + pub compaction_period: Duration, + // Level0 delta layer threshold for compaction. + pub compaction_threshold: usize, + pub compaction_algorithm: crate::models::CompactionAlgorithmSettings, + // Determines how much history is retained, to allow + // branching and read replicas at an older point in time. + // The unit is #of bytes of WAL. + // Page versions older than this are garbage collected away. + pub gc_horizon: u64, + // Interval at which garbage collection is triggered. + // Duration::ZERO means automatic GC is disabled + #[serde(with = "humantime_serde")] + pub gc_period: Duration, + // Delta layer churn threshold to create L1 image layers. + pub image_creation_threshold: usize, + // Determines how much history is retained, to allow + // branching and read replicas at an older point in time. + // The unit is time. + // Page versions older than this are garbage collected away. + #[serde(with = "humantime_serde")] + pub pitr_interval: Duration, + /// Maximum amount of time to wait while opening a connection to receive wal, before erroring. + #[serde(with = "humantime_serde")] + pub walreceiver_connect_timeout: Duration, + /// Considers safekeepers stalled after no WAL updates were received longer than this threshold. + /// A stalled safekeeper will be changed to a newer one when it appears. + #[serde(with = "humantime_serde")] + pub lagging_wal_timeout: Duration, + /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold. + /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update, + /// to avoid eager reconnects. + pub max_lsn_wal_lag: NonZeroU64, + pub eviction_policy: crate::models::EvictionPolicy, + pub min_resident_size_override: Option, + // See the corresponding metric's help string. + #[serde(with = "humantime_serde")] + pub evictions_low_residence_duration_metric_threshold: Duration, + + /// If non-zero, the period between uploads of a heatmap from attached tenants. This + /// may be disabled if a Tenant will not have secondary locations: only secondary + /// locations will use the heatmap uploaded by attached locations. + #[serde(with = "humantime_serde")] + pub heatmap_period: Duration, + + /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup + pub lazy_slru_download: bool, + + pub timeline_get_throttle: crate::models::ThrottleConfig, + + // How much WAL must be ingested before checking again whether a new image layer is required. + // Expresed in multiples of checkpoint distance. + pub image_layer_creation_check_threshold: u8, + + /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into + /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions. + /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux + /// file is written. + pub switch_aux_file_policy: crate::models::AuxFilePolicy, + + /// The length for an explicit LSN lease request. + /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. + #[serde(with = "humantime_serde")] + pub lsn_lease_length: Duration, + + /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request. + /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. + #[serde(with = "humantime_serde")] + pub lsn_lease_length_for_ts: Duration, +} + +pub mod defaults { + use crate::models::ImageCompressionAlgorithm; + + pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; + + pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s"; + pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; + + pub const DEFAULT_SUPERUSER: &str = "cloud_admin"; + + pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; + pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; + + pub const DEFAULT_LOG_FORMAT: &str = "plain"; + + pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8; + + pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = 1; + + pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min"; + pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; + pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; + pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; + + pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8; + pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1; + + pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; + + pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB + + pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm = + ImageCompressionAlgorithm::Zstd { level: Some(1) }; + + pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false; + + pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; + + pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512; +} + +impl Default for ConfigToml { + fn default() -> Self { + use defaults::*; + + Self { + listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()), + listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()), + availability_zone: (None), + wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) + .expect("cannot parse default wait lsn timeout")), + wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT) + .expect("cannot parse default wal redo timeout")), + superuser: (DEFAULT_SUPERUSER.to_string()), + page_cache_size: (DEFAULT_PAGE_CACHE_SIZE), + max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS), + pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir() + http_auth_type: (AuthType::Trust), + pg_auth_type: (AuthType::Trust), + auth_validation_public_key_path: (None), + remote_storage: None, + broker_endpoint: (storage_broker::DEFAULT_ENDPOINT + .parse() + .expect("failed to parse default broker endpoint")), + broker_keepalive_interval: (humantime::parse_duration( + storage_broker::DEFAULT_KEEPALIVE_INTERVAL, + ) + .expect("cannot parse default keepalive interval")), + log_format: (LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), + + concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP) + .expect("Invalid default constant")), + concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(1).unwrap(), + metric_collection_interval: (humantime::parse_duration( + DEFAULT_METRIC_COLLECTION_INTERVAL, + ) + .expect("cannot parse default metric collection interval")), + synthetic_size_calculation_interval: (humantime::parse_duration( + DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL, + ) + .expect("cannot parse default synthetic size calculation interval")), + metric_collection_endpoint: (DEFAULT_METRIC_COLLECTION_ENDPOINT), + + metric_collection_bucket: (None), + + disk_usage_based_eviction: (None), + + test_remote_failures: (0), + + ondemand_download_behavior_treat_error_as_warn: (false), + + background_task_maximum_delay: (humantime::parse_duration( + DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY, + ) + .unwrap()), + + control_plane_api: (None), + control_plane_api_token: (None), + control_plane_emergency_mode: (false), + + heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY), + secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY), + + ingest_batch_size: (DEFAULT_INGEST_BATCH_SIZE), + + virtual_file_io_engine: None, + + max_vectored_read_bytes: (MaxVectoredReadBytes( + NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), + )), + image_compression: (DEFAULT_IMAGE_COMPRESSION), + ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), + l0_flush: None, + compact_level0_phase1_value_access: Default::default(), + virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(), + + io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT, + + tenant_config: TenantConfigToml::default(), + } + } +} + +pub mod tenant_conf_defaults { + + // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB + // would be more appropriate. But a low value forces the code to be exercised more, + // which is good for now to trigger bugs. + // This parameter actually determines L0 layer file size. + pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; + pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m"; + + // FIXME the below configs are only used by legacy algorithm. The new algorithm + // has different parameters. + + // Target file size, when creating image and delta layers. + // This parameter determines L1 layer file size. + pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024; + + pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; + pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; + pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm = + crate::models::CompactionAlgorithm::Legacy; + + pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; + + // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger. + // If there's a need to decrease this value, first make sure that GC + // doesn't hold a layer map write lock for non-trivial operations. + // Relevant: https://github.com/neondatabase/neon/issues/3394 + pub const DEFAULT_GC_PERIOD: &str = "1 hr"; + pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; + pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; + pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds"; + pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; + // The default limit on WAL lag should be set to avoid causing disconnects under high throughput + // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for + // throughputs up to 1GiB/s per timeline. + pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024; + pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; + // By default ingest enough WAL for two new L0 layers before checking if new image + // image layers should be created. + pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; + + pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; +} + +impl Default for TenantConfigToml { + fn default() -> Self { + use tenant_conf_defaults::*; + Self { + checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE, + checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT) + .expect("cannot parse default checkpoint timeout"), + compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE, + compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) + .expect("cannot parse default compaction period"), + compaction_threshold: DEFAULT_COMPACTION_THRESHOLD, + compaction_algorithm: crate::models::CompactionAlgorithmSettings { + kind: DEFAULT_COMPACTION_ALGORITHM, + }, + gc_horizon: DEFAULT_GC_HORIZON, + gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) + .expect("cannot parse default gc period"), + image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD, + pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL) + .expect("cannot parse default PITR interval"), + walreceiver_connect_timeout: humantime::parse_duration( + DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, + ) + .expect("cannot parse default walreceiver connect timeout"), + lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT) + .expect("cannot parse default walreceiver lagging wal timeout"), + max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) + .expect("cannot parse default max walreceiver Lsn wal lag"), + eviction_policy: crate::models::EvictionPolicy::NoEviction, + min_resident_size_override: None, + evictions_low_residence_duration_metric_threshold: humantime::parse_duration( + DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, + ) + .expect("cannot parse default evictions_low_residence_duration_metric_threshold"), + heatmap_period: Duration::ZERO, + lazy_slru_download: false, + timeline_get_throttle: crate::models::ThrottleConfig::disabled(), + image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, + switch_aux_file_policy: crate::models::AuxFilePolicy::default_tenant_config(), + lsn_lease_length: LsnLease::DEFAULT_LENGTH, + lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, + } + } +} diff --git a/libs/pageserver_api/src/config/tests.rs b/libs/pageserver_api/src/config/tests.rs new file mode 100644 index 0000000000..edeefc156e --- /dev/null +++ b/libs/pageserver_api/src/config/tests.rs @@ -0,0 +1,22 @@ +use super::*; + +#[test] +fn test_node_metadata_v1_backward_compatibilty() { + let v1 = serde_json::to_vec(&serde_json::json!({ + "host": "localhost", + "port": 23, + "http_host": "localhost", + "http_port": 42, + })); + + assert_eq!( + serde_json::from_slice::(&v1.unwrap()).unwrap(), + NodeMetadata { + postgres_host: "localhost".to_string(), + postgres_port: 23, + http_host: "localhost".to_string(), + http_port: 42, + other: HashMap::new(), + } + ) +} diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs new file mode 100644 index 0000000000..5c8dcbf571 --- /dev/null +++ b/libs/pageserver_api/src/controller_api.rs @@ -0,0 +1,373 @@ +use std::collections::{HashMap, HashSet}; +use std::str::FromStr; +use std::time::{Duration, Instant}; + +/// Request/response types for the storage controller +/// API (`/control/v1` prefix). Implemented by the server +/// in [`storage_controller::http`] +use serde::{Deserialize, Serialize}; +use utils::id::{NodeId, TenantId}; + +use crate::models::PageserverUtilization; +use crate::{ + models::{ShardParameters, TenantConfig}, + shard::{ShardStripeSize, TenantShardId}, +}; + +#[derive(Serialize, Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct TenantCreateRequest { + pub new_tenant_id: TenantShardId, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub generation: Option, + + // If omitted, create a single shard with TenantShardId::unsharded() + #[serde(default)] + #[serde(skip_serializing_if = "ShardParameters::is_unsharded")] + pub shard_parameters: ShardParameters, + + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub placement_policy: Option, + + #[serde(flatten)] + pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it +} + +#[derive(Serialize, Deserialize)] +pub struct TenantCreateResponseShard { + pub shard_id: TenantShardId, + pub node_id: NodeId, + pub generation: u32, +} + +#[derive(Serialize, Deserialize)] +pub struct TenantCreateResponse { + pub shards: Vec, +} + +#[derive(Serialize, Deserialize)] +pub struct NodeRegisterRequest { + pub node_id: NodeId, + + pub listen_pg_addr: String, + pub listen_pg_port: u16, + + pub listen_http_addr: String, + pub listen_http_port: u16, + + pub availability_zone_id: String, +} + +#[derive(Serialize, Deserialize)] +pub struct NodeConfigureRequest { + pub node_id: NodeId, + + pub availability: Option, + pub scheduling: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct TenantPolicyRequest { + pub placement: Option, + pub scheduling: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct ShardsPreferredAzsRequest { + #[serde(flatten)] + pub preferred_az_ids: HashMap, +} + +#[derive(Serialize, Deserialize)] +pub struct ShardsPreferredAzsResponse { + pub updated: Vec, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantLocateResponseShard { + pub shard_id: TenantShardId, + pub node_id: NodeId, + + pub listen_pg_addr: String, + pub listen_pg_port: u16, + + pub listen_http_addr: String, + pub listen_http_port: u16, +} + +#[derive(Serialize, Deserialize)] +pub struct TenantLocateResponse { + pub shards: Vec, + pub shard_params: ShardParameters, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantDescribeResponse { + pub tenant_id: TenantId, + pub shards: Vec, + pub stripe_size: ShardStripeSize, + pub policy: PlacementPolicy, + pub config: TenantConfig, +} + +#[derive(Serialize, Deserialize)] +pub struct NodeDescribeResponse { + pub id: NodeId, + + pub availability: NodeAvailabilityWrapper, + pub scheduling: NodeSchedulingPolicy, + + pub listen_http_addr: String, + pub listen_http_port: u16, + + pub listen_pg_addr: String, + pub listen_pg_port: u16, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantDescribeResponseShard { + pub tenant_shard_id: TenantShardId, + + pub node_attached: Option, + pub node_secondary: Vec, + + pub last_error: String, + + /// A task is currently running to reconcile this tenant's intent state with the state on pageservers + pub is_reconciling: bool, + /// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending. + pub is_pending_compute_notification: bool, + /// A shard split is currently underway + pub is_splitting: bool, + + pub scheduling_policy: ShardSchedulingPolicy, + + pub preferred_az_id: Option, +} + +/// Migration request for a given tenant shard to a given node. +/// +/// Explicitly migrating a particular shard is a low level operation +/// TODO: higher level "Reschedule tenant" operation where the request +/// specifies some constraints, e.g. asking it to get off particular node(s) +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantShardMigrateRequest { + pub tenant_shard_id: TenantShardId, + pub node_id: NodeId, +} + +#[derive(Serialize, Clone, Debug)] +#[serde(into = "NodeAvailabilityWrapper")] +pub enum NodeAvailability { + // Normal, happy state + Active(PageserverUtilization), + // Node is warming up, but we expect it to become available soon. Covers + // the time span between the re-attach response being composed on the storage controller + // and the first successful heartbeat after the processing of the re-attach response + // finishes on the pageserver. + WarmingUp(Instant), + // Offline: Tenants shouldn't try to attach here, but they may assume that their + // secondary locations on this node still exist. Newly added nodes are in this + // state until we successfully contact them. + Offline, +} + +impl PartialEq for NodeAvailability { + fn eq(&self, other: &Self) -> bool { + use NodeAvailability::*; + matches!( + (self, other), + (Active(_), Active(_)) | (Offline, Offline) | (WarmingUp(_), WarmingUp(_)) + ) + } +} + +impl Eq for NodeAvailability {} + +// This wrapper provides serde functionality and it should only be used to +// communicate with external callers which don't know or care about the +// utilisation score of the pageserver it is targeting. +#[derive(Serialize, Deserialize, Clone, Copy, Debug)] +pub enum NodeAvailabilityWrapper { + Active, + WarmingUp, + Offline, +} + +impl From for NodeAvailability { + fn from(val: NodeAvailabilityWrapper) -> Self { + match val { + // Assume the worst utilisation score to begin with. It will later be updated by + // the heartbeats. + NodeAvailabilityWrapper::Active => { + NodeAvailability::Active(PageserverUtilization::full()) + } + NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()), + NodeAvailabilityWrapper::Offline => NodeAvailability::Offline, + } + } +} + +impl From for NodeAvailabilityWrapper { + fn from(val: NodeAvailability) -> Self { + match val { + NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active, + NodeAvailability::WarmingUp(_) => NodeAvailabilityWrapper::WarmingUp, + NodeAvailability::Offline => NodeAvailabilityWrapper::Offline, + } + } +} + +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] +pub enum ShardSchedulingPolicy { + // Normal mode: the tenant's scheduled locations may be updated at will, including + // for non-essential optimization. + Active, + + // Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy. + // For example, this still permits a node's attachment location to change to a secondary in + // response to a node failure, or to assign a new secondary if a node was removed. + Essential, + + // No scheduling: leave the shard running wherever it currently is. Even if the shard is + // unavailable, it will not be rescheduled to another node. + Pause, + + // No reconciling: we will make no location_conf API calls to pageservers at all. If the + // shard is unavailable, it stays that way. If a node fails, this shard doesn't get failed over. + Stop, +} + +impl Default for ShardSchedulingPolicy { + fn default() -> Self { + Self::Active + } +} + +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] +pub enum NodeSchedulingPolicy { + Active, + Filling, + Pause, + PauseForRestart, + Draining, +} + +impl FromStr for NodeSchedulingPolicy { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "active" => Ok(Self::Active), + "filling" => Ok(Self::Filling), + "pause" => Ok(Self::Pause), + "pause_for_restart" => Ok(Self::PauseForRestart), + "draining" => Ok(Self::Draining), + _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")), + } + } +} + +impl From for String { + fn from(value: NodeSchedulingPolicy) -> String { + use NodeSchedulingPolicy::*; + match value { + Active => "active", + Filling => "filling", + Pause => "pause", + PauseForRestart => "pause_for_restart", + Draining => "draining", + } + .to_string() + } +} + +/// Controls how tenant shards are mapped to locations on pageservers, e.g. whether +/// to create secondary locations. +#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)] +pub enum PlacementPolicy { + /// Normal live state: one attached pageserver and zero or more secondaries. + Attached(usize), + /// Create one secondary mode locations. This is useful when onboarding + /// a tenant, or for an idle tenant that we might want to bring online quickly. + Secondary, + + /// Do not attach to any pageservers. This is appropriate for tenants that + /// have been idle for a long time, where we do not mind some delay in making + /// them available in future. + Detached, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantShardMigrateResponse {} + +/// Metadata health record posted from scrubber. +#[derive(Serialize, Deserialize, Debug)] +pub struct MetadataHealthRecord { + pub tenant_shard_id: TenantShardId, + pub healthy: bool, + pub last_scrubbed_at: chrono::DateTime, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct MetadataHealthUpdateRequest { + pub healthy_tenant_shards: HashSet, + pub unhealthy_tenant_shards: HashSet, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct MetadataHealthUpdateResponse {} + +#[derive(Serialize, Deserialize, Debug)] +pub struct MetadataHealthListUnhealthyResponse { + pub unhealthy_tenant_shards: Vec, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct MetadataHealthListOutdatedRequest { + #[serde(with = "humantime_serde")] + pub not_scrubbed_for: Duration, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct MetadataHealthListOutdatedResponse { + pub health_records: Vec, +} + +#[cfg(test)] +mod test { + use super::*; + use serde_json; + + /// Check stability of PlacementPolicy's serialization + #[test] + fn placement_policy_encoding() -> anyhow::Result<()> { + let v = PlacementPolicy::Attached(1); + let encoded = serde_json::to_string(&v)?; + assert_eq!(encoded, "{\"Attached\":1}"); + assert_eq!(serde_json::from_str::(&encoded)?, v); + + let v = PlacementPolicy::Detached; + let encoded = serde_json::to_string(&v)?; + assert_eq!(encoded, "\"Detached\""); + assert_eq!(serde_json::from_str::(&encoded)?, v); + Ok(()) + } + + #[test] + fn test_reject_unknown_field() { + let id = TenantId::generate(); + let create_request = serde_json::json!({ + "new_tenant_id": id.to_string(), + "unknown_field": "unknown_value".to_string(), + }); + let err = serde_json::from_value::(create_request).unwrap_err(); + assert!( + err.to_string().contains("unknown field `unknown_field`"), + "expect unknown field `unknown_field` error, got: {}", + err + ); + } +} diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 852670af2c..77d744e4da 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,6 +1,7 @@ use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::RepOriginId; use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; use std::{fmt, ops::Range}; @@ -21,15 +22,128 @@ pub struct Key { pub field6: u32, } +/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as +/// a struct of fields. +#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd)] +pub struct CompactKey(i128); + +/// The storage key size. pub const KEY_SIZE: usize = 18; +/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized. +/// See [`Key::to_i128`] for more information on the encoding. +pub const METADATA_KEY_SIZE: usize = 16; + +/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x60 is a metadata key. +pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60; +pub const METADATA_KEY_END_PREFIX: u8 = 0x7F; + +/// The (reserved) key prefix of relation sizes. +pub const RELATION_SIZE_PREFIX: u8 = 0x61; + +/// The key prefix of AUX file keys. +pub const AUX_KEY_PREFIX: u8 = 0x62; + +/// The key prefix of ReplOrigin keys. +pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63; + +/// Check if the key falls in the range of metadata keys. +pub const fn is_metadata_key_slice(key: &[u8]) -> bool { + key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX +} + impl Key { + /// Check if the key falls in the range of metadata keys. + pub const fn is_metadata_key(&self) -> bool { + self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX + } + + /// Encode a metadata key to a storage key. + pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self { + assert!(is_metadata_key_slice(key), "key not in metadata key range"); + // Metadata key space ends at 0x7F so it's fine to directly convert it to i128. + Self::from_i128(i128::from_be_bytes(*key)) + } + + /// Encode a metadata key to a storage key. + pub fn from_metadata_key(key: &[u8]) -> Self { + Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key")) + } + + /// Get the range of metadata keys. + pub const fn metadata_key_range() -> Range { + Key { + field1: METADATA_KEY_BEGIN_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: METADATA_KEY_END_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } + } + + /// Get the range of aux keys. + pub fn metadata_aux_key_range() -> Range { + Key { + field1: AUX_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: AUX_KEY_PREFIX + 1, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } + } + + /// This function checks more extensively what keys we can take on the write path. + /// If a key beginning with 00 does not have a global/default tablespace OID, it + /// will be rejected on the write path. + #[allow(dead_code)] + pub fn is_valid_key_on_write_path_strong(&self) -> bool { + use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; + if !self.is_i128_representable() { + return false; + } + if self.field1 == 0 + && !(self.field2 == GLOBALTABLESPACE_OID + || self.field2 == DEFAULTTABLESPACE_OID + || self.field2 == 0) + { + return false; // User defined tablespaces are not supported + } + true + } + + /// This is a weaker version of `is_valid_key_on_write_path_strong` that simply + /// checks if the key is i128 representable. Note that some keys can be successfully + /// ingested into the pageserver, but will cause errors on generating basebackup. + pub fn is_valid_key_on_write_path(&self) -> bool { + self.is_i128_representable() + } + + pub fn is_i128_representable(&self) -> bool { + self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222 + } + /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish. /// As long as Neon does not support tablespace (because of lack of access to local file system), /// we can assume that only some predefined namespace OIDs are used which can fit in u16 pub fn to_i128(&self) -> i128 { - assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222); - (((self.field1 & 0xf) as i128) << 120) + assert!(self.is_i128_representable(), "invalid key: {self}"); + (((self.field1 & 0x7F) as i128) << 120) | (((self.field2 & 0xFFFF) as i128) << 104) | ((self.field3 as i128) << 72) | ((self.field4 as i128) << 40) @@ -39,7 +153,7 @@ impl Key { pub const fn from_i128(x: i128) -> Self { Key { - field1: ((x >> 120) & 0xf) as u8, + field1: ((x >> 120) & 0x7F) as u8, field2: ((x >> 104) & 0xFFFF) as u32, field3: (x >> 72) as u32, field4: (x >> 40) as u32, @@ -48,11 +162,19 @@ impl Key { } } - pub fn next(&self) -> Key { + pub fn to_compact(&self) -> CompactKey { + CompactKey(self.to_i128()) + } + + pub fn from_compact(k: CompactKey) -> Self { + Self::from_i128(k.0) + } + + pub const fn next(&self) -> Key { self.add(1) } - pub fn add(&self, x: u32) -> Key { + pub const fn add(&self, x: u32) -> Key { let mut key = *self; let r = key.field6.overflowing_add(x); @@ -81,6 +203,9 @@ impl Key { key } + /// Convert a 18B slice to a key. This function should not be used for 16B metadata keys because `field2` is handled differently. + /// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys). There are some restrictions on `field2`, + /// and therefore not all 18B slices are valid page server keys. pub fn from_slice(b: &[u8]) -> Self { Key { field1: b[0], @@ -92,6 +217,8 @@ impl Key { } } + /// Convert a key to a 18B slice. This function should not be used for getting a 16B metadata key because `field2` is handled differently. + /// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys). pub fn write_to_byte_slice(&self, buf: &mut [u8]) { buf[0] = self.field1; BE::write_u32(&mut buf[1..5], self.field2); @@ -112,6 +239,13 @@ impl fmt::Display for Key { } } +impl fmt::Display for CompactKey { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let k = Key::from_compact(*self); + k.fmt(f) + } +} + impl Key { pub const MIN: Key = Key { field1: u8::MIN, @@ -129,6 +263,15 @@ impl Key { field5: u8::MAX, field6: u32::MAX, }; + /// A key slightly smaller than [`Key::MAX`] for use in layer key ranges to avoid them to be confused with L0 layers + pub const NON_L0_MAX: Key = Key { + field1: u8::MAX, + field2: u32::MAX, + field3: u32::MAX, + field4: u32::MAX, + field5: u8::MAX, + field6: u32::MAX - 1, + }; pub fn from_hex(s: &str) -> Result { if s.len() != 36 { @@ -302,7 +445,14 @@ pub fn rel_size_to_key(rel: RelTag) -> Key { field3: rel.dbnode, field4: rel.relnode, field5: rel.forknum, - field6: 0xffffffff, + field6: 0xffff_ffff, + } +} + +impl Key { + #[inline(always)] + pub fn is_rel_size_key(&self) -> bool { + self.field1 == 0 && self.field6 == u32::MAX } } @@ -343,6 +493,25 @@ pub fn slru_dir_to_key(kind: SlruKind) -> Key { } } +#[inline(always)] +pub fn slru_dir_kind(key: &Key) -> Option> { + if key.field1 == 0x01 + && key.field3 == 0 + && key.field4 == 0 + && key.field5 == 0 + && key.field6 == 0 + { + match key.field2 { + 0 => Some(Ok(SlruKind::Clog)), + 1 => Some(Ok(SlruKind::MultiXactMembers)), + 2 => Some(Ok(SlruKind::MultiXactOffsets)), + x => Some(Err(x)), + } + } else { + None + } +} + #[inline(always)] pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key { Key { @@ -371,7 +540,17 @@ pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key { field3: 1, field4: segno, field5: 0, - field6: 0xffffffff, + field6: 0xffff_ffff, + } +} + +impl Key { + pub fn is_slru_segment_size_key(&self) -> bool { + self.field1 == 0x01 + && self.field2 < 0x03 + && self.field3 == 0x01 + && self.field5 == 0 + && self.field6 == u32::MAX } } @@ -472,76 +651,117 @@ pub const AUX_FILES_KEY: Key = Key { field6: 2, }; +#[inline(always)] +pub fn repl_origin_key(origin_id: RepOriginId) -> Key { + Key { + field1: REPL_ORIGIN_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: origin_id as u32, + } +} + +/// Get the range of replorigin keys. +pub fn repl_origin_key_range() -> Range { + Key { + field1: REPL_ORIGIN_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: REPL_ORIGIN_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0x10000, + } +} + // Reverse mappings for a few Keys. // These are needed by WAL redo manager. -// AUX_FILES currently stores only data for logical replication (slots etc), and -// we don't preserve these on a branch because safekeepers can't follow timeline -// switch (and generally it likely should be optional), so ignore these. -#[inline(always)] -pub fn is_inherited_key(key: Key) -> bool { - key != AUX_FILES_KEY -} +/// Non inherited range for vectored get. +pub const NON_INHERITED_RANGE: Range = AUX_FILES_KEY..AUX_FILES_KEY.next(); +/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range. +pub const NON_INHERITED_SPARSE_RANGE: Range = Key::metadata_key_range(); -#[inline(always)] -pub fn is_rel_fsm_block_key(key: Key) -> bool { - key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff -} +impl Key { + // AUX_FILES currently stores only data for logical replication (slots etc), and + // we don't preserve these on a branch because safekeepers can't follow timeline + // switch (and generally it likely should be optional), so ignore these. + #[inline(always)] + pub fn is_inherited_key(self) -> bool { + !NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self) + } -#[inline(always)] -pub fn is_rel_vm_block_key(key: Key) -> bool { - key.field1 == 0x00 - && key.field4 != 0 - && key.field5 == VISIBILITYMAP_FORKNUM - && key.field6 != 0xffffffff -} + #[inline(always)] + pub fn is_rel_fsm_block_key(self) -> bool { + self.field1 == 0x00 + && self.field4 != 0 + && self.field5 == FSM_FORKNUM + && self.field6 != 0xffffffff + } -#[inline(always)] -pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> { - Ok(match key.field1 { - 0x01 => { - let kind = match key.field2 { - 0x00 => SlruKind::Clog, - 0x01 => SlruKind::MultiXactMembers, - 0x02 => SlruKind::MultiXactOffsets, - _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2), - }; - let segno = key.field4; - let blknum = key.field6; + #[inline(always)] + pub fn is_rel_vm_block_key(self) -> bool { + self.field1 == 0x00 + && self.field4 != 0 + && self.field5 == VISIBILITYMAP_FORKNUM + && self.field6 != 0xffffffff + } - (kind, segno, blknum) - } - _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), - }) -} + #[inline(always)] + pub fn to_slru_block(self) -> anyhow::Result<(SlruKind, u32, BlockNumber)> { + Ok(match self.field1 { + 0x01 => { + let kind = match self.field2 { + 0x00 => SlruKind::Clog, + 0x01 => SlruKind::MultiXactMembers, + 0x02 => SlruKind::MultiXactOffsets, + _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", self.field2), + }; + let segno = self.field4; + let blknum = self.field6; -#[inline(always)] -pub fn is_slru_block_key(key: Key) -> bool { - key.field1 == 0x01 // SLRU-related - && key.field3 == 0x00000001 // but not SlruDir - && key.field6 != 0xffffffff // and not SlruSegSize -} + (kind, segno, blknum) + } + _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1), + }) + } -#[inline(always)] -pub fn is_rel_block_key(key: &Key) -> bool { - key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff -} + #[inline(always)] + pub fn is_slru_block_key(self) -> bool { + self.field1 == 0x01 // SLRU-related + && self.field3 == 0x00000001 // but not SlruDir + && self.field6 != 0xffffffff // and not SlruSegSize + } -/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`. -#[inline(always)] -pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> { - Ok(match key.field1 { - 0x00 => ( - RelTag { - spcnode: key.field2, - dbnode: key.field3, - relnode: key.field4, - forknum: key.field5, - }, - key.field6, - ), - _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), - }) + #[inline(always)] + pub fn is_rel_block_key(&self) -> bool { + self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff + } + + /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`. + #[inline(always)] + pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> { + Ok(match self.field1 { + 0x00 => ( + RelTag { + spcnode: self.field2, + dbnode: self.field3, + relnode: self.field4, + forknum: self.field5, + }, + self.field6, + ), + _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1), + }) + } } impl std::str::FromStr for Key { @@ -556,11 +776,14 @@ impl std::str::FromStr for Key { mod tests { use std::str::FromStr; + use crate::key::is_metadata_key_slice; use crate::key::Key; use rand::Rng; use rand::SeedableRng; + use super::AUX_KEY_PREFIX; + #[test] fn display_fromstr_bijection() { let mut rng = rand::rngs::StdRng::seed_from_u64(42); @@ -576,4 +799,21 @@ mod tests { assert_eq!(key, Key::from_str(&format!("{key}")).unwrap()); } + + #[test] + fn test_metadata_keys() { + let mut metadata_key = vec![AUX_KEY_PREFIX]; + metadata_key.extend_from_slice(&[0xFF; 15]); + let encoded_key = Key::from_metadata_key(&metadata_key); + let output_key = encoded_key.to_i128().to_be_bytes(); + assert_eq!(metadata_key, output_key); + assert!(encoded_key.is_metadata_key()); + assert!(is_metadata_key_slice(&metadata_key)); + } + + #[test] + fn test_possible_largest_key() { + Key::from_i128(0x7FFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF); + // TODO: put this key into the system and see if anything breaks. + } } diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index 2316acb616..401887d362 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -1,7 +1,11 @@ use postgres_ffi::BLCKSZ; use std::ops::Range; -use crate::key::Key; +use crate::{ + key::Key, + shard::{ShardCount, ShardIdentity}, +}; +use itertools::Itertools; /// /// Represents a set of Keys, in a compact form. @@ -13,44 +17,289 @@ pub struct KeySpace { pub ranges: Vec>, } -impl KeySpace { +impl std::fmt::Display for KeySpace { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "[")?; + for range in &self.ranges { + write!(f, "{}..{},", range.start, range.end)?; + } + write!(f, "]") + } +} + +/// A wrapper type for sparse keyspaces. +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct SparseKeySpace(pub KeySpace); + +/// Represents a contiguous half-open range of the keyspace, masked according to a particular +/// ShardNumber's stripes: within this range of keys, only some "belong" to the current +/// shard. +/// +/// When we iterate over keys within this object, we will skip any keys that don't belong +/// to this shard. +/// +/// The start + end keys may not belong to the shard: these specify where layer files should +/// start + end, but we will never actually read/write those keys. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ShardedRange<'a> { + pub shard_identity: &'a ShardIdentity, + pub range: Range, +} + +// Calculate the size of a range within the blocks of the same relation, or spanning only the +// top page in the previous relation's space. +fn contiguous_range_len(range: &Range) -> u32 { + debug_assert!(is_contiguous_range(range)); + if range.start.field6 == 0xffffffff { + range.end.field6 + 1 + } else { + range.end.field6 - range.start.field6 + } +} + +/// Return true if this key range includes only keys in the same relation's data blocks, or +/// just spanning one relation and the logical size (0xffffffff) block of the relation before it. +/// +/// Contiguous in this context means we know the keys are in use _somewhere_, but it might not +/// be on our shard. Later in ShardedRange we do the extra work to figure out how much +/// of a given contiguous range is present on one shard. +/// +/// This matters, because: +/// - Within such ranges, keys are used contiguously. Outside such ranges it is sparse. +/// - Within such ranges, we may calculate distances using simple subtraction of field6. +fn is_contiguous_range(range: &Range) -> bool { + range.start.field1 == range.end.field1 + && range.start.field2 == range.end.field2 + && range.start.field3 == range.end.field3 + && range.start.field4 == range.end.field4 + && (range.start.field5 == range.end.field5 + || (range.start.field6 == 0xffffffff && range.start.field5 + 1 == range.end.field5)) +} + +impl<'a> ShardedRange<'a> { + pub fn new(range: Range, shard_identity: &'a ShardIdentity) -> Self { + Self { + shard_identity, + range, + } + } + + /// Break up this range into chunks, each of which has at least one local key in it if the + /// total range has at least one local key. + pub fn fragment(self, target_nblocks: u32) -> Vec<(u32, Range)> { + // Optimization for single-key case (e.g. logical size keys) + if self.range.end == self.range.start.add(1) { + return vec![( + if self.shard_identity.is_key_disposable(&self.range.start) { + 0 + } else { + 1 + }, + self.range, + )]; + } + + if !is_contiguous_range(&self.range) { + // Ranges that span relations are not fragmented. We only get these ranges as a result + // of operations that act on existing layers, so we trust that the existing range is + // reasonably small. + return vec![(u32::MAX, self.range)]; + } + + let mut fragments: Vec<(u32, Range)> = Vec::new(); + + let mut cursor = self.range.start; + while cursor < self.range.end { + let advance_by = self.distance_to_next_boundary(cursor); + let is_fragment_disposable = self.shard_identity.is_key_disposable(&cursor); + + // If the previous fragment is undersized, then we seek to consume enough + // blocks to complete it. + let (want_blocks, merge_last_fragment) = match fragments.last_mut() { + Some(frag) if frag.0 < target_nblocks => (target_nblocks - frag.0, Some(frag)), + Some(frag) => { + // Prev block is complete, want the full number. + ( + target_nblocks, + if is_fragment_disposable { + // If this current range will be empty (not shard-local data), we will merge into previous + Some(frag) + } else { + None + }, + ) + } + None => { + // First iteration, want the full number + (target_nblocks, None) + } + }; + + let advance_by = if is_fragment_disposable { + advance_by + } else { + std::cmp::min(advance_by, want_blocks) + }; + + let next_cursor = cursor.add(advance_by); + + let this_frag = ( + if is_fragment_disposable { + 0 + } else { + advance_by + }, + cursor..next_cursor, + ); + cursor = next_cursor; + + if let Some(last_fragment) = merge_last_fragment { + // Previous fragment was short or this one is empty, merge into it + last_fragment.0 += this_frag.0; + last_fragment.1.end = this_frag.1.end; + } else { + fragments.push(this_frag); + } + } + + fragments + } + + /// Estimate the physical pages that are within this range, on this shard. This returns + /// u32::MAX if the range spans relations: this return value should be interpreted as "large". + pub fn page_count(&self) -> u32 { + // Special cases for single keys like logical sizes + if self.range.end == self.range.start.add(1) { + return if self.shard_identity.is_key_disposable(&self.range.start) { + 0 + } else { + 1 + }; + } + + // We can only do an authentic calculation of contiguous key ranges + if !is_contiguous_range(&self.range) { + return u32::MAX; + } + + // Special case for single sharded tenants: our logical and physical sizes are the same + if self.shard_identity.count < ShardCount::new(2) { + return contiguous_range_len(&self.range); + } + + // Normal path: step through stripes and part-stripes in the range, evaluate whether each one belongs + // to Self, and add the stripe's block count to our total if so. + let mut result: u64 = 0; + let mut cursor = self.range.start; + while cursor < self.range.end { + // Count up to the next stripe_size boundary or end of range + let advance_by = self.distance_to_next_boundary(cursor); + + // If this blocks in this stripe belong to us, add them to our count + if !self.shard_identity.is_key_disposable(&cursor) { + result += advance_by as u64; + } + + cursor = cursor.add(advance_by); + } + + if result > u32::MAX as u64 { + u32::MAX + } else { + result as u32 + } + } + + /// Advance the cursor to the next potential fragment boundary: this is either + /// a stripe boundary, or the end of the range. + fn distance_to_next_boundary(&self, cursor: Key) -> u32 { + let distance_to_range_end = contiguous_range_len(&(cursor..self.range.end)); + + if self.shard_identity.count < ShardCount::new(2) { + // Optimization: don't bother stepping through stripes if the tenant isn't sharded. + return distance_to_range_end; + } + + if cursor.field6 == 0xffffffff { + // We are wrapping from one relation's logical size to the next relation's first data block + return 1; + } + + let stripe_index = cursor.field6 / self.shard_identity.stripe_size.0; + let stripe_remainder = self.shard_identity.stripe_size.0 + - (cursor.field6 - stripe_index * self.shard_identity.stripe_size.0); + + if cfg!(debug_assertions) { + // We should never overflow field5 and field6 -- our callers check this earlier + // and would have returned their u32::MAX cases if the input range violated this. + let next_cursor = cursor.add(stripe_remainder); + debug_assert!( + next_cursor.field1 == cursor.field1 + && next_cursor.field2 == cursor.field2 + && next_cursor.field3 == cursor.field3 + && next_cursor.field4 == cursor.field4 + && next_cursor.field5 == cursor.field5 + ) + } + + std::cmp::min(stripe_remainder, distance_to_range_end) + } + + /// Whereas `page_count` estimates the number of pages physically in this range on this shard, + /// this function simply calculates the number of pages in the space, without accounting for those + /// pages that would not actually be stored on this node. /// + /// Don't use this function in code that works with physical entities like layer files. + pub fn raw_size(range: &Range) -> u32 { + if is_contiguous_range(range) { + contiguous_range_len(range) + } else { + u32::MAX + } + } +} + +impl KeySpace { + /// Create a key space with a single range. + pub fn single(key_range: Range) -> Self { + Self { + ranges: vec![key_range], + } + } + /// Partition a key space into roughly chunks of roughly 'target_size' bytes /// in each partition. /// - pub fn partition(&self, target_size: u64) -> KeyPartitioning { + pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning { // Assume that each value is 8k in size. - let target_nblocks = (target_size / BLCKSZ as u64) as usize; + let target_nblocks = (target_size / BLCKSZ as u64) as u32; let mut parts = Vec::new(); let mut current_part = Vec::new(); let mut current_part_size: usize = 0; for range in &self.ranges { - // If appending the next contiguous range in the keyspace to the current - // partition would cause it to be too large, start a new partition. - let this_size = key_range_size(range) as usize; - if current_part_size + this_size > target_nblocks && !current_part.is_empty() { - parts.push(KeySpace { - ranges: current_part, - }); - current_part = Vec::new(); - current_part_size = 0; - } + // While doing partitioning, wrap the range in ShardedRange so that our size calculations + // will respect shard striping rather than assuming all keys within a range are present. + let range = ShardedRange::new(range.clone(), shard_identity); - // If the next range is larger than 'target_size', split it into - // 'target_size' chunks. - let mut remain_size = this_size; - let mut start = range.start; - while remain_size > target_nblocks { - let next = start.add(target_nblocks as u32); - parts.push(KeySpace { - ranges: vec![start..next], - }); - start = next; - remain_size -= target_nblocks + // Chunk up the range into parts that each contain up to target_size local blocks + for (frag_on_shard_size, frag_range) in range.fragment(target_nblocks) { + // If appending the next contiguous range in the keyspace to the current + // partition would cause it to be too large, and our current partition + // covers at least one block that is physically present in this shard, + // then start a new partition + if current_part_size + frag_on_shard_size as usize > target_nblocks as usize + && current_part_size > 0 + { + parts.push(KeySpace { + ranges: current_part, + }); + current_part = Vec::new(); + current_part_size = 0; + } + current_part.push(frag_range.start..frag_range.end); + current_part_size += frag_on_shard_size as usize; } - current_part.push(start..range.end); - current_part_size += remain_size; } // add last partition that wasn't full yet. @@ -63,16 +312,128 @@ impl KeySpace { KeyPartitioning { parts } } + pub fn is_empty(&self) -> bool { + self.total_raw_size() == 0 + } + + /// Merge another keyspace into the current one. + /// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`. + pub fn merge(&mut self, other: &KeySpace) { + let all_ranges = self + .ranges + .iter() + .merge_by(other.ranges.iter(), |lhs, rhs| lhs.start < rhs.start); + + let mut accum = KeySpaceAccum::new(); + let mut prev: Option<&Range> = None; + for range in all_ranges { + if let Some(prev) = prev { + let overlap = + std::cmp::max(range.start, prev.start) < std::cmp::min(range.end, prev.end); + assert!( + !overlap, + "Attempt to merge ovelapping keyspaces: {:?} overlaps {:?}", + prev, range + ); + } + + accum.add_range(range.clone()); + prev = Some(range); + } + + self.ranges = accum.to_keyspace().ranges; + } + + /// Remove all keys in `other` from `self`. + /// This can involve splitting or removing of existing ranges. + /// Returns the removed keyspace + pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace { + let (self_start, self_end) = match (self.start(), self.end()) { + (Some(start), Some(end)) => (start, end), + _ => { + // self is empty + return KeySpace::default(); + } + }; + + // Key spaces are sorted by definition, so skip ahead to the first + // potentially intersecting range. Similarly, ignore ranges that start + // after the current keyspace ends. + let other_ranges = other + .ranges + .iter() + .skip_while(|range| self_start >= range.end) + .take_while(|range| self_end > range.start); + + let mut removed_accum = KeySpaceRandomAccum::new(); + for range in other_ranges { + while let Some(overlap_at) = self.overlaps_at(range) { + let overlapped = self.ranges[overlap_at].clone(); + + if overlapped.start < range.start && overlapped.end <= range.end { + // Higher part of the range is completely overlapped. + removed_accum.add_range(range.start..self.ranges[overlap_at].end); + self.ranges[overlap_at].end = range.start; + } + if overlapped.start >= range.start && overlapped.end > range.end { + // Lower part of the range is completely overlapped. + removed_accum.add_range(self.ranges[overlap_at].start..range.end); + self.ranges[overlap_at].start = range.end; + } + if overlapped.start < range.start && overlapped.end > range.end { + // Middle part of the range is overlapped. + removed_accum.add_range(range.clone()); + self.ranges[overlap_at].end = range.start; + self.ranges + .insert(overlap_at + 1, range.end..overlapped.end); + } + if overlapped.start >= range.start && overlapped.end <= range.end { + // Whole range is overlapped + removed_accum.add_range(self.ranges[overlap_at].clone()); + self.ranges.remove(overlap_at); + } + } + } + + removed_accum.to_keyspace() + } + + pub fn start(&self) -> Option { + self.ranges.first().map(|range| range.start) + } + + pub fn end(&self) -> Option { + self.ranges.last().map(|range| range.end) + } + + /// The size of the keyspace in pages, before accounting for sharding + pub fn total_raw_size(&self) -> usize { + self.ranges + .iter() + .map(|range| ShardedRange::raw_size(range) as usize) + .sum() + } + + fn overlaps_at(&self, range: &Range) -> Option { + match self.ranges.binary_search_by_key(&range.end, |r| r.start) { + Ok(0) => None, + Err(0) => None, + Ok(index) if self.ranges[index - 1].end > range.start => Some(index - 1), + Err(index) if self.ranges[index - 1].end > range.start => Some(index - 1), + _ => None, + } + } + /// /// Check if key space contains overlapping range /// pub fn overlaps(&self, range: &Range) -> bool { - match self.ranges.binary_search_by_key(&range.end, |r| r.start) { - Ok(0) => false, - Err(0) => false, - Ok(index) => self.ranges[index - 1].end > range.start, - Err(index) => self.ranges[index - 1].end > range.start, - } + self.overlaps_at(range).is_some() + } + + /// Check if the keyspace contains a key + pub fn contains(&self, key: &Key) -> bool { + self.overlaps(&(*key..key.next())) } } @@ -88,10 +449,33 @@ pub struct KeyPartitioning { pub parts: Vec, } +/// Represents a partitioning of the sparse key space. +#[derive(Clone, Debug, Default)] +pub struct SparseKeyPartitioning { + pub parts: Vec, +} + impl KeyPartitioning { pub fn new() -> Self { KeyPartitioning { parts: Vec::new() } } + + /// Convert a key partitioning to a sparse partition. + pub fn into_sparse(self) -> SparseKeyPartitioning { + SparseKeyPartitioning { + parts: self.parts.into_iter().map(SparseKeySpace).collect(), + } + } +} + +impl SparseKeyPartitioning { + /// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will + /// cause long/dead loops. + pub fn into_dense(self) -> KeyPartitioning { + KeyPartitioning { + parts: self.parts.into_iter().map(|x| x.0).collect(), + } + } } /// @@ -123,7 +507,7 @@ impl KeySpaceAccum { #[inline(always)] pub fn add_range(&mut self, range: Range) { - self.size += key_range_size(&range) as u64; + self.size += ShardedRange::raw_size(&range) as u64; match self.accum.as_mut() { Some(accum) => { @@ -152,19 +536,12 @@ impl KeySpaceAccum { } pub fn consume_keyspace(&mut self) -> KeySpace { - if let Some(accum) = self.accum.take() { - self.ranges.push(accum); - } - - let mut prev_accum = KeySpaceAccum::new(); - std::mem::swap(self, &mut prev_accum); - - KeySpace { - ranges: prev_accum.ranges, - } + std::mem::take(self).to_keyspace() } - pub fn size(&self) -> u64 { + // The total number of keys in this object, ignoring any sharding effects that might cause some of + // the keys to be omitted in storage on this shard. + pub fn raw_size(&self) -> u64 { self.size } } @@ -191,6 +568,12 @@ impl KeySpaceRandomAccum { self.ranges.push(range); } + pub fn add_keyspace(&mut self, keyspace: KeySpace) { + for range in keyspace.ranges { + self.add_range(range); + } + } + pub fn to_keyspace(mut self) -> KeySpace { let mut ranges = Vec::new(); if !self.ranges.is_empty() { @@ -211,28 +594,12 @@ impl KeySpaceRandomAccum { } KeySpace { ranges } } -} -pub fn key_range_size(key_range: &Range) -> u32 { - let start = key_range.start; - let end = key_range.end; + pub fn consume_keyspace(&mut self) -> KeySpace { + let mut prev_accum = KeySpaceRandomAccum::new(); + std::mem::swap(self, &mut prev_accum); - if end.field1 != start.field1 - || end.field2 != start.field2 - || end.field3 != start.field3 - || end.field4 != start.field4 - { - return u32::MAX; - } - - let start = (start.field5 as u64) << 32 | start.field6 as u64; - let end = (end.field5 as u64) << 32 | end.field6 as u64; - - let diff = end - start; - if diff > u32::MAX as u64 { - u32::MAX - } else { - diff as u32 + prev_accum.to_keyspace() } } @@ -242,6 +609,13 @@ pub fn singleton_range(key: Key) -> Range { #[cfg(test)] mod tests { + use rand::{RngCore, SeedableRng}; + + use crate::{ + models::ShardParameters, + shard::{ShardCount, ShardNumber}, + }; + use super::*; use std::fmt::Write; @@ -284,14 +658,17 @@ mod tests { accum.add_range(range.clone()); } - let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum(); - assert_eq!(accum.size(), expected_size); + let expected_size: u64 = ranges + .iter() + .map(|r| ShardedRange::raw_size(r) as u64) + .sum(); + assert_eq!(accum.raw_size(), expected_size); assert_ks_eq(&accum.consume_keyspace(), ranges.clone()); - assert_eq!(accum.size(), 0); + assert_eq!(accum.raw_size(), 0); assert_ks_eq(&accum.consume_keyspace(), vec![]); - assert_eq!(accum.size(), 0); + assert_eq!(accum.raw_size(), 0); for range in &ranges { accum.add_range(range.clone()); @@ -441,4 +818,559 @@ mod tests { // xxxxxxxxxxx assert!(ks.overlaps(&kr(0..30))); // XXXXX This fails currently! } + + #[test] + fn test_remove_full_overlapps() { + let mut key_space1 = KeySpace { + ranges: vec![ + Key::from_i128(1)..Key::from_i128(4), + Key::from_i128(5)..Key::from_i128(8), + Key::from_i128(10)..Key::from_i128(12), + ], + }; + let key_space2 = KeySpace { + ranges: vec![ + Key::from_i128(2)..Key::from_i128(3), + Key::from_i128(6)..Key::from_i128(7), + Key::from_i128(11)..Key::from_i128(13), + ], + }; + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace { + ranges: vec![ + Key::from_i128(2)..Key::from_i128(3), + Key::from_i128(6)..Key::from_i128(7), + Key::from_i128(11)..Key::from_i128(12), + ], + }; + assert_eq!(removed, removed_expected); + + assert_eq!( + key_space1.ranges, + vec![ + Key::from_i128(1)..Key::from_i128(2), + Key::from_i128(3)..Key::from_i128(4), + Key::from_i128(5)..Key::from_i128(6), + Key::from_i128(7)..Key::from_i128(8), + Key::from_i128(10)..Key::from_i128(11) + ] + ); + } + + #[test] + fn test_remove_partial_overlaps() { + // Test partial ovelaps + let mut key_space1 = KeySpace { + ranges: vec![ + Key::from_i128(1)..Key::from_i128(5), + Key::from_i128(7)..Key::from_i128(10), + Key::from_i128(12)..Key::from_i128(15), + ], + }; + let key_space2 = KeySpace { + ranges: vec![ + Key::from_i128(3)..Key::from_i128(6), + Key::from_i128(8)..Key::from_i128(11), + Key::from_i128(14)..Key::from_i128(17), + ], + }; + + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace { + ranges: vec![ + Key::from_i128(3)..Key::from_i128(5), + Key::from_i128(8)..Key::from_i128(10), + Key::from_i128(14)..Key::from_i128(15), + ], + }; + assert_eq!(removed, removed_expected); + + assert_eq!( + key_space1.ranges, + vec![ + Key::from_i128(1)..Key::from_i128(3), + Key::from_i128(7)..Key::from_i128(8), + Key::from_i128(12)..Key::from_i128(14), + ] + ); + } + + #[test] + fn test_remove_no_overlaps() { + let mut key_space1 = KeySpace { + ranges: vec![ + Key::from_i128(1)..Key::from_i128(5), + Key::from_i128(7)..Key::from_i128(10), + Key::from_i128(12)..Key::from_i128(15), + ], + }; + let key_space2 = KeySpace { + ranges: vec![ + Key::from_i128(6)..Key::from_i128(7), + Key::from_i128(11)..Key::from_i128(12), + Key::from_i128(15)..Key::from_i128(17), + ], + }; + + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace::default(); + assert_eq!(removed, removed_expected); + + assert_eq!( + key_space1.ranges, + vec![ + Key::from_i128(1)..Key::from_i128(5), + Key::from_i128(7)..Key::from_i128(10), + Key::from_i128(12)..Key::from_i128(15), + ] + ); + } + + #[test] + fn test_remove_one_range_overlaps_multiple() { + let mut key_space1 = KeySpace { + ranges: vec![ + Key::from_i128(1)..Key::from_i128(3), + Key::from_i128(3)..Key::from_i128(6), + Key::from_i128(6)..Key::from_i128(10), + Key::from_i128(12)..Key::from_i128(15), + Key::from_i128(17)..Key::from_i128(20), + Key::from_i128(20)..Key::from_i128(30), + Key::from_i128(30)..Key::from_i128(40), + ], + }; + let key_space2 = KeySpace { + ranges: vec![Key::from_i128(9)..Key::from_i128(19)], + }; + + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace { + ranges: vec![ + Key::from_i128(9)..Key::from_i128(10), + Key::from_i128(12)..Key::from_i128(15), + Key::from_i128(17)..Key::from_i128(19), + ], + }; + assert_eq!(removed, removed_expected); + + assert_eq!( + key_space1.ranges, + vec![ + Key::from_i128(1)..Key::from_i128(3), + Key::from_i128(3)..Key::from_i128(6), + Key::from_i128(6)..Key::from_i128(9), + Key::from_i128(19)..Key::from_i128(20), + Key::from_i128(20)..Key::from_i128(30), + Key::from_i128(30)..Key::from_i128(40), + ] + ); + } + #[test] + fn sharded_range_relation_gap() { + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + let range = ShardedRange::new( + Range { + start: Key::from_hex("000000067F00000005000040100300000000").unwrap(), + end: Key::from_hex("000000067F00000005000040130000004000").unwrap(), + }, + &shard_identity, + ); + + // Key range spans relations, expect MAX + assert_eq!(range.page_count(), u32::MAX); + } + + #[test] + fn shard_identity_keyspaces_single_key() { + let shard_identity = ShardIdentity::new( + ShardNumber(1), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + let range = ShardedRange::new( + Range { + start: Key::from_hex("000000067f000000010000007000ffffffff").unwrap(), + end: Key::from_hex("000000067f00000001000000700100000000").unwrap(), + }, + &shard_identity, + ); + // Single-key range on logical size key + assert_eq!(range.page_count(), 1); + } + + /// Test the helper that we use to identify ranges which go outside the data blocks of a single relation + #[test] + fn contiguous_range_check() { + assert!(!is_contiguous_range( + &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap() + ..Key::from_hex("000000067f00000001000004df0100000003").unwrap()) + ),); + + // The ranges goes all the way up to the 0xffffffff, including it: this is + // not considered a rel block range because 0xffffffff stores logical sizes, + // not blocks. + assert!(!is_contiguous_range( + &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap() + ..Key::from_hex("000000067f00000001000004df0100000000").unwrap()) + ),); + + // Keys within the normal data region of a relation + assert!(is_contiguous_range( + &(Key::from_hex("000000067f00000001000004df0000000000").unwrap() + ..Key::from_hex("000000067f00000001000004df0000000080").unwrap()) + ),); + + // The logical size key of one forkno, then some blocks in the next + assert!(is_contiguous_range( + &(Key::from_hex("000000067f00000001000004df00ffffffff").unwrap() + ..Key::from_hex("000000067f00000001000004df0100000080").unwrap()) + ),); + } + + #[test] + fn shard_identity_keyspaces_forkno_gap() { + let shard_identity = ShardIdentity::new( + ShardNumber(1), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + let range = ShardedRange::new( + Range { + start: Key::from_hex("000000067f00000001000004df00fffffffe").unwrap(), + end: Key::from_hex("000000067f00000001000004df0100000003").unwrap(), + }, + &shard_identity, + ); + + // Range spanning the end of one forkno and the start of the next: we do not attempt to + // calculate a valid size, because we have no way to know if they keys between start + // and end are actually in use. + assert_eq!(range.page_count(), u32::MAX); + } + + #[test] + fn shard_identity_keyspaces_one_relation() { + for shard_number in 0..4 { + let shard_identity = ShardIdentity::new( + ShardNumber(shard_number), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + let range = ShardedRange::new( + Range { + start: Key::from_hex("000000067f00000001000000ae0000000000").unwrap(), + end: Key::from_hex("000000067f00000001000000ae0000000001").unwrap(), + }, + &shard_identity, + ); + + // Very simple case: range covering block zero of one relation, where that block maps to shard zero + if shard_number == 0 { + assert_eq!(range.page_count(), 1); + } else { + // Other shards should perceive the range's size as zero + assert_eq!(range.page_count(), 0); + } + } + } + + /// Test helper: construct a ShardedRange and call fragment() on it, returning + /// the total page count in the range and the fragments. + fn do_fragment( + range_start: Key, + range_end: Key, + shard_identity: &ShardIdentity, + target_nblocks: u32, + ) -> (u32, Vec<(u32, Range)>) { + let range = ShardedRange::new( + Range { + start: range_start, + end: range_end, + }, + shard_identity, + ); + + let page_count = range.page_count(); + let fragments = range.fragment(target_nblocks); + + // Invariant: we always get at least one fragment + assert!(!fragments.is_empty()); + + // Invariant: the first/last fragment start/end should equal the input start/end + assert_eq!(fragments.first().unwrap().1.start, range_start); + assert_eq!(fragments.last().unwrap().1.end, range_end); + + if page_count > 0 { + // Invariant: every fragment must contain at least one shard-local page, if the + // total range contains at least one shard-local page + let all_nonzero = fragments.iter().all(|f| f.0 > 0); + if !all_nonzero { + eprintln!("Found a zero-length fragment: {:?}", fragments); + } + assert!(all_nonzero); + } else { + // A range with no shard-local pages should always be returned as a single fragment + assert_eq!(fragments, vec![(0, range_start..range_end)]); + } + + // Invariant: fragments must be ordered and non-overlapping + let mut last: Option> = None; + for frag in &fragments { + if let Some(last) = last { + assert!(frag.1.start >= last.end); + assert!(frag.1.start > last.start); + } + last = Some(frag.1.clone()) + } + + // Invariant: fragments respect target_nblocks + for frag in &fragments { + assert!(frag.0 == u32::MAX || frag.0 <= target_nblocks); + } + + (page_count, fragments) + } + + /// Really simple tests for fragment(), on a range that just contains a single stripe + /// for a single tenant. + #[test] + fn sharded_range_fragment_simple() { + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + // A range which we happen to know covers exactly one stripe which belongs to this shard + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap(); + + // Ask for stripe_size blocks, we get the whole stripe + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 32768), + (32768, vec![(32768, input_start..input_end)]) + ); + + // Ask for more, we still get the whole stripe + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 10000000), + (32768, vec![(32768, input_start..input_end)]) + ); + + // Ask for target_nblocks of half the stripe size, we get two halves + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 16384), + ( + 32768, + vec![ + (16384, input_start..input_start.add(16384)), + (16384, input_start.add(16384)..input_end) + ] + ) + ); + } + + #[test] + fn sharded_range_fragment_multi_stripe() { + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + // A range which covers multiple stripes, exactly one of which belongs to the current shard. + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap(); + // Ask for all the blocks, get a fragment that covers the whole range but reports + // its size to be just the blocks belonging to our shard. + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 131072), + (32768, vec![(32768, input_start..input_end)]) + ); + + // Ask for a sub-stripe quantity + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 16000), + ( + 32768, + vec![ + (16000, input_start..input_start.add(16000)), + (16000, input_start.add(16000)..input_start.add(32000)), + (768, input_start.add(32000)..input_end), + ] + ) + ); + + // Try on a range that starts slightly after our owned stripe + assert_eq!( + do_fragment(input_start.add(1), input_end, &shard_identity, 131072), + (32767, vec![(32767, input_start.add(1)..input_end)]) + ); + } + + /// Test our calculations work correctly when we start a range from the logical size key of + /// a previous relation. + #[test] + fn sharded_range_fragment_starting_from_logical_size() { + let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap(); + + // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x10000), + (0x8001, vec![(0x8001, input_start..input_end)]) + ); + + // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards + // store all logical sizes) + let shard_identity = ShardIdentity::new( + ShardNumber(1), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x10000), + (0x1, vec![(0x1, input_start..input_end)]) + ); + } + + /// Test that ShardedRange behaves properly when used on un-sharded data + #[test] + fn sharded_range_fragment_unsharded() { + let shard_identity = ShardIdentity::unsharded(); + + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000010000").unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x8000), + ( + 0x10000, + vec![ + (0x8000, input_start..input_start.add(0x8000)), + (0x8000, input_start.add(0x8000)..input_start.add(0x10000)) + ] + ) + ); + } + + #[test] + fn sharded_range_fragment_cross_relation() { + let shard_identity = ShardIdentity::unsharded(); + + // A range that spans relations: expect fragmentation to give up and return a u32::MAX size + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000068f00000001000000ae0000010000").unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x8000), + (u32::MAX, vec![(u32::MAX, input_start..input_end),]) + ); + + // Same, but using a sharded identity + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x8000), + (u32::MAX, vec![(u32::MAX, input_start..input_end),]) + ); + } + + #[test] + fn sharded_range_fragment_tiny_nblocks() { + let shard_identity = ShardIdentity::unsharded(); + + // A range that spans relations: expect fragmentation to give up and return a u32::MAX size + let input_start = Key::from_hex("000000067F00000001000004E10000000000").unwrap(); + let input_end = Key::from_hex("000000067F00000001000004E10000000038").unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 16), + ( + 0x38, + vec![ + (16, input_start..input_start.add(16)), + (16, input_start.add(16)..input_start.add(32)), + (16, input_start.add(32)..input_start.add(48)), + (8, input_start.add(48)..input_end), + ] + ) + ); + } + + #[test] + fn sharded_range_fragment_fuzz() { + // Use a fixed seed: we don't want to explicitly pick values, but we do want + // the test to be reproducible. + let mut prng = rand::rngs::StdRng::seed_from_u64(0xdeadbeef); + + for _i in 0..1000 { + let shard_identity = if prng.next_u32() % 2 == 0 { + ShardIdentity::unsharded() + } else { + let shard_count = prng.next_u32() % 127 + 1; + ShardIdentity::new( + ShardNumber((prng.next_u32() % shard_count) as u8), + ShardCount::new(shard_count as u8), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap() + }; + + let target_nblocks = prng.next_u32() % 65536 + 1; + + let start_offset = prng.next_u32() % 16384; + + // Try ranges up to 4GiB in size, that are always at least 1 + let range_size = prng.next_u32() % 8192 + 1; + + // A range that spans relations: expect fragmentation to give up and return a u32::MAX size + let input_start = Key::from_hex("000000067F00000001000004E10000000000") + .unwrap() + .add(start_offset); + let input_end = input_start.add(range_size); + + // This test's main success conditions are the invariants baked into do_fragment + let (_total_size, fragments) = + do_fragment(input_start, input_end, &shard_identity, target_nblocks); + + // Pick a random key within the range and check it appears in the output + let example_key = input_start.add(prng.next_u32() % range_size); + + // Panic on unwrap if it isn't found + let example_key_frag = fragments + .iter() + .find(|f| f.1.contains(&example_key)) + .unwrap(); + + // Check that the fragment containing our random key has a nonzero size if + // that key is shard-local + let example_key_local = !shard_identity.is_key_disposable(&example_key); + if example_key_local { + assert!(example_key_frag.0 > 0); + } + } + } } diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs index b236b93428..532185a366 100644 --- a/libs/pageserver_api/src/lib.rs +++ b/libs/pageserver_api/src/lib.rs @@ -1,16 +1,13 @@ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] -use const_format::formatcp; -/// Public API types -pub mod control_api; +pub mod controller_api; pub mod key; pub mod keyspace; pub mod models; pub mod reltag; pub mod shard; +/// Public API types +pub mod upcall_api; -pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; -pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); -pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; -pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); +pub mod config; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 86d2c2a7ca..ffe79c8350 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1,21 +1,28 @@ +pub mod detach_ancestor; pub mod partitioning; +pub mod utilization; + +pub use utilization::PageserverUtilization; use std::{ collections::HashMap, + fmt::Display, io::{BufRead, Read}, - num::{NonZeroU64, NonZeroUsize}, + num::{NonZeroU32, NonZeroU64, NonZeroUsize}, + str::FromStr, + sync::atomic::AtomicUsize, time::{Duration, SystemTime}, }; use byteorder::{BigEndian, ReadBytesExt}; +use postgres_ffi::BLCKSZ; use serde::{Deserialize, Serialize}; use serde_with::serde_as; -use strum_macros; use utils::{ completion, - history_buffer::HistoryBufferWithDropCounter, id::{NodeId, TenantId, TimelineId}, lsn::Lsn, + serde_system_time, }; use crate::{ @@ -153,6 +160,36 @@ impl std::fmt::Debug for TenantState { } } +/// A temporary lease to a specific lsn inside a timeline. +/// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`. +#[serde_as] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct LsnLease { + #[serde_as(as = "SystemTimeAsRfc3339Millis")] + pub valid_until: SystemTime, +} + +serde_with::serde_conv!( + SystemTimeAsRfc3339Millis, + SystemTime, + |time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(), + |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) } +); + +impl LsnLease { + /// The default length for an explicit LSN lease request (10 minutes). + pub const DEFAULT_LENGTH: Duration = Duration::from_secs(10 * 60); + + /// The default length for an implicit LSN lease granted during + /// `get_lsn_by_timestamp` request (1 minutes). + pub const DEFAULT_LENGTH_FOR_TS: Duration = Duration::from_secs(60); + + /// Checks whether the lease is expired. + pub fn is_expired(&self, now: &SystemTime) -> bool { + now > &self.valid_until + } +} + /// The only [`TenantState`] variants we could be `TenantState::Activating` from. #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum ActivatingFrom { @@ -179,7 +216,7 @@ pub enum TimelineState { Broken { reason: String, backtrace: String }, } -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub struct TimelineCreateRequest { pub new_timeline_id: TimelineId, #[serde(default)] @@ -191,6 +228,28 @@ pub struct TimelineCreateRequest { pub pg_version: Option, } +#[derive(Serialize, Deserialize, Clone)] +pub struct LsnLeaseRequest { + pub lsn: Lsn, +} + +#[derive(Serialize, Deserialize)] +pub struct TenantShardSplitRequest { + pub new_shard_count: u8, + + // A tenant's stripe size is only meaningful the first time their shard count goes + // above 1: therefore during a split from 1->N shards, we may modify the stripe size. + // + // If this is set while the stripe count is being increased from an already >1 value, + // then the request will fail with 400. + pub new_stripe_size: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct TenantShardSplitResponse { + pub new_shards: Vec, +} + /// Parameters that apply to all shards in a tenant. Used during tenant creation. #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] @@ -203,52 +262,19 @@ impl ShardParameters { pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); pub fn is_unsharded(&self) -> bool { - self.count == ShardCount(0) + self.count.is_unsharded() } } impl Default for ShardParameters { fn default() -> Self { Self { - count: ShardCount(0), + count: ShardCount::new(0), stripe_size: Self::DEFAULT_STRIPE_SIZE, } } } -#[derive(Serialize, Deserialize, Debug)] -#[serde(deny_unknown_fields)] -pub struct TenantCreateRequest { - pub new_tenant_id: TenantShardId, - #[serde(default)] - #[serde(skip_serializing_if = "Option::is_none")] - pub generation: Option, - - // If omitted, create a single shard with TenantShardId::unsharded() - #[serde(default)] - #[serde(skip_serializing_if = "ShardParameters::is_unsharded")] - pub shard_parameters: ShardParameters, - - #[serde(flatten)] - pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it -} - -#[derive(Deserialize, Debug)] -#[serde(deny_unknown_fields)] -pub struct TenantLoadRequest { - #[serde(default)] - #[serde(skip_serializing_if = "Option::is_none")] - pub generation: Option, -} - -impl std::ops::Deref for TenantCreateRequest { - type Target = TenantConfig; - - fn deref(&self) -> &Self::Target { - &self.config - } -} - /// An alternative representation of `pageserver::tenant::TenantConf` with /// simpler types. #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)] @@ -258,6 +284,8 @@ pub struct TenantConfig { pub compaction_target_size: Option, pub compaction_period: Option, pub compaction_threshold: Option, + // defer parsing compaction_algorithm, like eviction_policy + pub compaction_algorithm: Option, pub gc_horizon: Option, pub gc_period: Option, pub image_creation_threshold: Option, @@ -265,12 +293,114 @@ pub struct TenantConfig { pub walreceiver_connect_timeout: Option, pub lagging_wal_timeout: Option, pub max_lsn_wal_lag: Option, - pub trace_read_requests: Option, pub eviction_policy: Option, pub min_resident_size_override: Option, pub evictions_low_residence_duration_metric_threshold: Option, - pub gc_feedback: Option, pub heatmap_period: Option, + pub lazy_slru_download: Option, + pub timeline_get_throttle: Option, + pub image_layer_creation_check_threshold: Option, + pub switch_aux_file_policy: Option, + pub lsn_lease_length: Option, + pub lsn_lease_length_for_ts: Option, +} + +/// The policy for the aux file storage. +/// +/// It can be switched through `switch_aux_file_policy` tenant config. +/// When the first aux file written, the policy will be persisted in the +/// `index_part.json` file and has a limited migration path. +/// +/// Currently, we only allow the following migration path: +/// +/// Unset -> V1 +/// -> V2 +/// -> CrossValidation -> V2 +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +pub enum AuxFilePolicy { + /// V1 aux file policy: store everything in AUX_FILE_KEY + #[strum(ascii_case_insensitive)] + V1, + /// V2 aux file policy: store in the AUX_FILE keyspace + #[strum(ascii_case_insensitive)] + V2, + /// Cross validation runs both formats on the write path and does validation + /// on the read path. + #[strum(ascii_case_insensitive)] + CrossValidation, +} + +impl AuxFilePolicy { + pub fn is_valid_migration_path(from: Option, to: Self) -> bool { + matches!( + (from, to), + (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2) + ) + } + + /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used. + pub fn default_tenant_config() -> Self { + Self::V2 + } +} + +/// The aux file policy memory flag. Users can store `Option` into this atomic flag. 0 == unspecified. +pub struct AtomicAuxFilePolicy(AtomicUsize); + +impl AtomicAuxFilePolicy { + pub fn new(policy: Option) -> Self { + Self(AtomicUsize::new( + policy.map(AuxFilePolicy::to_usize).unwrap_or_default(), + )) + } + + pub fn load(&self) -> Option { + match self.0.load(std::sync::atomic::Ordering::Acquire) { + 0 => None, + other => Some(AuxFilePolicy::from_usize(other)), + } + } + + pub fn store(&self, policy: Option) { + self.0.store( + policy.map(AuxFilePolicy::to_usize).unwrap_or_default(), + std::sync::atomic::Ordering::Release, + ); + } +} + +impl AuxFilePolicy { + pub fn to_usize(self) -> usize { + match self { + Self::V1 => 1, + Self::CrossValidation => 2, + Self::V2 => 3, + } + } + + pub fn try_from_usize(this: usize) -> Option { + match this { + 1 => Some(Self::V1), + 2 => Some(Self::CrossValidation), + 3 => Some(Self::V2), + _ => None, + } + } + + pub fn from_usize(this: usize) -> Self { + Self::try_from_usize(this).unwrap() + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] @@ -278,6 +408,7 @@ pub struct TenantConfig { pub enum EvictionPolicy { NoEviction, LayerAccessThreshold(EvictionPolicyLayerAccessThreshold), + OnlyImitiate(EvictionPolicyLayerAccessThreshold), } impl EvictionPolicy { @@ -285,10 +416,92 @@ impl EvictionPolicy { match self { EvictionPolicy::NoEviction => "NoEviction", EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold", + EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate", } } } +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +pub enum CompactionAlgorithm { + Legacy, + Tiered, +} + +#[derive( + Debug, Clone, Copy, PartialEq, Eq, serde_with::DeserializeFromStr, serde_with::SerializeDisplay, +)] +pub enum ImageCompressionAlgorithm { + // Disabled for writes, support decompressing during read path + Disabled, + /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well. + /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html). + Zstd { + level: Option, + }, +} + +impl FromStr for ImageCompressionAlgorithm { + type Err = anyhow::Error; + fn from_str(s: &str) -> Result { + let mut components = s.split(['(', ')']); + let first = components + .next() + .ok_or_else(|| anyhow::anyhow!("empty string"))?; + match first { + "disabled" => Ok(ImageCompressionAlgorithm::Disabled), + "zstd" => { + let level = if let Some(v) = components.next() { + let v: i8 = v.parse()?; + Some(v) + } else { + None + }; + + Ok(ImageCompressionAlgorithm::Zstd { level }) + } + _ => anyhow::bail!("invalid specifier '{first}'"), + } + } +} + +impl Display for ImageCompressionAlgorithm { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ImageCompressionAlgorithm::Disabled => write!(f, "disabled"), + ImageCompressionAlgorithm::Zstd { level } => { + if let Some(level) = level { + write!(f, "zstd({})", level) + } else { + write!(f, "zstd") + } + } + } + } +} + +#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)] +pub struct CompactionAlgorithmSettings { + pub kind: CompactionAlgorithm, +} + +#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] +#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] +pub enum L0FlushConfig { + #[serde(rename_all = "snake_case")] + Direct { max_concurrency: NonZeroUsize }, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub struct EvictionPolicyLayerAccessThreshold { #[serde(with = "humantime_serde")] @@ -297,10 +510,37 @@ pub struct EvictionPolicyLayerAccessThreshold { pub threshold: Duration, } +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] +pub struct ThrottleConfig { + pub task_kinds: Vec, // TaskKind + pub initial: u32, + #[serde(with = "humantime_serde")] + pub refill_interval: Duration, + pub refill_amount: NonZeroU32, + pub max: u32, +} + +impl ThrottleConfig { + pub fn disabled() -> Self { + Self { + task_kinds: vec![], // effectively disables the throttle + // other values don't matter with emtpy `task_kinds`. + initial: 0, + refill_interval: Duration::from_millis(1), + refill_amount: NonZeroU32::new(1).unwrap(), + max: 1, + } + } + /// The requests per second allowed by the given config. + pub fn steady_rps(&self) -> f64 { + (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64()) + } +} + /// A flattened analog of a `pagesever::tenant::LocationMode`, which /// lists out all possible states (and the virtual "Detached" state) /// in a flat form rather than using rust-style enums. -#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)] +#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)] pub enum LocationConfigMode { AttachedSingle, AttachedMulti, @@ -347,10 +587,6 @@ pub struct LocationConfigListResponse { pub tenant_shards: Vec<(TenantShardId, Option)>, } -#[derive(Serialize, Deserialize)] -#[serde(transparent)] -pub struct TenantCreateResponse(pub TenantId); - #[derive(Serialize)] pub struct StatusResponse { pub id: NodeId, @@ -359,11 +595,31 @@ pub struct StatusResponse { #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct TenantLocationConfigRequest { - pub tenant_id: TenantShardId, #[serde(flatten)] pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it } +#[derive(Serialize, Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct TenantTimeTravelRequest { + pub shard_counts: Vec, +} + +#[derive(Serialize, Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct TenantShardLocation { + pub shard_id: TenantShardId, + pub node_id: NodeId, +} + +#[derive(Serialize, Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct TenantLocationConfigResponse { + pub shards: Vec, + // If the shards' ShardCount count is >1, stripe_size will be set. + pub stripe_size: Option, +} + #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct TenantConfigRequest { @@ -387,31 +643,6 @@ impl TenantConfigRequest { } } -#[derive(Debug, Deserialize)] -pub struct TenantAttachRequest { - #[serde(default)] - pub config: TenantAttachConfig, - #[serde(default)] - pub generation: Option, -} - -/// Newtype to enforce deny_unknown_fields on TenantConfig for -/// its usage inside `TenantAttachRequest`. -#[derive(Debug, Serialize, Deserialize, Default)] -#[serde(deny_unknown_fields)] -pub struct TenantAttachConfig { - #[serde(flatten)] - allowing_unknown_fields: TenantConfig, -} - -impl std::ops::Deref for TenantAttachConfig { - type Target = TenantConfig; - - fn deref(&self) -> &Self::Target { - &self.allowing_unknown_fields - } -} - /// See [`TenantState::attachment_status`] and the OpenAPI docs for context. #[derive(Serialize, Deserialize, Clone)] #[serde(tag = "slug", content = "data", rename_all = "snake_case")] @@ -430,8 +661,14 @@ pub struct TenantInfo { /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint pub attachment_status: TenantAttachmentStatus, + pub generation: u32, + + /// Opaque explanation if gc is being blocked. + /// + /// Only looked up for the individual tenant detail, not the listing. This is purely for + /// debugging, not included in openapi. #[serde(skip_serializing_if = "Option::is_none")] - pub generation: Option, + pub gc_blocking: Option, } #[derive(Serialize, Deserialize, Clone)] @@ -439,9 +676,22 @@ pub struct TenantDetails { #[serde(flatten)] pub tenant_info: TenantInfo, + pub walredo: Option, + pub timelines: Vec, } +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)] +pub enum TimelineArchivalState { + Archived, + Unarchived, +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)] +pub struct TimelineArchivalConfigRequest { + pub state: TimelineArchivalState, +} + /// This represents the output of the "timeline_detail" and "timeline_list" API calls. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { @@ -467,11 +717,23 @@ pub struct TimelineInfo { pub current_logical_size: u64, pub current_logical_size_is_accurate: bool, + pub directory_entries_counts: Vec, + /// Sum of the size of all layer files. /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // is None when timeline is Unloaded pub current_logical_size_non_incremental: Option, + /// How many bytes of WAL are within this branch's pitr_interval. If the pitr_interval goes + /// beyond the branch's branch point, we only count up to the branch point. + pub pitr_history_size: u64, + + /// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any + /// ancestor data used by this branch would have been retained anyway). If this is false, then + /// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would + /// otherwise be able to GC. + pub within_ancestor_pitr: bool, + pub timeline_dir_layer_file_size_sum: Option, pub wal_source_connstr: Option, @@ -483,66 +745,24 @@ pub struct TimelineInfo { pub state: TimelineState, pub walreceiver_status: String, + + // ALWAYS add new fields at the end of the struct with `Option` to ensure forward/backward compatibility. + // Backward compatibility: you will get a JSON not containing the newly-added field. + // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does + // not deny unknown fields by default so it's safe to set the field to some value, though it won't be + // read. + /// The last aux file policy being used on this timeline + pub last_aux_file_policy: Option, + pub is_archived: Option, } -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct LayerMapInfo { pub in_memory_layers: Vec, pub historic_layers: Vec, } -#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize, enum_map::Enum)] -#[repr(usize)] -pub enum LayerAccessKind { - GetValueReconstructData, - Iter, - KeyIter, - Dump, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct LayerAccessStatFullDetails { - pub when_millis_since_epoch: u64, - pub task_kind: &'static str, - pub access_kind: LayerAccessKind, -} - -/// An event that impacts the layer's residence status. -#[serde_as] -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct LayerResidenceEvent { - /// The time when the event occurred. - /// NB: this timestamp is captured while the residence status changes. - /// So, it might be behind/ahead of the actual residence change by a short amount of time. - /// - #[serde(rename = "timestamp_millis_since_epoch")] - #[serde_as(as = "serde_with::TimestampMilliSeconds")] - pub timestamp: SystemTime, - /// The new residence status of the layer. - pub status: LayerResidenceStatus, - /// The reason why we had to record this event. - pub reason: LayerResidenceEventReason, -} - -/// The reason for recording a given [`LayerResidenceEvent`]. -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -pub enum LayerResidenceEventReason { - /// The layer map is being populated, e.g. during timeline load or attach. - /// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`]. - /// We need to record such events because there is no persistent storage for the events. - /// - // https://github.com/rust-lang/rust/issues/74481 - /// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html - /// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote - LayerLoad, - /// We just created the layer (e.g., freeze_and_flush or compaction). - /// Such layers are always [`LayerResidenceStatus::Resident`]. - LayerCreate, - /// We on-demand downloaded or evicted the given layer. - ResidenceChange, -} - -/// The residence status of the layer, after the given [`LayerResidenceEvent`]. +/// The residence status of a layer #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub enum LayerResidenceStatus { /// Residence status for a layer file that exists locally. @@ -552,33 +772,26 @@ pub enum LayerResidenceStatus { Evicted, } -impl LayerResidenceEvent { - pub fn new(status: LayerResidenceStatus, reason: LayerResidenceEventReason) -> Self { - Self { - status, - reason, - timestamp: SystemTime::now(), - } - } -} - -#[derive(Debug, Clone, Serialize)] +#[serde_as] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct LayerAccessStats { - pub access_count_by_access_kind: HashMap, - pub task_kind_access_flag: Vec<&'static str>, - pub first: Option, - pub accesses_history: HistoryBufferWithDropCounter, - pub residence_events_history: HistoryBufferWithDropCounter, + #[serde_as(as = "serde_with::TimestampMilliSeconds")] + pub access_time: SystemTime, + + #[serde_as(as = "serde_with::TimestampMilliSeconds")] + pub residence_time: SystemTime, + + pub visible: bool, } -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "kind")] pub enum InMemoryLayerInfo { Open { lsn_start: Lsn }, Frozen { lsn_start: Lsn, lsn_end: Lsn }, } -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "kind")] pub enum HistoricLayerInfo { Delta { @@ -589,6 +802,8 @@ pub enum HistoricLayerInfo { lsn_end: Lsn, remote: bool, access_stats: LayerAccessStats, + + l0: bool, }, Image { layer_file_name: String, @@ -600,11 +815,57 @@ pub enum HistoricLayerInfo { }, } +impl HistoricLayerInfo { + pub fn layer_file_name(&self) -> &str { + match self { + HistoricLayerInfo::Delta { + layer_file_name, .. + } => layer_file_name, + HistoricLayerInfo::Image { + layer_file_name, .. + } => layer_file_name, + } + } + pub fn is_remote(&self) -> bool { + match self { + HistoricLayerInfo::Delta { remote, .. } => *remote, + HistoricLayerInfo::Image { remote, .. } => *remote, + } + } + pub fn set_remote(&mut self, value: bool) { + let field = match self { + HistoricLayerInfo::Delta { remote, .. } => remote, + HistoricLayerInfo::Image { remote, .. } => remote, + }; + *field = value; + } + pub fn layer_file_size(&self) -> u64 { + match self { + HistoricLayerInfo::Delta { + layer_file_size, .. + } => *layer_file_size, + HistoricLayerInfo::Image { + layer_file_size, .. + } => *layer_file_size, + } + } +} + #[derive(Debug, Serialize, Deserialize)] pub struct DownloadRemoteLayersTaskSpawnRequest { pub max_concurrent_downloads: NonZeroUsize, } +#[derive(Debug, Serialize, Deserialize)] +pub struct IngestAuxFilesRequest { + pub aux_files: HashMap, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct ListAuxFilesRequest { + pub lsn: Lsn, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct DownloadRemoteLayersTaskInfo { pub task_id: String, @@ -626,6 +887,168 @@ pub struct TimelineGcRequest { pub gc_horizon: Option, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WalRedoManagerProcessStatus { + pub pid: u32, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WalRedoManagerStatus { + pub last_redo_at: Option>, + pub process: Option, +} + +/// The progress of a secondary tenant. +/// +/// It is mostly useful when doing a long running download: e.g. initiating +/// a download job, timing out while waiting for it to run, and then inspecting this status to understand +/// what's happening. +#[derive(Default, Debug, Serialize, Deserialize, Clone)] +pub struct SecondaryProgress { + /// The remote storage LastModified time of the heatmap object we last downloaded. + pub heatmap_mtime: Option, + + /// The number of layers currently on-disk + pub layers_downloaded: usize, + /// The number of layers in the most recently seen heatmap + pub layers_total: usize, + + /// The number of layer bytes currently on-disk + pub bytes_downloaded: u64, + /// The number of layer bytes in the most recently seen heatmap + pub bytes_total: u64, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantScanRemoteStorageShard { + pub tenant_shard_id: TenantShardId, + pub generation: Option, +} + +#[derive(Serialize, Deserialize, Debug, Default)] +pub struct TenantScanRemoteStorageResponse { + pub shards: Vec, +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +#[serde(rename_all = "snake_case")] +pub enum TenantSorting { + ResidentSize, + MaxLogicalSize, +} + +impl Default for TenantSorting { + fn default() -> Self { + Self::ResidentSize + } +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct TopTenantShardsRequest { + // How would you like to sort the tenants? + pub order_by: TenantSorting, + + // How many results? + pub limit: usize, + + // Omit tenants with more than this many shards (e.g. if this is the max number of shards + // that the caller would ever split to) + pub where_shards_lt: Option, + + // Omit tenants where the ordering metric is less than this (this is an optimization to + // let us quickly exclude numerous tiny shards) + pub where_gt: Option, +} + +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] +pub struct TopTenantShardItem { + pub id: TenantShardId, + + /// Total size of layers on local disk for all timelines in this tenant + pub resident_size: u64, + + /// Total size of layers in remote storage for all timelines in this tenant + pub physical_size: u64, + + /// The largest logical size of a timeline within this tenant + pub max_logical_size: u64, +} + +#[derive(Serialize, Deserialize, Debug, Default)] +pub struct TopTenantShardsResponse { + pub shards: Vec, +} + +pub mod virtual_file { + use std::path::PathBuf; + + #[derive( + Copy, + Clone, + PartialEq, + Eq, + Hash, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, + Debug, + )] + #[strum(serialize_all = "kebab-case")] + pub enum IoEngineKind { + StdFs, + #[cfg(target_os = "linux")] + TokioEpollUring, + } + + /// Direct IO modes for a pageserver. + #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] + #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] + pub enum DirectIoMode { + /// Direct IO disabled (uses usual buffered IO). + #[default] + Disabled, + /// Direct IO disabled (performs checks and perf simulations). + Evaluate { + /// Alignment check level + alignment_check: DirectIoAlignmentCheckLevel, + /// Latency padded for performance simulation. + latency_padding: DirectIoLatencyPadding, + }, + /// Direct IO enabled. + Enabled { + /// Actions to perform on alignment error. + on_alignment_error: DirectIoOnAlignmentErrorAction, + }, + } + + #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] + #[serde(rename_all = "kebab-case")] + pub enum DirectIoAlignmentCheckLevel { + #[default] + Error, + Log, + None, + } + + #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] + #[serde(rename_all = "kebab-case")] + pub enum DirectIoOnAlignmentErrorAction { + Error, + #[default] + FallbackToBuffered, + } + + #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] + #[serde(tag = "type", rename_all = "kebab-case")] + pub enum DirectIoLatencyPadding { + /// Pad virtual file operations with IO to a fake file. + FakeFileRW { path: PathBuf }, + #[default] + None, + } +} + // Wrapped in libpq CopyData #[derive(PartialEq, Eq, Debug)] pub enum PagestreamFeMessage { @@ -633,6 +1056,7 @@ pub enum PagestreamFeMessage { Nblocks(PagestreamNblocksRequest), GetPage(PagestreamGetPageRequest), DbSize(PagestreamDbSizeRequest), + GetSlruSegment(PagestreamGetSlruSegmentRequest), } // Wrapped in libpq CopyData @@ -643,6 +1067,7 @@ pub enum PagestreamBeMessage { GetPage(PagestreamGetPageResponse), Error(PagestreamErrorResponse), DbSize(PagestreamDbSizeResponse), + GetSlruSegment(PagestreamGetSlruSegmentResponse), } // Keep in sync with `pagestore_client.h` @@ -653,6 +1078,7 @@ enum PagestreamBeMessageTag { GetPage = 102, Error = 103, DbSize = 104, + GetSlruSegment = 105, } impl TryFrom for PagestreamBeMessageTag { type Error = u8; @@ -663,40 +1089,78 @@ impl TryFrom for PagestreamBeMessageTag { 102 => Ok(PagestreamBeMessageTag::GetPage), 103 => Ok(PagestreamBeMessageTag::Error), 104 => Ok(PagestreamBeMessageTag::DbSize), + 105 => Ok(PagestreamBeMessageTag::GetSlruSegment), _ => Err(value), } } } +// A GetPage request contains two LSN values: +// +// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means +// "get the latest version present". It's used by the primary server, which knows that no one else +// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is +// Lsn::Max. Standby servers use the current replay LSN as the request LSN. +// +// not_modified_since: Hint to the pageserver that the client knows that the page has not been +// modified between 'not_modified_since' and the request LSN. It's always correct to set +// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but +// passing an earlier LSN can speed up the request, by allowing the pageserver to process the +// request without waiting for 'request_lsn' to arrive. +// +// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was +// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and +// 'latest' was set to true. The V2 interface was added because there was no correct way for a +// standby to request a page at a particular non-latest LSN, and also include the +// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the +// request, if the standby knows that the page hasn't been modified since, and risk getting an error +// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could +// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2 +// interface allows sending both LSNs, and let the pageserver do the right thing. There was no +// difference in the responses between V1 and V2. +// +#[derive(Clone, Copy)] +pub enum PagestreamProtocolVersion { + V2, +} + #[derive(Debug, PartialEq, Eq)] pub struct PagestreamExistsRequest { - pub latest: bool, - pub lsn: Lsn, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, pub rel: RelTag, } #[derive(Debug, PartialEq, Eq)] pub struct PagestreamNblocksRequest { - pub latest: bool, - pub lsn: Lsn, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, pub rel: RelTag, } #[derive(Debug, PartialEq, Eq)] pub struct PagestreamGetPageRequest { - pub latest: bool, - pub lsn: Lsn, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, pub rel: RelTag, pub blkno: u32, } #[derive(Debug, PartialEq, Eq)] pub struct PagestreamDbSizeRequest { - pub latest: bool, - pub lsn: Lsn, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, pub dbnode: u32, } +#[derive(Debug, PartialEq, Eq)] +pub struct PagestreamGetSlruSegmentRequest { + pub request_lsn: Lsn, + pub not_modified_since: Lsn, + pub kind: u8, + pub segno: u32, +} + #[derive(Debug)] pub struct PagestreamExistsResponse { pub exists: bool, @@ -712,6 +1176,11 @@ pub struct PagestreamGetPageResponse { pub page: Bytes, } +#[derive(Debug)] +pub struct PagestreamGetSlruSegmentResponse { + pub segment: Bytes, +} + #[derive(Debug)] pub struct PagestreamErrorResponse { pub message: String, @@ -734,14 +1203,16 @@ pub struct TenantHistorySize { } impl PagestreamFeMessage { + /// Serialize a compute -> pageserver message. This is currently only used in testing + /// tools. Always uses protocol version 2. pub fn serialize(&self) -> Bytes { let mut bytes = BytesMut::new(); match self { Self::Exists(req) => { bytes.put_u8(0); - bytes.put_u8(u8::from(req.latest)); - bytes.put_u64(req.lsn.0); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); @@ -750,8 +1221,8 @@ impl PagestreamFeMessage { Self::Nblocks(req) => { bytes.put_u8(1); - bytes.put_u8(u8::from(req.latest)); - bytes.put_u64(req.lsn.0); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); @@ -760,8 +1231,8 @@ impl PagestreamFeMessage { Self::GetPage(req) => { bytes.put_u8(2); - bytes.put_u8(u8::from(req.latest)); - bytes.put_u64(req.lsn.0); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); @@ -771,27 +1242,38 @@ impl PagestreamFeMessage { Self::DbSize(req) => { bytes.put_u8(3); - bytes.put_u8(u8::from(req.latest)); - bytes.put_u64(req.lsn.0); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); bytes.put_u32(req.dbnode); } + + Self::GetSlruSegment(req) => { + bytes.put_u8(4); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); + bytes.put_u8(req.kind); + bytes.put_u32(req.segno); + } } bytes.into() } pub fn parse(body: &mut R) -> anyhow::Result { - // TODO these gets can fail - // these correspond to the NeonMessageTag enum in pagestore_client.h // // TODO: consider using protobuf or serde bincode for less error prone // serialization. let msg_tag = body.read_u8()?; + + // these two fields are the same for every request type + let request_lsn = Lsn::from(body.read_u64::()?); + let not_modified_since = Lsn::from(body.read_u64::()?); + match msg_tag { 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { - latest: body.read_u8()? != 0, - lsn: Lsn::from(body.read_u64::()?), + request_lsn, + not_modified_since, rel: RelTag { spcnode: body.read_u32::()?, dbnode: body.read_u32::()?, @@ -800,8 +1282,8 @@ impl PagestreamFeMessage { }, })), 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - latest: body.read_u8()? != 0, - lsn: Lsn::from(body.read_u64::()?), + request_lsn, + not_modified_since, rel: RelTag { spcnode: body.read_u32::()?, dbnode: body.read_u32::()?, @@ -810,8 +1292,8 @@ impl PagestreamFeMessage { }, })), 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - latest: body.read_u8()? != 0, - lsn: Lsn::from(body.read_u64::()?), + request_lsn, + not_modified_since, rel: RelTag { spcnode: body.read_u32::()?, dbnode: body.read_u32::()?, @@ -821,10 +1303,18 @@ impl PagestreamFeMessage { blkno: body.read_u32::()?, })), 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - latest: body.read_u8()? != 0, - lsn: Lsn::from(body.read_u64::()?), + request_lsn, + not_modified_since, dbnode: body.read_u32::()?, })), + 4 => Ok(PagestreamFeMessage::GetSlruSegment( + PagestreamGetSlruSegmentRequest { + request_lsn, + not_modified_since, + kind: body.read_u8()?, + segno: body.read_u32::()?, + }, + )), _ => bail!("unknown smgr message tag: {:?}", msg_tag), } } @@ -860,6 +1350,12 @@ impl PagestreamBeMessage { bytes.put_u8(Tag::DbSize as u8); bytes.put_i64(resp.db_size); } + + Self::GetSlruSegment(resp) => { + bytes.put_u8(Tag::GetSlruSegment as u8); + bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); + bytes.put(&resp.segment[..]); + } } bytes.into() @@ -900,6 +1396,14 @@ impl PagestreamBeMessage { let db_size = buf.read_i64::()?; Self::DbSize(PagestreamDbSizeResponse { db_size }) } + Tag::GetSlruSegment => { + let n_blocks = buf.read_u32::()?; + let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize]; + buf.read_exact(&mut segment)?; + Self::GetSlruSegment(PagestreamGetSlruSegmentResponse { + segment: segment.into(), + }) + } }; let remaining = buf.into_inner(); if !remaining.is_empty() { @@ -918,14 +1422,15 @@ impl PagestreamBeMessage { Self::GetPage(_) => "GetPage", Self::Error(_) => "Error", Self::DbSize(_) => "DbSize", + Self::GetSlruSegment(_) => "GetSlruSegment", } } } #[cfg(test)] mod tests { - use bytes::Buf; use serde_json::json; + use std::str::FromStr; use super::*; @@ -934,8 +1439,8 @@ mod tests { // Test serialization/deserialization of PagestreamFeMessage let messages = vec![ PagestreamFeMessage::Exists(PagestreamExistsRequest { - latest: true, - lsn: Lsn(4), + request_lsn: Lsn(4), + not_modified_since: Lsn(3), rel: RelTag { forknum: 1, spcnode: 2, @@ -944,8 +1449,8 @@ mod tests { }, }), PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - latest: false, - lsn: Lsn(4), + request_lsn: Lsn(4), + not_modified_since: Lsn(4), rel: RelTag { forknum: 1, spcnode: 2, @@ -954,8 +1459,8 @@ mod tests { }, }), PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - latest: true, - lsn: Lsn(4), + request_lsn: Lsn(4), + not_modified_since: Lsn(3), rel: RelTag { forknum: 1, spcnode: 2, @@ -965,8 +1470,8 @@ mod tests { blkno: 7, }), PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - latest: true, - lsn: Lsn(4), + request_lsn: Lsn(4), + not_modified_since: Lsn(3), dbnode: 7, }), ]; @@ -985,7 +1490,8 @@ mod tests { state: TenantState::Active, current_physical_size: Some(42), attachment_status: TenantAttachmentStatus::Attached, - generation: None, + generation: 1, + gc_blocking: None, }; let expected_active = json!({ "id": original_active.id.to_string(), @@ -995,7 +1501,8 @@ mod tests { "current_physical_size": 42, "attachment_status": { "slug":"attached", - } + }, + "generation" : 1 }); let original_broken = TenantInfo { @@ -1006,7 +1513,8 @@ mod tests { }, current_physical_size: Some(42), attachment_status: TenantAttachmentStatus::Attached, - generation: None, + generation: 1, + gc_blocking: None, }; let expected_broken = json!({ "id": original_broken.id.to_string(), @@ -1020,7 +1528,8 @@ mod tests { "current_physical_size": 42, "attachment_status": { "slug":"attached", - } + }, + "generation" : 1 }); assert_eq!( @@ -1038,18 +1547,6 @@ mod tests { #[test] fn test_reject_unknown_field() { - let id = TenantId::generate(); - let create_request = json!({ - "new_tenant_id": id.to_string(), - "unknown_field": "unknown_value".to_string(), - }); - let err = serde_json::from_value::(create_request).unwrap_err(); - assert!( - err.to_string().contains("unknown field `unknown_field`"), - "expect unknown field `unknown_field` error, got: {}", - err - ); - let id = TenantId::generate(); let config_request = json!({ "tenant_id": id.to_string(), @@ -1061,18 +1558,6 @@ mod tests { "expect unknown field `unknown_field` error, got: {}", err ); - - let attach_request = json!({ - "config": { - "unknown_field": "unknown_value".to_string(), - }, - }); - let err = serde_json::from_value::(attach_request).unwrap_err(); - assert!( - err.to_string().contains("unknown field `unknown_field`"), - "expect unknown field `unknown_field` error, got: {}", - err - ); } #[test] @@ -1131,4 +1616,102 @@ mod tests { assert_eq!(actual, expected, "example on {line}"); } } + + #[test] + fn test_aux_file_migration_path() { + assert!(AuxFilePolicy::is_valid_migration_path( + None, + AuxFilePolicy::V1 + )); + assert!(AuxFilePolicy::is_valid_migration_path( + None, + AuxFilePolicy::V2 + )); + assert!(AuxFilePolicy::is_valid_migration_path( + None, + AuxFilePolicy::CrossValidation + )); + // Self-migration is not a valid migration path, and the caller should handle it by itself. + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V1), + AuxFilePolicy::V1 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V2), + AuxFilePolicy::V2 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::CrossValidation), + AuxFilePolicy::CrossValidation + )); + // Migrations not allowed + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::CrossValidation), + AuxFilePolicy::V1 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V1), + AuxFilePolicy::V2 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V2), + AuxFilePolicy::V1 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V2), + AuxFilePolicy::CrossValidation + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V1), + AuxFilePolicy::CrossValidation + )); + // Migrations allowed + assert!(AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::CrossValidation), + AuxFilePolicy::V2 + )); + } + + #[test] + fn test_aux_parse() { + assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2); + assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2); + assert_eq!( + AuxFilePolicy::from_str("cross-validation").unwrap(), + AuxFilePolicy::CrossValidation + ); + } + + #[test] + fn test_image_compression_algorithm_parsing() { + use ImageCompressionAlgorithm::*; + let cases = [ + ("disabled", Disabled), + ("zstd", Zstd { level: None }), + ("zstd(18)", Zstd { level: Some(18) }), + ("zstd(-3)", Zstd { level: Some(-3) }), + ]; + + for (display, expected) in cases { + assert_eq!( + ImageCompressionAlgorithm::from_str(display).unwrap(), + expected, + "parsing works" + ); + assert_eq!(format!("{expected}"), display, "Display FromStr roundtrip"); + + let ser = serde_json::to_string(&expected).expect("serialization"); + assert_eq!( + serde_json::from_str::(&ser).unwrap(), + expected, + "serde roundtrip" + ); + + assert_eq!( + serde_json::Value::String(display.to_string()), + serde_json::to_value(expected).unwrap(), + "Display is the serde serialization" + ); + } + } } diff --git a/libs/pageserver_api/src/models/detach_ancestor.rs b/libs/pageserver_api/src/models/detach_ancestor.rs new file mode 100644 index 0000000000..ad74d343ae --- /dev/null +++ b/libs/pageserver_api/src/models/detach_ancestor.rs @@ -0,0 +1,8 @@ +use std::collections::HashSet; + +use utils::id::TimelineId; + +#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)] +pub struct AncestorDetached { + pub reparented_timelines: HashSet, +} diff --git a/libs/pageserver_api/src/models/partitioning.rs b/libs/pageserver_api/src/models/partitioning.rs index 0d287f7be0..f6644be635 100644 --- a/libs/pageserver_api/src/models/partitioning.rs +++ b/libs/pageserver_api/src/models/partitioning.rs @@ -1,9 +1,11 @@ use utils::lsn::Lsn; +use crate::keyspace::SparseKeySpace; + #[derive(Debug, PartialEq, Eq)] pub struct Partitioning { pub keys: crate::keyspace::KeySpace, - + pub sparse_keys: crate::keyspace::SparseKeySpace, pub at_lsn: Lsn, } @@ -32,6 +34,8 @@ impl serde::Serialize for Partitioning { let mut map = serializer.serialize_map(Some(2))?; map.serialize_key("keys")?; map.serialize_value(&KeySpace(&self.keys))?; + map.serialize_key("sparse_keys")?; + map.serialize_value(&KeySpace(&self.sparse_keys.0))?; map.serialize_key("at_lsn")?; map.serialize_value(&WithDisplay(&self.at_lsn))?; map.end() @@ -99,6 +103,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning { #[derive(serde::Deserialize)] struct De { keys: KeySpace, + sparse_keys: KeySpace, #[serde_as(as = "serde_with::DisplayFromStr")] at_lsn: Lsn, } @@ -107,6 +112,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning { Ok(Self { at_lsn: de.at_lsn, keys: de.keys.0, + sparse_keys: SparseKeySpace(de.sparse_keys.0), }) } } @@ -133,6 +139,12 @@ mod tests { "030000000000000000000000000000000003" ] ], + "sparse_keys": [ + [ + "620000000000000000000000000000000000", + "620000000000000000000000000000000003" + ] + ], "at_lsn": "0/2240160" } "#; diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs new file mode 100644 index 0000000000..844a0cda5d --- /dev/null +++ b/libs/pageserver_api/src/models/utilization.rs @@ -0,0 +1,193 @@ +use std::time::SystemTime; +use utils::{serde_percent::Percent, serde_system_time}; + +/// Pageserver current utilization and scoring for how good candidate the pageserver would be for +/// the next tenant. +/// +/// See and maintain pageserver openapi spec for `/v1/utilization_score` as the truth. +/// +/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might +/// not handle full u64 values properly. +#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)] +pub struct PageserverUtilization { + /// Used disk space (physical, ground truth from statfs()) + #[serde(serialize_with = "ser_saturating_u63")] + pub disk_usage_bytes: u64, + /// Free disk space + #[serde(serialize_with = "ser_saturating_u63")] + pub free_space_bytes: u64, + + /// Wanted disk space, based on the tenant shards currently present on this pageserver: this + /// is like disk_usage_bytes, but it is stable and does not change with the cache state of + /// tenants, whereas disk_usage_bytes may reach the disk eviction `max_usage_pct` and stay + /// there, or may be unrealistically low if the pageserver has attached tenants which haven't + /// downloaded layers yet. + #[serde(serialize_with = "ser_saturating_u63", default)] + pub disk_wanted_bytes: u64, + + // What proportion of total disk space will this pageserver use before it starts evicting data? + #[serde(default = "unity_percent")] + pub disk_usable_pct: Percent, + + // How many shards are currently on this node? + #[serde(default)] + pub shard_count: u32, + + // How many shards should this node be able to handle at most? + #[serde(default)] + pub max_shard_count: u32, + + /// Cached result of [`Self::score`] + pub utilization_score: Option, + + /// When was this snapshot captured, pageserver local time. + /// + /// Use millis to give confidence that the value is regenerated often enough. + pub captured_at: serde_system_time::SystemTime, +} + +fn unity_percent() -> Percent { + Percent::new(0).unwrap() +} + +pub type RawScore = u64; + +impl PageserverUtilization { + const UTILIZATION_FULL: u64 = 1000000; + + /// Calculate a utilization score. The result is to be inrepreted as a fraction of + /// Self::UTILIZATION_FULL. + /// + /// Lower values are more affine to scheduling more work on this node. + /// - UTILIZATION_FULL represents an ideal node which is fully utilized but should not receive any more work. + /// - 0.0 represents an empty node. + /// - Negative values are forbidden + /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to + /// layer eviction. + pub fn score(&self) -> RawScore { + let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes) + * self.disk_usable_pct.get() as u64) + / 100; + let disk_utilization_score = + self.disk_wanted_bytes * Self::UTILIZATION_FULL / disk_usable_capacity; + + let shard_utilization_score = + self.shard_count as u64 * Self::UTILIZATION_FULL / self.max_shard_count as u64; + std::cmp::max(disk_utilization_score, shard_utilization_score) + } + + pub fn cached_score(&mut self) -> RawScore { + match self.utilization_score { + None => { + let s = self.score(); + self.utilization_score = Some(s); + s + } + Some(s) => s, + } + } + + /// If a node is currently hosting more work than it can comfortably handle. This does not indicate that + /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative. + pub fn is_overloaded(score: RawScore) -> bool { + score >= Self::UTILIZATION_FULL + } + + pub fn adjust_shard_count_max(&mut self, shard_count: u32) { + if self.shard_count < shard_count { + self.shard_count = shard_count; + + // Dirty cache: this will be calculated next time someone retrives the score + self.utilization_score = None; + } + } + + /// A utilization structure that has a full utilization score: use this as a placeholder when + /// you need a utilization but don't have real values yet. + pub fn full() -> Self { + Self { + disk_usage_bytes: 1, + free_space_bytes: 0, + disk_wanted_bytes: 1, + disk_usable_pct: Percent::new(100).unwrap(), + shard_count: 1, + max_shard_count: 1, + utilization_score: Some(Self::UTILIZATION_FULL), + captured_at: serde_system_time::SystemTime(SystemTime::now()), + } + } +} + +/// Test helper +pub mod test_utilization { + use super::PageserverUtilization; + use std::time::SystemTime; + use utils::{ + serde_percent::Percent, + serde_system_time::{self}, + }; + + // Parameters of the imaginary node used for test utilization instances + const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024; + const TEST_SHARDS_MAX: u32 = 1000; + + /// Unit test helper. Unconditionally compiled because cfg(test) doesn't carry across crates. Do + /// not abuse this function from non-test code. + /// + /// Emulates a node with a 1000 shard limit and a 1TB disk. + pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization { + PageserverUtilization { + disk_usage_bytes: disk_wanted_bytes, + free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE), + disk_wanted_bytes, + disk_usable_pct: Percent::new(100).unwrap(), + shard_count, + max_shard_count: TEST_SHARDS_MAX, + utilization_score: None, + captured_at: serde_system_time::SystemTime(SystemTime::now()), + } + } +} + +/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients. +/// +/// Instead of newtype, use this because a newtype would get require handling deserializing values +/// with the highest bit set which is properly parsed by serde formats, but would create a +/// conundrum on how to handle and again serialize such values at type level. It will be a few +/// years until we can use more than `i64::MAX` bytes on a disk. +fn ser_saturating_u63(value: &u64, serializer: S) -> Result { + const MAX_FORMAT_INT64: u64 = i64::MAX as u64; + + let value = (*value).min(MAX_FORMAT_INT64); + + serializer.serialize_u64(value) +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use super::*; + + #[test] + fn u64_max_is_serialized_as_u63_max() { + let doc = PageserverUtilization { + disk_usage_bytes: u64::MAX, + free_space_bytes: 0, + disk_wanted_bytes: u64::MAX, + utilization_score: Some(13), + disk_usable_pct: Percent::new(90).unwrap(), + shard_count: 100, + max_shard_count: 200, + captured_at: serde_system_time::SystemTime( + std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779), + ), + }; + + let s = serde_json::to_string(&doc).unwrap(); + + let expected = "{\"disk_usage_bytes\":9223372036854775807,\"free_space_bytes\":0,\"disk_wanted_bytes\":9223372036854775807,\"disk_usable_pct\":90,\"shard_count\":100,\"max_shard_count\":200,\"utilization_score\":13,\"captured_at\":\"2024-02-21T10:02:59.000Z\"}"; + + assert_eq!(s, expected); + } +} diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs index 3f37af600d..010a9c2932 100644 --- a/libs/pageserver_api/src/reltag.rs +++ b/libs/pageserver_api/src/reltag.rs @@ -3,7 +3,7 @@ use std::cmp::Ordering; use std::fmt; use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; -use postgres_ffi::relfile_utils::forknumber_to_name; +use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM}; use postgres_ffi::Oid; /// @@ -68,6 +68,57 @@ impl fmt::Display for RelTag { } } +#[derive(Debug, thiserror::Error)] +pub enum ParseRelTagError { + #[error("invalid forknum")] + InvalidForknum(#[source] std::num::ParseIntError), + #[error("missing triplet member {}", .0)] + MissingTripletMember(usize), + #[error("invalid triplet member {}", .0)] + InvalidTripletMember(usize, #[source] std::num::ParseIntError), +} + +impl std::str::FromStr for RelTag { + type Err = ParseRelTagError; + + fn from_str(s: &str) -> Result { + use ParseRelTagError::*; + + // FIXME: in postgres logs this separator is dot + // Example: + // could not read block 2 in rel 1663/208101/2620.1 from page server at lsn 0/2431E6F0 + // with a regex we could get this more painlessly + let (triplet, forknum) = match s.split_once('_').or_else(|| s.split_once('.')) { + Some((t, f)) => { + let forknum = forkname_to_number(Some(f)); + let forknum = if let Ok(f) = forknum { + f + } else { + f.parse::().map_err(InvalidForknum)? + }; + + (t, Some(forknum)) + } + None => (s, None), + }; + + let mut split = triplet + .splitn(3, '/') + .enumerate() + .map(|(i, s)| s.parse::().map_err(|e| InvalidTripletMember(i, e))); + let spcnode = split.next().ok_or(MissingTripletMember(0))??; + let dbnode = split.next().ok_or(MissingTripletMember(1))??; + let relnode = split.next().ok_or(MissingTripletMember(2))??; + + Ok(RelTag { + spcnode, + forknum: forknum.unwrap_or(MAIN_FORKNUM), + dbnode, + relnode, + }) + } +} + impl RelTag { pub fn to_segfile_name(&self, segno: u32) -> String { let mut name = if self.spcnode == GLOBALTABLESPACE_OID { @@ -123,9 +174,12 @@ impl RelTag { PartialOrd, Ord, strum_macros::EnumIter, + strum_macros::FromRepr, + enum_map::Enum, )] +#[repr(u8)] pub enum SlruKind { - Clog, + Clog = 0, MultiXactMembers, MultiXactOffsets, } diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index e27aad8156..e83cf4c855 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -1,330 +1,63 @@ -use std::{ops::RangeInclusive, str::FromStr}; +//! See docs/rfcs/031-sharding-static.md for an overview of sharding. +//! +//! This module contains a variety of types used to represent the concept of sharding +//! a Neon tenant across multiple physical shards. Since there are quite a few of these, +//! we provide an summary here. +//! +//! Types used to describe shards: +//! - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value +//! which identifies a tenant which is not shard-aware. This means its storage paths do not include +//! a shard suffix. +//! - [`ShardNumber`] is simply the zero-based index of a shard within a tenant. +//! - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId` +//! without the tenant ID. This is useful for things that are implicitly scoped to a particular +//! tenant, such as layer files. +//! - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient +//! detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read. +//! - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as +//! four hex digits. An unsharded tenant is `0000`. +//! - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant +//! +//! Types used to describe the parameters for data distribution in a sharded tenant: +//! - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across +//! multiple shards. Its value is given in 8kiB pages. +//! - [`ShardLayout`] describes the data distribution scheme, and at time of writing is +//! always zero: this is provided for future upgrades that might introduce different +//! data distribution schemes. +//! +//! Examples: +//! - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000 +//! - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001 +//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive), +//! and their slugs are 0004, 0104, 0204, and 0304. -use crate::{ - key::{is_rel_block_key, Key}, - models::ShardParameters, -}; -use hex::FromHex; +use crate::{key::Key, models::ShardParameters}; +use postgres_ffi::relfile_utils::INIT_FORKNUM; use serde::{Deserialize, Serialize}; -use thiserror; -use utils::id::TenantId; -#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] -pub struct ShardNumber(pub u8); +#[doc(inline)] +pub use ::utils::shard::*; -#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] -pub struct ShardCount(pub u8); - -impl ShardCount { - pub const MAX: Self = Self(u8::MAX); -} - -impl ShardNumber { - pub const MAX: Self = Self(u8::MAX); -} - -/// TenantShardId identify the units of work for the Pageserver. -/// -/// These are written as `-`, for example: -/// -/// # The second shard in a two-shard tenant -/// 072f1291a5310026820b2fe4b2968934-0102 -/// -/// Historically, tenants could not have multiple shards, and were identified -/// by TenantId. To support this, TenantShardId has a special legacy -/// mode where `shard_count` is equal to zero: this represents a single-sharded -/// tenant which should be written as a TenantId with no suffix. -/// -/// The human-readable encoding of TenantShardId, such as used in API URLs, -/// is both forward and backward compatible: a legacy TenantId can be -/// decoded as a TenantShardId, and when re-encoded it will be parseable -/// as a TenantId. -/// -/// Note that the binary encoding is _not_ backward compatible, because -/// at the time sharding is introduced, there are no existing binary structures -/// containing TenantId that we need to handle. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] -pub struct TenantShardId { - pub tenant_id: TenantId, - pub shard_number: ShardNumber, - pub shard_count: ShardCount, -} - -impl TenantShardId { - pub fn unsharded(tenant_id: TenantId) -> Self { - Self { - tenant_id, - shard_number: ShardNumber(0), - shard_count: ShardCount(0), - } - } - - /// The range of all TenantShardId that belong to a particular TenantId. This is useful when - /// you have a BTreeMap of TenantShardId, and are querying by TenantId. - pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive { - RangeInclusive::new( - Self { - tenant_id, - shard_number: ShardNumber(0), - shard_count: ShardCount(0), - }, - Self { - tenant_id, - shard_number: ShardNumber::MAX, - shard_count: ShardCount::MAX, - }, - ) - } - - pub fn shard_slug(&self) -> impl std::fmt::Display + '_ { - ShardSlug(self) - } - - /// Convenience for code that has special behavior on the 0th shard. - pub fn is_zero(&self) -> bool { - self.shard_number == ShardNumber(0) - } - - pub fn is_unsharded(&self) -> bool { - self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) - } - pub fn to_index(&self) -> ShardIndex { - ShardIndex { - shard_number: self.shard_number, - shard_count: self.shard_count, - } - } -} - -/// Formatting helper -struct ShardSlug<'a>(&'a TenantShardId); - -impl<'a> std::fmt::Display for ShardSlug<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{:02x}{:02x}", - self.0.shard_number.0, self.0.shard_count.0 - ) - } -} - -impl std::fmt::Display for TenantShardId { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - if self.shard_count != ShardCount(0) { - write!(f, "{}-{}", self.tenant_id, self.shard_slug()) - } else { - // Legacy case (shard_count == 0) -- format as just the tenant id. Note that this - // is distinct from the normal single shard case (shard count == 1). - self.tenant_id.fmt(f) - } - } -} - -impl std::fmt::Debug for TenantShardId { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // Debug is the same as Display: the compact hex representation - write!(f, "{}", self) - } -} - -impl std::str::FromStr for TenantShardId { - type Err = hex::FromHexError; - - fn from_str(s: &str) -> Result { - // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count - if s.len() == 32 { - // Legacy case: no shard specified - Ok(Self { - tenant_id: TenantId::from_str(s)?, - shard_number: ShardNumber(0), - shard_count: ShardCount(0), - }) - } else if s.len() == 37 { - let bytes = s.as_bytes(); - let tenant_id = TenantId::from_hex(&bytes[0..32])?; - let mut shard_parts: [u8; 2] = [0u8; 2]; - hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?; - Ok(Self { - tenant_id, - shard_number: ShardNumber(shard_parts[0]), - shard_count: ShardCount(shard_parts[1]), - }) - } else { - Err(hex::FromHexError::InvalidStringLength) - } - } -} - -impl From<[u8; 18]> for TenantShardId { - fn from(b: [u8; 18]) -> Self { - let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap(); - - Self { - tenant_id: TenantId::from(tenant_id_bytes), - shard_number: ShardNumber(b[16]), - shard_count: ShardCount(b[17]), - } - } -} - -/// For use within the context of a particular tenant, when we need to know which -/// shard we're dealing with, but do not need to know the full ShardIdentity (because -/// we won't be doing any page->shard mapping), and do not need to know the fully qualified -/// TenantShardId. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] -pub struct ShardIndex { - pub shard_number: ShardNumber, - pub shard_count: ShardCount, -} - -impl ShardIndex { - pub fn new(number: ShardNumber, count: ShardCount) -> Self { - Self { - shard_number: number, - shard_count: count, - } - } - pub fn unsharded() -> Self { - Self { - shard_number: ShardNumber(0), - shard_count: ShardCount(0), - } - } - - pub fn is_unsharded(&self) -> bool { - self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) - } - - /// For use in constructing remote storage paths: concatenate this with a TenantId - /// to get a fully qualified TenantShardId. - /// - /// Backward compat: this function returns an empty string if Self::is_unsharded, such - /// that the legacy pre-sharding remote key format is preserved. - pub fn get_suffix(&self) -> String { - if self.is_unsharded() { - "".to_string() - } else { - format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0) - } - } -} - -impl std::fmt::Display for ShardIndex { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0) - } -} - -impl std::fmt::Debug for ShardIndex { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // Debug is the same as Display: the compact hex representation - write!(f, "{}", self) - } -} - -impl std::str::FromStr for ShardIndex { - type Err = hex::FromHexError; - - fn from_str(s: &str) -> Result { - // Expect format: 1 byte shard number, 1 byte shard count - if s.len() == 4 { - let bytes = s.as_bytes(); - let mut shard_parts: [u8; 2] = [0u8; 2]; - hex::decode_to_slice(bytes, &mut shard_parts)?; - Ok(Self { - shard_number: ShardNumber(shard_parts[0]), - shard_count: ShardCount(shard_parts[1]), - }) - } else { - Err(hex::FromHexError::InvalidStringLength) - } - } -} - -impl From<[u8; 2]> for ShardIndex { - fn from(b: [u8; 2]) -> Self { - Self { - shard_number: ShardNumber(b[0]), - shard_count: ShardCount(b[1]), - } - } -} - -impl Serialize for TenantShardId { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - if serializer.is_human_readable() { - serializer.collect_str(self) - } else { - let mut packed: [u8; 18] = [0; 18]; - packed[0..16].clone_from_slice(&self.tenant_id.as_arr()); - packed[16] = self.shard_number.0; - packed[17] = self.shard_count.0; - - packed.serialize(serializer) - } - } -} - -impl<'de> Deserialize<'de> for TenantShardId { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - struct IdVisitor { - is_human_readable_deserializer: bool, - } - - impl<'de> serde::de::Visitor<'de> for IdVisitor { - type Value = TenantShardId; - - fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { - if self.is_human_readable_deserializer { - formatter.write_str("value in form of hex string") - } else { - formatter.write_str("value in form of integer array([u8; 18])") - } - } - - fn visit_seq(self, seq: A) -> Result - where - A: serde::de::SeqAccess<'de>, - { - let s = serde::de::value::SeqAccessDeserializer::new(seq); - let id: [u8; 18] = Deserialize::deserialize(s)?; - Ok(TenantShardId::from(id)) - } - - fn visit_str(self, v: &str) -> Result - where - E: serde::de::Error, - { - TenantShardId::from_str(v).map_err(E::custom) - } - } - - if deserializer.is_human_readable() { - deserializer.deserialize_str(IdVisitor { - is_human_readable_deserializer: true, - }) - } else { - deserializer.deserialize_tuple( - 18, - IdVisitor { - is_human_readable_deserializer: false, - }, - ) - } - } +/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`], +/// and to check whether that [`ShardNumber`] is the same as the current shard. +#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] +pub struct ShardIdentity { + pub number: ShardNumber, + pub count: ShardCount, + pub stripe_size: ShardStripeSize, + layout: ShardLayout, } /// Stripe size in number of pages #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] pub struct ShardStripeSize(pub u32); +impl Default for ShardStripeSize { + fn default() -> Self { + DEFAULT_STRIPE_SIZE + } +} + /// Layout version: for future upgrades where we might change how the key->shard mapping works #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] pub struct ShardLayout(u8); @@ -336,16 +69,6 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255); /// Default stripe size in pages: 256MiB divided by 8kiB page size. const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); -/// The ShardIdentity contains the information needed for one member of map -/// to resolve a key to a shard, and then check whether that shard is ==self. -#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] -pub struct ShardIdentity { - pub number: ShardNumber, - pub count: ShardCount, - pub stripe_size: ShardStripeSize, - layout: ShardLayout, -} - #[derive(thiserror::Error, Debug, PartialEq, Eq)] pub enum ShardConfigError { #[error("Invalid shard count")] @@ -360,7 +83,7 @@ impl ShardIdentity { /// An identity with number=0 count=0 is a "none" identity, which represents legacy /// tenants. Modern single-shard tenants should not use this: they should /// have number=0 count=1. - pub fn unsharded() -> Self { + pub const fn unsharded() -> Self { Self { number: ShardNumber(0), count: ShardCount(0), @@ -385,6 +108,9 @@ impl ShardIdentity { } } + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. pub fn is_unsharded(&self) -> bool { self.number == ShardNumber(0) && self.count == ShardCount(0) } @@ -433,6 +159,8 @@ impl ShardIdentity { } /// Return true if the key should be ingested by this shard + /// + /// Shards must ingest _at least_ keys which return true from this check. pub fn is_key_local(&self, key: &Key) -> bool { assert!(!self.is_broken()); if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) { @@ -443,20 +171,32 @@ impl ShardIdentity { } /// Return true if the key should be discarded if found in this shard's - /// data store, e.g. during compaction after a split + /// data store, e.g. during compaction after a split. + /// + /// Shards _may_ drop keys which return false here, but are not obliged to. pub fn is_key_disposable(&self, key: &Key) -> bool { if key_is_shard0(key) { // Q: Why can't we dispose of shard0 content if we're not shard 0? - // A: because the WAL ingestion logic currently ingests some shard 0 - // content on all shards, even though it's only read on shard 0. If we - // dropped it, then subsequent WAL ingest to these keys would encounter - // an error. + // A1: because the WAL ingestion logic currently ingests some shard 0 + // content on all shards, even though it's only read on shard 0. If we + // dropped it, then subsequent WAL ingest to these keys would encounter + // an error. + // A2: because key_is_shard0 also covers relation size keys, which are written + // on all shards even though they're only maintained accurately on shard 0. false } else { !self.is_key_local(key) } } + /// Obtains the shard number and count combined into a `ShardIndex`. + pub fn shard_index(&self) -> ShardIndex { + ShardIndex { + shard_count: self.count, + shard_number: self.number, + } + } + pub fn shard_slug(&self) -> String { if self.count > ShardCount(0) { format!("-{:02x}{:02x}", self.number.0, self.count.0) @@ -467,82 +207,11 @@ impl ShardIdentity { /// Convenience for checking if this identity is the 0th shard in a tenant, /// for special cases on shard 0 such as ingesting relation sizes. - pub fn is_zero(&self) -> bool { + pub fn is_shard_zero(&self) -> bool { self.number == ShardNumber(0) } } -impl Serialize for ShardIndex { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - if serializer.is_human_readable() { - serializer.collect_str(self) - } else { - // Binary encoding is not used in index_part.json, but is included in anticipation of - // switching various structures (e.g. inter-process communication, remote metadata) to more - // compact binary encodings in future. - let mut packed: [u8; 2] = [0; 2]; - packed[0] = self.shard_number.0; - packed[1] = self.shard_count.0; - packed.serialize(serializer) - } - } -} - -impl<'de> Deserialize<'de> for ShardIndex { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - struct IdVisitor { - is_human_readable_deserializer: bool, - } - - impl<'de> serde::de::Visitor<'de> for IdVisitor { - type Value = ShardIndex; - - fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { - if self.is_human_readable_deserializer { - formatter.write_str("value in form of hex string") - } else { - formatter.write_str("value in form of integer array([u8; 2])") - } - } - - fn visit_seq(self, seq: A) -> Result - where - A: serde::de::SeqAccess<'de>, - { - let s = serde::de::value::SeqAccessDeserializer::new(seq); - let id: [u8; 2] = Deserialize::deserialize(s)?; - Ok(ShardIndex::from(id)) - } - - fn visit_str(self, v: &str) -> Result - where - E: serde::de::Error, - { - ShardIndex::from_str(v).map_err(E::custom) - } - } - - if deserializer.is_human_readable() { - deserializer.deserialize_str(IdVisitor { - is_human_readable_deserializer: true, - }) - } else { - deserializer.deserialize_tuple( - 2, - IdVisitor { - is_human_readable_deserializer: false, - }, - ) - } - } -} - /// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys /// in order to be able to serve basebackup requests without peer communication). fn key_is_shard0(key: &Key) -> bool { @@ -550,7 +219,13 @@ fn key_is_shard0(key: &Key) -> bool { // relation pages are distributed to shards other than shard zero. Everything else gets // stored on shard 0. This guarantees that shard 0 can independently serve basebackup // requests, and any request other than those for particular blocks in relations. - !is_rel_block_key(key) + // + // The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table + // type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0 + // because they must be included in basebackups. + let is_initfork = key.field5 == INIT_FORKNUM; + + !key.is_rel_block_key() || is_initfork } /// Provide the same result as the function in postgres `hashfn.h` with the same name @@ -597,11 +272,29 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke ShardNumber((hash % count.0 as u32) as u8) } +/// For debugging, while not exposing the internals. +#[derive(Debug)] +#[allow(unused)] // used by debug formatting by pagectl +struct KeyShardingInfo { + shard0: bool, + shard_number: ShardNumber, +} + +pub fn describe( + key: &Key, + shard_count: ShardCount, + stripe_size: ShardStripeSize, +) -> impl std::fmt::Debug { + KeyShardingInfo { + shard0: key_is_shard0(key), + shard_number: key_to_shard_number(shard_count, stripe_size, key), + } +} + #[cfg(test)] mod tests { use std::str::FromStr; - use bincode; use utils::{id::TenantId, Hex}; use super::*; @@ -793,4 +486,108 @@ mod tests { let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key); assert_eq!(shard, ShardNumber(8)); } + + #[test] + fn shard_id_split() { + let tenant_id = TenantId::generate(); + let parent = TenantShardId::unsharded(tenant_id); + + // Unsharded into 2 + assert_eq!( + parent.split(ShardCount(2)), + vec![ + TenantShardId { + tenant_id, + shard_count: ShardCount(2), + shard_number: ShardNumber(0) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(2), + shard_number: ShardNumber(1) + } + ] + ); + + // Unsharded into 4 + assert_eq!( + parent.split(ShardCount(4)), + vec![ + TenantShardId { + tenant_id, + shard_count: ShardCount(4), + shard_number: ShardNumber(0) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(4), + shard_number: ShardNumber(1) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(4), + shard_number: ShardNumber(2) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(4), + shard_number: ShardNumber(3) + } + ] + ); + + // count=1 into 2 (check this works the same as unsharded.) + let parent = TenantShardId { + tenant_id, + shard_count: ShardCount(1), + shard_number: ShardNumber(0), + }; + assert_eq!( + parent.split(ShardCount(2)), + vec![ + TenantShardId { + tenant_id, + shard_count: ShardCount(2), + shard_number: ShardNumber(0) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(2), + shard_number: ShardNumber(1) + } + ] + ); + + // count=2 into count=8 + let parent = TenantShardId { + tenant_id, + shard_count: ShardCount(2), + shard_number: ShardNumber(1), + }; + assert_eq!( + parent.split(ShardCount(8)), + vec![ + TenantShardId { + tenant_id, + shard_count: ShardCount(8), + shard_number: ShardNumber(1) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(8), + shard_number: ShardNumber(3) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(8), + shard_number: ShardNumber(5) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(8), + shard_number: ShardNumber(7) + }, + ] + ); + } } diff --git a/libs/pageserver_api/src/control_api.rs b/libs/pageserver_api/src/upcall_api.rs similarity index 54% rename from libs/pageserver_api/src/control_api.rs rename to libs/pageserver_api/src/upcall_api.rs index 0acc3a7bb0..2e88836bd0 100644 --- a/libs/pageserver_api/src/control_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -6,19 +6,36 @@ use serde::{Deserialize, Serialize}; use utils::id::NodeId; -use crate::shard::TenantShardId; +use crate::{ + controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId, +}; +/// Upcall message sent by the pageserver to the configured `control_plane_api` on +/// startup. #[derive(Serialize, Deserialize)] pub struct ReAttachRequest { pub node_id: NodeId, + + /// Optional inline self-registration: this is useful with the storage controller, + /// if the node already has a node_id set. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub register: Option, } -#[derive(Serialize, Deserialize)] +fn default_mode() -> LocationConfigMode { + LocationConfigMode::AttachedSingle +} + +#[derive(Serialize, Deserialize, Debug)] pub struct ReAttachResponseTenant { pub id: TenantShardId, - pub gen: u32, -} + /// Mandatory if LocationConfigMode is None or set to an Attached* mode + pub gen: Option, + /// Default value only for backward compat: this field should be set + #[serde(default = "default_mode")] + pub mode: LocationConfigMode, +} #[derive(Serialize, Deserialize)] pub struct ReAttachResponse { pub tenants: Vec, diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml index 8e249c09f7..f6854328fc 100644 --- a/libs/postgres_backend/Cargo.toml +++ b/libs/postgres_backend/Cargo.toml @@ -13,14 +13,14 @@ rustls.workspace = true serde.workspace = true thiserror.workspace = true tokio.workspace = true +tokio-util.workspace = true tokio-rustls.workspace = true tracing.workspace = true pq_proto.workspace = true -workspace_hack.workspace = true [dev-dependencies] once_cell.workspace = true rustls-pemfile.workspace = true tokio-postgres.workspace = true -tokio-postgres-rustls.workspace = true \ No newline at end of file +tokio-postgres-rustls.workspace = true diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 73d25619c3..600f1d728c 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -6,7 +6,6 @@ #![deny(clippy::undocumented_unsafe_blocks)] use anyhow::Context; use bytes::Bytes; -use futures::pin_mut; use serde::{Deserialize, Serialize}; use std::io::ErrorKind; use std::net::SocketAddr; @@ -17,6 +16,7 @@ use std::{fmt, io}; use std::{future::Future, str::FromStr}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_rustls::TlsAcceptor; +use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, trace, warn}; use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter}; @@ -69,8 +69,10 @@ impl QueryError { } /// Returns true if the given error is a normal consequence of a network issue, -/// or the client closing the connection. These errors can happen during normal -/// operations, and don't indicate a bug in our code. +/// or the client closing the connection. +/// +/// These errors can happen during normal operations, +/// and don't indicate a bug in our code. pub fn is_expected_io_error(e: &io::Error) -> bool { use io::ErrorKind::*; matches!( @@ -378,8 +380,7 @@ impl PostgresBackend { &mut self, cx: &mut std::task::Context<'_>, ) -> Poll> { - let flush_fut = self.flush(); - pin_mut!(flush_fut); + let flush_fut = std::pin::pin!(self.flush()); flush_fut.poll(cx) } @@ -402,21 +403,15 @@ impl PostgresBackend { } /// Wrapper for run_message_loop() that shuts down socket when we are done - pub async fn run( + pub async fn run( mut self, handler: &mut impl Handler, - shutdown_watcher: F, - ) -> Result<(), QueryError> - where - F: Fn() -> S + Clone, - S: Future, - { - let ret = self - .run_message_loop(handler, shutdown_watcher.clone()) - .await; + cancel: &CancellationToken, + ) -> Result<(), QueryError> { + let ret = self.run_message_loop(handler, cancel).await; tokio::select! { - _ = shutdown_watcher() => { + _ = cancel.cancelled() => { // do nothing; we most likely got already stopped by shutdown and will log it next. } _ = self.framed.shutdown() => { @@ -446,21 +441,17 @@ impl PostgresBackend { } } - async fn run_message_loop( + async fn run_message_loop( &mut self, handler: &mut impl Handler, - shutdown_watcher: F, - ) -> Result<(), QueryError> - where - F: Fn() -> S, - S: Future, - { + cancel: &CancellationToken, + ) -> Result<(), QueryError> { trace!("postgres backend to {:?} started", self.peer_addr); tokio::select!( biased; - _ = shutdown_watcher() => { + _ = cancel.cancelled() => { // We were requested to shut down. tracing::info!("shutdown request received during handshake"); return Err(QueryError::Shutdown) @@ -475,7 +466,7 @@ impl PostgresBackend { let mut query_string = Bytes::new(); while let Some(msg) = tokio::select!( biased; - _ = shutdown_watcher() => { + _ = cancel.cancelled() => { // We were requested to shut down. tracing::info!("shutdown request received in run_message_loop"); return Err(QueryError::Shutdown) @@ -487,7 +478,7 @@ impl PostgresBackend { let result = self.process_message(handler, msg, &mut query_string).await; tokio::select!( biased; - _ = shutdown_watcher() => { + _ = cancel.cancelled() => { // We were requested to shut down. tracing::info!("shutdown request received during response flush"); @@ -674,11 +665,17 @@ impl PostgresBackend { assert!(self.state < ProtoState::Authentication); let have_tls = self.tls_config.is_some(); match msg { - FeStartupPacket::SslRequest => { + FeStartupPacket::SslRequest { direct } => { debug!("SSL requested"); - self.write_message(&BeMessage::EncryptionResponse(have_tls)) - .await?; + if !direct { + self.write_message(&BeMessage::EncryptionResponse(have_tls)) + .await?; + } else if !have_tls { + return Err(QueryError::Other(anyhow::anyhow!( + "direct SSL negotiation but no TLS support" + ))); + } if have_tls { self.start_tls().await?; @@ -822,10 +819,11 @@ impl PostgresBackend { Ok(ProcessMsgResult::Continue) } - /// Log as info/error result of handling COPY stream and send back - /// ErrorResponse if that makes sense. Shutdown the stream if we got - /// Terminate. TODO: transition into waiting for Sync msg if we initiate the - /// close. + /// - Log as info/error result of handling COPY stream and send back + /// ErrorResponse if that makes sense. + /// - Shutdown the stream if we got Terminate. + /// - Then close the connection because we don't handle exiting from COPY + /// stream normally. pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) { use CopyStreamHandlerEnd::*; @@ -851,10 +849,6 @@ impl PostgresBackend { } } - if let Terminate = &end { - self.state = ProtoState::Closed; - } - let err_to_send_and_errcode = match &end { ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)), Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)), @@ -884,6 +878,12 @@ impl PostgresBackend { error!("failed to send ErrorResponse: {}", ee); } } + + // Proper COPY stream finishing to continue using the connection is not + // implemented at the server side (we don't need it so far). To prevent + // further usages of the connection, close it. + self.framed.shutdown().await.ok(); + self.state = ProtoState::Closed; } } diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index e046fa5260..7ec85f0dbe 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -3,13 +3,14 @@ use once_cell::sync::Lazy; use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError}; use pq_proto::{BeMessage, RowDescriptor}; use std::io::Cursor; -use std::{future, sync::Arc}; +use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::{TcpListener, TcpStream}; use tokio_postgres::config::SslMode; use tokio_postgres::tls::MakeTlsConnect; use tokio_postgres::{Config, NoTls, SimpleQueryMessage}; use tokio_postgres_rustls::MakeRustlsConnect; +use tokio_util::sync::CancellationToken; // generate client, server test streams async fn make_tcp_pair() -> (TcpStream, TcpStream) { @@ -50,7 +51,7 @@ async fn simple_select() { tokio::spawn(async move { let mut handler = TestHandler {}; - pgbackend.run(&mut handler, future::pending::<()>).await + pgbackend.run(&mut handler, &CancellationToken::new()).await }); let conf = Config::new(); @@ -72,14 +73,19 @@ async fn simple_select() { } } -static KEY: Lazy = Lazy::new(|| { +static KEY: Lazy> = Lazy::new(|| { let mut cursor = Cursor::new(include_bytes!("key.pem")); - rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone()) + let key = rustls_pemfile::rsa_private_keys(&mut cursor) + .next() + .unwrap() + .unwrap(); + rustls::pki_types::PrivateKeyDer::Pkcs1(key) }); -static CERT: Lazy = Lazy::new(|| { +static CERT: Lazy> = Lazy::new(|| { let mut cursor = Cursor::new(include_bytes!("cert.pem")); - rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone()) + let cert = rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap(); + cert }); // test that basic select with ssl works @@ -88,9 +94,8 @@ async fn simple_select_ssl() { let (client_sock, server_sock) = make_tcp_pair().await; let server_cfg = rustls::ServerConfig::builder() - .with_safe_defaults() .with_no_client_auth() - .with_single_cert(vec![CERT.clone()], KEY.clone()) + .with_single_cert(vec![CERT.clone()], KEY.clone_key()) .unwrap(); let tls_config = Some(Arc::new(server_cfg)); let pgbackend = @@ -98,14 +103,13 @@ async fn simple_select_ssl() { tokio::spawn(async move { let mut handler = TestHandler {}; - pgbackend.run(&mut handler, future::pending::<()>).await + pgbackend.run(&mut handler, &CancellationToken::new()).await }); let client_cfg = rustls::ClientConfig::builder() - .with_safe_defaults() .with_root_certificates({ let mut store = rustls::RootCertStore::empty(); - store.add(&CERT).unwrap(); + store.add(CERT.clone()).unwrap(); store }) .with_no_client_auth(); diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml index fbfea80ae2..19027d13ff 100644 --- a/libs/postgres_connection/Cargo.toml +++ b/libs/postgres_connection/Cargo.toml @@ -11,7 +11,5 @@ postgres.workspace = true tokio-postgres.workspace = true url.workspace = true -workspace_hack.workspace = true - [dev-dependencies] once_cell.workspace = true diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs index ccf9108895..ddf9f7b610 100644 --- a/libs/postgres_connection/src/lib.rs +++ b/libs/postgres_connection/src/lib.rs @@ -7,6 +7,7 @@ use std::fmt; use url::Host; /// Parses a string of format either `host:port` or `host` into a corresponding pair. +/// /// The `host` part should be a correct `url::Host`, while `port` (if present) should be /// a valid decimal u16 of digits only. pub fn parse_host_port>(host_port: S) -> Result<(Host, Option), anyhow::Error> { @@ -178,6 +179,13 @@ impl PgConnectionConfig { } } +impl fmt::Display for PgConnectionConfig { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // The password is intentionally hidden and not part of this display string. + write!(f, "postgresql://{}:{}", self.host, self.port) + } +} + impl fmt::Debug for PgConnectionConfig { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")` diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 86e72f6bdd..ee69878f69 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -19,8 +19,6 @@ thiserror.workspace = true serde.workspace = true utils.workspace = true -workspace_hack.workspace = true - [dev-dependencies] env_logger.workspace = true postgres.workspace = true diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 8e6761d6d3..d3e3ce648f 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -14,7 +14,7 @@ impl ParseCallbacks for PostgresFfiCallbacks { fn include_file(&self, filename: &str) { // This does the equivalent of passing bindgen::CargoCallbacks // to the builder .parse_callbacks() method. - let cargo_callbacks = bindgen::CargoCallbacks; + let cargo_callbacks = bindgen::CargoCallbacks::new(); cargo_callbacks.include_file(filename) } @@ -126,6 +126,7 @@ fn main() -> anyhow::Result<()> { .allowlist_type("PageHeaderData") .allowlist_type("DBState") .allowlist_type("RelMapFile") + .allowlist_type("RepOriginId") // Because structs are used for serialization, tell bindgen to emit // explicit padding fields. .explicit_padding(true) diff --git a/libs/postgres_ffi/src/controlfile_utils.rs b/libs/postgres_ffi/src/controlfile_utils.rs index 0918d15001..eaa9450294 100644 --- a/libs/postgres_ffi/src/controlfile_utils.rs +++ b/libs/postgres_ffi/src/controlfile_utils.rs @@ -29,7 +29,7 @@ use anyhow::{bail, Result}; use bytes::{Bytes, BytesMut}; /// Equivalent to sizeof(ControlFileData) in C -const SIZEOF_CONTROLDATA: usize = std::mem::size_of::(); +const SIZEOF_CONTROLDATA: usize = size_of::(); impl ControlFileData { /// Compute the offset of the `crc` field within the `ControlFileData` struct. diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index d10ebfe277..9acb105e9b 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -3,7 +3,7 @@ #![allow(non_snake_case)] // bindgen creates some unsafe code with no doc comments. #![allow(clippy::missing_safety_doc)] -// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code. +// noted at 1.63 that in many cases there's u32 -> u32 transmutes in bindgen code. #![allow(clippy::useless_transmute)] // modules included with the postgres_ffi macro depend on the types of the specific version's // types, and trigger a too eager lint. @@ -110,6 +110,7 @@ pub mod pg_constants; pub mod relfile_utils; // Export some widely used datatypes that are unlikely to change across Postgres versions +pub use v14::bindings::RepOriginId; pub use v14::bindings::{uint32, uint64, Oid}; pub use v14::bindings::{BlockNumber, OffsetNumber}; pub use v14::bindings::{MultiXactId, TransactionId}; @@ -118,7 +119,9 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo}; // Likewise for these, although the assumption that these don't change is a little more iffy. pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; pub use v14::bindings::{PageHeaderData, XLogRecord}; -pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; +pub use v14::xlog_utils::{ + XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, +}; pub use v14::bindings::{CheckPoint, ControlFileData}; @@ -133,15 +136,15 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; // Export some version independent functions that are used outside of this mod pub use v14::xlog_utils::encode_logical_message; -pub use v14::xlog_utils::from_pg_timestamp; pub use v14::xlog_utils::get_current_timestamp; pub use v14::xlog_utils::to_pg_timestamp; +pub use v14::xlog_utils::try_from_pg_timestamp; pub use v14::xlog_utils::XLogFileName; pub use v14::bindings::DBState_DB_SHUTDOWNED; -pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result { - dispatch_pgversion!(version, Ok(pgv::bindings::bkpimg_is_compressed(bimg_info))) +pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool { + dispatch_pgversion!(version, pgv::bindings::bkpimg_is_compressed(bimg_info)) } pub fn generate_wal_segment( diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index d59e0e4a15..6ce855c78e 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -31,7 +31,7 @@ pub const SMGR_TRUNCATE_FSM: u32 = 0x0004; // // Assumes 8 byte alignment -const SIZEOF_PAGE_HEADER_DATA: usize = std::mem::size_of::(); +const SIZEOF_PAGE_HEADER_DATA: usize = size_of::(); pub const MAXALIGN_SIZE_OF_PAGE_HEADER_DATA: usize = (SIZEOF_PAGE_HEADER_DATA + 7) & !7; // @@ -80,6 +80,9 @@ pub const XLOG_XACT_ABORT: u8 = 0x20; pub const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30; pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40; +// From standbydefs.h +pub const XLOG_RUNNING_XACTS: u8 = 0x10; + // From srlu.h pub const SLRU_PAGES_PER_SEGMENT: u32 = 32; pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usize; @@ -99,7 +102,7 @@ pub const XACT_XINFO_HAS_SUBXACTS: u32 = 1u32 << 1; pub const XACT_XINFO_HAS_RELFILENODES: u32 = 1u32 << 2; pub const XACT_XINFO_HAS_INVALS: u32 = 1u32 << 3; pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4; -// pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5; +pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5; // pub const XACT_XINFO_HAS_AE_LOCKS: u32 = 1u32 << 6; // pub const XACT_XINFO_HAS_GID: u32 = 1u32 << 7; @@ -164,6 +167,7 @@ pub const RM_RELMAP_ID: u8 = 7; pub const RM_STANDBY_ID: u8 = 8; pub const RM_HEAP2_ID: u8 = 9; pub const RM_HEAP_ID: u8 = 10; +pub const RM_REPLORIGIN_ID: u8 = 19; pub const RM_LOGICALMSG_ID: u8 = 21; // from neon_rmgr.h @@ -187,7 +191,7 @@ pub const XLR_RMGR_INFO_MASK: u8 = 0xF0; pub const XLOG_TBLSPC_CREATE: u8 = 0x00; pub const XLOG_TBLSPC_DROP: u8 = 0x10; -pub const SIZEOF_XLOGRECORD: u32 = std::mem::size_of::() as u32; +pub const SIZEOF_XLOGRECORD: u32 = size_of::() as u32; // // from xlogrecord.h @@ -220,6 +224,10 @@ pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10; pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; pub const XLP_LONG_HEADER: u16 = 0x0002; +/* From xlog.h */ +pub const XLOG_REPLORIGIN_SET: u8 = 0x00; +pub const XLOG_REPLORIGIN_DROP: u8 = 0x10; + /* From replication/slot.h */ pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4 /* offset of `slotdata` in ReplicationSlotOnDisk */ + 64 /* NameData */ + 4*4; @@ -234,6 +242,9 @@ pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32; pub const VM_HEAPBLOCKS_PER_PAGE: u32 = (BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK) +/* From origin.c */ +pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE; + // List of subdirectories inside pgdata. // Copied from src/bin/initdb/initdb.c pub const PGDATA_SUBDIRS: [&str; 22] = [ diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 56ce9c901e..0cfd56962e 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -42,9 +42,9 @@ pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8; pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2; -pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::(); -pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::(); -pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::(); +pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = size_of::(); +pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = size_of::(); +pub const XLOG_SIZE_OF_XLOG_RECORD: usize = size_of::(); #[allow(clippy::identity_op)] pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2; @@ -119,11 +119,6 @@ pub fn generate_pg_control( // Generate new pg_control needed for bootstrap checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0; - //reset some fields we don't want to preserve - //TODO Check this. - //We may need to determine the value from twophase data. - checkpoint.oldestActiveXid = 0; - //save new values in pg_control pg_control.checkPoint = 0; pg_control.checkPointCopy = checkpoint; @@ -140,6 +135,8 @@ pub fn get_current_timestamp() -> TimestampTz { mod timestamp_conversions { use std::time::Duration; + use anyhow::Context; + use super::*; const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1) @@ -159,18 +156,18 @@ mod timestamp_conversions { } } - pub fn from_pg_timestamp(time: TimestampTz) -> SystemTime { + pub fn try_from_pg_timestamp(time: TimestampTz) -> anyhow::Result { let time: u64 = time .try_into() - .expect("timestamp before millenium (postgres epoch)"); + .context("timestamp before millenium (postgres epoch)")?; let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC; SystemTime::UNIX_EPOCH .checked_add(Duration::from_micros(since_unix_epoch)) - .expect("SystemTime overflow") + .context("SystemTime overflow") } } -pub use timestamp_conversions::{from_pg_timestamp, to_pg_timestamp}; +pub use timestamp_conversions::{to_pg_timestamp, try_from_pg_timestamp}; // Returns (aligned) end_lsn of the last record in data_dir with WAL segments. // start_lsn must point to some previously known record boundary (beginning of @@ -207,10 +204,16 @@ pub fn find_end_of_wal( let seg_offs = curr_lsn.segment_offset(wal_seg_size); segment.seek(SeekFrom::Start(seg_offs as u64))?; // loop inside segment - loop { + while curr_lsn.segment_number(wal_seg_size) == segno { let bytes_read = segment.read(&mut buf)?; if bytes_read == 0 { - break; // EOF + debug!( + "find_end_of_wal reached end at {:?}, EOF in segment {:?} at offset {}", + result, + seg_file_path, + curr_lsn.segment_offset(wal_seg_size) + ); + return Ok(result); } curr_lsn += bytes_read as u64; decoder.feed_bytes(&buf[0..bytes_read]); @@ -310,7 +313,7 @@ impl XLogLongPageHeaderData { } } -pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::(); +pub const SIZEOF_CHECKPOINT: usize = size_of::(); impl CheckPoint { pub fn encode(&self) -> Result { @@ -330,7 +333,10 @@ impl CheckPoint { /// Returns 'true' if the XID was updated. pub fn update_next_xid(&mut self, xid: u32) -> bool { // nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround. - let mut new_xid = std::cmp::max(xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID); + let mut new_xid = std::cmp::max( + xid.wrapping_add(1), + pg_constants::FIRST_NORMAL_TRANSACTION_ID, + ); // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL. // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE new_xid = @@ -352,6 +358,28 @@ impl CheckPoint { } false } + + /// Advance next multi-XID/offset to those given in arguments. + /// + /// It's important that this handles wraparound correctly. This should match the + /// MultiXactAdvanceNextMXact() logic in PostgreSQL's xlog_redo() function. + /// + /// Returns 'true' if the Checkpoint was updated. + pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool { + let mut modified = false; + + if multi_xid.wrapping_sub(self.nextMulti) as i32 > 0 { + self.nextMulti = multi_xid; + modified = true; + } + + if multi_offset.wrapping_sub(self.nextMultiOffset) as i32 > 0 { + self.nextMultiOffset = multi_offset; + modified = true; + } + + modified + } } /// Generate new, empty WAL segment, with correct block headers at the first @@ -366,8 +394,16 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result 0 { + assert!(seg_off >= XLOG_SIZE_OF_XLOG_LONG_PHD); + // xlp_rem_len doesn't include page header, hence the subtraction. + ( + seg_off - XLOG_SIZE_OF_XLOG_LONG_PHD, + pg_constants::XLP_FIRST_IS_CONTRECORD, + ) } else { (0, 0) }; @@ -396,20 +432,22 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result 0 { + assert!(page_off >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64); + ( + (page_off - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64) as u32, + pg_constants::XLP_FIRST_IS_CONTRECORD, + ) + } else { + (0, 0) + }; let header = XLogPageHeaderData { xlp_magic: XLOG_PAGE_MAGIC as u16, - xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 { - pg_constants::XLP_FIRST_IS_CONTRECORD - } else { - 0 - }, + xlp_info, xlp_tli: PG_TLI, xlp_pageaddr: lsn.page_lsn().0, - xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 { - page_off as u32 - } else { - 0u32 - }, + xlp_rem_len, ..Default::default() // Put 0 in padding fields. }; let hdr_bytes = header.encode()?; @@ -425,11 +463,11 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result Result<()> { .init(); let arg_matches = cli().get_matches(); - let wal_craft = |arg_matches: &ArgMatches, client| { - let (intermediate_lsns, end_of_wal_lsn) = match arg_matches + let wal_craft = |arg_matches: &ArgMatches, client: &mut Client| { + let intermediate_lsns = match arg_matches .get_one::("type") .map(|s| s.as_str()) .context("'type' is required")? @@ -25,6 +26,7 @@ fn main() -> Result<()> { LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?, a => panic!("Unknown --type argument: {a}"), }; + let end_of_wal_lsn = client.pg_current_wal_insert_lsn()?; for lsn in intermediate_lsns { println!("intermediate_lsn = {lsn}"); } diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 281a180e3b..6052f04d11 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -4,8 +4,9 @@ use log::*; use postgres::types::PgLsn; use postgres::Client; use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; -use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; -use std::cmp::Ordering; +use postgres_ffi::{ + XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, +}; use std::path::{Path, PathBuf}; use std::process::Command; use std::time::{Duration, Instant}; @@ -232,59 +233,62 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow pub trait Crafter { const NAME: &'static str; - /// Generates WAL using the client `client`. Returns a pair of: - /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from. - /// May include or exclude Lsn(0) and the end-of-wal. - /// * The expected end-of-wal LSN. - fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)>; + /// Generates WAL using the client `client`. Returns a vector of some valid + /// "interesting" intermediate LSNs which one may start reading from. + /// test_end_of_wal uses this to check various starting points. + /// + /// Note that postgres is generally keen about writing some WAL. While we + /// try to disable it (autovacuum, big wal_writer_delay, etc) it is always + /// possible, e.g. xl_running_xacts are dumped each 15s. So checks about + /// stable WAL end would be flaky unless postgres is shut down. For this + /// reason returning potential end of WAL here is pointless. Most of the + /// time this doesn't happen though, so it is reasonable to create needed + /// WAL structure and immediately kill postgres like test_end_of_wal does. + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result>; } +/// Wraps some WAL craft function, providing current LSN to it before the +/// insertion and flushing WAL afterwards. Also pushes initial LSN to the +/// result. fn craft_internal( client: &mut C, - f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec, Option)>, -) -> anyhow::Result<(Vec, PgLsn)> { + f: impl Fn(&mut C, PgLsn) -> anyhow::Result>, +) -> anyhow::Result> { ensure_server_config(client)?; let initial_lsn = client.pg_current_wal_insert_lsn()?; info!("LSN initial = {}", initial_lsn); - let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?; - let last_lsn = match last_lsn { - None => client.pg_current_wal_insert_lsn()?, - Some(last_lsn) => { - let insert_lsn = client.pg_current_wal_insert_lsn()?; - match last_lsn.cmp(&insert_lsn) { - Ordering::Less => bail!( - "Some records were inserted after the crafted WAL: {} vs {}", - last_lsn, - insert_lsn - ), - Ordering::Equal => last_lsn, - Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"), - } - } - }; + let mut intermediate_lsns = f(client, initial_lsn)?; if !intermediate_lsns.starts_with(&[initial_lsn]) { intermediate_lsns.insert(0, initial_lsn); } - // Some records may be not flushed, e.g. non-transactional logical messages. - client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?; - match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) { - Ordering::Less => bail!("Some records were flushed after the crafted WAL"), - Ordering::Equal => {} - Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"), + // Some records may be not flushed, e.g. non-transactional logical messages. Flush now. + // + // If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn + // returns the position just after the page header on the next page. That's where the next + // record will be inserted. But the page header hasn't actually been written to the WAL + // yet, and if you try to flush it, you get a "request to flush past end of generated WAL" + // error. Because of that, if the insert location is just after a page header, back off to + // previous page boundary. + let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?); + if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 { + lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64; + } else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 { + lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64; } - Ok((intermediate_lsns, last_lsn)) + client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?; + Ok(intermediate_lsns) } pub struct Simple; impl Crafter for Simple { const NAME: &'static str = "simple"; - fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { craft_internal(client, |client, _| { client.execute("CREATE table t(x int)", &[])?; - Ok((Vec::new(), None)) + Ok(Vec::new()) }) } } @@ -292,97 +296,114 @@ impl Crafter for Simple { pub struct LastWalRecordXlogSwitch; impl Crafter for LastWalRecordXlogSwitch { const NAME: &'static str = "last_wal_record_xlog_switch"; - fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { - // Do not use generate_internal because here we end up with flush_lsn exactly on + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { + // Do not use craft_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; client.execute("CREATE table t(x int)", &[])?; let before_xlog_switch = client.pg_current_wal_insert_lsn()?; - let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); - let next_segment = PgLsn::from(0x0200_0000); + // pg_switch_wal returns end of last record of the switched segment, + // i.e. end of SWITCH itself. + let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); + let before_xlog_switch_u64 = u64::from(before_xlog_switch); + let next_segment = PgLsn::from( + before_xlog_switch_u64 - (before_xlog_switch_u64 % WAL_SEGMENT_SIZE as u64) + + WAL_SEGMENT_SIZE as u64, + ); ensure!( - after_xlog_switch <= next_segment, - "XLOG_SWITCH message ended after the expected segment boundary: {} > {}", - after_xlog_switch, + xlog_switch_record_end <= next_segment, + "XLOG_SWITCH record ended after the expected segment boundary: {} > {}", + xlog_switch_record_end, next_segment ); - Ok((vec![before_xlog_switch, after_xlog_switch], next_segment)) + Ok(vec![before_xlog_switch, xlog_switch_record_end]) } } pub struct LastWalRecordXlogSwitchEndsOnPageBoundary; +/// Craft xlog SWITCH record ending at page boundary. impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary"; - fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { // Do not use generate_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; client.execute("CREATE table t(x int)", &[])?; - // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. - // We will use logical message as the padding. We start with detecting how much WAL - // it takes for one logical message, considering all alignments and headers. - let base_wal_advance = { + // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. We + // will use carefully-sized logical messages to advance WAL insert location such + // that there is just enough space on the page for the XLOG_SWITCH record. + loop { + // We start with measuring how much WAL it takes for one logical message, + // considering all alignments and headers. let before_lsn = client.pg_current_wal_insert_lsn()?; - // Small non-empty message bigger than few bytes is more likely than an empty - // message to have the same format as the big padding message. client.execute( "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))", &[], )?; - // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD. - (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize - + XLOG_SIZE_OF_XLOG_RECORD - }; - let mut remaining_lsn = - XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ; - if remaining_lsn < base_wal_advance { - remaining_lsn += XLOG_BLCKSZ; - } - let repeats = 10 + remaining_lsn - base_wal_advance; - info!( - "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}", - client.pg_current_wal_insert_lsn()?, - remaining_lsn, - base_wal_advance, - repeats - ); - client.execute( - "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))", - &[&(repeats as i32)], - )?; - info!( - "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}", - client.pg_current_wal_insert_lsn()?, - XLOG_SIZE_OF_XLOG_RECORD - ); + let after_lsn = client.pg_current_wal_insert_lsn()?; - // Emit the XLOG_SWITCH - let before_xlog_switch = client.pg_current_wal_insert_lsn()?; - let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); - let next_segment = PgLsn::from(0x0200_0000); - ensure!( - after_xlog_switch < next_segment, - "XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}", - after_xlog_switch, - next_segment - ); - ensure!( - u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD, - "XLOG_SWITCH message ended not on page boundary: {}, offset = {}", - after_xlog_switch, - u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ - ); - Ok((vec![before_xlog_switch, after_xlog_switch], next_segment)) + // Did the record cross a page boundary? If it did, start over. Crossing a + // page boundary adds to the apparent size of the record because of the page + // header, which throws off the calculation. + if u64::from(before_lsn) / XLOG_BLCKSZ as u64 + != u64::from(after_lsn) / XLOG_BLCKSZ as u64 + { + continue; + } + // base_size is the size of a logical message without the payload + let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10; + + // Is there enough space on the page for another logical message and an + // XLOG_SWITCH? If not, start over. + let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64; + if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 { + continue; + } + + // We will write another logical message, such that after the logical message + // record, there will be space for exactly one XLOG_SWITCH. How large should + // the logical message's payload be? An XLOG_SWITCH record has no data => its + // size is exactly XLOG_SIZE_OF_XLOG_RECORD. + let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64; + + client.execute( + "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))", + &[&(repeats as i32)], + )?; + info!( + "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}", + client.pg_current_wal_insert_lsn()?, + XLOG_SIZE_OF_XLOG_RECORD + ); + + // Emit the XLOG_SWITCH + let before_xlog_switch = client.pg_current_wal_insert_lsn()?; + let xlog_switch_record_end: PgLsn = + client.query_one("SELECT pg_switch_wal()", &[])?.get(0); + + if u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ + != XLOG_SIZE_OF_XLOG_SHORT_PHD + { + warn!( + "XLOG_SWITCH message ended not on page boundary: {}, offset = {}, repeating", + xlog_switch_record_end, + u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ + ); + continue; + } + return Ok(vec![before_xlog_switch, xlog_switch_record_end]); + } } } -fn craft_single_logical_message( +/// Write ~16MB logical message; it should cross WAL segment. +fn craft_seg_size_logical_message( client: &mut impl postgres::GenericClient, transactional: bool, -) -> anyhow::Result<(Vec, PgLsn)> { +) -> anyhow::Result> { craft_internal(client, |client, initial_lsn| { ensure!( initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024), @@ -405,34 +426,24 @@ fn craft_single_logical_message( "Logical message crossed two segments" ); - if transactional { - // Transactional logical messages are part of a transaction, so the one above is - // followed by a small COMMIT record. - - let after_message_lsn = client.pg_current_wal_insert_lsn()?; - ensure!( - message_lsn < after_message_lsn, - "No record found after the emitted message" - ); - Ok((vec![message_lsn], Some(after_message_lsn))) - } else { - Ok((Vec::new(), Some(message_lsn))) - } + Ok(vec![message_lsn]) }) } pub struct WalRecordCrossingSegmentFollowedBySmallOne; impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne { const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one"; - fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { - craft_single_logical_message(client, true) + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { + // Transactional message crossing WAL segment will be followed by small + // commit record. + craft_seg_size_logical_message(client, true) } } pub struct LastWalRecordCrossingSegment; impl Crafter for LastWalRecordCrossingSegment { const NAME: &'static str = "last_wal_record_crossing_segment"; - fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { - craft_single_logical_message(client, false) + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { + craft_seg_size_logical_message(client, false) } } diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs index 6ff4c563b2..79d45de67a 100644 --- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs +++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs @@ -11,13 +11,15 @@ use utils::const_assert; use utils::lsn::Lsn; fn init_logging() { - let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or( - format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"), - )) + let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(format!( + "crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace" + ))) .is_test(true) .try_init(); } +/// Test that find_end_of_wal returns the same results as pg_dump on various +/// WALs created by Crafter. fn test_end_of_wal(test_name: &str) { use crate::*; @@ -38,13 +40,13 @@ fn test_end_of_wal(test_name: &str) { } cfg.initdb().unwrap(); let srv = cfg.start_server().unwrap(); - let (intermediate_lsns, expected_end_of_wal_partial) = - C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap(); + let intermediate_lsns = C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap(); let intermediate_lsns: Vec = intermediate_lsns .iter() .map(|&lsn| u64::from(lsn).into()) .collect(); - let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into(); + // Kill postgres. Note that it might have inserted to WAL something after + // 'craft' did its job. srv.kill(); // Check find_end_of_wal on the initial WAL @@ -56,7 +58,7 @@ fn test_end_of_wal(test_name: &str) { .filter(|fname| IsXLogFileName(fname)) .max() .unwrap(); - check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal); + let expected_end_of_wal = find_pg_waldump_end_of_wal(&cfg, &last_segment); for start_lsn in intermediate_lsns .iter() .chain(std::iter::once(&expected_end_of_wal)) @@ -91,11 +93,7 @@ fn test_end_of_wal(test_name: &str) { } } -fn check_pg_waldump_end_of_wal( - cfg: &crate::Conf, - last_segment: &str, - expected_end_of_wal: Lsn, -) { +fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn { // Get the actual end of WAL by pg_waldump let waldump_output = cfg .pg_waldump("000000010000000000000001", last_segment) @@ -113,11 +111,8 @@ fn check_pg_waldump_end_of_wal( } }; let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap(); - info!( - "waldump erred on {}, expected wal end at {}", - waldump_wal_end, expected_end_of_wal - ); - assert_eq!(waldump_wal_end, expected_end_of_wal); + info!("waldump erred on {}", waldump_wal_end); + waldump_wal_end } fn check_end_of_wal( @@ -183,7 +178,7 @@ pub fn test_find_end_of_wal_last_crossing_segment() { /// currently 1024. #[test] pub fn test_update_next_xid() { - let checkpoint_buf = [0u8; std::mem::size_of::()]; + let checkpoint_buf = [0u8; size_of::()]; let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap(); checkpoint.nextXid = FullTransactionId { value: 10 }; @@ -207,12 +202,59 @@ pub fn test_update_next_xid() { assert_eq!(checkpoint.nextXid.value, 2048); } +#[test] +pub fn test_update_next_multixid() { + let checkpoint_buf = [0u8; size_of::()]; + let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap(); + + // simple case + checkpoint.nextMulti = 20; + checkpoint.nextMultiOffset = 20; + checkpoint.update_next_multixid(1000, 2000); + assert_eq!(checkpoint.nextMulti, 1000); + assert_eq!(checkpoint.nextMultiOffset, 2000); + + // No change + checkpoint.update_next_multixid(500, 900); + assert_eq!(checkpoint.nextMulti, 1000); + assert_eq!(checkpoint.nextMultiOffset, 2000); + + // Close to wraparound, but not wrapped around yet + checkpoint.nextMulti = 0xffff0000; + checkpoint.nextMultiOffset = 0xfffe0000; + checkpoint.update_next_multixid(0xffff00ff, 0xfffe00ff); + assert_eq!(checkpoint.nextMulti, 0xffff00ff); + assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff); + + // Wraparound + checkpoint.update_next_multixid(1, 900); + assert_eq!(checkpoint.nextMulti, 1); + assert_eq!(checkpoint.nextMultiOffset, 900); + + // Wraparound nextMulti to 0. + // + // It's a bit surprising that nextMulti can be 0, because that's a special value + // (InvalidMultiXactId). However, that's how Postgres does it at multi-xid wraparound: + // nextMulti wraps around to 0, but then when the next multi-xid is assigned, it skips + // the 0 and the next multi-xid actually assigned is 1. + checkpoint.nextMulti = 0xffff0000; + checkpoint.nextMultiOffset = 0xfffe0000; + checkpoint.update_next_multixid(0, 0xfffe00ff); + assert_eq!(checkpoint.nextMulti, 0); + assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff); + + // Wraparound nextMultiOffset to 0 + checkpoint.update_next_multixid(0, 0); + assert_eq!(checkpoint.nextMulti, 0); + assert_eq!(checkpoint.nextMultiOffset, 0); +} + #[test] pub fn test_encode_logical_message() { let expected = [ - 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, - 38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, - 101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101, + 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38, + 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102, + 105, 120, 0, 109, 101, 115, 115, 97, 103, 101, ]; let actual = encode_logical_message("prefix", "message"); assert_eq!(expected, actual[..]); diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml index b286eb0358..66bbe03ebc 100644 --- a/libs/pq_proto/Cargo.toml +++ b/libs/pq_proto/Cargo.toml @@ -7,11 +7,11 @@ license.workspace = true [dependencies] bytes.workspace = true byteorder.workspace = true +itertools.workspace = true pin-project-lite.workspace = true postgres-protocol.workspace = true rand.workspace = true -tokio.workspace = true +tokio = { workspace = true, features = ["io-util"] } tracing.workspace = true thiserror.workspace = true - -workspace_hack.workspace = true +serde.workspace = true diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs index 6e97b8c2a0..ccbb90e384 100644 --- a/libs/pq_proto/src/framed.rs +++ b/libs/pq_proto/src/framed.rs @@ -44,9 +44,9 @@ impl ConnectionError { /// Wraps async io `stream`, providing messages to write/flush + read Postgres /// messages. pub struct Framed { - stream: S, - read_buf: BytesMut, - write_buf: BytesMut, + pub stream: S, + pub read_buf: BytesMut, + pub write_buf: BytesMut, } impl Framed { diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index c52a21bcd3..a01191bd5d 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -7,7 +7,9 @@ pub mod framed; use byteorder::{BigEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use std::{borrow::Cow, collections::HashMap, fmt, io, str}; +use itertools::Itertools; +use serde::{Deserialize, Serialize}; +use std::{borrow::Cow, fmt, io, str}; // re-export for use in utils pageserver_feedback.rs pub use postgres_protocol::PG_EPOCH; @@ -37,27 +39,74 @@ pub enum FeMessage { PasswordMessage(Bytes), } +#[derive(Clone, Copy, PartialEq, PartialOrd)] +pub struct ProtocolVersion(u32); + +impl ProtocolVersion { + pub const fn new(major: u16, minor: u16) -> Self { + Self((major as u32) << 16 | minor as u32) + } + pub const fn minor(self) -> u16 { + self.0 as u16 + } + pub const fn major(self) -> u16 { + (self.0 >> 16) as u16 + } +} + +impl fmt::Debug for ProtocolVersion { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_list() + .entry(&self.major()) + .entry(&self.minor()) + .finish() + } +} + #[derive(Debug)] pub enum FeStartupPacket { CancelRequest(CancelKeyData), - SslRequest, + SslRequest { + direct: bool, + }, GssEncRequest, StartupMessage { - major_version: u32, - minor_version: u32, + version: ProtocolVersion, params: StartupMessageParams, }, } -#[derive(Debug)] +#[derive(Debug, Clone, Default)] +pub struct StartupMessageParamsBuilder { + params: BytesMut, +} + +impl StartupMessageParamsBuilder { + /// Set parameter's value by its name. + /// name and value must not contain a \0 byte + pub fn insert(&mut self, name: &str, value: &str) { + self.params.put(name.as_bytes()); + self.params.put(&b"\0"[..]); + self.params.put(value.as_bytes()); + self.params.put(&b"\0"[..]); + } + + pub fn freeze(self) -> StartupMessageParams { + StartupMessageParams { + params: self.params.freeze(), + } + } +} + +#[derive(Debug, Clone, Default)] pub struct StartupMessageParams { - params: HashMap, + params: Bytes, } impl StartupMessageParams { /// Get parameter's value by its name. pub fn get(&self, name: &str) -> Option<&str> { - self.params.get(name).map(|s| s.as_str()) + self.iter().find_map(|(k, v)| (k == name).then_some(v)) } /// Split command-line options according to PostgreSQL's logic, @@ -111,19 +160,23 @@ impl StartupMessageParams { /// Iterate through key-value pairs in an arbitrary order. pub fn iter(&self) -> impl Iterator { - self.params.iter().map(|(k, v)| (k.as_str(), v.as_str())) + let params = + std::str::from_utf8(&self.params).expect("should be validated as utf8 already"); + params.split_terminator('\0').tuples() } // This function is mostly useful in tests. #[doc(hidden)] pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self { - Self { - params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(), + let mut b = StartupMessageParamsBuilder::default(); + for (k, v) in pairs { + b.insert(k, v) } + b.freeze() } } -#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)] pub struct CancelKeyData { pub backend_pid: i32, pub cancel_key: i32, @@ -273,11 +326,23 @@ impl FeStartupPacket { /// different from [`FeMessage::parse`] because startup messages don't have /// message type byte; otherwise, its comments apply. pub fn parse(buf: &mut BytesMut) -> Result, ProtocolError> { + /// const MAX_STARTUP_PACKET_LENGTH: usize = 10000; - const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234; - const CANCEL_REQUEST_CODE: u32 = 5678; - const NEGOTIATE_SSL_CODE: u32 = 5679; - const NEGOTIATE_GSS_CODE: u32 = 5680; + const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234; + /// + const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678); + /// + const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679); + /// + const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680); + + // + // First byte indicates standard SSL handshake message + // (It can't be a Postgres startup length because in network byte order + // that would be a startup packet hundreds of megabytes long) + if buf.first() == Some(&0x16) { + return Ok(Some(FeStartupPacket::SslRequest { direct: true })); + } // need at least 4 bytes with packet len if buf.len() < 4 { @@ -310,12 +375,10 @@ impl FeStartupPacket { let mut msg = buf.split_to(len).freeze(); msg.advance(4); // consume len - let request_code = msg.get_u32(); - let req_hi = request_code >> 16; - let req_lo = request_code & ((1 << 16) - 1); + let request_code = ProtocolVersion(msg.get_u32()); // StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code. - let message = match (req_hi, req_lo) { - (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => { + let message = match request_code { + CANCEL_REQUEST_CODE => { if msg.remaining() != 8 { return Err(ProtocolError::BadMessage( "CancelRequest message is malformed, backend PID / secret key missing" @@ -327,52 +390,38 @@ impl FeStartupPacket { cancel_key: msg.get_i32(), }) } - (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => { + NEGOTIATE_SSL_CODE => { // Requested upgrade to SSL (aka TLS) - FeStartupPacket::SslRequest + FeStartupPacket::SslRequest { direct: false } } - (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => { + NEGOTIATE_GSS_CODE => { // Requested upgrade to GSSAPI FeStartupPacket::GssEncRequest } - (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => { + version if version.major() == RESERVED_INVALID_MAJOR_VERSION => { return Err(ProtocolError::Protocol(format!( - "Unrecognized request code {unrecognized_code}" + "Unrecognized request code {}", + version.minor() ))); } // TODO bail if protocol major_version is not 3? - (major_version, minor_version) => { + version => { // StartupMessage - // Parse pairs of null-terminated strings (key, value). - // See `postgres: ProcessStartupPacket, build_startup_packet`. - let mut tokens = str::from_utf8(&msg) - .map_err(|_e| { - ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned()) - })? - .strip_suffix('\0') // drop packet's own null - .ok_or_else(|| { - ProtocolError::Protocol( - "StartupMessage params: missing null terminator".to_string(), - ) - })? - .split_terminator('\0'); - - let mut params = HashMap::new(); - while let Some(name) = tokens.next() { - let value = tokens.next().ok_or_else(|| { - ProtocolError::Protocol( - "StartupMessage params: key without value".to_string(), - ) - })?; - - params.insert(name.to_owned(), value.to_owned()); - } + let s = str::from_utf8(&msg).map_err(|_e| { + ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned()) + })?; + let s = s.strip_suffix('\0').ok_or_else(|| { + ProtocolError::Protocol( + "StartupMessage params: missing null terminator".to_string(), + ) + })?; FeStartupPacket::StartupMessage { - major_version, - minor_version, - params: StartupMessageParams { params }, + version, + params: StartupMessageParams { + params: msg.slice_ref(s.as_bytes()), + }, } } }; @@ -508,6 +557,10 @@ pub enum BeMessage<'a> { RowDescription(&'a [RowDescriptor<'a>]), XLogData(XLogDataBody<'a>), NoticeResponse(&'a str), + NegotiateProtocolVersion { + version: ProtocolVersion, + options: &'a [&'a str], + }, KeepAlive(WalSndKeepAlive), } @@ -931,6 +984,18 @@ impl<'a> BeMessage<'a> { buf.put_u8(u8::from(req.request_reply)); }); } + + BeMessage::NegotiateProtocolVersion { version, options } => { + buf.put_u8(b'v'); + write_body(buf, |buf| { + buf.put_u32(version.0); + buf.put_u32(options.len() as u32); + for option in options.iter() { + write_cstr(option, buf)?; + } + Ok(()) + })? + } } Ok(()) } diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 2cc59a947b..02adee058f 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -7,6 +7,7 @@ license.workspace = true [dependencies] anyhow.workspace = true async-trait.workspace = true +async-stream.workspace = true once_cell.workspace = true aws-smithy-async.workspace = true aws-smithy-types.workspace = true @@ -14,12 +15,16 @@ aws-config.workspace = true aws-sdk-s3.workspace = true aws-credential-types.workspace = true bytes.workspace = true -camino.workspace = true +camino = { workspace = true, features = ["serde1"] } +humantime.workspace = true +humantime-serde.workspace = true hyper = { workspace = true, features = ["stream"] } futures.workspace = true +rand.workspace = true serde.workspace = true serde_json.workspace = true tokio = { workspace = true, features = ["sync", "fs", "io-util"] } +tokio-stream.workspace = true tokio-util = { workspace = true, features = ["compat"] } toml_edit.workspace = true tracing.workspace = true @@ -27,7 +32,7 @@ scopeguard.workspace = true metrics.workspace = true utils.workspace = true pin-project-lite.workspace = true -workspace_hack.workspace = true + azure_core.workspace = true azure_identity.workspace = true azure_storage.workspace = true @@ -35,8 +40,10 @@ azure_storage_blobs.workspace = true futures-util.workspace = true http-types.workspace = true itertools.workspace = true +sync_wrapper = { workspace = true, features = ["futures"] } [dev-dependencies] camino-tempfile.workspace = true test-context.workspace = true rand.workspace = true +tokio = { workspace = true, features = ["test-util"] } diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index abab32470b..cb7479f6cd 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -3,6 +3,8 @@ use std::borrow::Cow; use std::collections::HashMap; use std::env; +use std::fmt::Display; +use std::io; use std::num::NonZeroU32; use std::pin::Pin; use std::str::FromStr; @@ -13,41 +15,51 @@ use std::time::SystemTime; use super::REMOTE_STORAGE_PREFIX_SEPARATOR; use anyhow::Result; use azure_core::request_options::{MaxResults, Metadata, Range}; -use azure_core::RetryOptions; +use azure_core::{Continuable, RetryOptions}; use azure_identity::DefaultAzureCredential; use azure_storage::StorageCredentials; use azure_storage_blobs::blob::CopyStatus; use azure_storage_blobs::prelude::ClientBuilder; use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient}; use bytes::Bytes; +use futures::future::Either; use futures::stream::Stream; use futures_util::StreamExt; +use futures_util::TryStreamExt; use http_types::{StatusCode, Url}; -use tokio::time::Instant; +use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; use tracing::debug; +use utils::backoff; -use crate::s3_bucket::RequestKind; +use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind}; +use crate::ListingObject; use crate::{ - AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, - RemoteStorage, StorageMetadata, + config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, Listing, + ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel, }; pub struct AzureBlobStorage { client: ContainerClient, + container_name: String, prefix_in_container: Option, max_keys_per_list_response: Option, concurrency_limiter: ConcurrencyLimiter, + // Per-request timeout. Accessible for tests. + pub timeout: Duration, } impl AzureBlobStorage { - pub fn new(azure_config: &AzureConfig) -> Result { + pub fn new(azure_config: &AzureConfig, timeout: Duration) -> Result { debug!( "Creating azure remote storage for azure container {}", azure_config.container_name ); - let account = env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT"); + // Use the storage account from the config by default, fall back to env var if not present. + let account = azure_config.storage_account.clone().unwrap_or_else(|| { + env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT") + }); // If the `AZURE_STORAGE_ACCESS_KEY` env var has an access key, use that, // otherwise try the token based credentials. @@ -75,9 +87,11 @@ impl AzureBlobStorage { Ok(AzureBlobStorage { client, + container_name: azure_config.container_name.to_owned(), prefix_in_container: azure_config.prefix_in_container.to_owned(), max_keys_per_list_response, concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()), + timeout, }) } @@ -120,21 +134,47 @@ impl AzureBlobStorage { async fn download_for_builder( &self, builder: GetBlobBuilder, + cancel: &CancellationToken, ) -> Result { - let mut response = builder.into_stream(); + let kind = RequestKind::Get; + + let _permit = self.permit(kind, cancel).await?; + let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); + let cancel_or_timeout_ = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); let mut etag = None; let mut last_modified = None; let mut metadata = HashMap::new(); - // TODO give proper streaming response instead of buffering into RAM - // https://github.com/neondatabase/neon/issues/5563 - let mut bufs = Vec::new(); - while let Some(part) = response.next().await { - let part = part.map_err(to_download_error)?; - let etag_str: &str = part.blob.properties.etag.as_ref(); + let started_at = start_measuring_requests(kind); + + let download = async { + let response = builder + // convert to concrete Pageable + .into_stream() + // convert to TryStream + .into_stream() + .map_err(to_download_error); + + // apply per request timeout + let response = tokio_stream::StreamExt::timeout(response, self.timeout); + + // flatten + let response = response.map(|res| match res { + Ok(res) => res, + Err(_elapsed) => Err(DownloadError::Timeout), + }); + + let mut response = Box::pin(response); + + let Some(part) = response.next().await else { + return Err(DownloadError::Other(anyhow::anyhow!( + "Azure GET response contained no response body" + ))); + }; + let part = part?; if etag.is_none() { - etag = Some(etag.unwrap_or_else(|| etag_str.to_owned())); + etag = Some(part.blob.properties.etag); } if last_modified.is_none() { last_modified = Some(part.blob.properties.last_modified.into()); @@ -142,26 +182,68 @@ impl AzureBlobStorage { if let Some(blob_meta) = part.blob.metadata { metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned()))); } - let data = part + + // unwrap safety: if these were None, bufs would be empty and we would have returned an error already + let etag = etag.unwrap(); + let last_modified = last_modified.unwrap(); + + let tail_stream = response + .map(|part| match part { + Ok(part) => Either::Left(part.data.map(|r| r.map_err(io::Error::other))), + Err(e) => { + Either::Right(futures::stream::once(async { Err(io::Error::other(e)) })) + } + }) + .flatten(); + let stream = part .data - .collect() - .await - .map_err(|e| DownloadError::Other(e.into()))?; - bufs.push(data); - } - Ok(Download { - download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))), - etag, - last_modified, - metadata: Some(StorageMetadata(metadata)), - }) + .map(|r| r.map_err(io::Error::other)) + .chain(sync_wrapper::SyncStream::new(tail_stream)); + //.chain(SyncStream::from_pin(Box::pin(tail_stream))); + + let download_stream = crate::support::DownloadStream::new(cancel_or_timeout_, stream); + + Ok(Download { + download_stream: Box::pin(download_stream), + etag, + last_modified, + metadata: Some(StorageMetadata(metadata)), + }) + }; + + let download = tokio::select! { + bufs = download => bufs, + cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout { + TimeoutOrCancel::Timeout => return Err(DownloadError::Timeout), + TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled), + }, + }; + let started_at = ScopeGuard::into_inner(started_at); + let outcome = match &download { + Ok(_) => AttemptOutcome::Ok, + Err(_) => AttemptOutcome::Err, + }; + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, outcome, started_at); + download } - async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> { - self.concurrency_limiter - .acquire(kind) - .await - .expect("semaphore is never closed") + async fn permit( + &self, + kind: RequestKind, + cancel: &CancellationToken, + ) -> Result, Cancelled> { + let acquire = self.concurrency_limiter.acquire(kind); + + tokio::select! { + permit = acquire => Ok(permit.expect("never closed")), + _ = cancel.cancelled() => Err(Cancelled), + } + } + + pub fn container_name(&self) -> &str { + &self.container_name } } @@ -186,11 +268,13 @@ fn to_download_error(error: azure_core::Error) -> DownloadError { } impl RemoteStorage for AzureBlobStorage { - async fn list( + fn list_streaming( &self, prefix: Option<&RemotePath>, mode: ListingMode, - ) -> anyhow::Result { + max_keys: Option, + cancel: &CancellationToken, + ) -> impl Stream> { // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix .map(|p| self.relative_path_to_name(p)) @@ -206,37 +290,139 @@ impl RemoteStorage for AzureBlobStorage { p }); - let mut builder = self.client.list_blobs(); + async_stream::stream! { + let _permit = self.permit(RequestKind::List, cancel).await?; - if let ListingMode::WithDelimiter = mode { - builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); + let mut builder = self.client.list_blobs(); + + if let ListingMode::WithDelimiter = mode { + builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); + } + + if let Some(prefix) = list_prefix { + builder = builder.prefix(Cow::from(prefix.to_owned())); + } + + if let Some(limit) = self.max_keys_per_list_response { + builder = builder.max_results(MaxResults::new(limit)); + } + + let mut next_marker = None; + + 'outer: loop { + let mut builder = builder.clone(); + if let Some(marker) = next_marker.clone() { + builder = builder.marker(marker); + } + let response = builder.into_stream(); + let response = response.into_stream().map_err(to_download_error); + let response = tokio_stream::StreamExt::timeout(response, self.timeout); + let response = response.map(|res| match res { + Ok(res) => res, + Err(_elapsed) => Err(DownloadError::Timeout), + }); + + let mut response = std::pin::pin!(response); + + let mut max_keys = max_keys.map(|mk| mk.get()); + let next_item = tokio::select! { + op = response.next() => Ok(op), + _ = cancel.cancelled() => Err(DownloadError::Cancelled), + }?; + let Some(entry) = next_item else { + // The list is complete, so yield it. + break; + }; + + let mut res = Listing::default(); + let entry = match entry { + Ok(entry) => entry, + Err(e) => { + // The error is potentially retryable, so we must rewind the loop after yielding. + yield Err(e); + continue; + } + }; + next_marker = entry.continuation(); + let prefix_iter = entry + .blobs + .prefixes() + .map(|prefix| self.name_to_relative_path(&prefix.name)); + res.prefixes.extend(prefix_iter); + + let blob_iter = entry + .blobs + .blobs() + .map(|k| ListingObject{ + key: self.name_to_relative_path(&k.name), + last_modified: k.properties.last_modified.into(), + size: k.properties.content_length, + } + ); + + for key in blob_iter { + res.keys.push(key); + + if let Some(mut mk) = max_keys { + assert!(mk > 0); + mk -= 1; + if mk == 0 { + yield Ok(res); // limit reached + break 'outer; + } + max_keys = Some(mk); + } + } + yield Ok(res); + + // We are done here + if next_marker.is_none() { + break; + } + } + } + } + + async fn head_object( + &self, + key: &RemotePath, + cancel: &CancellationToken, + ) -> Result { + let kind = RequestKind::Head; + let _permit = self.permit(kind, cancel).await?; + + let started_at = start_measuring_requests(kind); + + let blob_client = self.client.blob_client(self.relative_path_to_name(key)); + let properties_future = blob_client.get_properties().into_future(); + + let properties_future = tokio::time::timeout(self.timeout, properties_future); + + let res = tokio::select! { + res = properties_future => res, + _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), + }; + + if let Ok(inner) = &res { + // do not incl. timeouts as errors in metrics but cancellations + let started_at = ScopeGuard::into_inner(started_at); + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, inner, started_at); } - if let Some(prefix) = list_prefix { - builder = builder.prefix(Cow::from(prefix.to_owned())); - } + let data = match res { + Ok(Ok(data)) => Ok(data), + Ok(Err(sdk)) => Err(to_download_error(sdk)), + Err(_timeout) => Err(DownloadError::Timeout), + }?; - if let Some(limit) = self.max_keys_per_list_response { - builder = builder.max_results(MaxResults::new(limit)); - } - - let mut response = builder.into_stream(); - let mut res = Listing::default(); - while let Some(l) = response.next().await { - let entry = l.map_err(to_download_error)?; - let prefix_iter = entry - .blobs - .prefixes() - .map(|prefix| self.name_to_relative_path(&prefix.name)); - res.prefixes.extend(prefix_iter); - - let blob_iter = entry - .blobs - .blobs() - .map(|k| self.name_to_relative_path(&k.name)); - res.keys.extend(blob_iter); - } - Ok(res) + let properties = data.blob.properties; + Ok(ListingObject { + key: key.to_owned(), + last_modified: SystemTime::from(properties.last_modified), + size: properties.content_length, + }) } async fn upload( @@ -245,35 +431,66 @@ impl RemoteStorage for AzureBlobStorage { data_size_bytes: usize, to: &RemotePath, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result<()> { - let _permit = self.permit(RequestKind::Put).await; - let blob_client = self.client.blob_client(self.relative_path_to_name(to)); + let kind = RequestKind::Put; + let _permit = self.permit(kind, cancel).await?; - let from: Pin> + Send + Sync + 'static>> = - Box::pin(from); + let started_at = start_measuring_requests(kind); - let from = NonSeekableStream::new(from, data_size_bytes); + let op = async { + let blob_client = self.client.blob_client(self.relative_path_to_name(to)); - let body = azure_core::Body::SeekableStream(Box::new(from)); + let from: Pin> + Send + Sync + 'static>> = + Box::pin(from); - let mut builder = blob_client.put_block_blob(body); + let from = NonSeekableStream::new(from, data_size_bytes); - if let Some(metadata) = metadata { - builder = builder.metadata(to_azure_metadata(metadata)); - } + let body = azure_core::Body::SeekableStream(Box::new(from)); - let _response = builder.into_future().await?; + let mut builder = blob_client.put_block_blob(body); - Ok(()) + if let Some(metadata) = metadata { + builder = builder.metadata(to_azure_metadata(metadata)); + } + + let fut = builder.into_future(); + let fut = tokio::time::timeout(self.timeout, fut); + + match fut.await { + Ok(Ok(_response)) => Ok(()), + Ok(Err(azure)) => Err(azure.into()), + Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()), + } + }; + + let res = tokio::select! { + res = op => res, + _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), + }; + + let outcome = match res { + Ok(_) => AttemptOutcome::Ok, + Err(_) => AttemptOutcome::Err, + }; + let started_at = ScopeGuard::into_inner(started_at); + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, outcome, started_at); + + res } - async fn download(&self, from: &RemotePath) -> Result { - let _permit = self.permit(RequestKind::Get).await; + async fn download( + &self, + from: &RemotePath, + cancel: &CancellationToken, + ) -> Result { let blob_client = self.client.blob_client(self.relative_path_to_name(from)); let builder = blob_client.get(); - self.download_for_builder(builder).await + self.download_for_builder(builder, cancel).await } async fn download_byte_range( @@ -281,8 +498,8 @@ impl RemoteStorage for AzureBlobStorage { from: &RemotePath, start_inclusive: u64, end_exclusive: Option, + cancel: &CancellationToken, ) -> Result { - let _permit = self.permit(RequestKind::Get).await; let blob_client = self.client.blob_client(self.relative_path_to_name(from)); let mut builder = blob_client.get(); @@ -294,83 +511,161 @@ impl RemoteStorage for AzureBlobStorage { }; builder = builder.range(range); - self.download_for_builder(builder).await + self.download_for_builder(builder, cancel).await } - async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { - let _permit = self.permit(RequestKind::Delete).await; - let blob_client = self.client.blob_client(self.relative_path_to_name(path)); + async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> { + self.delete_objects(std::array::from_ref(path), cancel) + .await + } - let builder = blob_client.delete(); + async fn delete_objects<'a>( + &self, + paths: &'a [RemotePath], + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let kind = RequestKind::Delete; + let _permit = self.permit(kind, cancel).await?; + let started_at = start_measuring_requests(kind); - match builder.into_future().await { - Ok(_response) => Ok(()), - Err(e) => { - if let Some(http_err) = e.as_http_error() { - if http_err.status() == StatusCode::NotFound { - return Ok(()); + let op = async { + // TODO batch requests are not supported by the SDK + // https://github.com/Azure/azure-sdk-for-rust/issues/1068 + for path in paths { + #[derive(Debug)] + enum AzureOrTimeout { + AzureError(azure_core::Error), + Timeout, + Cancel, + } + impl Display for AzureOrTimeout { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{self:?}") } } - Err(anyhow::Error::new(e)) + let warn_threshold = 3; + let max_retries = 5; + backoff::retry( + || async { + let blob_client = self.client.blob_client(self.relative_path_to_name(path)); + + let request = blob_client.delete().into_future(); + + let res = tokio::time::timeout(self.timeout, request).await; + + match res { + Ok(Ok(_v)) => Ok(()), + Ok(Err(azure_err)) => { + if let Some(http_err) = azure_err.as_http_error() { + if http_err.status() == StatusCode::NotFound { + return Ok(()); + } + } + Err(AzureOrTimeout::AzureError(azure_err)) + } + Err(_elapsed) => Err(AzureOrTimeout::Timeout), + } + }, + |err| match err { + AzureOrTimeout::AzureError(_) | AzureOrTimeout::Timeout => false, + AzureOrTimeout::Cancel => true, + }, + warn_threshold, + max_retries, + "deleting remote object", + cancel, + ) + .await + .ok_or_else(|| AzureOrTimeout::Cancel) + .and_then(|x| x) + .map_err(|e| match e { + AzureOrTimeout::AzureError(err) => anyhow::Error::from(err), + AzureOrTimeout::Timeout => TimeoutOrCancel::Timeout.into(), + AzureOrTimeout::Cancel => TimeoutOrCancel::Cancel.into(), + })?; } - } + Ok(()) + }; + + let res = tokio::select! { + res = op => res, + _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), + }; + + let started_at = ScopeGuard::into_inner(started_at); + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &res, started_at); + res } - async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { - // Permit is already obtained by inner delete function + async fn copy( + &self, + from: &RemotePath, + to: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let kind = RequestKind::Copy; + let _permit = self.permit(kind, cancel).await?; + let started_at = start_measuring_requests(kind); - // TODO batch requests are also not supported by the SDK - // https://github.com/Azure/azure-sdk-for-rust/issues/1068 - // https://github.com/Azure/azure-sdk-for-rust/issues/1249 - for path in paths { - self.delete(path).await?; - } - Ok(()) - } + let timeout = tokio::time::sleep(self.timeout); - async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { - let _permit = self.permit(RequestKind::Copy).await; - let blob_client = self.client.blob_client(self.relative_path_to_name(to)); + let mut copy_status = None; - let source_url = format!( - "{}/{}", - self.client.url()?, - self.relative_path_to_name(from) - ); - let builder = blob_client.copy(Url::from_str(&source_url)?); + let op = async { + let blob_client = self.client.blob_client(self.relative_path_to_name(to)); - let result = builder.into_future().await?; + let source_url = format!( + "{}/{}", + self.client.url()?, + self.relative_path_to_name(from) + ); - let mut copy_status = result.copy_status; - let start_time = Instant::now(); - const MAX_WAIT_TIME: Duration = Duration::from_secs(60); - loop { - match copy_status { - CopyStatus::Aborted => { - anyhow::bail!("Received abort for copy from {from} to {to}."); + let builder = blob_client.copy(Url::from_str(&source_url)?); + let copy = builder.into_future(); + + let result = copy.await?; + + copy_status = Some(result.copy_status); + loop { + match copy_status.as_ref().expect("we always set it to Some") { + CopyStatus::Aborted => { + anyhow::bail!("Received abort for copy from {from} to {to}."); + } + CopyStatus::Failed => { + anyhow::bail!("Received failure response for copy from {from} to {to}."); + } + CopyStatus::Success => return Ok(()), + CopyStatus::Pending => (), } - CopyStatus::Failed => { - anyhow::bail!("Received failure response for copy from {from} to {to}."); - } - CopyStatus::Success => return Ok(()), - CopyStatus::Pending => (), + // The copy is taking longer. Waiting a second and then re-trying. + // TODO estimate time based on copy_progress and adjust time based on that + tokio::time::sleep(Duration::from_millis(1000)).await; + let properties = blob_client.get_properties().into_future().await?; + let Some(status) = properties.blob.properties.copy_status else { + tracing::warn!("copy_status for copy is None!, from={from}, to={to}"); + return Ok(()); + }; + copy_status = Some(status); } - // The copy is taking longer. Waiting a second and then re-trying. - // TODO estimate time based on copy_progress and adjust time based on that - tokio::time::sleep(Duration::from_millis(1000)).await; - let properties = blob_client.get_properties().into_future().await?; - let Some(status) = properties.blob.properties.copy_status else { - tracing::warn!("copy_status for copy is None!, from={from}, to={to}"); - return Ok(()); - }; - if start_time.elapsed() > MAX_WAIT_TIME { - anyhow::bail!("Copy from from {from} to {to} took longer than limit MAX_WAIT_TIME={}s. copy_pogress={:?}.", - MAX_WAIT_TIME.as_secs_f32(), - properties.blob.properties.copy_progress, - ); - } - copy_status = status; - } + }; + + let res = tokio::select! { + res = op => res, + _ = cancel.cancelled() => return Err(anyhow::Error::new(TimeoutOrCancel::Cancel)), + _ = timeout => { + let e = anyhow::Error::new(TimeoutOrCancel::Timeout); + let e = e.context(format!("Timeout, last status: {copy_status:?}")); + Err(e) + }, + }; + + let started_at = ScopeGuard::into_inner(started_at); + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &res, started_at); + res } async fn time_travel_recover( @@ -378,13 +673,11 @@ impl RemoteStorage for AzureBlobStorage { _prefix: Option<&RemotePath>, _timestamp: SystemTime, _done_if_after: SystemTime, - _cancel: CancellationToken, - ) -> anyhow::Result<()> { + _cancel: &CancellationToken, + ) -> Result<(), TimeTravelError> { // TODO use Azure point in time recovery feature for this // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview - Err(anyhow::anyhow!( - "time travel recovery for azure blob storage is not implemented" - )) + Err(TimeTravelError::Unimplemented) } } diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs new file mode 100644 index 0000000000..f819a1572a --- /dev/null +++ b/libs/remote_storage/src/config.rs @@ -0,0 +1,289 @@ +use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration}; + +use aws_sdk_s3::types::StorageClass; +use camino::Utf8PathBuf; + +use serde::{Deserialize, Serialize}; + +use crate::{ + DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT, + DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, +}; + +/// External backup storage configuration, enough for creating a client for that storage. +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] +pub struct RemoteStorageConfig { + /// The storage connection configuration. + #[serde(flatten)] + pub storage: RemoteStorageKind, + /// A common timeout enforced for all requests after concurrency limiter permit has been + /// acquired. + #[serde( + with = "humantime_serde", + default = "default_timeout", + skip_serializing_if = "is_default_timeout" + )] + pub timeout: Duration, +} + +fn default_timeout() -> Duration { + RemoteStorageConfig::DEFAULT_TIMEOUT +} + +fn is_default_timeout(d: &Duration) -> bool { + *d == RemoteStorageConfig::DEFAULT_TIMEOUT +} + +/// A kind of a remote storage to connect to, with its connection configuration. +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] +#[serde(untagged)] +pub enum RemoteStorageKind { + /// Storage based on local file system. + /// Specify a root folder to place all stored files into. + LocalFs { local_path: Utf8PathBuf }, + /// AWS S3 based storage, storing all files in the S3 bucket + /// specified by the config + AwsS3(S3Config), + /// Azure Blob based storage, storing all files in the container + /// specified by the config + AzureContainer(AzureConfig), +} + +/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write). +#[derive(Clone, PartialEq, Eq, Deserialize, Serialize)] +pub struct S3Config { + /// Name of the bucket to connect to. + pub bucket_name: String, + /// The region where the bucket is located at. + pub bucket_region: String, + /// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once. + pub prefix_in_bucket: Option, + /// A base URL to send S3 requests to. + /// By default, the endpoint is derived from a region name, assuming it's + /// an AWS S3 region name, erroring on wrong region name. + /// Endpoint provides a way to support other S3 flavors and their regions. + /// + /// Example: `http://127.0.0.1:5000` + pub endpoint: Option, + /// AWS S3 has various limits on its API calls, we need not to exceed those. + /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. + #[serde(default = "default_remote_storage_s3_concurrency_limit")] + pub concurrency_limit: NonZeroUsize, + #[serde(default = "default_max_keys_per_list_response")] + pub max_keys_per_list_response: Option, + #[serde( + deserialize_with = "deserialize_storage_class", + serialize_with = "serialize_storage_class", + default + )] + pub upload_storage_class: Option, +} + +fn default_remote_storage_s3_concurrency_limit() -> NonZeroUsize { + DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT + .try_into() + .unwrap() +} + +fn default_max_keys_per_list_response() -> Option { + DEFAULT_MAX_KEYS_PER_LIST_RESPONSE +} + +impl Debug for S3Config { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("S3Config") + .field("bucket_name", &self.bucket_name) + .field("bucket_region", &self.bucket_region) + .field("prefix_in_bucket", &self.prefix_in_bucket) + .field("concurrency_limit", &self.concurrency_limit) + .field( + "max_keys_per_list_response", + &self.max_keys_per_list_response, + ) + .finish() + } +} + +/// Azure bucket coordinates and access credentials to manage the bucket contents (read and write). +#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct AzureConfig { + /// Name of the container to connect to. + pub container_name: String, + /// Name of the storage account the container is inside of + pub storage_account: Option, + /// The region where the bucket is located at. + pub container_region: String, + /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once. + pub prefix_in_container: Option, + /// Azure has various limits on its API calls, we need not to exceed those. + /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details. + #[serde(default = "default_remote_storage_azure_concurrency_limit")] + pub concurrency_limit: NonZeroUsize, + #[serde(default = "default_max_keys_per_list_response")] + pub max_keys_per_list_response: Option, +} + +fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize { + NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap() +} + +impl Debug for AzureConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AzureConfig") + .field("bucket_name", &self.container_name) + .field("storage_account", &self.storage_account) + .field("bucket_region", &self.container_region) + .field("prefix_in_container", &self.prefix_in_container) + .field("concurrency_limit", &self.concurrency_limit) + .field( + "max_keys_per_list_response", + &self.max_keys_per_list_response, + ) + .finish() + } +} + +fn deserialize_storage_class<'de, D: serde::Deserializer<'de>>( + deserializer: D, +) -> Result, D::Error> { + Option::::deserialize(deserializer).and_then(|s| { + if let Some(s) = s { + use serde::de::Error; + let storage_class = StorageClass::from_str(&s).expect("infallible"); + #[allow(deprecated)] + if matches!(storage_class, StorageClass::Unknown(_)) { + return Err(D::Error::custom(format!( + "Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", + StorageClass::values() + ))); + } + Ok(Some(storage_class)) + } else { + Ok(None) + } + }) +} + +fn serialize_storage_class( + val: &Option, + serializer: S, +) -> Result { + let val = val.as_ref().map(StorageClass::as_str); + Option::<&str>::serialize(&val, serializer) +} + +impl RemoteStorageConfig { + pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120); + + pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { + Ok(utils::toml_edit_ext::deserialize_item(toml)?) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse(input: &str) -> anyhow::Result { + let toml = input.parse::().unwrap(); + RemoteStorageConfig::from_toml(toml.as_item()) + } + + #[test] + fn parse_localfs_config_with_timeout() { + let input = "local_path = '.' +timeout = '5s'"; + + let config = parse(input).unwrap(); + + assert_eq!( + config, + RemoteStorageConfig { + storage: RemoteStorageKind::LocalFs { + local_path: Utf8PathBuf::from(".") + }, + timeout: Duration::from_secs(5) + } + ); + } + + #[test] + fn test_s3_parsing() { + let toml = "\ + bucket_name = 'foo-bar' + bucket_region = 'eu-central-1' + upload_storage_class = 'INTELLIGENT_TIERING' + timeout = '7s' + "; + + let config = parse(toml).unwrap(); + + assert_eq!( + config, + RemoteStorageConfig { + storage: RemoteStorageKind::AwsS3(S3Config { + bucket_name: "foo-bar".into(), + bucket_region: "eu-central-1".into(), + prefix_in_bucket: None, + endpoint: None, + concurrency_limit: default_remote_storage_s3_concurrency_limit(), + max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, + upload_storage_class: Some(StorageClass::IntelligentTiering), + }), + timeout: Duration::from_secs(7) + } + ); + } + + #[test] + fn test_storage_class_serde_roundtrip() { + let classes = [ + None, + Some(StorageClass::Standard), + Some(StorageClass::IntelligentTiering), + ]; + for class in classes { + #[derive(Serialize, Deserialize)] + struct Wrapper { + #[serde( + deserialize_with = "deserialize_storage_class", + serialize_with = "serialize_storage_class" + )] + class: Option, + } + let wrapped = Wrapper { + class: class.clone(), + }; + let serialized = serde_json::to_string(&wrapped).unwrap(); + let deserialized: Wrapper = serde_json::from_str(&serialized).unwrap(); + assert_eq!(class, deserialized.class); + } + } + + #[test] + fn test_azure_parsing() { + let toml = "\ + container_name = 'foo-bar' + container_region = 'westeurope' + upload_storage_class = 'INTELLIGENT_TIERING' + timeout = '7s' + "; + + let config = parse(toml).unwrap(); + + assert_eq!( + config, + RemoteStorageConfig { + storage: RemoteStorageKind::AzureContainer(AzureConfig { + container_name: "foo-bar".into(), + storage_account: None, + container_region: "westeurope".into(), + prefix_in_container: None, + concurrency_limit: default_remote_storage_azure_concurrency_limit(), + max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, + }), + timeout: Duration::from_secs(7) + } + ); + } +} diff --git a/libs/remote_storage/src/error.rs b/libs/remote_storage/src/error.rs new file mode 100644 index 0000000000..5fd0eaabc7 --- /dev/null +++ b/libs/remote_storage/src/error.rs @@ -0,0 +1,204 @@ +/// Reasons for downloads or listings to fail. +#[derive(Debug)] +pub enum DownloadError { + /// Validation or other error happened due to user input. + BadInput(anyhow::Error), + /// The file was not found in the remote storage. + NotFound, + /// A cancellation token aborted the download, typically during + /// tenant detach or process shutdown. + Cancelled, + /// A timeout happened while executing the request. Possible reasons: + /// - stuck tcp connection + /// + /// Concurrency control is not timed within timeout. + Timeout, + /// The file was found in the remote storage, but the download failed. + Other(anyhow::Error), +} + +impl std::fmt::Display for DownloadError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + DownloadError::BadInput(e) => { + write!(f, "Failed to download a remote file due to user input: {e}") + } + DownloadError::NotFound => write!(f, "No file found for the remote object id given"), + DownloadError::Cancelled => write!(f, "Cancelled, shutting down"), + DownloadError::Timeout => write!(f, "timeout"), + DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"), + } + } +} + +impl std::error::Error for DownloadError {} + +impl DownloadError { + /// Returns true if the error should not be retried with backoff + pub fn is_permanent(&self) -> bool { + use DownloadError::*; + match self { + BadInput(_) | NotFound | Cancelled => true, + Timeout | Other(_) => false, + } + } + + pub fn is_cancelled(&self) -> bool { + matches!(self, DownloadError::Cancelled) + } +} + +impl From for DownloadError { + fn from(value: std::io::Error) -> Self { + let needs_unwrap = value.kind() == std::io::ErrorKind::Other + && value + .get_ref() + .and_then(|x| x.downcast_ref::()) + .is_some(); + + if needs_unwrap { + *value + .into_inner() + .expect("just checked") + .downcast::() + .expect("just checked") + } else { + DownloadError::Other(value.into()) + } + } +} + +#[derive(Debug)] +pub enum TimeTravelError { + /// Validation or other error happened due to user input. + BadInput(anyhow::Error), + /// The used remote storage does not have time travel recovery implemented + Unimplemented, + /// The number of versions/deletion markers is above our limit. + TooManyVersions, + /// A cancellation token aborted the process, typically during + /// request closure or process shutdown. + Cancelled, + /// Other errors + Other(anyhow::Error), +} + +impl std::fmt::Display for TimeTravelError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + TimeTravelError::BadInput(e) => { + write!( + f, + "Failed to time travel recover a prefix due to user input: {e}" + ) + } + TimeTravelError::Unimplemented => write!( + f, + "time travel recovery is not implemented for the current storage backend" + ), + TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"), + TimeTravelError::TooManyVersions => { + write!(f, "Number of versions/delete markers above limit") + } + TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"), + } + } +} + +impl std::error::Error for TimeTravelError {} + +/// Plain cancelled error. +/// +/// By design this type does not not implement `std::error::Error` so it cannot be put as the root +/// cause of `std::io::Error` or `anyhow::Error`. It should never need to be exposed out of this +/// crate. +/// +/// It exists to implement permit acquiring in `{Download,TimeTravel}Error` and `anyhow::Error` returning +/// operations and ensuring that those get converted to proper versions with just `?`. +#[derive(Debug)] +pub(crate) struct Cancelled; + +impl From for anyhow::Error { + fn from(_: Cancelled) -> Self { + anyhow::Error::new(TimeoutOrCancel::Cancel) + } +} + +impl From for TimeTravelError { + fn from(_: Cancelled) -> Self { + TimeTravelError::Cancelled + } +} + +impl From for TimeoutOrCancel { + fn from(_: Cancelled) -> Self { + TimeoutOrCancel::Cancel + } +} + +impl From for DownloadError { + fn from(_: Cancelled) -> Self { + DownloadError::Cancelled + } +} + +/// This type is used at as the root cause for timeouts and cancellations with `anyhow::Error` returning +/// RemoteStorage methods. +/// +/// For use with `utils::backoff::retry` and `anyhow::Error` returning operations there is +/// `TimeoutOrCancel::caused_by_cancel` method to query "proper form" errors. +#[derive(Debug)] +pub enum TimeoutOrCancel { + Timeout, + Cancel, +} + +impl std::fmt::Display for TimeoutOrCancel { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use TimeoutOrCancel::*; + match self { + Timeout => write!(f, "timeout"), + Cancel => write!(f, "cancel"), + } + } +} + +impl std::error::Error for TimeoutOrCancel {} + +impl TimeoutOrCancel { + /// Returns true if the error was caused by [`TimeoutOrCancel::Cancel`]. + pub fn caused_by_cancel(error: &anyhow::Error) -> bool { + error + .root_cause() + .downcast_ref::() + .is_some_and(Self::is_cancel) + } + + pub fn is_cancel(&self) -> bool { + matches!(self, TimeoutOrCancel::Cancel) + } + + pub fn is_timeout(&self) -> bool { + matches!(self, TimeoutOrCancel::Timeout) + } +} + +/// This conversion is used when [`crate::support::DownloadStream`] notices a cancellation or +/// timeout to wrap it in an `std::io::Error`. +impl From for std::io::Error { + fn from(value: TimeoutOrCancel) -> Self { + let e = DownloadError::from(value); + std::io::Error::other(e) + } +} + +impl From for DownloadError { + fn from(value: TimeoutOrCancel) -> Self { + use TimeoutOrCancel::*; + + match value { + Timeout => DownloadError::Timeout, + Cancel => DownloadError::Cancelled, + } + } +} diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index bf9c51ad1a..b5b69c9faf 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -10,23 +10,26 @@ #![deny(clippy::undocumented_unsafe_blocks)] mod azure_blob; +mod config; +mod error; mod local_fs; +mod metrics; mod s3_bucket; mod simulate_failures; +mod support; use std::{ - collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime, + collections::HashMap, fmt::Debug, num::NonZeroU32, pin::Pin, sync::Arc, time::SystemTime, }; -use anyhow::{bail, Context}; +use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use bytes::Bytes; -use futures::stream::Stream; +use futures::{stream::Stream, StreamExt}; use serde::{Deserialize, Serialize}; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; -use toml_edit::Item; use tracing::info; pub use self::{ @@ -35,17 +38,26 @@ pub use self::{ }; use s3_bucket::RequestKind; +pub use crate::config::{AzureConfig, RemoteStorageConfig, RemoteStorageKind, S3Config}; + +/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here. +pub use azure_core::Etag; + +pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel}; + +/// Default concurrency limit for S3 operations +/// /// Currently, sync happens with AWS S3, that has two limits on requests per second: /// ~200 RPS for IAM services /// /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests /// pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; -/// We set this a little bit low as we currently buffer the entire file into RAM +/// Set this limit analogously to the S3 limit /// /// Here, a limit of max 20k concurrent connections was noted. /// -pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30; +pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100; /// No limits on the client side, which currenltly means 1000 for AWS S3. /// pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option = None; @@ -107,8 +119,8 @@ impl RemotePath { self.0.file_name() } - pub fn join(&self, segment: &Utf8Path) -> Self { - Self(self.0.join(segment)) + pub fn join(&self, path: impl AsRef) -> Self { + Self(self.0.join(path)) } pub fn get_path(&self) -> &Utf8PathBuf { @@ -122,6 +134,11 @@ impl RemotePath { pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> { self.0.strip_prefix(&p.0) } + + pub fn add_trailing_slash(&self) -> Self { + // Unwrap safety inputs are guararnteed to be valid UTF-8 + Self(format!("{}/", self.0).try_into().unwrap()) + } } /// We don't need callers to be able to pass arbitrary delimiters: just control @@ -129,15 +146,23 @@ impl RemotePath { /// /// The WithDelimiter mode will populate `prefixes` and `keys` in the result. The /// NoDelimiter mode will only populate `keys`. +#[derive(Copy, Clone)] pub enum ListingMode { WithDelimiter, NoDelimiter, } +#[derive(PartialEq, Eq, Debug, Clone)] +pub struct ListingObject { + pub key: RemotePath, + pub last_modified: SystemTime, + pub size: u64, +} + #[derive(Default)] pub struct Listing { pub prefixes: Vec, - pub keys: Vec, + pub keys: Vec, } /// Storage (potentially remote) API to manage its state. @@ -145,43 +170,64 @@ pub struct Listing { /// providing basic CRUD operations for storage files. #[allow(async_fn_in_trait)] pub trait RemoteStorage: Send + Sync + 'static { - /// Lists all top level subdirectories for a given prefix - /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id - /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS) - /// so this method doesnt need to. - async fn list_prefixes( + /// List objects in remote storage, with semantics matching AWS S3's [`ListObjectsV2`]. + /// + /// The stream is guaranteed to return at least one element, even in the case of errors + /// (in that case it's an `Err()`), or an empty `Listing`. + /// + /// The stream is not ending if it returns an error, as long as [`is_permanent`] returns false on the error. + /// The `next` function can be retried, and maybe in a future retry, there will be success. + /// + /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not + /// from the absolute root of the bucket. + /// + /// `mode` configures whether to use a delimiter. Without a delimiter, all keys + /// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of + /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are + /// returned in `keys` (). + /// + /// `max_keys` controls the maximum number of keys that will be returned. If this is None, this function + /// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on + /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure. + /// + /// [`ListObjectsV2`]: + /// [`is_permanent`]: DownloadError::is_permanent + fn list_streaming( &self, prefix: Option<&RemotePath>, - ) -> Result, DownloadError> { - let result = self - .list(prefix, ListingMode::WithDelimiter) - .await? - .prefixes; - Ok(result) - } - /// Lists all files in directory "recursively" - /// (not really recursively, because AWS has a flat namespace) - /// Note: This is subtely different than list_prefixes, - /// because it is for listing files instead of listing - /// names sharing common prefixes. - /// For example, - /// list_files("foo/bar") = ["foo/bar/cat123.txt", - /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"] - /// whereas, - /// list_prefixes("foo/bar/") = ["cat", "dog"] - /// See `test_real_s3.rs` for more details. - async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result> { - let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys; - Ok(result) - } + mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, + ) -> impl Stream> + Send; async fn list( &self, prefix: Option<&RemotePath>, - _mode: ListingMode, - ) -> anyhow::Result; + mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, + ) -> Result { + let mut stream = std::pin::pin!(self.list_streaming(prefix, mode, max_keys, cancel)); + let mut combined = stream.next().await.expect("At least one item required")?; + while let Some(list) = stream.next().await { + let list = list?; + combined.keys.extend(list.keys.into_iter()); + combined.prefixes.extend_from_slice(&list.prefixes); + } + Ok(combined) + } + + /// Obtain metadata information about an object. + async fn head_object( + &self, + key: &RemotePath, + cancel: &CancellationToken, + ) -> Result; /// Streams the local file contents into remote into the remote storage entry. + /// + /// If the operation fails because of timeout or cancellation, the root cause of the error will be + /// set to `TimeoutOrCancel`. async fn upload( &self, from: impl Stream> + Send + Sync + 'static, @@ -190,27 +236,61 @@ pub trait RemoteStorage: Send + Sync + 'static { data_size_bytes: usize, to: &RemotePath, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result<()>; - /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer. + /// Streams the remote storage entry contents. + /// + /// The returned download stream will obey initial timeout and cancellation signal by erroring + /// on whichever happens first. Only one of the reasons will fail the stream, which is usually + /// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out. + /// /// Returns the metadata, if any was stored with the file previously. - async fn download(&self, from: &RemotePath) -> Result; + async fn download( + &self, + from: &RemotePath, + cancel: &CancellationToken, + ) -> Result; - /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer. + /// Streams a given byte range of the remote storage entry contents. + /// + /// The returned download stream will obey initial timeout and cancellation signal by erroring + /// on whichever happens first. Only one of the reasons will fail the stream, which is usually + /// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out. + /// /// Returns the metadata, if any was stored with the file previously. async fn download_byte_range( &self, from: &RemotePath, start_inclusive: u64, end_exclusive: Option, + cancel: &CancellationToken, ) -> Result; - async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>; + /// Delete a single path from remote storage. + /// + /// If the operation fails because of timeout or cancellation, the root cause of the error will be + /// set to `TimeoutOrCancel`. In such situation it is unknown if the deletion went through. + async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()>; - async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>; + /// Delete a multiple paths from remote storage. + /// + /// If the operation fails because of timeout or cancellation, the root cause of the error will be + /// set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went + /// through. + async fn delete_objects<'a>( + &self, + paths: &'a [RemotePath], + cancel: &CancellationToken, + ) -> anyhow::Result<()>; /// Copy a remote object inside a bucket from one path to another. - async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>; + async fn copy( + &self, + from: &RemotePath, + to: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()>; /// Resets the content of everything with the given prefix to the given state async fn time_travel_recover( @@ -218,17 +298,25 @@ pub trait RemoteStorage: Send + Sync + 'static { prefix: Option<&RemotePath>, timestamp: SystemTime, done_if_after: SystemTime, - cancel: CancellationToken, - ) -> anyhow::Result<()>; + cancel: &CancellationToken, + ) -> Result<(), TimeTravelError>; } -pub type DownloadStream = Pin> + Unpin + Send + Sync>>; +/// Data part of an ongoing [`Download`]. +/// +/// `DownloadStream` is sensitive to the timeout and cancellation used with the original +/// [`RemoteStorage::download`] request. The type yields `std::io::Result` to be compatible +/// with `tokio::io::copy_buf`. +// This has 'static because safekeepers do not use cancellation tokens (yet) +pub type DownloadStream = + Pin> + Send + Sync + 'static>>; + pub struct Download { pub download_stream: DownloadStream, /// The last time the file was modified (`last-modified` HTTP header) - pub last_modified: Option, + pub last_modified: SystemTime, /// A way to identify this specific version of the resource (`etag` HTTP header) - pub etag: Option, + pub etag: Etag, /// Extra key-value data, associated with the current remote file. pub metadata: Option, } @@ -241,38 +329,10 @@ impl Debug for Download { } } -#[derive(Debug)] -pub enum DownloadError { - /// Validation or other error happened due to user input. - BadInput(anyhow::Error), - /// The file was not found in the remote storage. - NotFound, - /// A cancellation token aborted the download, typically during - /// tenant detach or process shutdown. - Cancelled, - /// The file was found in the remote storage, but the download failed. - Other(anyhow::Error), -} - -impl std::fmt::Display for DownloadError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - DownloadError::BadInput(e) => { - write!(f, "Failed to download a remote file due to user input: {e}") - } - DownloadError::Cancelled => write!(f, "Cancelled, shutting down"), - DownloadError::NotFound => write!(f, "No file found for the remote object id given"), - DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"), - } - } -} - -impl std::error::Error for DownloadError {} - /// Every storage, currently supported. /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics. -#[derive(Clone)] // Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925 +#[derive(Clone)] pub enum GenericRemoteStorage> { LocalFs(LocalFs), AwsS3(Arc), @@ -281,67 +341,80 @@ pub enum GenericRemoteStorage> { } impl GenericRemoteStorage> { + // See [`RemoteStorage::list`]. pub async fn list( &self, prefix: Option<&RemotePath>, mode: ListingMode, - ) -> anyhow::Result { + max_keys: Option, + cancel: &CancellationToken, + ) -> Result { match self { - Self::LocalFs(s) => s.list(prefix, mode).await, - Self::AwsS3(s) => s.list(prefix, mode).await, - Self::AzureBlob(s) => s.list(prefix, mode).await, - Self::Unreliable(s) => s.list(prefix, mode).await, + Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await, + Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await, + Self::AzureBlob(s) => s.list(prefix, mode, max_keys, cancel).await, + Self::Unreliable(s) => s.list(prefix, mode, max_keys, cancel).await, } } - // A function for listing all the files in a "directory" - // Example: - // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"] - pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result> { + // See [`RemoteStorage::list_streaming`]. + pub fn list_streaming<'a>( + &'a self, + prefix: Option<&'a RemotePath>, + mode: ListingMode, + max_keys: Option, + cancel: &'a CancellationToken, + ) -> impl Stream> + 'a + Send { match self { - Self::LocalFs(s) => s.list_files(folder).await, - Self::AwsS3(s) => s.list_files(folder).await, - Self::AzureBlob(s) => s.list_files(folder).await, - Self::Unreliable(s) => s.list_files(folder).await, + Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)) + as Pin> + Send>>, + Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), + Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), + Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)), } } - // lists common *prefixes*, if any of files - // Example: - // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"] - pub async fn list_prefixes( + // See [`RemoteStorage::head_object`]. + pub async fn head_object( &self, - prefix: Option<&RemotePath>, - ) -> Result, DownloadError> { + key: &RemotePath, + cancel: &CancellationToken, + ) -> Result { match self { - Self::LocalFs(s) => s.list_prefixes(prefix).await, - Self::AwsS3(s) => s.list_prefixes(prefix).await, - Self::AzureBlob(s) => s.list_prefixes(prefix).await, - Self::Unreliable(s) => s.list_prefixes(prefix).await, + Self::LocalFs(s) => s.head_object(key, cancel).await, + Self::AwsS3(s) => s.head_object(key, cancel).await, + Self::AzureBlob(s) => s.head_object(key, cancel).await, + Self::Unreliable(s) => s.head_object(key, cancel).await, } } + /// See [`RemoteStorage::upload`] pub async fn upload( &self, from: impl Stream> + Send + Sync + 'static, data_size_bytes: usize, to: &RemotePath, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result<()> { match self { - Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await, - Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await, - Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata).await, - Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await, + Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await, + Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await, + Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await, + Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await, } } - pub async fn download(&self, from: &RemotePath) -> Result { + pub async fn download( + &self, + from: &RemotePath, + cancel: &CancellationToken, + ) -> Result { match self { - Self::LocalFs(s) => s.download(from).await, - Self::AwsS3(s) => s.download(from).await, - Self::AzureBlob(s) => s.download(from).await, - Self::Unreliable(s) => s.download(from).await, + Self::LocalFs(s) => s.download(from, cancel).await, + Self::AwsS3(s) => s.download(from, cancel).await, + Self::AzureBlob(s) => s.download(from, cancel).await, + Self::Unreliable(s) => s.download(from, cancel).await, } } @@ -350,61 +423,79 @@ impl GenericRemoteStorage> { from: &RemotePath, start_inclusive: u64, end_exclusive: Option, + cancel: &CancellationToken, ) -> Result { match self { Self::LocalFs(s) => { - s.download_byte_range(from, start_inclusive, end_exclusive) + s.download_byte_range(from, start_inclusive, end_exclusive, cancel) .await } Self::AwsS3(s) => { - s.download_byte_range(from, start_inclusive, end_exclusive) + s.download_byte_range(from, start_inclusive, end_exclusive, cancel) .await } Self::AzureBlob(s) => { - s.download_byte_range(from, start_inclusive, end_exclusive) + s.download_byte_range(from, start_inclusive, end_exclusive, cancel) .await } Self::Unreliable(s) => { - s.download_byte_range(from, start_inclusive, end_exclusive) + s.download_byte_range(from, start_inclusive, end_exclusive, cancel) .await } } } - pub async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { + /// See [`RemoteStorage::delete`] + pub async fn delete( + &self, + path: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { match self { - Self::LocalFs(s) => s.delete(path).await, - Self::AwsS3(s) => s.delete(path).await, - Self::AzureBlob(s) => s.delete(path).await, - Self::Unreliable(s) => s.delete(path).await, + Self::LocalFs(s) => s.delete(path, cancel).await, + Self::AwsS3(s) => s.delete(path, cancel).await, + Self::AzureBlob(s) => s.delete(path, cancel).await, + Self::Unreliable(s) => s.delete(path, cancel).await, } } - pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { + /// See [`RemoteStorage::delete_objects`] + pub async fn delete_objects( + &self, + paths: &[RemotePath], + cancel: &CancellationToken, + ) -> anyhow::Result<()> { match self { - Self::LocalFs(s) => s.delete_objects(paths).await, - Self::AwsS3(s) => s.delete_objects(paths).await, - Self::AzureBlob(s) => s.delete_objects(paths).await, - Self::Unreliable(s) => s.delete_objects(paths).await, + Self::LocalFs(s) => s.delete_objects(paths, cancel).await, + Self::AwsS3(s) => s.delete_objects(paths, cancel).await, + Self::AzureBlob(s) => s.delete_objects(paths, cancel).await, + Self::Unreliable(s) => s.delete_objects(paths, cancel).await, } } - pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + /// See [`RemoteStorage::copy`] + pub async fn copy_object( + &self, + from: &RemotePath, + to: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { match self { - Self::LocalFs(s) => s.copy(from, to).await, - Self::AwsS3(s) => s.copy(from, to).await, - Self::AzureBlob(s) => s.copy(from, to).await, - Self::Unreliable(s) => s.copy(from, to).await, + Self::LocalFs(s) => s.copy(from, to, cancel).await, + Self::AwsS3(s) => s.copy(from, to, cancel).await, + Self::AzureBlob(s) => s.copy(from, to, cancel).await, + Self::Unreliable(s) => s.copy(from, to, cancel).await, } } + /// See [`RemoteStorage::time_travel_recover`]. pub async fn time_travel_recover( &self, prefix: Option<&RemotePath>, timestamp: SystemTime, done_if_after: SystemTime, - cancel: CancellationToken, - ) -> anyhow::Result<()> { + cancel: &CancellationToken, + ) -> Result<(), TimeTravelError> { match self { Self::LocalFs(s) => { s.time_travel_recover(prefix, timestamp, done_if_after, cancel) @@ -427,21 +518,31 @@ impl GenericRemoteStorage> { } impl GenericRemoteStorage { - pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result { + pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result { + let timeout = storage_config.timeout; Ok(match &storage_config.storage { - RemoteStorageKind::LocalFs(root) => { - info!("Using fs root '{root}' as a remote storage"); - Self::LocalFs(LocalFs::new(root.clone())?) + RemoteStorageKind::LocalFs { local_path: path } => { + info!("Using fs root '{path}' as a remote storage"); + Self::LocalFs(LocalFs::new(path.clone(), timeout)?) } RemoteStorageKind::AwsS3(s3_config) => { - info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'", + // The profile and access key id are only printed here for debugging purposes, + // their values don't indicate the eventually taken choice for auth. + let profile = std::env::var("AWS_PROFILE").unwrap_or_else(|_| "".into()); + let access_key_id = + std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "".into()); + info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}", s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); - Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?)) + Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?)) } RemoteStorageKind::AzureContainer(azure_config) => { - info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'", + let storage_account = azure_config + .storage_account + .as_deref() + .unwrap_or(""); + info!("Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'", azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container); - Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config)?)) + Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config, timeout)?)) } }) } @@ -450,18 +551,15 @@ impl GenericRemoteStorage { Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first))) } - /// Takes storage object contents and its size and uploads to remote storage, - /// mapping `from_path` to the corresponding remote object id in the storage. - /// - /// The storage object does not have to be present on the `from_path`, - /// this path is used for the remote object id conversion only. + /// See [`RemoteStorage::upload`], which this method calls with `None` as metadata. pub async fn upload_storage_object( &self, from: impl Stream> + Send + Sync + 'static, from_size_bytes: usize, to: &RemotePath, + cancel: &CancellationToken, ) -> anyhow::Result<()> { - self.upload(from, from_size_bytes, to, None) + self.upload(from, from_size_bytes, to, None, cancel) .await .with_context(|| { format!("Failed to upload data of length {from_size_bytes} to storage path {to:?}") @@ -474,10 +572,21 @@ impl GenericRemoteStorage { &self, byte_range: Option<(u64, Option)>, from: &RemotePath, + cancel: &CancellationToken, ) -> Result { match byte_range { - Some((start, end)) => self.download_byte_range(from, start, end).await, - None => self.download(from).await, + Some((start, end)) => self.download_byte_range(from, start, end, cancel).await, + None => self.download(from, cancel).await, + } + } + + /// The name of the bucket/container/etc. + pub fn bucket_name(&self) -> Option<&str> { + match self { + Self::LocalFs(_s) => None, + Self::AwsS3(s) => Some(s.bucket_name()), + Self::AzureBlob(s) => Some(s.container_name()), + Self::Unreliable(_s) => None, } } } @@ -487,214 +596,16 @@ impl GenericRemoteStorage { #[derive(Debug, Clone, PartialEq, Eq)] pub struct StorageMetadata(HashMap); -/// External backup storage configuration, enough for creating a client for that storage. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct RemoteStorageConfig { - /// The storage connection configuration. - pub storage: RemoteStorageKind, -} - -/// A kind of a remote storage to connect to, with its connection configuration. -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum RemoteStorageKind { - /// Storage based on local file system. - /// Specify a root folder to place all stored files into. - LocalFs(Utf8PathBuf), - /// AWS S3 based storage, storing all files in the S3 bucket - /// specified by the config - AwsS3(S3Config), - /// Azure Blob based storage, storing all files in the container - /// specified by the config - AzureContainer(AzureConfig), -} - -/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write). -#[derive(Clone, PartialEq, Eq)] -pub struct S3Config { - /// Name of the bucket to connect to. - pub bucket_name: String, - /// The region where the bucket is located at. - pub bucket_region: String, - /// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once. - pub prefix_in_bucket: Option, - /// A base URL to send S3 requests to. - /// By default, the endpoint is derived from a region name, assuming it's - /// an AWS S3 region name, erroring on wrong region name. - /// Endpoint provides a way to support other S3 flavors and their regions. - /// - /// Example: `http://127.0.0.1:5000` - pub endpoint: Option, - /// AWS S3 has various limits on its API calls, we need not to exceed those. - /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. - pub concurrency_limit: NonZeroUsize, - pub max_keys_per_list_response: Option, -} - -impl Debug for S3Config { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("S3Config") - .field("bucket_name", &self.bucket_name) - .field("bucket_region", &self.bucket_region) - .field("prefix_in_bucket", &self.prefix_in_bucket) - .field("concurrency_limit", &self.concurrency_limit) - .field( - "max_keys_per_list_response", - &self.max_keys_per_list_response, - ) - .finish() +impl From<[(&str, &str); N]> for StorageMetadata { + fn from(arr: [(&str, &str); N]) -> Self { + let map: HashMap = arr + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + Self(map) } } -/// Azure bucket coordinates and access credentials to manage the bucket contents (read and write). -#[derive(Clone, PartialEq, Eq)] -pub struct AzureConfig { - /// Name of the container to connect to. - pub container_name: String, - /// The region where the bucket is located at. - pub container_region: String, - /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once. - pub prefix_in_container: Option, - /// Azure has various limits on its API calls, we need not to exceed those. - /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details. - pub concurrency_limit: NonZeroUsize, - pub max_keys_per_list_response: Option, -} - -impl Debug for AzureConfig { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("AzureConfig") - .field("bucket_name", &self.container_name) - .field("bucket_region", &self.container_region) - .field("prefix_in_bucket", &self.prefix_in_container) - .field("concurrency_limit", &self.concurrency_limit) - .field( - "max_keys_per_list_response", - &self.max_keys_per_list_response, - ) - .finish() - } -} - -impl RemoteStorageConfig { - pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result> { - let local_path = toml.get("local_path"); - let bucket_name = toml.get("bucket_name"); - let bucket_region = toml.get("bucket_region"); - let container_name = toml.get("container_name"); - let container_region = toml.get("container_region"); - - let use_azure = container_name.is_some() && container_region.is_some(); - - let default_concurrency_limit = if use_azure { - DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT - } else { - DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT - }; - let concurrency_limit = NonZeroUsize::new( - parse_optional_integer("concurrency_limit", toml)?.unwrap_or(default_concurrency_limit), - ) - .context("Failed to parse 'concurrency_limit' as a positive integer")?; - - let max_keys_per_list_response = - parse_optional_integer::("max_keys_per_list_response", toml) - .context("Failed to parse 'max_keys_per_list_response' as a positive integer")? - .or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE); - - let endpoint = toml - .get("endpoint") - .map(|endpoint| parse_toml_string("endpoint", endpoint)) - .transpose()?; - - let storage = match ( - local_path, - bucket_name, - bucket_region, - container_name, - container_region, - ) { - // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled - (None, None, None, None, None) => return Ok(None), - (_, Some(_), None, ..) => { - bail!("'bucket_region' option is mandatory if 'bucket_name' is given ") - } - (_, None, Some(_), ..) => { - bail!("'bucket_name' option is mandatory if 'bucket_region' is given ") - } - (None, Some(bucket_name), Some(bucket_region), ..) => { - RemoteStorageKind::AwsS3(S3Config { - bucket_name: parse_toml_string("bucket_name", bucket_name)?, - bucket_region: parse_toml_string("bucket_region", bucket_region)?, - prefix_in_bucket: toml - .get("prefix_in_bucket") - .map(|prefix_in_bucket| { - parse_toml_string("prefix_in_bucket", prefix_in_bucket) - }) - .transpose()?, - endpoint, - concurrency_limit, - max_keys_per_list_response, - }) - } - (_, _, _, Some(_), None) => { - bail!("'container_name' option is mandatory if 'container_region' is given ") - } - (_, _, _, None, Some(_)) => { - bail!("'container_name' option is mandatory if 'container_region' is given ") - } - (None, None, None, Some(container_name), Some(container_region)) => { - RemoteStorageKind::AzureContainer(AzureConfig { - container_name: parse_toml_string("container_name", container_name)?, - container_region: parse_toml_string("container_region", container_region)?, - prefix_in_container: toml - .get("prefix_in_container") - .map(|prefix_in_container| { - parse_toml_string("prefix_in_container", prefix_in_container) - }) - .transpose()?, - concurrency_limit, - max_keys_per_list_response, - }) - } - (Some(local_path), None, None, None, None) => RemoteStorageKind::LocalFs( - Utf8PathBuf::from(parse_toml_string("local_path", local_path)?), - ), - (Some(_), Some(_), ..) => { - bail!("'local_path' and 'bucket_name' are mutually exclusive") - } - (Some(_), _, _, Some(_), Some(_)) => { - bail!("local_path and 'container_name' are mutually exclusive") - } - }; - - Ok(Some(RemoteStorageConfig { storage })) - } -} - -// Helper functions to parse a toml Item -fn parse_optional_integer(name: &str, item: &toml_edit::Item) -> anyhow::Result> -where - I: TryFrom, - E: std::error::Error + Send + Sync + 'static, -{ - let toml_integer = match item.get(name) { - Some(item) => item - .as_integer() - .with_context(|| format!("configure option {name} is not an integer"))?, - None => return Ok(None), - }; - - I::try_from(toml_integer) - .map(Some) - .with_context(|| format!("configure option {name} is too large")) -} - -fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result { - let s = item - .as_str() - .with_context(|| format!("configure option {name} is not a string"))?; - Ok(s.to_string()) -} - struct ConcurrencyLimiter { // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded. // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold. @@ -712,6 +623,7 @@ impl ConcurrencyLimiter { RequestKind::Delete => &self.write, RequestKind::Copy => &self.write, RequestKind::TimeTravel => &self.write, + RequestKind::Head => &self.read, } } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 34a6658a69..c3ef18cab1 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -4,7 +4,12 @@ //! This storage used in tests, but can also be used in cases when a certain persistent //! volume is mounted to the local FS. -use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime}; +use std::{ + collections::HashSet, + io::ErrorKind, + num::NonZeroU32, + time::{Duration, SystemTime, UNIX_EPOCH}, +}; use anyhow::{bail, ensure, Context}; use bytes::Bytes; @@ -15,24 +20,28 @@ use tokio::{ io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, }; use tokio_util::{io::ReaderStream, sync::CancellationToken}; -use tracing::*; -use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty}; +use utils::crashsafe::path_with_suffix_extension; -use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath}; +use crate::{ + Download, DownloadError, Listing, ListingMode, ListingObject, RemotePath, TimeTravelError, + TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR, +}; use super::{RemoteStorage, StorageMetadata}; +use crate::Etag; const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp"; #[derive(Debug, Clone)] pub struct LocalFs { storage_root: Utf8PathBuf, + timeout: Duration, } impl LocalFs { /// Attempts to create local FS storage, along with its root directory. /// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative). - pub fn new(mut storage_root: Utf8PathBuf) -> anyhow::Result { + pub fn new(mut storage_root: Utf8PathBuf, timeout: Duration) -> anyhow::Result { if !storage_root.exists() { std::fs::create_dir_all(&storage_root).with_context(|| { format!("Failed to create all directories in the given root path {storage_root:?}") @@ -44,7 +53,10 @@ impl LocalFs { })?; } - Ok(Self { storage_root }) + Ok(Self { + storage_root, + timeout, + }) } // mirrors S3Bucket::s3_object_to_relative_path @@ -79,7 +91,47 @@ impl LocalFs { #[cfg(test)] async fn list_all(&self) -> anyhow::Result> { - Ok(get_all_files(&self.storage_root, true) + use std::{future::Future, pin::Pin}; + fn get_all_files<'a, P>( + directory_path: P, + ) -> Pin>> + Send + Sync + 'a>> + where + P: AsRef + Send + Sync + 'a, + { + Box::pin(async move { + let directory_path = directory_path.as_ref(); + if directory_path.exists() { + if directory_path.is_dir() { + let mut paths = Vec::new(); + let mut dir_contents = fs::read_dir(directory_path).await?; + while let Some(dir_entry) = dir_contents.next_entry().await? { + let file_type = dir_entry.file_type().await?; + let entry_path = + Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| { + anyhow::Error::msg(format!( + "non-Unicode path: {}", + pb.to_string_lossy() + )) + })?; + if file_type.is_symlink() { + tracing::debug!("{entry_path:?} is a symlink, skipping") + } else if file_type.is_dir() { + paths.extend(get_all_files(&entry_path).await?.into_iter()) + } else { + paths.push(entry_path); + } + } + Ok(paths) + } else { + bail!("Path {directory_path:?} is not a directory") + } + } else { + Ok(Vec::new()) + } + }) + } + + Ok(get_all_files(&self.storage_root) .await? .into_iter() .map(|path| { @@ -106,6 +158,14 @@ impl LocalFs { // S3 object list prefixes can be arbitrary strings, but when reading // the local filesystem we need a directory to start calling read_dir on. let mut initial_dir = full_path.clone(); + + // If there's no trailing slash, we have to start looking from one above: even if + // `initial_dir` is a directory, we should still list any prefixes in the parent + // that start with the same string. + if !full_path.to_string().ends_with('/') { + initial_dir.pop(); + } + loop { // Did we make it to the root? if initial_dir.parent().is_none() { @@ -155,76 +215,14 @@ impl LocalFs { Ok(files) } -} -impl RemoteStorage for LocalFs { - async fn list( - &self, - prefix: Option<&RemotePath>, - mode: ListingMode, - ) -> Result { - let mut result = Listing::default(); - - if let ListingMode::NoDelimiter = mode { - let keys = self - .list_recursive(prefix) - .await - .map_err(DownloadError::Other)?; - - result.keys = keys - .into_iter() - .filter(|k| { - let path = k.with_base(&self.storage_root); - !path.is_dir() - }) - .collect(); - - return Ok(result); - } - - let path = match prefix { - Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)), - None => Cow::Borrowed(&self.storage_root), - }; - - let prefixes_to_filter = get_all_files(path.as_ref(), false) - .await - .map_err(DownloadError::Other)?; - - // filter out empty directories to mirror s3 behavior. - for prefix in prefixes_to_filter { - if prefix.is_dir() - && is_directory_empty(&prefix) - .await - .map_err(DownloadError::Other)? - { - continue; - } - - let stripped = prefix - .strip_prefix(&self.storage_root) - .context("Failed to strip prefix") - .and_then(RemotePath::new) - .expect( - "We list files for storage root, hence should be able to remote the prefix", - ); - - if prefix.is_dir() { - result.prefixes.push(stripped); - } else { - result.keys.push(stripped); - } - } - - Ok(result) - } - - async fn upload( + async fn upload0( &self, data: impl Stream> + Send + Sync, data_size_bytes: usize, to: &RemotePath, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result<()> { let target_file_path = to.with_base(&self.storage_root); create_target_directory(&target_file_path).await?; @@ -246,6 +244,7 @@ impl RemoteStorage for LocalFs { fs::OpenOptions::new() .write(true) .create(true) + .truncate(true) .open(&temp_file_path) .await .with_context(|| { @@ -259,9 +258,26 @@ impl RemoteStorage for LocalFs { let mut buffer_to_read = data.take(from_size_bytes); // alternatively we could just write the bytes to a file, but local_fs is a testing utility - let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination) - .await - .with_context(|| { + let copy = io::copy_buf(&mut buffer_to_read, &mut destination); + + let bytes_read = tokio::select! { + biased; + _ = cancel.cancelled() => { + let file = destination.into_inner(); + // wait for the inflight operation(s) to complete so that there could be a next + // attempt right away and our writes are not directed to their file. + file.into_std().await; + + // TODO: leave the temp or not? leaving is probably less racy. enabled truncate at + // least. + fs::remove_file(temp_file_path).await.context("remove temp_file_path after cancellation or timeout")?; + return Err(TimeoutOrCancel::Cancel.into()); + } + read = copy => read, + }; + + let bytes_read = + bytes_read.with_context(|| { format!( "Failed to upload file (write temp) to the local storage at '{temp_file_path}'", ) @@ -293,6 +309,9 @@ impl RemoteStorage for LocalFs { })?; if let Some(storage_metadata) = metadata { + // FIXME: we must not be using metadata much, since this would forget the old metadata + // for new writes? or perhaps metadata is sticky; could consider removing if it's never + // used. let storage_metadata_path = storage_metadata_path(&target_file_path); fs::write( &storage_metadata_path, @@ -309,41 +328,214 @@ impl RemoteStorage for LocalFs { Ok(()) } +} - async fn download(&self, from: &RemotePath) -> Result { - let target_path = from.with_base(&self.storage_root); - if file_exists(&target_path).map_err(DownloadError::BadInput)? { - let source = ReaderStream::new( - fs::OpenOptions::new() - .read(true) - .open(&target_path) - .await - .with_context(|| { - format!("Failed to open source file {target_path:?} to use in the download") - }) - .map_err(DownloadError::Other)?, - ); +impl RemoteStorage for LocalFs { + fn list_streaming( + &self, + prefix: Option<&RemotePath>, + mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, + ) -> impl Stream> { + let listing = self.list(prefix, mode, max_keys, cancel); + futures::stream::once(listing) + } - let metadata = self - .read_storage_metadata(&target_path) + async fn list( + &self, + prefix: Option<&RemotePath>, + mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, + ) -> Result { + let op = async { + let mut result = Listing::default(); + + // Filter out directories: in S3 directories don't exist, only the keys within them do. + let keys = self + .list_recursive(prefix) .await .map_err(DownloadError::Other)?; - Ok(Download { - metadata, - last_modified: None, - etag: None, - download_stream: Box::pin(source), - }) - } else { - Err(DownloadError::NotFound) + let objects = keys + .into_iter() + .filter_map(|k| { + let path = k.with_base(&self.storage_root); + if path.is_dir() { + None + } else { + Some(ListingObject { + key: k.clone(), + // LocalFs is just for testing, so just specify a dummy time + last_modified: SystemTime::now(), + size: 0, + }) + } + }) + .collect(); + + if let ListingMode::NoDelimiter = mode { + result.keys = objects; + } else { + let mut prefixes = HashSet::new(); + for object in objects { + let key = object.key; + // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`. + let relative_key = if let Some(prefix) = prefix { + let mut prefix = prefix.clone(); + // We only strip the dirname of the prefix, so that when we strip it from the start of keys we + // end up with full file/dir names. + let prefix_full_local_path = prefix.with_base(&self.storage_root); + let has_slash = prefix.0.to_string().ends_with('/'); + let strip_prefix = if prefix_full_local_path.is_dir() && has_slash { + prefix + } else { + prefix.0.pop(); + prefix + }; + + RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap() + } else { + key + }; + + let relative_key = format!("{}", relative_key); + if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) { + let first_part = relative_key + .split(REMOTE_STORAGE_PREFIX_SEPARATOR) + .next() + .unwrap() + .to_owned(); + prefixes.insert(first_part); + } else { + result.keys.push(ListingObject { + key: RemotePath::from_string(&relative_key).unwrap(), + // LocalFs is just for testing + last_modified: SystemTime::now(), + size: 0, + }); + } + } + result.prefixes = prefixes + .into_iter() + .map(|s| RemotePath::from_string(&s).unwrap()) + .collect(); + } + + if let Some(max_keys) = max_keys { + result.keys.truncate(max_keys.get() as usize); + } + Ok(result) + }; + + let timeout = async { + tokio::time::sleep(self.timeout).await; + Err(DownloadError::Timeout) + }; + + let cancelled = async { + cancel.cancelled().await; + Err(DownloadError::Cancelled) + }; + + tokio::select! { + res = op => res, + res = timeout => res, + res = cancelled => res, } } + async fn head_object( + &self, + key: &RemotePath, + _cancel: &CancellationToken, + ) -> Result { + let target_file_path = key.with_base(&self.storage_root); + let metadata = file_metadata(&target_file_path).await?; + Ok(ListingObject { + key: key.clone(), + last_modified: metadata.modified()?, + size: metadata.len(), + }) + } + + async fn upload( + &self, + data: impl Stream> + Send + Sync, + data_size_bytes: usize, + to: &RemotePath, + metadata: Option, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let cancel = cancel.child_token(); + + let op = self.upload0(data, data_size_bytes, to, metadata, &cancel); + let mut op = std::pin::pin!(op); + + // race the upload0 to the timeout; if it goes over, do a graceful shutdown + let (res, timeout) = tokio::select! { + res = &mut op => (res, false), + _ = tokio::time::sleep(self.timeout) => { + cancel.cancel(); + (op.await, true) + } + }; + + match res { + Err(e) if timeout && TimeoutOrCancel::caused_by_cancel(&e) => { + // we caused this cancel (or they happened simultaneously) -- swap it out to + // Timeout + Err(TimeoutOrCancel::Timeout.into()) + } + res => res, + } + } + + async fn download( + &self, + from: &RemotePath, + cancel: &CancellationToken, + ) -> Result { + let target_path = from.with_base(&self.storage_root); + + let file_metadata = file_metadata(&target_path).await?; + + let source = ReaderStream::new( + fs::OpenOptions::new() + .read(true) + .open(&target_path) + .await + .with_context(|| { + format!("Failed to open source file {target_path:?} to use in the download") + }) + .map_err(DownloadError::Other)?, + ); + + let metadata = self + .read_storage_metadata(&target_path) + .await + .map_err(DownloadError::Other)?; + + let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); + let source = crate::support::DownloadStream::new(cancel_or_timeout, source); + + let etag = mock_etag(&file_metadata); + Ok(Download { + metadata, + last_modified: file_metadata + .modified() + .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?, + etag, + download_stream: Box::pin(source), + }) + } + async fn download_byte_range( &self, from: &RemotePath, start_inclusive: u64, end_exclusive: Option, + cancel: &CancellationToken, ) -> Result { if let Some(end_exclusive) = end_exclusive { if end_exclusive <= start_inclusive { @@ -353,44 +545,54 @@ impl RemoteStorage for LocalFs { return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes"))); } } - let target_path = from.with_base(&self.storage_root); - if file_exists(&target_path).map_err(DownloadError::BadInput)? { - let mut source = tokio::fs::OpenOptions::new() - .read(true) - .open(&target_path) - .await - .with_context(|| { - format!("Failed to open source file {target_path:?} to use in the download") - }) - .map_err(DownloadError::Other)?; - source - .seek(io::SeekFrom::Start(start_inclusive)) - .await - .context("Failed to seek to the range start in a local storage file") - .map_err(DownloadError::Other)?; - let metadata = self - .read_storage_metadata(&target_path) - .await - .map_err(DownloadError::Other)?; - let download_stream: DownloadStream = match end_exclusive { - Some(end_exclusive) => Box::pin(ReaderStream::new( - source.take(end_exclusive - start_inclusive), - )), - None => Box::pin(ReaderStream::new(source)), - }; - Ok(Download { - metadata, - last_modified: None, - etag: None, - download_stream, + let target_path = from.with_base(&self.storage_root); + let file_metadata = file_metadata(&target_path).await?; + let mut source = tokio::fs::OpenOptions::new() + .read(true) + .open(&target_path) + .await + .with_context(|| { + format!("Failed to open source file {target_path:?} to use in the download") }) - } else { - Err(DownloadError::NotFound) - } + .map_err(DownloadError::Other)?; + + let len = source + .metadata() + .await + .context("query file length") + .map_err(DownloadError::Other)? + .len(); + + source + .seek(io::SeekFrom::Start(start_inclusive)) + .await + .context("Failed to seek to the range start in a local storage file") + .map_err(DownloadError::Other)?; + + let metadata = self + .read_storage_metadata(&target_path) + .await + .map_err(DownloadError::Other)?; + + let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive); + let source = ReaderStream::new(source); + + let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); + let source = crate::support::DownloadStream::new(cancel_or_timeout, source); + + let etag = mock_etag(&file_metadata); + Ok(Download { + metadata, + last_modified: file_metadata + .modified() + .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?, + etag, + download_stream: Box::pin(source), + }) } - async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { + async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> { let file_path = path.with_base(&self.storage_root); match fs::remove_file(&file_path).await { Ok(()) => Ok(()), @@ -402,14 +604,23 @@ impl RemoteStorage for LocalFs { } } - async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { + async fn delete_objects<'a>( + &self, + paths: &'a [RemotePath], + cancel: &CancellationToken, + ) -> anyhow::Result<()> { for path in paths { - self.delete(path).await? + self.delete(path, cancel).await? } Ok(()) } - async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + async fn copy( + &self, + from: &RemotePath, + to: &RemotePath, + _cancel: &CancellationToken, + ) -> anyhow::Result<()> { let from_path = from.with_base(&self.storage_root); let to_path = to.with_base(&self.storage_root); create_target_directory(&to_path).await?; @@ -423,15 +634,14 @@ impl RemoteStorage for LocalFs { Ok(()) } - #[allow(clippy::diverging_sub_expression)] async fn time_travel_recover( &self, _prefix: Option<&RemotePath>, _timestamp: SystemTime, _done_if_after: SystemTime, - _cancel: CancellationToken, - ) -> anyhow::Result<()> { - unimplemented!() + _cancel: &CancellationToken, + ) -> Result<(), TimeTravelError> { + Err(TimeTravelError::Unimplemented) } } @@ -439,50 +649,6 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf { path_with_suffix_extension(original_path, "metadata") } -fn get_all_files<'a, P>( - directory_path: P, - recursive: bool, -) -> Pin>> + Send + Sync + 'a>> -where - P: AsRef + Send + Sync + 'a, -{ - Box::pin(async move { - let directory_path = directory_path.as_ref(); - if directory_path.exists() { - if directory_path.is_dir() { - let mut paths = Vec::new(); - let mut dir_contents = fs::read_dir(directory_path).await?; - while let Some(dir_entry) = dir_contents.next_entry().await? { - let file_type = dir_entry.file_type().await?; - let entry_path = - Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| { - anyhow::Error::msg(format!( - "non-Unicode path: {}", - pb.to_string_lossy() - )) - })?; - if file_type.is_symlink() { - debug!("{entry_path:?} is a symlink, skipping") - } else if file_type.is_dir() { - if recursive { - paths.extend(get_all_files(&entry_path, true).await?.into_iter()) - } else { - paths.push(entry_path) - } - } else { - paths.push(entry_path); - } - } - Ok(paths) - } else { - bail!("Path {directory_path:?} is not a directory") - } - } else { - Ok(Vec::new()) - } - }) -} - async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> { let target_dir = match target_file_path.parent() { Some(parent_dir) => parent_dir, @@ -494,33 +660,39 @@ async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result< Ok(()) } -fn file_exists(file_path: &Utf8Path) -> anyhow::Result { - if file_path.exists() { - ensure!(file_path.is_file(), "file path '{file_path}' is not a file"); - Ok(true) - } else { - Ok(false) - } +async fn file_metadata(file_path: &Utf8Path) -> Result { + tokio::fs::metadata(&file_path).await.map_err(|e| { + if e.kind() == ErrorKind::NotFound { + DownloadError::NotFound + } else { + DownloadError::BadInput(e.into()) + } + }) +} + +// Use mtime as stand-in for ETag. We could calculate a meaningful one by md5'ing the contents of files we +// read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests +// quickly, with less overhead than using a mock S3 server. +fn mock_etag(meta: &std::fs::Metadata) -> Etag { + let mtime = meta.modified().expect("Filesystem mtime missing"); + format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis()).into() } #[cfg(test)] mod fs_tests { use super::*; - use bytes::Bytes; use camino_tempfile::tempdir; - use futures_util::Stream; use std::{collections::HashMap, io::Write}; - async fn read_and_assert_remote_file_contents( + async fn read_and_check_metadata( storage: &LocalFs, - #[allow(clippy::ptr_arg)] - // have to use &Utf8PathBuf due to `storage.local_path` parameter requirements remote_storage_path: &RemotePath, expected_metadata: Option<&StorageMetadata>, ) -> anyhow::Result { + let cancel = CancellationToken::new(); let download = storage - .download(remote_storage_path) + .download(remote_storage_path, &cancel) .await .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?; ensure!( @@ -535,16 +707,16 @@ mod fs_tests { #[tokio::test] async fn upload_file() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; - let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?; + let target_path_1 = upload_dummy_file(&storage, "upload_1", None, &cancel).await?; assert_eq!( storage.list_all().await?, vec![target_path_1.clone()], "Should list a single file after first upload" ); - let target_path_2 = upload_dummy_file(&storage, "upload_2", None).await?; + let target_path_2 = upload_dummy_file(&storage, "upload_2", None, &cancel).await?; assert_eq!( list_files_sorted(&storage).await?, vec![target_path_1.clone(), target_path_2.clone()], @@ -556,7 +728,7 @@ mod fs_tests { #[tokio::test] async fn upload_file_negatives() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; let id = RemotePath::new(Utf8Path::new("dummy"))?; let content = Bytes::from_static(b"12345"); @@ -565,36 +737,36 @@ mod fs_tests { // Check that you get an error if the size parameter doesn't match the actual // size of the stream. storage - .upload(content(), 0, &id, None) + .upload(content(), 0, &id, None, &cancel) .await .expect_err("upload with zero size succeeded"); storage - .upload(content(), 4, &id, None) + .upload(content(), 4, &id, None, &cancel) .await .expect_err("upload with too short size succeeded"); storage - .upload(content(), 6, &id, None) + .upload(content(), 6, &id, None, &cancel) .await .expect_err("upload with too large size succeeded"); // Correct size is 5, this should succeed. - storage.upload(content(), 5, &id, None).await?; + storage.upload(content(), 5, &id, None, &cancel).await?; Ok(()) } - fn create_storage() -> anyhow::Result { + fn create_storage() -> anyhow::Result<(LocalFs, CancellationToken)> { let storage_root = tempdir()?.path().to_path_buf(); - LocalFs::new(storage_root) + LocalFs::new(storage_root, Duration::from_secs(120)).map(|s| (s, CancellationToken::new())) } #[tokio::test] async fn download_file() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?; - let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?; + let contents = read_and_check_metadata(&storage, &upload_target, None).await?; assert_eq!( dummy_contents(upload_name), contents, @@ -602,7 +774,7 @@ mod fs_tests { ); let non_existing_path = "somewhere/else"; - match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?).await { + match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?, &cancel).await { Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"), } @@ -611,12 +783,12 @@ mod fs_tests { #[tokio::test] async fn download_file_range_positive() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?; let full_range_download_contents = - read_and_assert_remote_file_contents(&storage, &upload_target, None).await?; + read_and_check_metadata(&storage, &upload_target, None).await?; assert_eq!( dummy_contents(upload_name), full_range_download_contents, @@ -627,7 +799,12 @@ mod fs_tests { let (first_part_local, second_part_local) = uploaded_bytes.split_at(3); let first_part_download = storage - .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64)) + .download_byte_range( + &upload_target, + 0, + Some(first_part_local.len() as u64), + &cancel, + ) .await?; assert!( first_part_download.metadata.is_none(), @@ -645,6 +822,7 @@ mod fs_tests { &upload_target, first_part_local.len() as u64, Some((first_part_local.len() + second_part_local.len()) as u64), + &cancel, ) .await?; assert!( @@ -658,14 +836,30 @@ mod fs_tests { "Second part bytes should be returned when requested" ); + let suffix_bytes = storage + .download_byte_range(&upload_target, 13, None, &cancel) + .await? + .download_stream; + let suffix_bytes = aggregate(suffix_bytes).await?; + let suffix = std::str::from_utf8(&suffix_bytes)?; + assert_eq!(upload_name, suffix); + + let all_bytes = storage + .download_byte_range(&upload_target, 0, None, &cancel) + .await? + .download_stream; + let all_bytes = aggregate(all_bytes).await?; + let all_bytes = std::str::from_utf8(&all_bytes)?; + assert_eq!(dummy_contents("upload_1"), all_bytes); + Ok(()) } #[tokio::test] async fn download_file_range_negative() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?; let start = 1_000_000_000; let end = start + 1; @@ -674,6 +868,7 @@ mod fs_tests { &upload_target, start, Some(end), // exclusive end + &cancel, ) .await { @@ -690,7 +885,7 @@ mod fs_tests { let end = 234; assert!(start > end, "Should test an incorrect range"); match storage - .download_byte_range(&upload_target, start, Some(end)) + .download_byte_range(&upload_target, start, Some(end), &cancel) .await { Ok(_) => panic!("Should not allow downloading wrong ranges"), @@ -707,15 +902,15 @@ mod fs_tests { #[tokio::test] async fn delete_file() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?; - storage.delete(&upload_target).await?; + storage.delete(&upload_target, &cancel).await?; assert!(storage.list_all().await?.is_empty()); storage - .delete(&upload_target) + .delete(&upload_target, &cancel) .await .expect("Should allow deleting non-existing storage files"); @@ -724,17 +919,17 @@ mod fs_tests { #[tokio::test] async fn file_with_metadata() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; let metadata = StorageMetadata(HashMap::from([ ("one".to_string(), "1".to_string()), ("two".to_string(), "2".to_string()), ])); let upload_target = - upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?; + upload_dummy_file(&storage, upload_name, Some(metadata.clone()), &cancel).await?; let full_range_download_contents = - read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?; + read_and_check_metadata(&storage, &upload_target, Some(&metadata)).await?; assert_eq!( dummy_contents(upload_name), full_range_download_contents, @@ -745,7 +940,12 @@ mod fs_tests { let (first_part_local, _) = uploaded_bytes.split_at(3); let partial_download_with_metadata = storage - .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64)) + .download_byte_range( + &upload_target, + 0, + Some(first_part_local.len() as u64), + &cancel, + ) .await?; let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?; assert_eq!( @@ -766,16 +966,29 @@ mod fs_tests { #[tokio::test] async fn list() -> anyhow::Result<()> { // No delimiter: should recursively list everything - let storage = create_storage()?; - let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?; - let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?; + let (storage, cancel) = create_storage()?; + let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?; + let child_sibling = + upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?; + let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?; - let listing = storage.list(None, ListingMode::NoDelimiter).await?; + let listing = storage + .list(None, ListingMode::NoDelimiter, None, &cancel) + .await?; assert!(listing.prefixes.is_empty()); - assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec()); + assert_eq!( + listing + .keys + .into_iter() + .map(|o| o.key) + .collect::>(), + HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()]) + ); // Delimiter: should only go one deep - let listing = storage.list(None, ListingMode::WithDelimiter).await?; + let listing = storage + .list(None, ListingMode::WithDelimiter, None, &cancel) + .await?; assert_eq!( listing.prefixes, @@ -783,19 +996,157 @@ mod fs_tests { ); assert!(listing.keys.is_empty()); - // Delimiter & prefix + // Delimiter & prefix with a trailing slash + let listing = storage + .list( + Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await?; + assert_eq!( + listing.keys.into_iter().map(|o| o.key).collect::>(), + [RemotePath::from_string("uncle").unwrap()].to_vec() + ); + assert_eq!( + listing.prefixes, + [RemotePath::from_string("parent").unwrap()].to_vec() + ); + + // Delimiter and prefix without a trailing slash let listing = storage .list( Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()), ListingMode::WithDelimiter, + None, + &cancel, ) .await?; + assert_eq!(listing.keys, vec![]); assert_eq!( listing.prefixes, - [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()] - .to_vec() + [RemotePath::from_string("grandparent").unwrap()].to_vec() ); - assert_eq!(listing.keys, [uncle.clone()].to_vec()); + + // Delimiter and prefix that's partway through a path component + let listing = storage + .list( + Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await?; + assert_eq!(listing.keys, vec![]); + assert_eq!( + listing.prefixes, + [RemotePath::from_string("grandparent").unwrap()].to_vec() + ); + + Ok(()) + } + + #[tokio::test] + async fn list_part_component() -> anyhow::Result<()> { + // No delimiter: should recursively list everything + let (storage, cancel) = create_storage()?; + + // Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing + // of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as + // a freeform prefix. + let _child_a = + upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?; + let _child_b = + upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?; + + // Delimiter and prefix that's partway through a path component + let listing = storage + .list( + Some( + &RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(), + ), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await?; + assert_eq!(listing.keys, vec![]); + + let mut found_prefixes = listing.prefixes.clone(); + found_prefixes.sort(); + assert_eq!( + found_prefixes, + [ + RemotePath::from_string("tenant").unwrap(), + RemotePath::from_string("tenant-01").unwrap(), + ] + .to_vec() + ); + + Ok(()) + } + + #[tokio::test] + async fn overwrite_shorter_file() -> anyhow::Result<()> { + let (storage, cancel) = create_storage()?; + + let path = RemotePath::new("does/not/matter/file".into())?; + + let body = Bytes::from_static(b"long file contents is long"); + { + let len = body.len(); + let body = + futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone()))); + storage.upload(body, len, &path, None, &cancel).await?; + } + + let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?; + assert_eq!(body, read); + + let shorter = Bytes::from_static(b"shorter body"); + { + let len = shorter.len(); + let body = + futures::stream::once(futures::future::ready(std::io::Result::Ok(shorter.clone()))); + storage.upload(body, len, &path, None, &cancel).await?; + } + + let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?; + assert_eq!(shorter, read); + Ok(()) + } + + #[tokio::test] + async fn cancelled_upload_can_later_be_retried() -> anyhow::Result<()> { + let (storage, cancel) = create_storage()?; + + let path = RemotePath::new("does/not/matter/file".into())?; + + let body = Bytes::from_static(b"long file contents is long"); + { + let len = body.len(); + let body = + futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone()))); + let cancel = cancel.child_token(); + cancel.cancel(); + let e = storage + .upload(body, len, &path, None, &cancel) + .await + .unwrap_err(); + + assert!(TimeoutOrCancel::caused_by_cancel(&e)); + } + + { + let len = body.len(); + let body = + futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone()))); + storage.upload(body, len, &path, None, &cancel).await?; + } + + let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?; + assert_eq!(body, read); Ok(()) } @@ -804,6 +1155,7 @@ mod fs_tests { storage: &LocalFs, name: &str, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result { let from_path = storage .storage_root @@ -825,7 +1177,9 @@ mod fs_tests { let file = tokio_util::io::ReaderStream::new(file); - storage.upload(file, size, &relative_path, metadata).await?; + storage + .upload(file, size, &relative_path, metadata, cancel) + .await?; Ok(relative_path) } diff --git a/libs/remote_storage/src/s3_bucket/metrics.rs b/libs/remote_storage/src/metrics.rs similarity index 72% rename from libs/remote_storage/src/s3_bucket/metrics.rs rename to libs/remote_storage/src/metrics.rs index beca755920..f1aa4c433b 100644 --- a/libs/remote_storage/src/s3_bucket/metrics.rs +++ b/libs/remote_storage/src/metrics.rs @@ -13,8 +13,10 @@ pub(crate) enum RequestKind { List = 3, Copy = 4, TimeTravel = 5, + Head = 6, } +use scopeguard::ScopeGuard; use RequestKind::*; impl RequestKind { @@ -26,6 +28,7 @@ impl RequestKind { List => "list_objects", Copy => "copy_object", TimeTravel => "time_travel_recover", + Head => "head_object", } } const fn as_index(&self) -> usize { @@ -33,17 +36,18 @@ impl RequestKind { } } -pub(super) struct RequestTyped([C; 6]); +const REQUEST_KIND_COUNT: usize = 7; +pub(crate) struct RequestTyped([C; REQUEST_KIND_COUNT]); impl RequestTyped { - pub(super) fn get(&self, kind: RequestKind) -> &C { + pub(crate) fn get(&self, kind: RequestKind) -> &C { &self.0[kind.as_index()] } fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self { use RequestKind::*; - let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter(); - let arr = std::array::from_fn::(|index| { + let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter(); + let arr = std::array::from_fn::(|index| { let next = it.next().unwrap(); assert_eq!(index, next.as_index()); f(next) @@ -58,19 +62,19 @@ impl RequestTyped { } impl RequestTyped { - pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) { + pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) { self.get(kind).observe(started_at.elapsed().as_secs_f64()) } } -pub(super) struct PassFailCancelledRequestTyped { +pub(crate) struct PassFailCancelledRequestTyped { success: RequestTyped, fail: RequestTyped, cancelled: RequestTyped, } #[derive(Debug, Clone, Copy)] -pub(super) enum AttemptOutcome { +pub(crate) enum AttemptOutcome { Ok, Err, Cancelled, @@ -86,7 +90,7 @@ impl From<&Result> for AttemptOutcome { } impl AttemptOutcome { - pub(super) fn as_str(&self) -> &'static str { + pub(crate) fn as_str(&self) -> &'static str { match self { AttemptOutcome::Ok => "ok", AttemptOutcome::Err => "err", @@ -96,7 +100,7 @@ impl AttemptOutcome { } impl PassFailCancelledRequestTyped { - pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C { + pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C { let target = match outcome { AttemptOutcome::Ok => &self.success, AttemptOutcome::Err => &self.fail, @@ -119,7 +123,7 @@ impl PassFailCancelledRequestTyped { } impl PassFailCancelledRequestTyped { - pub(super) fn observe_elapsed( + pub(crate) fn observe_elapsed( &self, kind: RequestKind, outcome: impl Into, @@ -130,19 +134,44 @@ impl PassFailCancelledRequestTyped { } } -pub(super) struct BucketMetrics { +/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`]. +pub(crate) fn start_counting_cancelled_wait( + kind: RequestKind, +) -> ScopeGuard { + scopeguard::guard_on_success(std::time::Instant::now(), move |_| { + crate::metrics::BUCKET_METRICS + .cancelled_waits + .get(kind) + .inc() + }) +} + +/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`]. +pub(crate) fn start_measuring_requests( + kind: RequestKind, +) -> ScopeGuard { + scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| { + crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + kind, + AttemptOutcome::Cancelled, + started_at, + ) + }) +} + +pub(crate) struct BucketMetrics { /// Full request duration until successful completion, error or cancellation. - pub(super) req_seconds: PassFailCancelledRequestTyped, + pub(crate) req_seconds: PassFailCancelledRequestTyped, /// Total amount of seconds waited on queue. - pub(super) wait_seconds: RequestTyped, + pub(crate) wait_seconds: RequestTyped, /// Track how many semaphore awaits were cancelled per request type. /// /// This is in case cancellations are happening more than expected. - pub(super) cancelled_waits: RequestTyped, + pub(crate) cancelled_waits: RequestTyped, /// Total amount of deleted objects in batches or single requests. - pub(super) deleted_objects_total: IntCounter, + pub(crate) deleted_objects_total: IntCounter, } impl Default for BucketMetrics { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 4909b8522b..11f6598cbf 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -7,35 +7,30 @@ use std::{ borrow::Cow, collections::HashMap, + num::NonZeroU32, pin::Pin, sync::Arc, task::{Context, Poll}, - time::SystemTime, + time::{Duration, SystemTime}, }; use anyhow::{anyhow, Context as _}; use aws_config::{ - environment::credentials::EnvironmentVariableCredentialsProvider, - imds::credentials::ImdsCredentialsProvider, - meta::credentials::CredentialsProviderChain, - profile::ProfileFileCredentialsProvider, - provider_config::ProviderConfig, + default_provider::credentials::DefaultCredentialsChain, retry::{RetryConfigBuilder, RetryMode}, - web_identity_token::WebIdentityTokenCredentialsProvider, BehaviorVersion, }; -use aws_credential_types::provider::SharedCredentialsProvider; use aws_sdk_s3::{ - config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep}, + config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}, error::SdkError, - operation::get_object::GetObjectError, - types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion}, + operation::{get_object::GetObjectError, head_object::HeadObjectError}, + types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass}, Client, }; use aws_smithy_async::rt::sleep::TokioSleep; -use aws_smithy_types::byte_stream::ByteStream; use aws_smithy_types::{body::SdkBody, DateTime}; +use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError}; use bytes::Bytes; use futures::stream::Stream; use hyper::Body; @@ -45,14 +40,17 @@ use utils::backoff; use super::StorageMetadata; use crate::{ - ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, - S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, + config::S3Config, + error::Cancelled, + metrics::{start_counting_cancelled_wait, start_measuring_requests}, + support::PermitCarrying, + ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, ListingObject, RemotePath, + RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, + REMOTE_STORAGE_PREFIX_SEPARATOR, }; -pub(super) mod metrics; - -use self::metrics::AttemptOutcome; -pub(super) use self::metrics::RequestKind; +use crate::metrics::AttemptOutcome; +pub(super) use crate::metrics::RequestKind; /// AWS S3 storage. pub struct S3Bucket { @@ -60,10 +58,12 @@ pub struct S3Bucket { bucket_name: String, prefix_in_bucket: Option, max_keys_per_list_response: Option, + upload_storage_class: Option, concurrency_limiter: ConcurrencyLimiter, + // Per-request timeout. Accessible for tests. + pub timeout: Duration, } -#[derive(Default)] struct GetObjectRequest { bucket: String, key: String, @@ -71,44 +71,63 @@ struct GetObjectRequest { } impl S3Bucket { /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. - pub fn new(aws_config: &S3Config) -> anyhow::Result { + pub async fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result { tracing::debug!( "Creating s3 remote storage for S3 bucket {}", - aws_config.bucket_name + remote_storage_config.bucket_name ); - let region = Some(Region::new(aws_config.bucket_region.clone())); + let region = Region::new(remote_storage_config.bucket_region.clone()); + let region_opt = Some(region.clone()); - let provider_conf = ProviderConfig::without_region().with_region(region.clone()); - - let credentials_provider = { - // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" - CredentialsProviderChain::first_try( - "env", - EnvironmentVariableCredentialsProvider::new(), - ) - // uses "AWS_PROFILE" / `aws sso login --profile ` - .or_else( - "profile-sso", - ProfileFileCredentialsProvider::builder() - .configure(&provider_conf) - .build(), - ) - // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" - // needed to access remote extensions bucket - .or_else( - "token", - WebIdentityTokenCredentialsProvider::builder() - .configure(&provider_conf) - .build(), - ) - // uses imds v2 - .or_else("imds", ImdsCredentialsProvider::builder().build()) - }; + // https://docs.aws.amazon.com/sdkref/latest/guide/standardized-credentials.html + // https://docs.rs/aws-config/latest/aws_config/default_provider/credentials/struct.DefaultCredentialsChain.html + // Incomplete list of auth methods used by this: + // * "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" + // * "AWS_PROFILE" / `aws sso login --profile ` + // * "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" + // * http (ECS/EKS) container credentials + // * imds v2 + let credentials_provider = DefaultCredentialsChain::builder() + .region(region) + .build() + .await; // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off let sleep_impl: Arc = Arc::new(TokioSleep::new()); + let sdk_config_loader: aws_config::ConfigLoader = aws_config::defaults( + #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */ + BehaviorVersion::v2023_11_09(), + ) + .region(region_opt) + .identity_cache(IdentityCache::lazy().build()) + .credentials_provider(credentials_provider) + .sleep_impl(SharedAsyncSleep::from(sleep_impl)); + + let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| { + s.spawn(|| { + // TODO: make this function async. + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap() + .block_on(sdk_config_loader.load()) + }) + .join() + .unwrap() + }); + + let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config); + + // Technically, the `remote_storage_config.endpoint` field only applies to S3 interactions. + // (In case we ever re-use the `sdk_config` for more than just the S3 client in the future) + if let Some(custom_endpoint) = remote_storage_config.endpoint.clone() { + s3_config_builder = s3_config_builder + .endpoint_url(custom_endpoint) + .force_path_style(true); + } + // We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled. @@ -116,41 +135,37 @@ impl S3Bucket { retry_config .set_max_attempts(Some(1)) .set_mode(Some(RetryMode::Adaptive)); + s3_config_builder = s3_config_builder.retry_config(retry_config.build()); - let mut config_builder = Builder::default() - .behavior_version(BehaviorVersion::v2023_11_09()) - .region(region) - .identity_cache(IdentityCache::lazy().build()) - .credentials_provider(SharedCredentialsProvider::new(credentials_provider)) - .retry_config(retry_config.build()) - .sleep_impl(SharedAsyncSleep::from(sleep_impl)); + let s3_config = s3_config_builder.build(); + let client = aws_sdk_s3::Client::from_conf(s3_config); - if let Some(custom_endpoint) = aws_config.endpoint.clone() { - config_builder = config_builder - .endpoint_url(custom_endpoint) - .force_path_style(true); - } + let prefix_in_bucket = remote_storage_config + .prefix_in_bucket + .as_deref() + .map(|prefix| { + let mut prefix = prefix; + while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + prefix = &prefix[1..] + } - let client = Client::from_conf(config_builder.build()); + let mut prefix = prefix.to_string(); + while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + prefix.pop(); + } + prefix + }); - let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| { - let mut prefix = prefix; - while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { - prefix = &prefix[1..] - } - - let mut prefix = prefix.to_string(); - while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { - prefix.pop(); - } - prefix - }); Ok(Self { client, - bucket_name: aws_config.bucket_name.clone(), - max_keys_per_list_response: aws_config.max_keys_per_list_response, + bucket_name: remote_storage_config.bucket_name.clone(), + max_keys_per_list_response: remote_storage_config.max_keys_per_list_response, prefix_in_bucket, - concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()), + concurrency_limiter: ConcurrencyLimiter::new( + remote_storage_config.concurrency_limit.get(), + ), + upload_storage_class: remote_storage_config.upload_storage_class.clone(), + timeout, }) } @@ -174,50 +189,62 @@ impl S3Bucket { pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String { assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR); - let path_string = path - .get_path() - .as_str() - .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR); + let path_string = path.get_path().as_str(); match &self.prefix_in_bucket { Some(prefix) => prefix.clone() + "/" + path_string, None => path_string.to_string(), } } - async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> { + async fn permit( + &self, + kind: RequestKind, + cancel: &CancellationToken, + ) -> Result, Cancelled> { let started_at = start_counting_cancelled_wait(kind); - let permit = self - .concurrency_limiter - .acquire(kind) - .await - .expect("semaphore is never closed"); + let acquire = self.concurrency_limiter.acquire(kind); + + let permit = tokio::select! { + permit = acquire => permit.expect("semaphore is never closed"), + _ = cancel.cancelled() => return Err(Cancelled), + }; let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS + crate::metrics::BUCKET_METRICS .wait_seconds .observe_elapsed(kind, started_at); - permit + Ok(permit) } - async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit { + async fn owned_permit( + &self, + kind: RequestKind, + cancel: &CancellationToken, + ) -> Result { let started_at = start_counting_cancelled_wait(kind); - let permit = self - .concurrency_limiter - .acquire_owned(kind) - .await - .expect("semaphore is never closed"); + let acquire = self.concurrency_limiter.acquire_owned(kind); + + let permit = tokio::select! { + permit = acquire => permit.expect("semaphore is never closed"), + _ = cancel.cancelled() => return Err(Cancelled), + }; let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS + crate::metrics::BUCKET_METRICS .wait_seconds .observe_elapsed(kind, started_at); - permit + Ok(permit) } - async fn download_object(&self, request: GetObjectRequest) -> Result { + async fn download_object( + &self, + request: GetObjectRequest, + cancel: &CancellationToken, + ) -> Result { let kind = RequestKind::Get; - let permit = self.owned_permit(kind).await; + + let permit = self.owned_permit(kind, cancel).await?; let started_at = start_measuring_requests(kind); @@ -227,83 +254,115 @@ impl S3Bucket { .bucket(request.bucket) .key(request.key) .set_range(request.range) - .send() - .await; + .send(); + + let get_object = tokio::select! { + res = get_object => res, + _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout), + _ = cancel.cancelled() => return Err(DownloadError::Cancelled), + }; let started_at = ScopeGuard::into_inner(started_at); - match get_object { - Ok(object_output) => { - let metadata = object_output.metadata().cloned().map(StorageMetadata); - let etag = object_output.e_tag.clone(); - let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok()); - - let body = object_output.body; - let body = ByteStreamAsStream::from(body); - let body = PermitCarrying::new(permit, body); - let body = TimedDownload::new(started_at, body); - - Ok(Download { - metadata, - etag, - last_modified, - download_stream: Box::pin(body), - }) - } + let object_output = match get_object { + Ok(object_output) => object_output, Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => { // Count this in the AttemptOutcome::Ok bucket, because 404 is not // an error: we expect to sometimes fetch an object and find it missing, // e.g. when probing for timeline indices. - metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( kind, AttemptOutcome::Ok, started_at, ); - Err(DownloadError::NotFound) + return Err(DownloadError::NotFound); } Err(e) => { - metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( kind, AttemptOutcome::Err, started_at, ); - Err(DownloadError::Other( + return Err(DownloadError::Other( anyhow::Error::new(e).context("download s3 object"), - )) + )); } - } + }; + + // even if we would have no timeout left, continue anyways. the caller can decide to ignore + // the errors considering timeouts and cancellation. + let remaining = self.timeout.saturating_sub(started_at.elapsed()); + + let metadata = object_output.metadata().cloned().map(StorageMetadata); + let etag = object_output + .e_tag + .ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))? + .into(); + let last_modified = object_output + .last_modified + .ok_or(DownloadError::Other(anyhow::anyhow!( + "Missing LastModified header" + )))? + .try_into() + .map_err(|e: ConversionError| DownloadError::Other(e.into()))?; + + let body = object_output.body; + let body = ByteStreamAsStream::from(body); + let body = PermitCarrying::new(permit, body); + let body = TimedDownload::new(started_at, body); + + let cancel_or_timeout = crate::support::cancel_or_timeout(remaining, cancel.clone()); + let body = crate::support::DownloadStream::new(cancel_or_timeout, body); + + Ok(Download { + metadata, + etag, + last_modified, + download_stream: Box::pin(body), + }) } async fn delete_oids( &self, - kind: RequestKind, + _permit: &tokio::sync::SemaphorePermit<'_>, delete_objects: &[ObjectIdentifier], + cancel: &CancellationToken, ) -> anyhow::Result<()> { + let kind = RequestKind::Delete; + let mut cancel = std::pin::pin!(cancel.cancelled()); + for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) { let started_at = start_measuring_requests(kind); - let resp = self + let req = self .client .delete_objects() .bucket(self.bucket_name.clone()) .delete( Delete::builder() .set_objects(Some(chunk.to_vec())) - .build()?, + .build() + .context("build request")?, ) - .send() - .await; + .send(); + + let resp = tokio::select! { + resp = req => resp, + _ = tokio::time::sleep(self.timeout) => return Err(TimeoutOrCancel::Timeout.into()), + _ = &mut cancel => return Err(TimeoutOrCancel::Cancel.into()), + }; let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS + crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, &resp, started_at); - let resp = resp?; - metrics::BUCKET_METRICS + let resp = resp.context("request deletion")?; + crate::metrics::BUCKET_METRICS .deleted_objects_total .inc_by(chunk.len() as u64); + if let Some(errors) = resp.errors { // Log a bounded number of the errors within the response: // these requests can carry 1000 keys so logging each one @@ -319,14 +378,19 @@ impl S3Bucket { ); } - return Err(anyhow::format_err!( - "Failed to delete {} objects", - errors.len() + return Err(anyhow::anyhow!( + "Failed to delete {}/{} objects", + errors.len(), + chunk.len(), )); } } Ok(()) } + + pub fn bucket_name(&self) -> &str { + &self.bucket_name + } } pin_project_lite::pin_project! { @@ -354,45 +418,18 @@ impl Stream for ByteStreamAsStream { // sense and Stream::size_hint does not really } -pin_project_lite::pin_project! { - /// An `AsyncRead` adapter which carries a permit for the lifetime of the value. - struct PermitCarrying { - permit: tokio::sync::OwnedSemaphorePermit, - #[pin] - inner: S, - } -} - -impl PermitCarrying { - fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self { - Self { permit, inner } - } -} - -impl>> Stream for PermitCarrying { - type Item = ::Item; - - fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().inner.poll_next(cx) - } - - fn size_hint(&self) -> (usize, Option) { - self.inner.size_hint() - } -} - pin_project_lite::pin_project! { /// Times and tracks the outcome of the request. struct TimedDownload { started_at: std::time::Instant, - outcome: metrics::AttemptOutcome, + outcome: AttemptOutcome, #[pin] inner: S } impl PinnedDrop for TimedDownload { fn drop(mut this: Pin<&mut Self>) { - metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at); + crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at); } } } @@ -401,7 +438,7 @@ impl TimedDownload { fn new(started_at: std::time::Instant, inner: S) -> Self { TimedDownload { started_at, - outcome: metrics::AttemptOutcome::Cancelled, + outcome: AttemptOutcome::Cancelled, inner, } } @@ -418,8 +455,8 @@ impl>> Stream for TimedDownload { let res = ready!(this.inner.poll_next(cx)); match &res { Some(Ok(_)) => {} - Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err, - None => *this.outcome = metrics::AttemptOutcome::Ok, + Some(Err(_)) => *this.outcome = AttemptOutcome::Err, + None => *this.outcome = AttemptOutcome::Ok, } Poll::Ready(res) @@ -431,86 +468,212 @@ impl>> Stream for TimedDownload { } impl RemoteStorage for S3Bucket { - async fn list( + fn list_streaming( &self, prefix: Option<&RemotePath>, mode: ListingMode, - ) -> Result { + max_keys: Option, + cancel: &CancellationToken, + ) -> impl Stream> { let kind = RequestKind::List; - let mut result = Listing::default(); + // s3 sdk wants i32 + let mut max_keys = max_keys.map(|mk| mk.get() as i32); // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix .map(|p| self.relative_path_to_s3_object(p)) - .or_else(|| self.prefix_in_bucket.clone()) - .map(|mut p| { - // required to end with a separator - // otherwise request will return only the entry of a prefix - if matches!(mode, ListingMode::WithDelimiter) - && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) - { - p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); - } - p + .or_else(|| { + self.prefix_in_bucket.clone().map(|mut s| { + s.push(REMOTE_STORAGE_PREFIX_SEPARATOR); + s + }) }); - let mut continuation_token = None; + async_stream::stream! { + let _permit = self.permit(kind, cancel).await?; - loop { - let _guard = self.permit(kind).await; - let started_at = start_measuring_requests(kind); + let mut continuation_token = None; + 'outer: loop { + let started_at = start_measuring_requests(kind); - let mut request = self - .client - .list_objects_v2() - .bucket(self.bucket_name.clone()) - .set_prefix(list_prefix.clone()) - .set_continuation_token(continuation_token) - .set_max_keys(self.max_keys_per_list_response); + // min of two Options, returning Some if one is value and another is + // None (None is smaller than anything, so plain min doesn't work). + let request_max_keys = self + .max_keys_per_list_response + .into_iter() + .chain(max_keys.into_iter()) + .min(); + let mut request = self + .client + .list_objects_v2() + .bucket(self.bucket_name.clone()) + .set_prefix(list_prefix.clone()) + .set_continuation_token(continuation_token.clone()) + .set_max_keys(request_max_keys); - if let ListingMode::WithDelimiter = mode { - request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); + if let ListingMode::WithDelimiter = mode { + request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); + } + + let request = request.send(); + + let response = tokio::select! { + res = request => Ok(res), + _ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout), + _ = cancel.cancelled() => Err(DownloadError::Cancelled), + }?; + + let response = response + .context("Failed to list S3 prefixes") + .map_err(DownloadError::Other); + + let started_at = ScopeGuard::into_inner(started_at); + + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &response, started_at); + + let response = match response { + Ok(response) => response, + Err(e) => { + // The error is potentially retryable, so we must rewind the loop after yielding. + yield Err(e); + continue 'outer; + }, + }; + + let keys = response.contents(); + let prefixes = response.common_prefixes.as_deref().unwrap_or_default(); + + tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len()); + let mut result = Listing::default(); + + for object in keys { + let key = object.key().expect("response does not contain a key"); + let key = self.s3_object_to_relative_path(key); + + let last_modified = match object.last_modified.map(SystemTime::try_from) { + Some(Ok(t)) => t, + Some(Err(_)) => { + tracing::warn!("Remote storage last_modified {:?} for {} is out of bounds", + object.last_modified, key + ); + SystemTime::now() + }, + None => { + SystemTime::now() + } + }; + + let size = object.size.unwrap_or(0) as u64; + + result.keys.push(ListingObject{ + key, + last_modified, + size, + }); + if let Some(mut mk) = max_keys { + assert!(mk > 0); + mk -= 1; + if mk == 0 { + // limit reached + yield Ok(result); + break 'outer; + } + max_keys = Some(mk); + } + } + + // S3 gives us prefixes like "foo/", we return them like "foo" + result.prefixes.extend(prefixes.iter().filter_map(|o| { + Some( + self.s3_object_to_relative_path( + o.prefix()? + .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR), + ), + ) + })); + + yield Ok(result); + + continuation_token = match response.next_continuation_token { + Some(new_token) => Some(new_token), + None => break, + }; } - - let response = request - .send() - .await - .context("Failed to list S3 prefixes") - .map_err(DownloadError::Other); - - let started_at = ScopeGuard::into_inner(started_at); - - metrics::BUCKET_METRICS - .req_seconds - .observe_elapsed(kind, &response, started_at); - - let response = response?; - - let keys = response.contents(); - let empty = Vec::new(); - let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty); - - tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len()); - - for object in keys { - let object_path = object.key().expect("response does not contain a key"); - let remote_path = self.s3_object_to_relative_path(object_path); - result.keys.push(remote_path); - } - - result.prefixes.extend( - prefixes - .iter() - .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))), - ); - - continuation_token = match response.next_continuation_token { - Some(new_token) => Some(new_token), - None => break, - }; } + } - Ok(result) + async fn head_object( + &self, + key: &RemotePath, + cancel: &CancellationToken, + ) -> Result { + let kind = RequestKind::Head; + let _permit = self.permit(kind, cancel).await?; + + let started_at = start_measuring_requests(kind); + + let head_future = self + .client + .head_object() + .bucket(self.bucket_name()) + .key(self.relative_path_to_s3_object(key)) + .send(); + + let head_future = tokio::time::timeout(self.timeout, head_future); + + let res = tokio::select! { + res = head_future => res, + _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), + }; + + let res = res.map_err(|_e| DownloadError::Timeout)?; + + // do not incl. timeouts as errors in metrics but cancellations + let started_at = ScopeGuard::into_inner(started_at); + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &res, started_at); + + let data = match res { + Ok(object_output) => object_output, + Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => { + // Count this in the AttemptOutcome::Ok bucket, because 404 is not + // an error: we expect to sometimes fetch an object and find it missing, + // e.g. when probing for timeline indices. + crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + kind, + AttemptOutcome::Ok, + started_at, + ); + return Err(DownloadError::NotFound); + } + Err(e) => { + crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + kind, + AttemptOutcome::Err, + started_at, + ); + + return Err(DownloadError::Other( + anyhow::Error::new(e).context("s3 head object"), + )); + } + }; + + let (Some(last_modified), Some(size)) = (data.last_modified, data.content_length) else { + return Err(DownloadError::Other(anyhow!( + "head_object doesn't contain last_modified or content_length" + )))?; + }; + Ok(ListingObject { + key: key.to_owned(), + last_modified: SystemTime::try_from(last_modified).map_err(|e| { + DownloadError::Other(anyhow!("can't convert time '{last_modified}': {e}")) + })?, + size: size as u64, + }) } async fn upload( @@ -519,39 +682,59 @@ impl RemoteStorage for S3Bucket { from_size_bytes: usize, to: &RemotePath, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Put; - let _guard = self.permit(kind).await; + let _permit = self.permit(kind, cancel).await?; let started_at = start_measuring_requests(kind); let body = Body::wrap_stream(from); let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body)); - let res = self + let upload = self .client .put_object() .bucket(self.bucket_name.clone()) .key(self.relative_path_to_s3_object(to)) .set_metadata(metadata.map(|m| m.0)) + .set_storage_class(self.upload_storage_class.clone()) .content_length(from_size_bytes.try_into()?) .body(bytes_stream) - .send() - .await; + .send(); - let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS - .req_seconds - .observe_elapsed(kind, &res, started_at); + let upload = tokio::time::timeout(self.timeout, upload); - res?; + let res = tokio::select! { + res = upload => res, + _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), + }; - Ok(()) + if let Ok(inner) = &res { + // do not incl. timeouts as errors in metrics but cancellations + let started_at = ScopeGuard::into_inner(started_at); + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, inner, started_at); + } + + match res { + Ok(Ok(_put)) => Ok(()), + Ok(Err(sdk)) => Err(sdk.into()), + Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()), + } } - async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + async fn copy( + &self, + from: &RemotePath, + to: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { let kind = RequestKind::Copy; - let _guard = self.permit(kind).await; + let _permit = self.permit(kind, cancel).await?; + + let timeout = tokio::time::sleep(self.timeout); let started_at = start_measuring_requests(kind); @@ -562,17 +745,23 @@ impl RemoteStorage for S3Bucket { self.relative_path_to_s3_object(from) ); - let res = self + let op = self .client .copy_object() .bucket(self.bucket_name.clone()) .key(self.relative_path_to_s3_object(to)) + .set_storage_class(self.upload_storage_class.clone()) .copy_source(copy_source) - .send() - .await; + .send(); + + let res = tokio::select! { + res = op => res, + _ = timeout => return Err(TimeoutOrCancel::Timeout.into()), + _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), + }; let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS + crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, &res, started_at); @@ -581,14 +770,21 @@ impl RemoteStorage for S3Bucket { Ok(()) } - async fn download(&self, from: &RemotePath) -> Result { + async fn download( + &self, + from: &RemotePath, + cancel: &CancellationToken, + ) -> Result { // if prefix is not none then download file `prefix/from` // if prefix is none then download file `from` - self.download_object(GetObjectRequest { - bucket: self.bucket_name.clone(), - key: self.relative_path_to_s3_object(from), - range: None, - }) + self.download_object( + GetObjectRequest { + bucket: self.bucket_name.clone(), + key: self.relative_path_to_s3_object(from), + range: None, + }, + cancel, + ) .await } @@ -597,6 +793,7 @@ impl RemoteStorage for S3Bucket { from: &RemotePath, start_inclusive: u64, end_exclusive: Option, + cancel: &CancellationToken, ) -> Result { // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 // and needs both ends to be exclusive @@ -606,31 +803,39 @@ impl RemoteStorage for S3Bucket { None => format!("bytes={start_inclusive}-"), }); - self.download_object(GetObjectRequest { - bucket: self.bucket_name.clone(), - key: self.relative_path_to_s3_object(from), - range, - }) + self.download_object( + GetObjectRequest { + bucket: self.bucket_name.clone(), + key: self.relative_path_to_s3_object(from), + range, + }, + cancel, + ) .await } - async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { - let kind = RequestKind::Delete; - let _guard = self.permit(kind).await; + async fn delete_objects<'a>( + &self, + paths: &'a [RemotePath], + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let kind = RequestKind::Delete; + let permit = self.permit(kind, cancel).await?; let mut delete_objects = Vec::with_capacity(paths.len()); for path in paths { let obj_id = ObjectIdentifier::builder() .set_key(Some(self.relative_path_to_s3_object(path))) - .build()?; + .build() + .context("convert path to oid")?; delete_objects.push(obj_id); } - self.delete_oids(kind, &delete_objects).await + self.delete_oids(&permit, &delete_objects, cancel).await } - async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { + async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> { let paths = std::array::from_ref(path); - self.delete_objects(paths).await + self.delete_objects(paths, cancel).await } async fn time_travel_recover( @@ -638,10 +843,10 @@ impl RemoteStorage for S3Bucket { prefix: Option<&RemotePath>, timestamp: SystemTime, done_if_after: SystemTime, - cancel: CancellationToken, - ) -> anyhow::Result<()> { + cancel: &CancellationToken, + ) -> Result<(), TimeTravelError> { let kind = RequestKind::TimeTravel; - let _guard = self.permit(kind).await; + let permit = self.permit(kind, cancel).await?; let timestamp = DateTime::from(timestamp); let done_if_after = DateTime::from(done_if_after); @@ -655,77 +860,120 @@ impl RemoteStorage for S3Bucket { let warn_threshold = 3; let max_retries = 10; - let is_permanent = |_e: &_| false; + let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled); - let list = backoff::retry( - || async { - Ok(self - .client - .list_object_versions() - .bucket(self.bucket_name.clone()) - .set_prefix(prefix.clone()) - .send() - .await?) - }, - is_permanent, - warn_threshold, - max_retries, - "listing object versions for time_travel_recover", - backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")), - ) - .await?; + let mut key_marker = None; + let mut version_id_marker = None; + let mut versions_and_deletes = Vec::new(); - if list.is_truncated().unwrap_or_default() { - anyhow::bail!("Received truncated ListObjectVersions response for prefix={prefix:?}"); + loop { + let response = backoff::retry( + || async { + let op = self + .client + .list_object_versions() + .bucket(self.bucket_name.clone()) + .set_prefix(prefix.clone()) + .set_key_marker(key_marker.clone()) + .set_version_id_marker(version_id_marker.clone()) + .send(); + + tokio::select! { + res = op => res.map_err(|e| TimeTravelError::Other(e.into())), + _ = cancel.cancelled() => Err(TimeTravelError::Cancelled), + } + }, + is_permanent, + warn_threshold, + max_retries, + "listing object versions for time_travel_recover", + cancel, + ) + .await + .ok_or_else(|| TimeTravelError::Cancelled) + .and_then(|x| x)?; + + tracing::trace!( + " Got List response version_id_marker={:?}, key_marker={:?}", + response.version_id_marker, + response.key_marker + ); + let versions = response + .versions + .unwrap_or_default() + .into_iter() + .map(VerOrDelete::from_version); + let deletes = response + .delete_markers + .unwrap_or_default() + .into_iter() + .map(VerOrDelete::from_delete_marker); + itertools::process_results(versions.chain(deletes), |n_vds| { + versions_and_deletes.extend(n_vds) + }) + .map_err(TimeTravelError::Other)?; + fn none_if_empty(v: Option) -> Option { + v.filter(|v| !v.is_empty()) + } + version_id_marker = none_if_empty(response.next_version_id_marker); + key_marker = none_if_empty(response.next_key_marker); + if version_id_marker.is_none() { + // The final response is not supposed to be truncated + if response.is_truncated.unwrap_or_default() { + return Err(TimeTravelError::Other(anyhow::anyhow!( + "Received truncated ListObjectVersions response for prefix={prefix:?}" + ))); + } + break; + } + // Limit the number of versions deletions, mostly so that we don't + // keep requesting forever if the list is too long, as we'd put the + // list in RAM. + // Building a list of 100k entries that reaches the limit roughly takes + // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size. + const COMPLEXITY_LIMIT: usize = 100_000; + if versions_and_deletes.len() >= COMPLEXITY_LIMIT { + return Err(TimeTravelError::TooManyVersions); + } } - let mut versions_deletes = list - .versions() - .iter() - .map(VerOrDelete::Version) - .chain(list.delete_markers().iter().map(VerOrDelete::DeleteMarker)) - .collect::>(); + tracing::info!( + "Built list for time travel with {} versions and deletions", + versions_and_deletes.len() + ); - versions_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified())); + // Work on the list of references instead of the objects directly, + // otherwise we get lifetime errors in the sort_by_key call below. + let mut versions_and_deletes = versions_and_deletes.iter().collect::>(); + + versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified)); let mut vds_for_key = HashMap::<_, Vec<_>>::new(); - for vd in versions_deletes { - let last_modified = vd.last_modified(); - let version_id = vd.version_id(); - let key = vd.key(); - let (Some(last_modified), Some(version_id), Some(key)) = - (last_modified, version_id, key) - else { - anyhow::bail!( - "One (or more) of last_modified, key, and id is None. \ - Is versioning enabled in the bucket? last_modified={:?} key={:?} version_id={:?}", - last_modified, key, version_id, - ); - }; + for vd in &versions_and_deletes { + let VerOrDelete { + version_id, key, .. + } = &vd; if version_id == "null" { - anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \ - indicating either disabled versioning, or legacy objects with null version id values"); + return Err(TimeTravelError::Other(anyhow!("Received ListVersions response for key={key} with version_id='null', \ + indicating either disabled versioning, or legacy objects with null version id values"))); } tracing::trace!( - "Parsing version key={key} version_id={version_id} is_delete={}", - matches!(vd, VerOrDelete::DeleteMarker(_)) + "Parsing version key={key} version_id={version_id} kind={:?}", + vd.kind ); - vds_for_key - .entry(key) - .or_default() - .push((vd, last_modified, version_id)); + vds_for_key.entry(key).or_default().push(vd); } for (key, versions) in vds_for_key { - let (last_vd, last_last_modified, _version_id) = versions.last().unwrap(); - if last_last_modified > &&done_if_after { + let last_vd = versions.last().unwrap(); + if last_vd.last_modified > done_if_after { tracing::trace!("Key {key} has version later than done_if_after, skipping"); continue; } // the version we want to restore to. let version_to_restore_to = - match versions.binary_search_by_key(×tamp, |tpl| *tpl.1) { + match versions.binary_search_by_key(×tamp, |tpl| tpl.last_modified) { Ok(v) => v, Err(e) => e, }; @@ -743,7 +991,11 @@ impl RemoteStorage for S3Bucket { do_delete = true; } else { match &versions[version_to_restore_to - 1] { - (VerOrDelete::Version(_), _last_modified, version_id) => { + VerOrDelete { + kind: VerOrDeleteKind::Version, + version_id, + .. + } => { tracing::trace!("Copying old version {version_id} for {key}..."); // Restore the state to the last version by copying let source_id = @@ -751,37 +1003,61 @@ impl RemoteStorage for S3Bucket { backoff::retry( || async { - Ok(self + let op = self .client .copy_object() .bucket(self.bucket_name.clone()) .key(key) + .set_storage_class(self.upload_storage_class.clone()) .copy_source(&source_id) - .send() - .await?) + .send(); + + tokio::select! { + res = op => res.map_err(|e| TimeTravelError::Other(e.into())), + _ = cancel.cancelled() => Err(TimeTravelError::Cancelled), + } }, is_permanent, warn_threshold, max_retries, - "listing object versions for time_travel_recover", - backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")), + "copying object version for time_travel_recover", + cancel, ) - .await?; + .await + .ok_or_else(|| TimeTravelError::Cancelled) + .and_then(|x| x)?; + tracing::info!(%version_id, %key, "Copied old version in S3"); } - (VerOrDelete::DeleteMarker(_), _last_modified, _version_id) => { + VerOrDelete { + kind: VerOrDeleteKind::DeleteMarker, + .. + } => { do_delete = true; } } }; if do_delete { - if matches!(last_vd, VerOrDelete::DeleteMarker(_)) { + if matches!(last_vd.kind, VerOrDeleteKind::DeleteMarker) { // Key has since been deleted (but there was some history), no need to do anything tracing::trace!("Key {key} already deleted, skipping."); } else { tracing::trace!("Deleting {key}..."); - let oid = ObjectIdentifier::builder().key(key.to_owned()).build()?; - self.delete_oids(kind, &[oid]).await?; + let oid = ObjectIdentifier::builder() + .key(key.to_owned()) + .build() + .map_err(|e| TimeTravelError::Other(e.into()))?; + + self.delete_oids(&permit, &[oid], cancel) + .await + .map_err(|e| { + // delete_oid0 will use TimeoutOrCancel + if TimeoutOrCancel::caused_by_cancel(&e) { + TimeTravelError::Cancelled + } else { + TimeTravelError::Other(e) + } + })?; } } } @@ -789,51 +1065,59 @@ impl RemoteStorage for S3Bucket { } } -/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`]. -fn start_counting_cancelled_wait( - kind: RequestKind, -) -> ScopeGuard { - scopeguard::guard_on_success(std::time::Instant::now(), move |_| { - metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc() - }) +// Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry +struct VerOrDelete { + kind: VerOrDeleteKind, + last_modified: DateTime, + version_id: String, + key: String, } -/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`]. -fn start_measuring_requests( - kind: RequestKind, -) -> ScopeGuard { - scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| { - metrics::BUCKET_METRICS.req_seconds.observe_elapsed( +#[derive(Debug)] +enum VerOrDeleteKind { + Version, + DeleteMarker, +} + +impl VerOrDelete { + fn with_kind( + kind: VerOrDeleteKind, + last_modified: Option, + version_id: Option, + key: Option, + ) -> anyhow::Result { + let lvk = (last_modified, version_id, key); + let (Some(last_modified), Some(version_id), Some(key)) = lvk else { + anyhow::bail!( + "One (or more) of last_modified, key, and id is None. \ + Is versioning enabled in the bucket? last_modified={:?}, version_id={:?}, key={:?}", + lvk.0, + lvk.1, + lvk.2, + ); + }; + Ok(Self { kind, - AttemptOutcome::Cancelled, - started_at, + last_modified, + version_id, + key, + }) + } + fn from_version(v: ObjectVersion) -> anyhow::Result { + Self::with_kind( + VerOrDeleteKind::Version, + v.last_modified, + v.version_id, + v.key, ) - }) -} - -enum VerOrDelete<'a> { - Version(&'a ObjectVersion), - DeleteMarker(&'a DeleteMarkerEntry), -} - -impl<'a> VerOrDelete<'a> { - fn last_modified(&self) -> Option<&'a DateTime> { - match self { - VerOrDelete::Version(v) => v.last_modified(), - VerOrDelete::DeleteMarker(v) => v.last_modified(), - } } - fn version_id(&self) -> Option<&'a str> { - match self { - VerOrDelete::Version(v) => v.version_id(), - VerOrDelete::DeleteMarker(v) => v.version_id(), - } - } - fn key(&self) -> Option<&'a str> { - match self { - VerOrDelete::Version(v) => v.key(), - VerOrDelete::DeleteMarker(v) => v.key(), - } + fn from_delete_marker(v: DeleteMarkerEntry) -> anyhow::Result { + Self::with_kind( + VerOrDeleteKind::DeleteMarker, + v.last_modified, + v.version_id, + v.key, + ) } } @@ -844,8 +1128,8 @@ mod tests { use crate::{RemotePath, S3Bucket, S3Config}; - #[test] - fn relative_path() { + #[tokio::test] + async fn relative_path() { let all_paths = ["", "some/path", "some/path/"]; let all_paths: Vec = all_paths .iter() @@ -858,23 +1142,23 @@ mod tests { Some("test/prefix/"), Some("/test/prefix/"), ]; - let expected_outputs = vec![ - vec!["", "some/path", "some/path"], - vec!["/", "/some/path", "/some/path"], + let expected_outputs = [ + vec!["", "some/path", "some/path/"], + vec!["/", "/some/path", "/some/path/"], vec![ "test/prefix/", "test/prefix/some/path", - "test/prefix/some/path", + "test/prefix/some/path/", ], vec![ "test/prefix/", "test/prefix/some/path", - "test/prefix/some/path", + "test/prefix/some/path/", ], vec![ "test/prefix/", "test/prefix/some/path", - "test/prefix/some/path", + "test/prefix/some/path/", ], ]; @@ -886,8 +1170,11 @@ mod tests { endpoint: None, concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response: Some(5), + upload_storage_class: None, }; - let storage = S3Bucket::new(&config).expect("remote storage init"); + let storage = S3Bucket::new(&config, std::time::Duration::ZERO) + .await + .expect("remote storage init"); for (test_path_idx, test_path) in all_paths.iter().enumerate() { let result = storage.relative_path_to_s3_object(test_path); let expected = expected_outputs[prefix_idx][test_path_idx]; diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index fc4c4b315b..c7eb634af3 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -3,7 +3,9 @@ //! testing purposes. use bytes::Bytes; use futures::stream::Stream; +use futures::StreamExt; use std::collections::HashMap; +use std::num::NonZeroU32; use std::sync::Mutex; use std::time::SystemTime; use std::{collections::hash_map::Entry, sync::Arc}; @@ -11,7 +13,7 @@ use tokio_util::sync::CancellationToken; use crate::{ Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage, - StorageMetadata, + StorageMetadata, TimeTravelError, }; pub struct UnreliableWrapper { @@ -28,6 +30,7 @@ pub struct UnreliableWrapper { #[derive(Debug, Hash, Eq, PartialEq)] enum RemoteOp { ListPrefixes(Option), + HeadObject(RemotePath), Upload(RemotePath), Download(RemotePath), Delete(RemotePath), @@ -60,7 +63,7 @@ impl UnreliableWrapper { /// On the first attempts of this operation, return an error. After 'attempts_to_fail' /// attempts, let the operation go ahead, and clear the counter. /// - fn attempt(&self, op: RemoteOp) -> Result { + fn attempt(&self, op: RemoteOp) -> anyhow::Result { let mut attempts = self.attempts.lock().unwrap(); match attempts.entry(op) { @@ -78,22 +81,27 @@ impl UnreliableWrapper { } else { let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); - Err(DownloadError::Other(error)) + Err(error) } } Entry::Vacant(e) => { let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); e.insert(1); - Err(DownloadError::Other(error)) + Err(error) } } } - async fn delete_inner(&self, path: &RemotePath, attempt: bool) -> anyhow::Result<()> { + async fn delete_inner( + &self, + path: &RemotePath, + attempt: bool, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { if attempt { self.attempt(RemoteOp::Delete(path.clone()))?; } - self.inner.delete(path).await + self.inner.delete(path, cancel).await } } @@ -101,26 +109,43 @@ impl UnreliableWrapper { type VoidStorage = crate::LocalFs; impl RemoteStorage for UnreliableWrapper { - async fn list_prefixes( + fn list_streaming( &self, prefix: Option<&RemotePath>, - ) -> Result, DownloadError> { - self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?; - self.inner.list_prefixes(prefix).await + mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, + ) -> impl Stream> + Send { + async_stream::stream! { + self.attempt(RemoteOp::ListPrefixes(prefix.cloned())) + .map_err(DownloadError::Other)?; + let mut stream = self.inner + .list_streaming(prefix, mode, max_keys, cancel); + while let Some(item) = stream.next().await { + yield item; + } + } } - - async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result> { - self.attempt(RemoteOp::ListPrefixes(folder.cloned()))?; - self.inner.list_files(folder).await - } - async fn list( &self, prefix: Option<&RemotePath>, mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, ) -> Result { - self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?; - self.inner.list(prefix, mode).await + self.attempt(RemoteOp::ListPrefixes(prefix.cloned())) + .map_err(DownloadError::Other)?; + self.inner.list(prefix, mode, max_keys, cancel).await + } + + async fn head_object( + &self, + key: &RemotePath, + cancel: &CancellationToken, + ) -> Result { + self.attempt(RemoteOp::HeadObject(key.clone())) + .map_err(DownloadError::Other)?; + self.inner.head_object(key, cancel).await } async fn upload( @@ -131,14 +156,22 @@ impl RemoteStorage for UnreliableWrapper { data_size_bytes: usize, to: &RemotePath, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result<()> { self.attempt(RemoteOp::Upload(to.clone()))?; - self.inner.upload(data, data_size_bytes, to, metadata).await + self.inner + .upload(data, data_size_bytes, to, metadata, cancel) + .await } - async fn download(&self, from: &RemotePath) -> Result { - self.attempt(RemoteOp::Download(from.clone()))?; - self.inner.download(from).await + async fn download( + &self, + from: &RemotePath, + cancel: &CancellationToken, + ) -> Result { + self.attempt(RemoteOp::Download(from.clone())) + .map_err(DownloadError::Other)?; + self.inner.download(from, cancel).await } async fn download_byte_range( @@ -146,26 +179,32 @@ impl RemoteStorage for UnreliableWrapper { from: &RemotePath, start_inclusive: u64, end_exclusive: Option, + cancel: &CancellationToken, ) -> Result { // Note: We treat any download_byte_range as an "attempt" of the same // operation. We don't pay attention to the ranges. That's good enough // for now. - self.attempt(RemoteOp::Download(from.clone()))?; + self.attempt(RemoteOp::Download(from.clone())) + .map_err(DownloadError::Other)?; self.inner - .download_byte_range(from, start_inclusive, end_exclusive) + .download_byte_range(from, start_inclusive, end_exclusive, cancel) .await } - async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { - self.delete_inner(path, true).await + async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> { + self.delete_inner(path, true, cancel).await } - async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { + async fn delete_objects<'a>( + &self, + paths: &'a [RemotePath], + cancel: &CancellationToken, + ) -> anyhow::Result<()> { self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?; let mut error_counter = 0; for path in paths { // Dont record attempt because it was already recorded above - if (self.delete_inner(path, false).await).is_err() { + if (self.delete_inner(path, false, cancel).await).is_err() { error_counter += 1; } } @@ -178,11 +217,16 @@ impl RemoteStorage for UnreliableWrapper { Ok(()) } - async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + async fn copy( + &self, + from: &RemotePath, + to: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { // copy is equivalent to download + upload self.attempt(RemoteOp::Download(from.clone()))?; self.attempt(RemoteOp::Upload(to.clone()))?; - self.inner.copy_object(from, to).await + self.inner.copy_object(from, to, cancel).await } async fn time_travel_recover( @@ -190,9 +234,10 @@ impl RemoteStorage for UnreliableWrapper { prefix: Option<&RemotePath>, timestamp: SystemTime, done_if_after: SystemTime, - cancel: CancellationToken, - ) -> anyhow::Result<()> { - self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))?; + cancel: &CancellationToken, + ) -> Result<(), TimeTravelError> { + self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned()))) + .map_err(TimeTravelError::Other)?; self.inner .time_travel_recover(prefix, timestamp, done_if_after, cancel) .await diff --git a/libs/remote_storage/src/support.rs b/libs/remote_storage/src/support.rs new file mode 100644 index 0000000000..1ed9ed9305 --- /dev/null +++ b/libs/remote_storage/src/support.rs @@ -0,0 +1,215 @@ +use std::{ + future::Future, + pin::Pin, + task::{Context, Poll}, + time::Duration, +}; + +use bytes::Bytes; +use futures_util::Stream; +use tokio_util::sync::CancellationToken; + +use crate::TimeoutOrCancel; + +pin_project_lite::pin_project! { + /// An `AsyncRead` adapter which carries a permit for the lifetime of the value. + pub(crate) struct PermitCarrying { + permit: tokio::sync::OwnedSemaphorePermit, + #[pin] + inner: S, + } +} + +impl PermitCarrying { + pub(crate) fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self { + Self { permit, inner } + } +} + +impl Stream for PermitCarrying { + type Item = ::Item; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.project().inner.poll_next(cx) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +pin_project_lite::pin_project! { + pub(crate) struct DownloadStream { + hit: bool, + #[pin] + cancellation: F, + #[pin] + inner: S, + } +} + +impl DownloadStream { + pub(crate) fn new(cancellation: F, inner: S) -> Self { + Self { + cancellation, + hit: false, + inner, + } + } +} + +/// See documentation on [`crate::DownloadStream`] on rationale why `std::io::Error` is used. +impl Stream for DownloadStream +where + std::io::Error: From, + F: Future, + S: Stream>, +{ + type Item = ::Item; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.project(); + + if !*this.hit { + if let Poll::Ready(e) = this.cancellation.poll(cx) { + *this.hit = true; + + // most likely this will be a std::io::Error wrapping a DownloadError + let e = Err(std::io::Error::from(e)); + return Poll::Ready(Some(e)); + } + } else { + // this would be perfectly valid behaviour for doing a graceful completion on the + // download for example, but not one we expect to do right now. + tracing::warn!("continuing polling after having cancelled or timeouted"); + } + + this.inner.poll_next(cx) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +/// Fires only on the first cancel or timeout, not on both. +pub(crate) fn cancel_or_timeout( + timeout: Duration, + cancel: CancellationToken, +) -> impl std::future::Future + 'static { + // futures are lazy, they don't do anything before being polled. + // + // "precalculate" the wanted deadline before returning the future, so that we can use pause + // failpoint to trigger a timeout in test. + let deadline = tokio::time::Instant::now() + timeout; + async move { + tokio::select! { + _ = tokio::time::sleep_until(deadline) => TimeoutOrCancel::Timeout, + _ = cancel.cancelled() => { + TimeoutOrCancel::Cancel + }, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::DownloadError; + use futures::stream::StreamExt; + + #[tokio::test(start_paused = true)] + async fn cancelled_download_stream() { + let inner = futures::stream::pending(); + let timeout = Duration::from_secs(120); + let cancel = CancellationToken::new(); + + let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner); + let mut stream = std::pin::pin!(stream); + + let mut first = stream.next(); + + tokio::select! { + _ = &mut first => unreachable!("we haven't yet cancelled nor is timeout passed"), + _ = tokio::time::sleep(Duration::from_secs(1)) => {}, + } + + cancel.cancel(); + + let e = first.await.expect("there must be some").unwrap_err(); + assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}"); + let inner = e.get_ref().expect("inner should be set"); + assert!( + inner + .downcast_ref::() + .is_some_and(|e| matches!(e, DownloadError::Cancelled)), + "{inner:?}" + ); + let e = DownloadError::from(e); + assert!(matches!(e, DownloadError::Cancelled), "{e:?}"); + + tokio::select! { + _ = stream.next() => unreachable!("no timeout ever happens as we were already cancelled"), + _ = tokio::time::sleep(Duration::from_secs(121)) => {}, + } + } + + #[tokio::test(start_paused = true)] + async fn timeouted_download_stream() { + let inner = futures::stream::pending(); + let timeout = Duration::from_secs(120); + let cancel = CancellationToken::new(); + + let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner); + let mut stream = std::pin::pin!(stream); + + // because the stream uses 120s timeout and we are paused, we advance to 120s right away. + let first = stream.next(); + + let e = first.await.expect("there must be some").unwrap_err(); + assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}"); + let inner = e.get_ref().expect("inner should be set"); + assert!( + inner + .downcast_ref::() + .is_some_and(|e| matches!(e, DownloadError::Timeout)), + "{inner:?}" + ); + let e = DownloadError::from(e); + assert!(matches!(e, DownloadError::Timeout), "{e:?}"); + + cancel.cancel(); + + tokio::select! { + _ = stream.next() => unreachable!("no cancellation ever happens because we already timed out"), + _ = tokio::time::sleep(Duration::from_secs(121)) => {}, + } + } + + #[tokio::test] + async fn notified_but_pollable_after() { + let inner = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from_static( + b"hello world", + )))); + let timeout = Duration::from_secs(120); + let cancel = CancellationToken::new(); + + cancel.cancel(); + let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner); + let mut stream = std::pin::pin!(stream); + + let next = stream.next().await; + let ioe = next.unwrap().unwrap_err(); + assert!( + matches!( + ioe.get_ref().unwrap().downcast_ref::(), + Some(&DownloadError::Cancelled) + ), + "{ioe:?}" + ); + + let next = stream.next().await; + let bytes = next.unwrap().unwrap(); + assert_eq!(&b"hello world"[..], bytes); + } +} diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs index bca117ed1a..daab05d91a 100644 --- a/libs/remote_storage/tests/common/mod.rs +++ b/libs/remote_storage/tests/common/mod.rs @@ -10,6 +10,7 @@ use futures::stream::Stream; use once_cell::sync::OnceCell; use remote_storage::{Download, GenericRemoteStorage, RemotePath}; use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; use tracing::{debug, error, info}; static LOGGING_DONE: OnceCell<()> = OnceCell::new(); @@ -58,8 +59,12 @@ pub(crate) async fn upload_simple_remote_data( ) -> ControlFlow, HashSet> { info!("Creating {upload_tasks_count} remote files"); let mut upload_tasks = JoinSet::new(); + let cancel = CancellationToken::new(); + for i in 1..upload_tasks_count + 1 { let task_client = Arc::clone(client); + let cancel = cancel.clone(); + upload_tasks.spawn(async move { let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i)); let blob_path = RemotePath::new( @@ -69,7 +74,9 @@ pub(crate) async fn upload_simple_remote_data( debug!("Creating remote item {i} at path {blob_path:?}"); let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into()); - task_client.upload(data, len, &blob_path, None).await?; + task_client + .upload(data, len, &blob_path, None, &cancel) + .await?; Ok::<_, anyhow::Error>(blob_path) }); @@ -107,13 +114,15 @@ pub(crate) async fn cleanup( "Removing {} objects from the remote storage during cleanup", objects_to_delete.len() ); + let cancel = CancellationToken::new(); let mut delete_tasks = JoinSet::new(); for object_to_delete in objects_to_delete { let task_client = Arc::clone(client); + let cancel = cancel.clone(); delete_tasks.spawn(async move { debug!("Deleting remote item at path {object_to_delete:?}"); task_client - .delete(&object_to_delete) + .delete(&object_to_delete, &cancel) .await .with_context(|| format!("{object_to_delete:?} removal")) }); @@ -141,8 +150,12 @@ pub(crate) async fn upload_remote_data( ) -> ControlFlow { info!("Creating {upload_tasks_count} remote files"); let mut upload_tasks = JoinSet::new(); - for i in 1..upload_tasks_count + 1 { + let cancel = CancellationToken::new(); + + for i in 1..=upload_tasks_count { let task_client = Arc::clone(client); + let cancel = cancel.clone(); + upload_tasks.spawn(async move { let prefix = format!("{base_prefix_str}/sub_prefix_{i}/"); let blob_prefix = RemotePath::new(Utf8Path::new(&prefix)) @@ -152,7 +165,9 @@ pub(crate) async fn upload_remote_data( let (data, data_len) = upload_stream(format!("remote blob data {i}").into_bytes().into()); - task_client.upload(data, data_len, &blob_path, None).await?; + task_client + .upload(data, data_len, &blob_path, None, &cancel) + .await?; Ok::<_, anyhow::Error>((blob_prefix, blob_path)) }); diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs index abccc24c97..86c55872c1 100644 --- a/libs/remote_storage/tests/common/tests.rs +++ b/libs/remote_storage/tests/common/tests.rs @@ -1,9 +1,12 @@ use anyhow::Context; use camino::Utf8Path; +use futures::StreamExt; +use remote_storage::ListingMode; use remote_storage::RemotePath; -use std::collections::HashSet; use std::sync::Arc; +use std::{collections::HashSet, num::NonZeroU32}; use test_context::test_context; +use tokio_util::sync::CancellationToken; use tracing::debug; use crate::common::{download_to_vec, upload_stream, wrap_stream}; @@ -27,10 +30,10 @@ use super::{ /// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only /// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}` /// -/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys. -/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3, -/// since current default AWS S3 pagination limit is 1000. -/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax) +/// In the `MaybeEnabledStorageWithTestBlobs::setup`, we set the `max_keys_in_list_response` param to limit the keys in a single response. +/// This way, we are able to test the pagination, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3, +/// as the current default AWS S3 pagination limit is 1000. +/// (see ). /// /// Lastly, the test attempts to clean up and remove all uploaded S3 files. /// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished. @@ -45,15 +48,17 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a } }; + let cancel = CancellationToken::new(); + let test_client = Arc::clone(&ctx.enabled.client); let expected_remote_prefixes = ctx.remote_prefixes.clone(); let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix)) .context("common_prefix construction")?; let root_remote_prefixes = test_client - .list_prefixes(None) - .await - .context("client list root prefixes failure")? + .list(None, ListingMode::WithDelimiter, None, &cancel) + .await? + .prefixes .into_iter() .collect::>(); assert_eq!( @@ -62,9 +67,14 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a ); let nested_remote_prefixes = test_client - .list_prefixes(Some(&base_prefix)) - .await - .context("client list nested prefixes failure")? + .list( + Some(&base_prefix.add_trailing_slash()), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await? + .prefixes .into_iter() .collect::>(); let remote_only_prefixes = nested_remote_prefixes @@ -78,6 +88,41 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", ); + // list_streaming + + let prefix_with_slash = base_prefix.add_trailing_slash(); + let mut nested_remote_prefixes_st = test_client.list_streaming( + Some(&prefix_with_slash), + ListingMode::WithDelimiter, + None, + &cancel, + ); + let mut nested_remote_prefixes_combined = HashSet::new(); + let mut segments = 0; + let mut segment_max_size = 0; + while let Some(st) = nested_remote_prefixes_st.next().await { + let st = st?; + segment_max_size = segment_max_size.max(st.prefixes.len()); + nested_remote_prefixes_combined.extend(st.prefixes.into_iter()); + segments += 1; + } + assert!(segments > 1, "less than 2 segments: {segments}"); + assert!( + segment_max_size * 2 <= nested_remote_prefixes_combined.len(), + "double of segment_max_size={segment_max_size} larger number of remote prefixes of {}", + nested_remote_prefixes_combined.len() + ); + let remote_only_prefixes = nested_remote_prefixes_combined + .difference(&expected_remote_prefixes) + .collect::>(); + let missing_uploaded_prefixes = expected_remote_prefixes + .difference(&nested_remote_prefixes_combined) + .collect::>(); + assert_eq!( + remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, + "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", + ); + Ok(()) } @@ -87,11 +132,13 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a /// /// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`] /// Then performs the following queries: -/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt` -/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt` +/// 1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt` +/// 2. `list("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt` #[test_context(MaybeEnabledStorageWithSimpleTestBlobs)] #[tokio::test] -async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> { +async fn list_no_delimiter_works( + ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs, +) -> anyhow::Result<()> { let ctx = match ctx { MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx, MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()), @@ -99,25 +146,44 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a anyhow::bail!("S3 init failed: {e:?}") } }; + let cancel = CancellationToken::new(); let test_client = Arc::clone(&ctx.enabled.client); let base_prefix = RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?; let root_files = test_client - .list_files(None) + .list(None, ListingMode::NoDelimiter, None, &cancel) .await .context("client list root files failure")? + .keys .into_iter() + .map(|o| o.key) .collect::>(); assert_eq!( root_files, ctx.remote_blobs.clone(), - "remote storage list_files on root mismatches with the uploads." + "remote storage list on root mismatches with the uploads." ); + + // Test that max_keys limit works. In total there are about 21 files (see + // upload_simple_remote_data call in test_real_s3.rs). + let limited_root_files = test_client + .list( + None, + ListingMode::NoDelimiter, + Some(NonZeroU32::new(2).unwrap()), + &cancel, + ) + .await + .context("client list root files failure")?; + assert_eq!(limited_root_files.keys.len(), 2); + let nested_remote_files = test_client - .list_files(Some(&base_prefix)) + .list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel) .await .context("client list nested files failure")? + .keys .into_iter() + .map(|o| o.key) .collect::>(); let trim_remote_blobs: HashSet<_> = ctx .remote_blobs @@ -128,7 +194,7 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a .collect(); assert_eq!( nested_remote_files, trim_remote_blobs, - "remote storage list_files on subdirrectory mismatches with the uploads." + "remote storage list on subdirrectory mismatches with the uploads." ); Ok(()) } @@ -141,12 +207,17 @@ async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Resu MaybeEnabledStorage::Disabled => return Ok(()), }; + let cancel = CancellationToken::new(); + let path = RemotePath::new(Utf8Path::new( format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(), )) .with_context(|| "RemotePath conversion")?; - ctx.client.delete(&path).await.expect("should succeed"); + ctx.client + .delete(&path, &cancel) + .await + .expect("should succeed"); Ok(()) } @@ -159,6 +230,8 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<( MaybeEnabledStorage::Disabled => return Ok(()), }; + let cancel = CancellationToken::new(); + let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; @@ -169,21 +242,25 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<( .with_context(|| "RemotePath conversion")?; let (data, len) = upload_stream("remote blob data1".as_bytes().into()); - ctx.client.upload(data, len, &path1, None).await?; + ctx.client.upload(data, len, &path1, None, &cancel).await?; let (data, len) = upload_stream("remote blob data2".as_bytes().into()); - ctx.client.upload(data, len, &path2, None).await?; + ctx.client.upload(data, len, &path2, None, &cancel).await?; let (data, len) = upload_stream("remote blob data3".as_bytes().into()); - ctx.client.upload(data, len, &path3, None).await?; + ctx.client.upload(data, len, &path3, None, &cancel).await?; - ctx.client.delete_objects(&[path1, path2]).await?; + ctx.client.delete_objects(&[path1, path2], &cancel).await?; - let prefixes = ctx.client.list_prefixes(None).await?; + let prefixes = ctx + .client + .list(None, ListingMode::WithDelimiter, None, &cancel) + .await? + .prefixes; assert_eq!(prefixes.len(), 1); - ctx.client.delete_objects(&[path3]).await?; + ctx.client.delete_objects(&[path3], &cancel).await?; Ok(()) } @@ -195,6 +272,8 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result< return Ok(()); }; + let cancel = CancellationToken::new(); + let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; @@ -202,47 +281,56 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result< let (data, len) = wrap_stream(orig.clone()); - ctx.client.upload(data, len, &path, None).await?; + ctx.client.upload(data, len, &path, None, &cancel).await?; // Normal download request - let dl = ctx.client.download(&path).await?; + let dl = ctx.client.download(&path, &cancel).await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig); // Full range (end specified) let dl = ctx .client - .download_byte_range(&path, 0, Some(len as u64)) + .download_byte_range(&path, 0, Some(len as u64), &cancel) .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig); // partial range (end specified) - let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?; + let dl = ctx + .client + .download_byte_range(&path, 4, Some(10), &cancel) + .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig[4..10]); // partial range (end beyond real end) let dl = ctx .client - .download_byte_range(&path, 8, Some(len as u64 * 100)) + .download_byte_range(&path, 8, Some(len as u64 * 100), &cancel) .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig[8..]); // Partial range (end unspecified) - let dl = ctx.client.download_byte_range(&path, 4, None).await?; + let dl = ctx + .client + .download_byte_range(&path, 4, None, &cancel) + .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig[4..]); // Full range (end unspecified) - let dl = ctx.client.download_byte_range(&path, 0, None).await?; + let dl = ctx + .client + .download_byte_range(&path, 0, None, &cancel) + .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig); debug!("Cleanup: deleting file at path {path:?}"); ctx.client - .delete(&path) + .delete(&path, &cancel) .await .with_context(|| format!("{path:?} removal"))?; @@ -256,6 +344,8 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { return Ok(()); }; + let cancel = CancellationToken::new(); + let path = RemotePath::new(Utf8Path::new( format!("{}/file_to_copy", ctx.base_prefix).as_str(), )) @@ -269,18 +359,18 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { let (data, len) = wrap_stream(orig.clone()); - ctx.client.upload(data, len, &path, None).await?; + ctx.client.upload(data, len, &path, None, &cancel).await?; // Normal download request - ctx.client.copy_object(&path, &path_dest).await?; + ctx.client.copy_object(&path, &path_dest, &cancel).await?; - let dl = ctx.client.download(&path_dest).await?; + let dl = ctx.client.download(&path_dest, &cancel).await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig); debug!("Cleanup: deleting file at path {path:?}"); ctx.client - .delete_objects(&[path.clone(), path_dest.clone()]) + .delete_objects(&[path.clone(), path_dest.clone()], &cancel) .await .with_context(|| format!("{path:?} removal"))?; diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 6f9a1ec6f7..3a20649490 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -1,9 +1,9 @@ -use std::collections::HashSet; use std::env; use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::sync::Arc; use std::time::UNIX_EPOCH; +use std::{collections::HashSet, time::Duration}; use anyhow::Context; use remote_storage::{ @@ -31,6 +31,7 @@ struct EnabledAzure { impl EnabledAzure { async fn setup(max_keys_in_list_response: Option) -> Self { let client = create_azure_client(max_keys_in_list_response) + .await .context("Azure client creation") .expect("Azure client creation failed"); @@ -39,6 +40,17 @@ impl EnabledAzure { base_prefix: BASE_PREFIX, } } + + #[allow(unused)] // this will be needed when moving the timeout integration tests back + fn configure_request_timeout(&mut self, timeout: Duration) { + match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") { + GenericRemoteStorage::AzureBlob(azure) => { + let azure = Arc::get_mut(azure).expect("inner Arc::get_mut"); + azure.timeout = timeout; + } + _ => unreachable!(), + } + } } enum MaybeEnabledStorage { @@ -46,7 +58,6 @@ enum MaybeEnabledStorage { Disabled, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorage { async fn setup() -> Self { ensure_logging_ready(); @@ -75,7 +86,6 @@ struct AzureWithTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { async fn setup() -> Self { ensure_logging_ready(); @@ -123,10 +133,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { } } -// NOTE: the setups for the list_prefixes test and the list_files test are very similar -// However, they are not idential. The list_prefixes function is concerned with listing prefixes, -// whereas the list_files function is concerned with listing files. -// See `RemoteStorage::list_files` documentation for more details enum MaybeEnabledStorageWithSimpleTestBlobs { Enabled(AzureWithSimpleTestBlobs), Disabled, @@ -137,7 +143,6 @@ struct AzureWithSimpleTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { async fn setup() -> Self { ensure_logging_ready(); @@ -183,7 +188,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { } } -fn create_azure_client( +async fn create_azure_client( max_keys_per_list_response: Option, ) -> anyhow::Result> { use rand::Rng; @@ -208,13 +213,17 @@ fn create_azure_client( let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::AzureContainer(AzureConfig { container_name: remote_storage_azure_container, + storage_account: None, container_region: remote_storage_azure_region, prefix_in_container: Some(format!("test_{millis}_{random:08x}/")), concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, }), + timeout: Duration::from_secs(120), }; Ok(Arc::new( - GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, + GenericRemoteStorage::from_config(&remote_storage_config) + .await + .context("remote storage init")?, )) } diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 9e1b989e4d..b893beeebd 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -1,4 +1,6 @@ use std::env; +use std::fmt::{Debug, Display}; +use std::future::Future; use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::sync::Arc; @@ -8,11 +10,14 @@ use std::{collections::HashSet, time::SystemTime}; use crate::common::{download_to_vec, upload_stream}; use anyhow::Context; use camino::Utf8Path; +use futures_util::StreamExt; use remote_storage::{ - GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, + DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig, + RemoteStorageKind, S3Config, }; use test_context::test_context; use test_context::AsyncTestContext; +use tokio::io::AsyncBufReadExt; use tokio_util::sync::CancellationToken; use tracing::info; @@ -22,9 +27,9 @@ mod common; mod tests_s3; use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data}; +use utils::backoff; const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE"; - const BASE_PREFIX: &str = "test"; #[test_context(MaybeEnabledStorage)] @@ -39,6 +44,26 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: // to take the time from S3 response headers. const WAIT_TIME: Duration = Duration::from_millis(3_000); + async fn retry(op: O) -> Result + where + E: Display + Debug + 'static, + O: FnMut() -> F, + F: Future>, + { + let warn_threshold = 3; + let max_retries = 10; + backoff::retry( + op, + |_e| false, + warn_threshold, + max_retries, + "test retry", + &CancellationToken::new(), + ) + .await + .expect("never cancelled") + } + async fn time_point() -> SystemTime { tokio::time::sleep(WAIT_TIME).await; let ret = SystemTime::now(); @@ -46,15 +71,23 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: ret } - async fn list_files(client: &Arc) -> anyhow::Result> { - Ok(client - .list_files(None) - .await - .context("list root files failure")? - .into_iter() - .collect::>()) + async fn list_files( + client: &Arc, + cancel: &CancellationToken, + ) -> anyhow::Result> { + Ok( + retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel)) + .await + .context("list root files failure")? + .keys + .into_iter() + .map(|o| o.key) + .collect::>(), + ) } + let cancel = CancellationToken::new(); + let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; @@ -64,25 +97,32 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; - let (data, len) = upload_stream("remote blob data1".as_bytes().into()); - ctx.client.upload(data, len, &path1, None).await?; + retry(|| { + let (data, len) = upload_stream("remote blob data1".as_bytes().into()); + ctx.client.upload(data, len, &path1, None, &cancel) + }) + .await?; - let t0_files = list_files(&ctx.client).await?; + let t0_files = list_files(&ctx.client, &cancel).await?; let t0 = time_point().await; println!("at t0: {t0_files:?}"); let old_data = "remote blob data2"; - let (data, len) = upload_stream(old_data.as_bytes().into()); - ctx.client.upload(data, len, &path2, None).await?; - let t1_files = list_files(&ctx.client).await?; + retry(|| { + let (data, len) = upload_stream(old_data.as_bytes().into()); + ctx.client.upload(data, len, &path2, None, &cancel) + }) + .await?; + + let t1_files = list_files(&ctx.client, &cancel).await?; let t1 = time_point().await; println!("at t1: {t1_files:?}"); // A little check to ensure that our clock is not too far off from the S3 clock { - let dl = ctx.client.download(&path2).await?; - let last_modified = dl.last_modified.unwrap(); + let dl = retry(|| ctx.client.download(&path2, &cancel)).await?; + let last_modified = dl.last_modified; let half_wt = WAIT_TIME.mul_f32(0.5); let t0_hwt = t0 + half_wt; let t1_hwt = t1 - half_wt; @@ -92,52 +132,60 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: } } - let (data, len) = upload_stream("remote blob data3".as_bytes().into()); - ctx.client.upload(data, len, &path3, None).await?; + retry(|| { + let (data, len) = upload_stream("remote blob data3".as_bytes().into()); + ctx.client.upload(data, len, &path3, None, &cancel) + }) + .await?; let new_data = "new remote blob data2"; - let (data, len) = upload_stream(new_data.as_bytes().into()); - ctx.client.upload(data, len, &path2, None).await?; - ctx.client.delete(&path1).await?; + retry(|| { + let (data, len) = upload_stream(new_data.as_bytes().into()); + ctx.client.upload(data, len, &path2, None, &cancel) + }) + .await?; - let t2_files = list_files(&ctx.client).await?; + retry(|| ctx.client.delete(&path1, &cancel)).await?; + let t2_files = list_files(&ctx.client, &cancel).await?; let t2 = time_point().await; println!("at t2: {t2_files:?}"); // No changes after recovery to t2 (no-op) let t_final = time_point().await; ctx.client - .time_travel_recover(None, t2, t_final, CancellationToken::new()) + .time_travel_recover(None, t2, t_final, &cancel) .await?; - let t2_files_recovered = list_files(&ctx.client).await?; + let t2_files_recovered = list_files(&ctx.client, &cancel).await?; println!("after recovery to t2: {t2_files_recovered:?}"); assert_eq!(t2_files, t2_files_recovered); - let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?; + let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?; assert_eq!(path2_recovered_t2, new_data.as_bytes()); // after recovery to t1: path1 is back, path2 has the old content let t_final = time_point().await; ctx.client - .time_travel_recover(None, t1, t_final, CancellationToken::new()) + .time_travel_recover(None, t1, t_final, &cancel) .await?; - let t1_files_recovered = list_files(&ctx.client).await?; + let t1_files_recovered = list_files(&ctx.client, &cancel).await?; println!("after recovery to t1: {t1_files_recovered:?}"); assert_eq!(t1_files, t1_files_recovered); - let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?; + let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?; assert_eq!(path2_recovered_t1, old_data.as_bytes()); // after recovery to t0: everything is gone except for path1 let t_final = time_point().await; ctx.client - .time_travel_recover(None, t0, t_final, CancellationToken::new()) + .time_travel_recover(None, t0, t_final, &cancel) .await?; - let t0_files_recovered = list_files(&ctx.client).await?; + let t0_files_recovered = list_files(&ctx.client, &cancel).await?; println!("after recovery to t0: {t0_files_recovered:?}"); assert_eq!(t0_files, t0_files_recovered); // cleanup - ctx.client.delete_objects(&[path1, path2, path3]).await?; + + let paths = &[path1, path2, path3]; + retry(|| ctx.client.delete_objects(paths, &cancel)).await?; Ok(()) } @@ -150,6 +198,7 @@ struct EnabledS3 { impl EnabledS3 { async fn setup(max_keys_in_list_response: Option) -> Self { let client = create_s3_client(max_keys_in_list_response) + .await .context("S3 client creation") .expect("S3 client creation failed"); @@ -158,6 +207,16 @@ impl EnabledS3 { base_prefix: BASE_PREFIX, } } + + fn configure_request_timeout(&mut self, timeout: Duration) { + match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") { + GenericRemoteStorage::AwsS3(s3) => { + let s3 = Arc::get_mut(s3).expect("inner Arc::get_mut"); + s3.timeout = timeout; + } + _ => unreachable!(), + } + } } enum MaybeEnabledStorage { @@ -165,7 +224,6 @@ enum MaybeEnabledStorage { Disabled, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorage { async fn setup() -> Self { ensure_logging_ready(); @@ -194,7 +252,6 @@ struct S3WithTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { async fn setup() -> Self { ensure_logging_ready(); @@ -242,10 +299,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { } } -// NOTE: the setups for the list_prefixes test and the list_files test are very similar -// However, they are not idential. The list_prefixes function is concerned with listing prefixes, -// whereas the list_files function is concerned with listing files. -// See `RemoteStorage::list_files` documentation for more details enum MaybeEnabledStorageWithSimpleTestBlobs { Enabled(S3WithSimpleTestBlobs), Disabled, @@ -256,7 +309,6 @@ struct S3WithSimpleTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { async fn setup() -> Self { ensure_logging_ready(); @@ -302,7 +354,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { } } -fn create_s3_client( +async fn create_s3_client( max_keys_per_list_response: Option, ) -> anyhow::Result> { use rand::Rng; @@ -330,9 +382,178 @@ fn create_s3_client( endpoint: None, concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, + upload_storage_class: None, }), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; Ok(Arc::new( - GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, + GenericRemoteStorage::from_config(&remote_storage_config) + .await + .context("remote storage init")?, )) } + +#[test_context(MaybeEnabledStorage)] +#[tokio::test] +async fn download_is_timeouted(ctx: &mut MaybeEnabledStorage) { + let MaybeEnabledStorage::Enabled(ctx) = ctx else { + return; + }; + + let cancel = CancellationToken::new(); + + let path = RemotePath::new(Utf8Path::new( + format!("{}/file_to_copy", ctx.base_prefix).as_str(), + )) + .unwrap(); + + let len = upload_large_enough_file(&ctx.client, &path, &cancel).await; + + let timeout = std::time::Duration::from_secs(5); + + ctx.configure_request_timeout(timeout); + + let started_at = std::time::Instant::now(); + let mut stream = ctx + .client + .download(&path, &cancel) + .await + .expect("download succeeds") + .download_stream; + + if started_at.elapsed().mul_f32(0.9) >= timeout { + tracing::warn!( + elapsed_ms = started_at.elapsed().as_millis(), + "timeout might be too low, consumed most of it during headers" + ); + } + + let first = stream + .next() + .await + .expect("should have the first blob") + .expect("should have succeeded"); + + tracing::info!(len = first.len(), "downloaded first chunk"); + + assert!( + first.len() < len, + "uploaded file is too small, we downloaded all on first chunk" + ); + + tokio::time::sleep(timeout).await; + + { + let started_at = std::time::Instant::now(); + let next = stream + .next() + .await + .expect("stream should not have ended yet"); + + tracing::info!( + next.is_err = next.is_err(), + elapsed_ms = started_at.elapsed().as_millis(), + "received item after timeout" + ); + + let e = next.expect_err("expected an error, but got a chunk?"); + + let inner = e.get_ref().expect("std::io::Error::inner should be set"); + assert!( + inner + .downcast_ref::() + .is_some_and(|e| matches!(e, DownloadError::Timeout)), + "{inner:?}" + ); + } + + ctx.configure_request_timeout(RemoteStorageConfig::DEFAULT_TIMEOUT); + + ctx.client.delete_objects(&[path], &cancel).await.unwrap() +} + +#[test_context(MaybeEnabledStorage)] +#[tokio::test] +async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) { + let MaybeEnabledStorage::Enabled(ctx) = ctx else { + return; + }; + + let cancel = CancellationToken::new(); + + let path = RemotePath::new(Utf8Path::new( + format!("{}/file_to_copy", ctx.base_prefix).as_str(), + )) + .unwrap(); + + let file_len = upload_large_enough_file(&ctx.client, &path, &cancel).await; + + { + let stream = ctx + .client + .download(&path, &cancel) + .await + .expect("download succeeds") + .download_stream; + + let mut reader = std::pin::pin!(tokio_util::io::StreamReader::new(stream)); + + let first = reader.fill_buf().await.expect("should have the first blob"); + + let len = first.len(); + tracing::info!(len, "downloaded first chunk"); + + assert!( + first.len() < file_len, + "uploaded file is too small, we downloaded all on first chunk" + ); + + reader.consume(len); + + cancel.cancel(); + + let next = reader.fill_buf().await; + + let e = next.expect_err("expected an error, but got a chunk?"); + + let inner = e.get_ref().expect("std::io::Error::inner should be set"); + assert!( + inner + .downcast_ref::() + .is_some_and(|e| matches!(e, DownloadError::Cancelled)), + "{inner:?}" + ); + + let e = DownloadError::from(e); + + assert!(matches!(e, DownloadError::Cancelled), "{e:?}"); + } + + let cancel = CancellationToken::new(); + + ctx.client.delete_objects(&[path], &cancel).await.unwrap(); +} + +/// Upload a long enough file so that we cannot download it in single chunk +/// +/// For s3 the first chunk seems to be less than 10kB, so this has a bit of a safety margin +async fn upload_large_enough_file( + client: &GenericRemoteStorage, + path: &RemotePath, + cancel: &CancellationToken, +) -> usize { + let header = bytes::Bytes::from_static("remote blob data content".as_bytes()); + let body = bytes::Bytes::from(vec![0u8; 1024]); + let contents = std::iter::once(header).chain(std::iter::repeat(body).take(128)); + + let len = contents.clone().fold(0, |acc, next| acc + next.len()); + + let contents = futures::stream::iter(contents.map(std::io::Result::Ok)); + + client + .upload(contents, len, path, None, cancel) + .await + .expect("upload succeeds"); + + len +} diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index 327d98ee77..e1f4bcca46 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -9,5 +9,3 @@ serde.workspace = true serde_with.workspace = true const_format.workspace = true utils.workspace = true - -workspace_hack.workspace = true diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index ce5a1e411e..2fbc333075 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -50,6 +50,9 @@ pub struct SkTimelineInfo { pub safekeeper_connstr: Option, #[serde(default)] pub http_connstr: Option, + // Minimum of all active RO replicas flush LSN + #[serde(default = "lsn_invalid")] + pub standby_horizon: Lsn, } #[derive(Debug, Clone, Deserialize, Serialize)] diff --git a/libs/tenant_size_model/Cargo.toml b/libs/tenant_size_model/Cargo.toml index 15e78932a8..8aa3c54f62 100644 --- a/libs/tenant_size_model/Cargo.toml +++ b/libs/tenant_size_model/Cargo.toml @@ -9,5 +9,3 @@ license.workspace = true anyhow.workspace = true serde.workspace = true serde_json.workspace = true - -workspace_hack.workspace = true diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs index f05997ee65..be00562219 100644 --- a/libs/tenant_size_model/src/calculation.rs +++ b/libs/tenant_size_model/src/calculation.rs @@ -34,10 +34,10 @@ struct SegmentSize { } struct SizeAlternatives { - // cheapest alternative if parent is available. + /// cheapest alternative if parent is available. incremental: SegmentSize, - // cheapest alternative if parent node is not available + /// cheapest alternative if parent node is not available non_incremental: Option, } diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs index a3e12cf0e3..974a498404 100644 --- a/libs/tenant_size_model/src/lib.rs +++ b/libs/tenant_size_model/src/lib.rs @@ -5,9 +5,10 @@ mod calculation; pub mod svg; -/// StorageModel is the input to the synthetic size calculation. It represents -/// a tree of timelines, with just the information that's needed for the -/// calculation. This doesn't track timeline names or where each timeline +/// StorageModel is the input to the synthetic size calculation. +/// +/// It represents a tree of timelines, with just the information that's needed +/// for the calculation. This doesn't track timeline names or where each timeline /// begins and ends, for example. Instead, it consists of "points of interest" /// on the timelines. A point of interest could be the timeline start or end point, /// the oldest point on a timeline that needs to be retained because of PITR diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs index f26d3aa79d..0de2890bb4 100644 --- a/libs/tenant_size_model/src/svg.rs +++ b/libs/tenant_size_model/src/svg.rs @@ -3,10 +3,17 @@ use std::fmt::Write; const SVG_WIDTH: f32 = 500.0; +/// Different branch kind for SVG drawing. +#[derive(PartialEq)] +pub enum SvgBranchKind { + Timeline, + Lease, +} + struct SvgDraw<'a> { storage: &'a StorageModel, branches: &'a [String], - seg_to_branch: &'a [usize], + seg_to_branch: &'a [(usize, SvgBranchKind)], sizes: &'a [SegmentSizeResult], // layout @@ -42,13 +49,18 @@ fn draw_legend(result: &mut String) -> anyhow::Result<()> { "" )?; writeln!(result, "WAL not retained")?; + writeln!( + result, + "" + )?; + writeln!(result, "LSN lease")?; Ok(()) } pub fn draw_svg( storage: &StorageModel, branches: &[String], - seg_to_branch: &[usize], + seg_to_branch: &[(usize, SvgBranchKind)], sizes: &SizeResult, ) -> anyhow::Result { let mut draw = SvgDraw { @@ -100,7 +112,7 @@ impl<'a> SvgDraw<'a> { // Layout the timelines on Y dimension. // TODO - let mut y = 100.0; + let mut y = 120.0; let mut branch_y_coordinates = Vec::new(); for _branch in self.branches { branch_y_coordinates.push(y); @@ -109,7 +121,7 @@ impl<'a> SvgDraw<'a> { // Calculate coordinates for each point let seg_coordinates = std::iter::zip(segments, self.seg_to_branch) - .map(|(seg, branch_id)| { + .map(|(seg, (branch_id, _))| { let x = (seg.lsn - min_lsn) as f32 / xscale; let y = branch_y_coordinates[*branch_id]; (x, y) @@ -175,6 +187,22 @@ impl<'a> SvgDraw<'a> { // draw a snapshot point if it's needed let (coord_x, coord_y) = self.seg_coordinates[seg_id]; + + let (_, kind) = &self.seg_to_branch[seg_id]; + if kind == &SvgBranchKind::Lease { + let (x1, y1) = (coord_x, coord_y - 10.0); + let (x2, y2) = (coord_x, coord_y + 10.0); + + let style = "stroke-width=\"3\" stroke=\"blue\""; + + writeln!( + result, + "", + )?; + writeln!(result, " leased lsn at {}", seg.lsn)?; + writeln!(result, "")?; + } + if self.sizes[seg_id].method == SegmentMethod::SnapshotHere { writeln!( result, diff --git a/libs/tenant_size_model/tests/tests.rs b/libs/tenant_size_model/tests/tests.rs index 7660d41c56..0ffea0f2cd 100644 --- a/libs/tenant_size_model/tests/tests.rs +++ b/libs/tenant_size_model/tests/tests.rs @@ -247,7 +247,7 @@ fn scenario_4() { // // This is in total 5000 + 1000 + 5000 + 1000 = 12000 // - // (If we used the the method from the previous scenario, and + // (If we used the method from the previous scenario, and // kept only snapshot at the branch point, we'd need to keep // all the WAL between 10000-18000 on the main branch, so // the total size would be 5000 + 1000 + 8000 = 14000. The diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml index b285c9b5b0..5ea8db6b42 100644 --- a/libs/tracing-utils/Cargo.toml +++ b/libs/tracing-utils/Cargo.toml @@ -7,12 +7,10 @@ license.workspace = true [dependencies] hyper.workspace = true opentelemetry = { workspace = true, features=["rt-tokio"] } -opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-semantic-conventions.workspace = true reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tracing.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true - -workspace_hack.workspace = true diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 706b7a3187..19deaab63f 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -13,14 +13,14 @@ testing = ["fail/failpoints"] [dependencies] arc-swap.workspace = true sentry.workspace = true -async-trait.workspace = true +async-compression.workspace = true anyhow.workspace = true bincode.workspace = true bytes.workspace = true camino.workspace = true chrono.workspace = true -heapless.workspace = true hex = { workspace = true, features = ["serde"] } +humantime.workspace = true hyper = { workspace = true, features = ["full"] } fail.workspace = true futures = { workspace = true} @@ -35,7 +35,9 @@ serde_json.workspace = true signal-hook.workspace = true thiserror.workspace = true tokio.workspace = true +tokio-tar.workspace = true tokio-util.workspace = true +toml_edit = { workspace = true, features = ["serde"] } tracing.workspace = true tracing-error.workspace = true tracing-subscriber = { workspace = true, features = ["json", "registry"] } @@ -45,11 +47,11 @@ strum.workspace = true strum_macros.workspace = true url.workspace = true uuid.workspace = true +walkdir.workspace = true pq_proto.workspace = true postgres_connection.workspace = true metrics.workspace = true -workspace_hack.workspace = true const_format.workspace = true @@ -66,6 +68,7 @@ criterion.workspace = true hex-literal.workspace = true camino-tempfile.workspace = true serde_assert.workspace = true +tokio = { workspace = true, features = ["test-util"] } [[bench]] name = "benchmarks" diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index 98d839ca55..44eb36387c 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -1,5 +1,3 @@ -#![allow(unused)] - use criterion::{criterion_group, criterion_main, Criterion}; use utils::id; diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 66b1f6e866..7b735875b7 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -1,7 +1,6 @@ // For details about authentication see docs/authentication.md use arc_swap::ArcSwap; -use serde; use std::{borrow::Cow, fmt::Display, fs, sync::Arc}; use anyhow::Result; @@ -19,16 +18,25 @@ const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA; #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)] #[serde(rename_all = "lowercase")] pub enum Scope { - // Provides access to all data for a specific tenant (specified in `struct Claims` below) + /// Provides access to all data for a specific tenant (specified in `struct Claims` below) // TODO: join these two? Tenant, - // Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs. - // Should only be used e.g. for status check/tenant creation/list. + /// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs. + /// Should only be used e.g. for status check/tenant creation/list. PageServerApi, - // Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs. - // Should only be used e.g. for status check. - // Currently also used for connection from any pageserver to any safekeeper. + /// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs. + /// Should only be used e.g. for status check. + /// Currently also used for connection from any pageserver to any safekeeper. SafekeeperData, + /// The scope used by pageservers in upcalls to storage controller and cloud control plane + #[serde(rename = "generations_api")] + GenerationsApi, + /// Allows access to control plane managment API and some storage controller endpoints. + Admin, + + /// Allows access to storage controller APIs used by the scrubber, to interrogate the state + /// of a tenant & post scrub results. + Scrubber, } /// JWT payload. See docs/authentication.md for the format @@ -127,6 +135,10 @@ impl JwtAuth { Ok(Self::new(decoding_keys)) } + pub fn from_key(key: String) -> Result { + Ok(Self::new(vec![DecodingKey::from_ed_pem(key.as_bytes())?])) + } + /// Attempt to decode the token with the internal decoding keys. /// /// The function tries the stored decoding keys in succession, @@ -197,12 +209,11 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH // "scope": "tenant", // "tenant_id": "3d1f7595b468230304e0b73cecbcb081", // "iss": "neon.controlplane", - // "exp": 1709200879, // "iat": 1678442479 // } // ``` // - let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw"; + let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJpYXQiOjE2Nzg0NDI0Nzl9.rNheBnluMJNgXzSTTJoTNIGy4P_qe0JUHl_nVEGuDCTgHOThPVr552EnmKccrCKquPeW3c2YUk0Y9Oh4KyASAw"; // Check it can be validated with the public key let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]); diff --git a/libs/utils/src/backoff.rs b/libs/utils/src/backoff.rs index d50ad39585..096c7e5854 100644 --- a/libs/utils/src/backoff.rs +++ b/libs/utils/src/backoff.rs @@ -37,69 +37,53 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec } } -/// Configure cancellation for a retried operation: when to cancel (the token), and -/// what kind of error to return on cancellation -pub struct Cancel -where - E: Display + Debug + 'static, - CF: Fn() -> E, -{ - token: CancellationToken, - on_cancel: CF, -} - -impl Cancel -where - E: Display + Debug + 'static, - CF: Fn() -> E, -{ - pub fn new(token: CancellationToken, on_cancel: CF) -> Self { - Self { token, on_cancel } - } -} - -/// retries passed operation until one of the following conditions are met: -/// Encountered error is considered as permanent (non-retryable) -/// Retries have been exhausted. -/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors -/// When attempts cross `warn_threshold` function starts to emit log warnings. +/// Retries passed operation until one of the following conditions are met: +/// - encountered error is considered as permanent (non-retryable) +/// - retries have been exhausted +/// - cancellation token has been cancelled +/// +/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent +/// errors. When attempts cross `warn_threshold` function starts to emit log warnings. /// `description` argument is added to log messages. Its value should identify the `op` is doing -/// `cancel` argument is required: any time we are looping on retry, we should be using a CancellationToken -/// to drop out promptly on shutdown. -pub async fn retry( +/// `cancel` cancels new attempts and the backoff sleep. +/// +/// If attempts fail, they are being logged with `{:#}` which works for anyhow, but does not work +/// for any other error type. Final failed attempt is logged with `{:?}`. +/// +/// Returns `None` if cancellation was noticed during backoff or the terminal result. +pub async fn retry( mut op: O, is_permanent: impl Fn(&E) -> bool, warn_threshold: u32, max_retries: u32, description: &str, - cancel: Cancel, -) -> Result + cancel: &CancellationToken, +) -> Option> where // Not std::error::Error because anyhow::Error doesnt implement it. // For context see https://github.com/dtolnay/anyhow/issues/63 E: Display + Debug + 'static, O: FnMut() -> F, F: Future>, - CF: Fn() -> E, { let mut attempts = 0; loop { - if cancel.token.is_cancelled() { - return Err((cancel.on_cancel)()); + if cancel.is_cancelled() { + return None; } let result = op().await; - match result { + match &result { Ok(_) => { if attempts > 0 { tracing::info!("{description} succeeded after {attempts} retries"); } - return result; + return Some(result); } // These are "permanent" errors that should not be retried. - Err(ref e) if is_permanent(e) => { - return result; + Err(e) if is_permanent(e) => { + return Some(result); } // Assume that any other failure might be transient, and the operation might // succeed if we just keep trying. @@ -109,12 +93,12 @@ where Err(err) if attempts < max_retries => { tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}"); } - Err(ref err) => { + Err(err) => { // Operation failed `max_attempts` times. Time to give up. tracing::warn!( "{description} still failed after {attempts} retries, giving up: {err:?}" ); - return result; + return Some(result); } } // sleep and retry @@ -122,7 +106,7 @@ where attempts, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, - &cancel.token, + cancel, ) .await; attempts += 1; @@ -131,11 +115,9 @@ where #[cfg(test)] mod tests { - use std::io; - - use tokio::sync::Mutex; - use super::*; + use std::io; + use tokio::sync::Mutex; #[test] fn backoff_defaults_produce_growing_backoff_sequence() { @@ -166,7 +148,7 @@ mod tests { #[tokio::test(start_paused = true)] async fn retry_always_error() { let count = Mutex::new(0); - let err_result = retry( + retry( || async { *count.lock().await += 1; Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other)) @@ -175,11 +157,11 @@ mod tests { 1, 1, "work", - Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }), + &CancellationToken::new(), ) - .await; - - assert!(err_result.is_err()); + .await + .expect("not cancelled") + .expect_err("it can only fail"); assert_eq!(*count.lock().await, 2); } @@ -201,10 +183,11 @@ mod tests { 2, 2, "work", - Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }), + &CancellationToken::new(), ) .await - .unwrap(); + .expect("not cancelled") + .expect("success on second try"); } #[tokio::test(start_paused = true)] @@ -224,10 +207,11 @@ mod tests { 2, 2, "work", - Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }), + &CancellationToken::new(), ) .await - .unwrap_err(); + .expect("was not cancellation") + .expect_err("it was permanent error"); assert_eq!(*count.lock().await, 1); } diff --git a/libs/utils/src/circuit_breaker.rs b/libs/utils/src/circuit_breaker.rs new file mode 100644 index 0000000000..e1ddfd8650 --- /dev/null +++ b/libs/utils/src/circuit_breaker.rs @@ -0,0 +1,116 @@ +use std::{ + fmt::Display, + time::{Duration, Instant}, +}; + +use metrics::IntCounter; + +/// Circuit breakers are for operations that are expensive and fallible. +/// +/// If a circuit breaker fails repeatedly, we will stop attempting it for some +/// period of time, to avoid denial-of-service from retries, and +/// to mitigate the log spam from repeated failures. +pub struct CircuitBreaker { + /// An identifier that enables us to log useful errors when a circuit is broken + name: String, + + /// Consecutive failures since last success + fail_count: usize, + + /// How many consecutive failures before we break the circuit + fail_threshold: usize, + + /// If circuit is broken, when was it broken? + broken_at: Option, + + /// If set, we will auto-reset the circuit this long after it was broken. If None, broken + /// circuits stay broken forever, or until success() is called. + reset_period: Option, + + /// If this is true, no actual circuit-breaking happens. This is for overriding a circuit breaker + /// to permit something to keep running even if it would otherwise have tripped it. + short_circuit: bool, +} + +impl CircuitBreaker { + pub fn new(name: String, fail_threshold: usize, reset_period: Option) -> Self { + Self { + name, + fail_count: 0, + fail_threshold, + broken_at: None, + reset_period, + short_circuit: false, + } + } + + /// Construct an unbreakable circuit breaker, for use in unit tests etc. + pub fn short_circuit() -> Self { + Self { + name: String::new(), + fail_threshold: 0, + fail_count: 0, + broken_at: None, + reset_period: None, + short_circuit: true, + } + } + + pub fn fail(&mut self, metric: &IntCounter, error: E) + where + E: Display, + { + if self.short_circuit { + return; + } + + self.fail_count += 1; + if self.broken_at.is_none() && self.fail_count >= self.fail_threshold { + self.break_circuit(metric, error); + } + } + + /// Call this after successfully executing an operation + pub fn success(&mut self, metric: &IntCounter) { + self.fail_count = 0; + if let Some(broken_at) = &self.broken_at { + tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})", + humantime::format_duration(broken_at.elapsed())); + self.broken_at = None; + metric.inc(); + } + } + + /// Call this before attempting an operation, and skip the operation if we are currently broken. + pub fn is_broken(&mut self) -> bool { + if self.short_circuit { + return false; + } + + if let Some(broken_at) = self.broken_at { + match self.reset_period { + Some(reset_period) if broken_at.elapsed() > reset_period => { + self.reset_circuit(); + false + } + _ => true, + } + } else { + false + } + } + + fn break_circuit(&mut self, metric: &IntCounter, error: E) + where + E: Display, + { + self.broken_at = Some(Instant::now()); + tracing::error!(breaker=%self.name, "Circuit breaker broken! Last error: {error}"); + metric.inc(); + } + + fn reset_circuit(&mut self) { + self.broken_at = None; + self.fail_count = 0; + } +} diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs index ca6827c9b8..f65c080ad4 100644 --- a/libs/utils/src/completion.rs +++ b/libs/utils/src/completion.rs @@ -4,12 +4,41 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker}; /// /// Can be cloned, moved and kept around in futures as "guard objects". #[derive(Clone)] -pub struct Completion(TaskTrackerToken); +pub struct Completion { + token: TaskTrackerToken, +} + +impl std::fmt::Debug for Completion { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Completion") + .field("siblings", &self.token.task_tracker().len()) + .finish() + } +} + +impl Completion { + /// Returns true if this completion is associated with the given barrier. + pub fn blocks(&self, barrier: &Barrier) -> bool { + TaskTracker::ptr_eq(self.token.task_tracker(), &barrier.0) + } + + pub fn barrier(&self) -> Barrier { + Barrier(self.token.task_tracker().clone()) + } +} /// Barrier will wait until all clones of [`Completion`] have been dropped. #[derive(Clone)] pub struct Barrier(TaskTracker); +impl std::fmt::Debug for Barrier { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Barrier") + .field("remaining", &self.0.len()) + .finish() + } +} + impl Default for Barrier { fn default() -> Self { let (_, rx) = channel(); @@ -27,6 +56,11 @@ impl Barrier { b.wait().await } } + + /// Return true if a call to wait() would complete immediately + pub fn is_ready(&self) -> bool { + futures::future::FutureExt::now_or_never(self.0.wait()).is_some() + } } impl PartialEq for Barrier { @@ -44,5 +78,5 @@ pub fn channel() -> (Completion, Barrier) { tracker.close(); let token = tracker.token(); - (Completion(token), Barrier(tracker)) + (Completion { token }, Barrier(tracker)) } diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs index 0c6855d17b..756b19138c 100644 --- a/libs/utils/src/crashsafe.rs +++ b/libs/utils/src/crashsafe.rs @@ -112,14 +112,66 @@ pub async fn fsync_async(path: impl AsRef) -> Result<(), std::io::Erro tokio::fs::File::open(path.as_ref()).await?.sync_all().await } -/// Writes a file to the specified `final_path` in a crash safe fasion +pub async fn fsync_async_opt( + path: impl AsRef, + do_fsync: bool, +) -> Result<(), std::io::Error> { + if do_fsync { + fsync_async(path.as_ref()).await?; + } + Ok(()) +} + +/// Like postgres' durable_rename, renames file issuing fsyncs do make it +/// durable. After return, file and rename are guaranteed to be persisted. /// -/// The file is first written to the specified tmp_path, and in a second -/// step, the tmp path is renamed to the final path. As renames are -/// atomic, a crash during the write operation will never leave behind a -/// partially written file. +/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make +/// contents durable; 2) its directory entry to make rename durable 3) again to +/// already renamed file, which is not required by standards but postgres does +/// it, let's stick to that. Postgres additionally fsyncs newpath *before* +/// rename if it exists to ensure that at least one of the files survives, but +/// current callers don't need that. /// -/// NB: an async variant of this code exists in Pageserver's VirtualFile. +/// virtual_file.rs has similar code, but it doesn't use vfs. +/// +/// Useful links: +/// +/// +pub async fn durable_rename( + old_path: impl AsRef, + new_path: impl AsRef, + do_fsync: bool, +) -> io::Result<()> { + // first fsync the file + fsync_async_opt(old_path.as_ref(), do_fsync).await?; + + // Time to do the real deal. + tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?; + + // Postgres'ish fsync of renamed file. + fsync_async_opt(new_path.as_ref(), do_fsync).await?; + + // Now fsync the parent + let parent = match new_path.as_ref().parent() { + Some(p) => p, + None => Utf8Path::new("./"), // assume current dir if there is no parent + }; + fsync_async_opt(parent, do_fsync).await?; + + Ok(()) +} + +/// Writes a file to the specified `final_path` in a crash safe fasion, using [`std::fs`]. +/// +/// The file is first written to the specified `tmp_path`, and in a second +/// step, the `tmp_path` is renamed to the `final_path`. Intermediary fsync +/// and atomic rename guarantee that, if we crash at any point, there will never +/// be a partially written file at `final_path` (but maybe at `tmp_path`). +/// +/// Callers are responsible for serializing calls of this function for a given `final_path`. +/// If they don't, there may be an error due to conflicting `tmp_path`, or there will +/// be no error and the content of `final_path` will be the "winner" caller's `content`. +/// I.e., the atomticity guarantees still hold. pub fn overwrite( final_path: &Utf8Path, tmp_path: &Utf8Path, @@ -139,17 +191,14 @@ pub fn overwrite( .open(tmp_path)?; file.write_all(content)?; file.sync_all()?; - drop(file); // before the rename, that's important! - // renames are atomic + drop(file); // don't keep the fd open for longer than we have to + std::fs::rename(tmp_path, final_path)?; - // Only open final path parent dirfd now, so that this operation only - // ever holds one VirtualFile fd at a time. That's important because - // the current `find_victim_slot` impl might pick the same slot for both - // VirtualFile., and it eventually does a blocking write lock instead of - // try_lock. + let final_parent_dirfd = std::fs::OpenOptions::new() .read(true) .open(final_path_parent)?; + final_parent_dirfd.sync_all()?; Ok(()) } diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs new file mode 100644 index 0000000000..b3e326bfd0 --- /dev/null +++ b/libs/utils/src/env.rs @@ -0,0 +1,21 @@ +//! Wrapper around `std::env::var` for parsing environment variables. + +use std::{fmt::Display, str::FromStr}; + +pub fn var(varname: &str) -> Option +where + V: FromStr, + E: Display, +{ + match std::env::var(varname) { + Ok(s) => Some( + s.parse() + .map_err(|e| format!("failed to parse env var {varname}: {e:#}")) + .unwrap(), + ), + Err(std::env::VarError::NotPresent) => None, + Err(std::env::VarError::NotUnicode(_)) => { + panic!("env var {varname} is not unicode") + } + } +} diff --git a/libs/utils/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs index 8704b72921..870684b399 100644 --- a/libs/utils/src/failpoint_support.rs +++ b/libs/utils/src/failpoint_support.rs @@ -9,6 +9,33 @@ use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; use tracing::*; +/// Declare a failpoint that can use the `pause` failpoint action. +/// We don't want to block the executor thread, hence, spawn_blocking + await. +#[macro_export] +macro_rules! pausable_failpoint { + ($name:literal) => { + if cfg!(feature = "testing") { + tokio::task::spawn_blocking({ + let current = tracing::Span::current(); + move || { + let _entered = current.entered(); + tracing::info!("at failpoint {}", $name); + fail::fail_point!($name); + } + }) + .await + .expect("spawn_blocking"); + } + }; + ($name:literal, $cond:expr) => { + if cfg!(feature = "testing") { + if $cond { + pausable_failpoint!($name) + } + } + }; +} + /// use with fail::cfg("$name", "return(2000)") /// /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs index 90ba348a02..8e53d2c79b 100644 --- a/libs/utils/src/fs_ext.rs +++ b/libs/utils/src/fs_ext.rs @@ -3,6 +3,9 @@ use std::{fs, io, path::Path}; use anyhow::Context; +mod rename_noreplace; +pub use rename_noreplace::rename_noreplace; + pub trait PathExt { /// Returns an error if `self` is not a directory. fn is_empty_dir(&self) -> io::Result; diff --git a/libs/utils/src/fs_ext/rename_noreplace.rs b/libs/utils/src/fs_ext/rename_noreplace.rs new file mode 100644 index 0000000000..897e30d7f1 --- /dev/null +++ b/libs/utils/src/fs_ext/rename_noreplace.rs @@ -0,0 +1,109 @@ +use nix::NixPath; + +/// Rename a file without replacing an existing file. +/// +/// This is a wrapper around platform-specific APIs. +pub fn rename_noreplace( + src: &P1, + dst: &P2, +) -> nix::Result<()> { + { + #[cfg(target_os = "linux")] + { + nix::fcntl::renameat2( + None, + src, + None, + dst, + nix::fcntl::RenameFlags::RENAME_NOREPLACE, + ) + } + #[cfg(target_os = "macos")] + { + let res = src.with_nix_path(|src| { + dst.with_nix_path(|dst| + // SAFETY: `src` and `dst` are valid C strings as per the NixPath trait and they outlive the call to renamex_np. + unsafe { + nix::libc::renamex_np(src.as_ptr(), dst.as_ptr(), nix::libc::RENAME_EXCL) + }) + })??; + nix::errno::Errno::result(res).map(drop) + } + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + { + std::compile_error!("OS does not support no-replace renames"); + } + } +} + +#[cfg(test)] +mod test { + use std::{fs, path::PathBuf}; + + use super::*; + + fn testdir() -> camino_tempfile::Utf8TempDir { + match crate::env::var("NEON_UTILS_RENAME_NOREPLACE_TESTDIR") { + Some(path) => { + let path: camino::Utf8PathBuf = path; + camino_tempfile::tempdir_in(path).unwrap() + } + None => camino_tempfile::tempdir().unwrap(), + } + } + + #[test] + fn test_absolute_paths() { + let testdir = testdir(); + println!("testdir: {}", testdir.path()); + + let src = testdir.path().join("src"); + let dst = testdir.path().join("dst"); + + fs::write(&src, b"").unwrap(); + fs::write(&dst, b"").unwrap(); + + let src = src.canonicalize().unwrap(); + assert!(src.is_absolute()); + let dst = dst.canonicalize().unwrap(); + assert!(dst.is_absolute()); + + let result = rename_noreplace(&src, &dst); + assert_eq!(result.unwrap_err(), nix::Error::EEXIST); + } + + #[test] + fn test_relative_paths() { + let testdir = testdir(); + println!("testdir: {}", testdir.path()); + + // this is fine because we run in nextest => process per test + std::env::set_current_dir(testdir.path()).unwrap(); + + let src = PathBuf::from("src"); + let dst = PathBuf::from("dst"); + + fs::write(&src, b"").unwrap(); + fs::write(&dst, b"").unwrap(); + + let result = rename_noreplace(&src, &dst); + assert_eq!(result.unwrap_err(), nix::Error::EEXIST); + } + + #[test] + fn test_works_when_not_exists() { + let testdir = testdir(); + println!("testdir: {}", testdir.path()); + + let src = testdir.path().join("src"); + let dst = testdir.path().join("dst"); + + fs::write(&src, b"content").unwrap(); + + rename_noreplace(src.as_std_path(), dst.as_std_path()).unwrap(); + assert_eq!( + "content", + String::from_utf8(std::fs::read(&dst).unwrap()).unwrap() + ); + } +} diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index 46eadee1da..5970836033 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -9,20 +9,11 @@ use serde::{Deserialize, Serialize}; /// numbers are used. #[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] pub enum Generation { - // Generations with this magic value will not add a suffix to S3 keys, and will not - // be included in persisted index_part.json. This value is only to be used - // during migration from pre-generation metadata to generation-aware metadata, - // and should eventually go away. - // - // A special Generation is used rather than always wrapping Generation in an Option, - // so that code handling generations doesn't have to be aware of the legacy - // case everywhere it touches a generation. + // The None Generation is used in the metadata of layers written before generations were + // introduced. A running Tenant always has a valid generation, but the layer metadata may + // include None generations. None, - // Generations with this magic value may never be used to construct S3 keys: - // we will panic if someone tries to. This is for Tenants in the "Broken" state, - // so that we can satisfy their constructor with a Generation without risking - // a code bug using it in an S3 write (broken tenants should never write) - Broken, + Valid(u32), } @@ -34,18 +25,15 @@ pub enum Generation { /// scenarios where pageservers might otherwise issue conflicting writes to /// remote storage impl Generation { + pub const MAX: Self = Self::Valid(u32::MAX); + /// Create a new Generation that represents a legacy key format with /// no generation suffix pub fn none() -> Self { Self::None } - // Create a new generation that will panic if you try to use get_suffix - pub fn broken() -> Self { - Self::Broken - } - - pub fn new(v: u32) -> Self { + pub const fn new(v: u32) -> Self { Self::Valid(v) } @@ -54,15 +42,10 @@ impl Generation { } #[track_caller] - pub fn get_suffix(&self) -> String { + pub fn get_suffix(&self) -> impl std::fmt::Display { match self { - Self::Valid(v) => { - format!("-{:08x}", v) - } - Self::None => "".into(), - Self::Broken => { - panic!("Tried to use a broken generation"); - } + Self::Valid(v) => GenerationFileSuffix(Some(*v)), + Self::None => GenerationFileSuffix(None), } } @@ -86,15 +69,14 @@ impl Generation { } } Self::None => Self::None, - Self::Broken => panic!("Attempted to use a broken generation"), } } + #[track_caller] pub fn next(&self) -> Generation { match self { Self::Valid(n) => Self::Valid(*n + 1), Self::None => Self::Valid(1), - Self::Broken => panic!("Attempted to use a broken generation"), } } @@ -107,6 +89,18 @@ impl Generation { } } +struct GenerationFileSuffix(Option); + +impl std::fmt::Display for GenerationFileSuffix { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if let Some(g) = self.0 { + write!(f, "-{g:08x}") + } else { + Ok(()) + } + } +} + impl Serialize for Generation { fn serialize(&self, serializer: S) -> Result where @@ -115,7 +109,7 @@ impl Serialize for Generation { if let Self::Valid(v) = self { v.serialize(serializer) } else { - // We should never be asked to serialize a None or Broken. Structures + // We should never be asked to serialize a None. Structures // that include an optional generation should convert None to an // Option::None Err(serde::ser::Error::custom( @@ -146,9 +140,6 @@ impl Debug for Generation { Self::None => { write!(f, "") } - Self::Broken => { - write!(f, "") - } } } } @@ -164,4 +155,24 @@ mod test { assert!(Generation::none() < Generation::new(0)); assert!(Generation::none() < Generation::new(1)); } + + #[test] + fn suffix_is_stable() { + use std::fmt::Write as _; + + // the suffix must remain stable through-out the pageserver remote storage evolution and + // not be changed accidentially without thinking about migration + let examples = [ + (line!(), Generation::None, ""), + (line!(), Generation::Valid(0), "-00000000"), + (line!(), Generation::Valid(u32::MAX), "-ffffffff"), + ]; + + let mut s = String::new(); + for (line, gen, expected) in examples { + s.clear(); + write!(s, "{}", &gen.get_suffix()).expect("string grows"); + assert_eq!(s, expected, "example on {line}"); + } + } } diff --git a/libs/utils/src/hex.rs b/libs/utils/src/hex.rs index fc0bb7e4a2..382f805a96 100644 --- a/libs/utils/src/hex.rs +++ b/libs/utils/src/hex.rs @@ -19,13 +19,13 @@ /// // right: [0x68; 1] /// # fn serialize_something() -> Vec { "hello world".as_bytes().to_vec() } /// ``` -#[derive(PartialEq)] -pub struct Hex<'a>(pub &'a [u8]); +pub struct Hex(pub S); -impl std::fmt::Debug for Hex<'_> { +impl> std::fmt::Debug for Hex { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "[")?; - for (i, c) in self.0.chunks(16).enumerate() { + let chunks = self.0.as_ref().chunks(16); + for (i, c) in chunks.enumerate() { if i > 0 && !c.is_empty() { writeln!(f, ", ")?; } @@ -36,6 +36,15 @@ impl std::fmt::Debug for Hex<'_> { write!(f, "0x{b:02x}")?; } } - write!(f, "; {}]", self.0.len()) + write!(f, "; {}]", self.0.as_ref().len()) + } +} + +impl, L: AsRef<[u8]>> PartialEq> for Hex { + fn eq(&self, other: &Hex) -> bool { + let left = self.0.as_ref(); + let right = other.0.as_ref(); + + left == right } } diff --git a/libs/utils/src/history_buffer.rs b/libs/utils/src/history_buffer.rs deleted file mode 100644 index 1f07f5560f..0000000000 --- a/libs/utils/src/history_buffer.rs +++ /dev/null @@ -1,161 +0,0 @@ -//! A heapless buffer for events of sorts. - -use std::ops; - -use heapless::HistoryBuffer; - -#[derive(Debug, Clone)] -pub struct HistoryBufferWithDropCounter { - buffer: HistoryBuffer, - drop_count: u64, -} - -impl HistoryBufferWithDropCounter { - pub fn write(&mut self, data: T) { - let len_before = self.buffer.len(); - self.buffer.write(data); - let len_after = self.buffer.len(); - self.drop_count += u64::from(len_before == len_after); - } - pub fn drop_count(&self) -> u64 { - self.drop_count - } - pub fn map U>(&self, f: F) -> HistoryBufferWithDropCounter { - let mut buffer = HistoryBuffer::new(); - buffer.extend(self.buffer.oldest_ordered().map(f)); - HistoryBufferWithDropCounter:: { - buffer, - drop_count: self.drop_count, - } - } -} - -impl Default for HistoryBufferWithDropCounter { - fn default() -> Self { - Self { - buffer: HistoryBuffer::default(), - drop_count: 0, - } - } -} - -impl ops::Deref for HistoryBufferWithDropCounter { - type Target = HistoryBuffer; - - fn deref(&self) -> &Self::Target { - &self.buffer - } -} - -#[derive(serde::Serialize)] -struct SerdeRepr { - buffer: Vec, - drop_count: u64, -} - -impl<'a, T, const L: usize> From<&'a HistoryBufferWithDropCounter> for SerdeRepr -where - T: Clone + serde::Serialize, -{ - fn from(value: &'a HistoryBufferWithDropCounter) -> Self { - let HistoryBufferWithDropCounter { buffer, drop_count } = value; - SerdeRepr { - buffer: buffer.iter().cloned().collect(), - drop_count: *drop_count, - } - } -} - -impl serde::Serialize for HistoryBufferWithDropCounter -where - T: Clone + serde::Serialize, -{ - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - SerdeRepr::from(self).serialize(serializer) - } -} - -#[cfg(test)] -mod test { - use super::HistoryBufferWithDropCounter; - - #[test] - fn test_basics() { - let mut b = HistoryBufferWithDropCounter::<_, 2>::default(); - b.write(1); - b.write(2); - b.write(3); - assert!(b.iter().any(|e| *e == 2)); - assert!(b.iter().any(|e| *e == 3)); - assert!(!b.iter().any(|e| *e == 1)); - } - - #[test] - fn test_drop_count_works() { - let mut b = HistoryBufferWithDropCounter::<_, 2>::default(); - b.write(1); - assert_eq!(b.drop_count(), 0); - b.write(2); - assert_eq!(b.drop_count(), 0); - b.write(3); - assert_eq!(b.drop_count(), 1); - b.write(4); - assert_eq!(b.drop_count(), 2); - } - - #[test] - fn test_clone_works() { - let mut b = HistoryBufferWithDropCounter::<_, 2>::default(); - b.write(1); - b.write(2); - b.write(3); - assert_eq!(b.drop_count(), 1); - let mut c = b.clone(); - assert_eq!(c.drop_count(), 1); - assert!(c.iter().any(|e| *e == 2)); - assert!(c.iter().any(|e| *e == 3)); - assert!(!c.iter().any(|e| *e == 1)); - - c.write(4); - assert!(c.iter().any(|e| *e == 4)); - assert!(!b.iter().any(|e| *e == 4)); - } - - #[test] - fn test_map() { - let mut b = HistoryBufferWithDropCounter::<_, 2>::default(); - - b.write(1); - assert_eq!(b.drop_count(), 0); - { - let c = b.map(|i| i + 10); - assert_eq!(c.oldest_ordered().cloned().collect::>(), vec![11]); - assert_eq!(c.drop_count(), 0); - } - - b.write(2); - assert_eq!(b.drop_count(), 0); - { - let c = b.map(|i| i + 10); - assert_eq!( - c.oldest_ordered().cloned().collect::>(), - vec![11, 12] - ); - assert_eq!(c.drop_count(), 0); - } - - b.write(3); - assert_eq!(b.drop_count(), 1); - { - let c = b.map(|i| i + 10); - assert_eq!( - c.oldest_ordered().cloned().collect::>(), - vec![12, 13] - ); - assert_eq!(c.drop_count(), 1); - } - } -} diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 550ab10700..8ee5abd434 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -9,7 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use once_cell::sync::Lazy; use routerify::ext::RequestExt; use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; -use tracing::{self, debug, info, info_span, warn, Instrument}; +use tracing::{debug, info, info_span, warn, Instrument}; use std::future::Future; use std::str::FromStr; @@ -52,17 +52,17 @@ struct RequestId(String); /// There could be other ways to implement similar functionality: /// /// * procmacros placed on top of all handler methods -/// With all the drawbacks of procmacros, brings no difference implementation-wise, -/// and little code reduction compared to the existing approach. +/// With all the drawbacks of procmacros, brings no difference implementation-wise, +/// and little code reduction compared to the existing approach. /// /// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic, -/// implemented for [`RouterBuilder`]. -/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later. +/// implemented for [`RouterBuilder`]. +/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later. /// /// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped -/// later, in a post-response middleware. -/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures` -/// tries to achive with its `.instrument` used in the current approach. +/// later, in a post-response middleware. +/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures` +/// tries to achive with its `.instrument` used in the current approach. /// /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced. pub async fn request_span(request: Request, handler: H) -> R::Output @@ -156,6 +156,10 @@ pub struct ChannelWriter { buffer: BytesMut, pub tx: mpsc::Sender>, written: usize, + /// Time spent waiting for the channel to make progress. It is not the same as time to upload a + /// buffer because we cannot know anything about that, but this should allow us to understand + /// the actual time taken without the time spent `std::thread::park`ed. + wait_time: std::time::Duration, } impl ChannelWriter { @@ -168,6 +172,7 @@ impl ChannelWriter { buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2), tx, written: 0, + wait_time: std::time::Duration::ZERO, } } @@ -180,6 +185,8 @@ impl ChannelWriter { tracing::trace!(n, "flushing"); let ready = self.buffer.split().freeze(); + let wait_started_at = std::time::Instant::now(); + // not ideal to call from blocking code to block_on, but we are sure that this // operation does not spawn_blocking other tasks let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async { @@ -192,6 +199,9 @@ impl ChannelWriter { // sending it to the client. Ok(()) }); + + self.wait_time += wait_started_at.elapsed(); + if res.is_err() { return Err(std::io::ErrorKind::BrokenPipe.into()); } @@ -202,6 +212,10 @@ impl ChannelWriter { pub fn flushed_bytes(&self) -> usize { self.written } + + pub fn wait_time(&self) -> std::time::Duration { + self.wait_time + } } impl std::io::Write for ChannelWriter { @@ -231,7 +245,7 @@ impl std::io::Write for ChannelWriter { } } -async fn prometheus_metrics_handler(_req: Request) -> Result, ApiError> { +pub async fn prometheus_metrics_handler(_req: Request) -> Result, ApiError> { SERVE_METRICS_COUNT.inc(); let started_at = std::time::Instant::now(); @@ -252,22 +266,52 @@ async fn prometheus_metrics_handler(_req: Request) -> Result { tracing::info!( bytes = writer.flushed_bytes(), - elapsed_ms = started_at.elapsed().as_millis(), + total_ms = total.as_millis(), + spawning_ms = spawned_in.as_millis(), + collection_ms = collected_in.as_millis(), + encoding_ms = encoded_in.as_millis(), "responded /metrics" ); } Err(e) => { - tracing::warn!("failed to write out /metrics response: {e:#}"); + // there is a chance that this error is not the BrokenPipe we generate in the writer + // for "closed connection", but it is highly unlikely. + tracing::warn!( + after_bytes = writer.flushed_bytes(), + total_ms = total.as_millis(), + spawning_ms = spawned_in.as_millis(), + collection_ms = collected_in.as_millis(), + encoding_ms = encoded_in.as_millis(), + "failed to write out /metrics response: {e:?}" + ); // semantics of this error are quite... unclear. we want to error the stream out to // abort the response to somehow notify the client that we failed. // @@ -323,7 +367,6 @@ pub fn make_router() -> RouterBuilder { .middleware(Middleware::post_with_info( add_request_id_header_to_response, )) - .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .err_handler(route_error_handler) } diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index d55823b0b7..3d863a6518 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -34,6 +34,9 @@ pub enum ApiError { #[error("Timeout")] Timeout(Cow<'static, str>), + #[error("Request cancelled")] + Cancelled, + #[error(transparent)] InternalServerError(anyhow::Error), } @@ -74,6 +77,10 @@ impl ApiError { err.to_string(), StatusCode::REQUEST_TIMEOUT, ), + ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status( + self.to_string(), + StatusCode::INTERNAL_SERVER_ERROR, + ), ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::INTERNAL_SERVER_ERROR, @@ -133,6 +140,7 @@ pub fn api_error_handler(api_error: ApiError) -> Response { ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"), ApiError::ShuttingDown => info!("Shut down while processing HTTP request"), ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"), + ApiError::Cancelled => info!("Request cancelled while processing HTTP request"), _ => info!("Error processing HTTP request: {api_error:#}"), } diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs index 7ca62561fe..6c25440b42 100644 --- a/libs/utils/src/http/json.rs +++ b/libs/utils/src/http/json.rs @@ -8,22 +8,15 @@ use super::error::ApiError; pub async fn json_request Deserialize<'de>>( request: &mut Request, ) -> Result { - json_request_or_empty_body(request) - .await? - .context("missing request body") - .map_err(ApiError::BadRequest) -} - -/// Will be removed as part of -pub async fn json_request_or_empty_body Deserialize<'de>>( - request: &mut Request, -) -> Result, ApiError> { let body = hyper::body::aggregate(request.body_mut()) .await .context("Failed to read request body") .map_err(ApiError::BadRequest)?; + if body.remaining() == 0 { - return Ok(None); + return Err(ApiError::BadRequest(anyhow::anyhow!( + "missing request body" + ))); } let mut deser = serde_json::de::Deserializer::from_reader(body.reader()); @@ -31,7 +24,6 @@ pub async fn json_request_or_empty_body Deserialize<'de>>( serde_path_to_error::deserialize(&mut deser) // intentionally stringify because the debug version is not helpful in python logs .map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}")) - .map(Some) .map_err(ApiError::BadRequest) } diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs index 766bbfc9df..8b8ed5a67f 100644 --- a/libs/utils/src/http/request.rs +++ b/libs/utils/src/http/request.rs @@ -74,6 +74,15 @@ pub fn parse_query_param>( .transpose() } +pub fn must_parse_query_param>( + request: &Request, + param_name: &str, +) -> Result { + parse_query_param(request, param_name)?.ok_or_else(|| { + ApiError::BadRequest(anyhow!("no {param_name} specified in query parameters")) + }) +} + pub async fn ensure_no_body(request: &mut Request) -> Result<(), ApiError> { match request.body_mut().data().await { Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))), diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index 0409001f4f..2cda899b15 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -249,8 +249,10 @@ macro_rules! id_newtype { }; } -/// Neon timeline IDs are different from PostgreSQL timeline -/// IDs. They serve a similar purpose though: they differentiate +/// Neon timeline ID. +/// +/// They are different from PostgreSQL timeline +/// IDs, but serve a similar purpose: they differentiate /// between different "histories" of the same cluster. However, /// PostgreSQL timeline IDs are a bit cumbersome, because they are only /// 32-bits wide, and they must be in ascending order in any given @@ -302,17 +304,6 @@ pub struct TenantId(Id); id_newtype!(TenantId); -/// Neon Connection Id identifies long-lived connections (for example a pagestream -/// connection with the page_service). Is used for better logging and tracing -/// -/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look -/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. -/// See [`Id`] for alternative ways to serialize it. -#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] -pub struct ConnectionId(Id); - -id_newtype!(ConnectionId); - // A pair uniquely identifying Neon instance. #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct TenantTimelineId { diff --git a/libs/utils/src/leaky_bucket.rs b/libs/utils/src/leaky_bucket.rs new file mode 100644 index 0000000000..a120dc0ac5 --- /dev/null +++ b/libs/utils/src/leaky_bucket.rs @@ -0,0 +1,280 @@ +//! This module implements the Generic Cell Rate Algorithm for a simplified +//! version of the Leaky Bucket rate limiting system. +//! +//! # Leaky Bucket +//! +//! If the bucket is full, no new requests are allowed and are throttled/errored. +//! If the bucket is partially full/empty, new requests are added to the bucket in +//! terms of "tokens". +//! +//! Over time, tokens are removed from the bucket, naturally allowing new requests at a steady rate. +//! +//! The bucket size tunes the burst support. The drain rate tunes the steady-rate requests per second. +//! +//! # [GCRA](https://en.wikipedia.org/wiki/Generic_cell_rate_algorithm) +//! +//! GCRA is a continuous rate leaky-bucket impl that stores minimal state and requires +//! no background jobs to drain tokens, as the design utilises timestamps to drain automatically over time. +//! +//! We store an "empty_at" timestamp as the only state. As time progresses, we will naturally approach +//! the empty state. The full-bucket state is calculated from `empty_at - config.bucket_width`. +//! +//! Another explaination can be found here: + +use std::{sync::Mutex, time::Duration}; + +use tokio::{sync::Notify, time::Instant}; + +pub struct LeakyBucketConfig { + /// This is the "time cost" of a single request unit. + /// Should loosely represent how long it takes to handle a request unit in active resource time. + /// Loosely speaking this is the inverse of the steady-rate requests-per-second + pub cost: Duration, + + /// total size of the bucket + pub bucket_width: Duration, +} + +impl LeakyBucketConfig { + pub fn new(rps: f64, bucket_size: f64) -> Self { + let cost = Duration::from_secs_f64(rps.recip()); + let bucket_width = cost.mul_f64(bucket_size); + Self { cost, bucket_width } + } +} + +pub struct LeakyBucketState { + /// Bucket is represented by `allow_at..empty_at` where `allow_at = empty_at - config.bucket_width`. + /// + /// At any given time, `empty_at - now` represents the number of tokens in the bucket, multiplied by the "time_cost". + /// Adding `n` tokens to the bucket is done by moving `empty_at` forward by `n * config.time_cost`. + /// If `now < allow_at`, the bucket is considered filled and cannot accept any more tokens. + /// Draining the bucket will happen naturally as `now` moves forward. + /// + /// Let `n` be some "time cost" for the request, + /// If now is after empty_at, the bucket is empty and the empty_at is reset to now, + /// If now is within the `bucket window + n`, we are within time budget. + /// If now is before the `bucket window + n`, we have run out of budget. + /// + /// This is inspired by the generic cell rate algorithm (GCRA) and works + /// exactly the same as a leaky-bucket. + pub empty_at: Instant, +} + +impl LeakyBucketState { + pub fn with_initial_tokens(config: &LeakyBucketConfig, initial_tokens: f64) -> Self { + LeakyBucketState { + empty_at: Instant::now() + config.cost.mul_f64(initial_tokens), + } + } + + pub fn bucket_is_empty(&self, now: Instant) -> bool { + // if self.end is after now, the bucket is not empty + self.empty_at <= now + } + + /// Immediately adds tokens to the bucket, if there is space. + /// + /// In a scenario where you are waiting for available rate, + /// rather than just erroring immediately, `started` corresponds to when this waiting started. + /// + /// `n` is the number of tokens that will be filled in the bucket. + /// + /// # Errors + /// + /// If there is not enough space, no tokens are added. Instead, an error is returned with the time when + /// there will be space again. + pub fn add_tokens( + &mut self, + config: &LeakyBucketConfig, + started: Instant, + n: f64, + ) -> Result<(), Instant> { + let now = Instant::now(); + + // invariant: started <= now + debug_assert!(started <= now); + + // If the bucket was empty when we started our search, + // we should update the `empty_at` value accordingly. + // this prevents us from having negative tokens in the bucket. + let mut empty_at = self.empty_at; + if empty_at < started { + empty_at = started; + } + + let n = config.cost.mul_f64(n); + let new_empty_at = empty_at + n; + let allow_at = new_empty_at.checked_sub(config.bucket_width); + + // empty_at + // allow_at | new_empty_at + // / | / + // -------o-[---------o-|--]--------- + // now1 ^ now2 ^ + // + // at now1, the bucket would be completely filled if we add n tokens. + // at now2, the bucket would be partially filled if we add n tokens. + + match allow_at { + Some(allow_at) if now < allow_at => Err(allow_at), + _ => { + self.empty_at = new_empty_at; + Ok(()) + } + } + } +} + +pub struct RateLimiter { + pub config: LeakyBucketConfig, + pub state: Mutex, + /// a queue to provide this fair ordering. + pub queue: Notify, +} + +struct Requeue<'a>(&'a Notify); + +impl Drop for Requeue<'_> { + fn drop(&mut self) { + self.0.notify_one(); + } +} + +impl RateLimiter { + pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self { + RateLimiter { + state: Mutex::new(LeakyBucketState::with_initial_tokens( + &config, + initial_tokens, + )), + config, + queue: { + let queue = Notify::new(); + queue.notify_one(); + queue + }, + } + } + + pub fn steady_rps(&self) -> f64 { + self.config.cost.as_secs_f64().recip() + } + + /// returns true if we did throttle + pub async fn acquire(&self, count: usize) -> bool { + let mut throttled = false; + + let start = tokio::time::Instant::now(); + + // wait until we are the first in the queue + let mut notified = std::pin::pin!(self.queue.notified()); + if !notified.as_mut().enable() { + throttled = true; + notified.await; + } + + // notify the next waiter in the queue when we are done. + let _guard = Requeue(&self.queue); + + loop { + let res = self + .state + .lock() + .unwrap() + .add_tokens(&self.config, start, count as f64); + match res { + Ok(()) => return throttled, + Err(ready_at) => { + throttled = true; + tokio::time::sleep_until(ready_at).await; + } + } + } + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use tokio::time::Instant; + + use super::{LeakyBucketConfig, LeakyBucketState}; + + #[tokio::test(start_paused = true)] + async fn check() { + let config = LeakyBucketConfig { + // average 100rps + cost: Duration::from_millis(10), + // burst up to 100 requests + bucket_width: Duration::from_millis(1000), + }; + + let mut state = LeakyBucketState { + empty_at: Instant::now(), + }; + + // supports burst + { + // should work for 100 requests this instant + for _ in 0..100 { + state.add_tokens(&config, Instant::now(), 1.0).unwrap(); + } + let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); + assert_eq!(ready - Instant::now(), Duration::from_millis(10)); + } + + // doesn't overfill + { + // after 1s we should have an empty bucket again. + tokio::time::advance(Duration::from_secs(1)).await; + assert!(state.bucket_is_empty(Instant::now())); + + // after 1s more, we should not over count the tokens and allow more than 200 requests. + tokio::time::advance(Duration::from_secs(1)).await; + for _ in 0..100 { + state.add_tokens(&config, Instant::now(), 1.0).unwrap(); + } + let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); + assert_eq!(ready - Instant::now(), Duration::from_millis(10)); + } + + // supports sustained rate over a long period + { + tokio::time::advance(Duration::from_secs(1)).await; + + // should sustain 100rps + for _ in 0..2000 { + tokio::time::advance(Duration::from_millis(10)).await; + state.add_tokens(&config, Instant::now(), 1.0).unwrap(); + } + } + + // supports requesting more tokens than can be stored in the bucket + // we just wait a little bit longer upfront. + { + // start the bucket completely empty + tokio::time::advance(Duration::from_secs(5)).await; + assert!(state.bucket_is_empty(Instant::now())); + + // requesting 200 tokens of space should take 200*cost = 2s + // but we already have 1s available, so we wait 1s from start. + let start = Instant::now(); + + let ready = state.add_tokens(&config, start, 200.0).unwrap_err(); + assert_eq!(ready - Instant::now(), Duration::from_secs(1)); + + tokio::time::advance(Duration::from_millis(500)).await; + let ready = state.add_tokens(&config, start, 200.0).unwrap_err(); + assert_eq!(ready - Instant::now(), Duration::from_millis(500)); + + tokio::time::advance(Duration::from_millis(500)).await; + state.add_tokens(&config, start, 200.0).unwrap(); + + // bucket should be completely full now + let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); + assert_eq!(ready - Instant::now(), Duration::from_millis(10)); + } + } +} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 890061dc59..218dd468b1 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -26,6 +26,8 @@ pub mod auth; // utility functions and helper traits for unified unique id generation/serialization etc. pub mod id; +pub mod shard; + mod hex; pub use hex::Hex; @@ -57,12 +59,11 @@ pub mod signals; pub mod fs_ext; -pub mod history_buffer; - pub mod measured_stream; pub mod serde_percent; pub mod serde_regex; +pub mod serde_system_time; pub mod pageserver_feedback; @@ -70,6 +71,7 @@ pub mod postgres_client; pub mod tracing_span_assert; +pub mod leaky_bucket; pub mod rate_limit; /// Simple once-barrier and a guard which keeps barrier awaiting. @@ -87,6 +89,16 @@ pub mod failpoint_support; pub mod yielding_loop; +pub mod zstd; + +pub mod env; + +pub mod poison; + +pub mod toml_edit_ext; + +pub mod circuit_breaker; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: @@ -117,7 +129,7 @@ pub mod yielding_loop; /// /// ############################################################################################# /// TODO this macro is not the way the library is intended to be used, see for details. -/// We use `cachepot` to reduce our current CI build times: +/// We used `cachepot` to reduce our current CI build times: /// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains /// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation. /// The problem needs further investigation and regular `const` declaration instead of a macro. diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs index 987b9d9ad2..3a2ed3e830 100644 --- a/libs/utils/src/lock_file.rs +++ b/libs/utils/src/lock_file.rs @@ -63,6 +63,7 @@ impl UnwrittenLockFile { pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result { let lock_file = fs::OpenOptions::new() .create(true) // O_CREAT + .truncate(true) .write(true) .open(lock_file_path) .context("open lock file")?; @@ -99,7 +100,9 @@ pub enum LockFileRead { } /// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to -/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked. +/// inspect its content. +/// +/// It is not an `Err(...)` if the file does not exist or is already locked. /// Check the [`LockFileRead`] variants for details. pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result { let res = fs::OpenOptions::new().read(true).open(path); diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index f7b73dc984..71af43a4da 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -5,7 +5,9 @@ use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; use strum_macros::{EnumString, EnumVariantNames}; -#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)] +#[derive( + EnumString, strum_macros::Display, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy, +)] #[strum(serialize_all = "snake_case")] pub enum LogFormat { Plain, @@ -274,6 +276,14 @@ impl From for SecretString { } } +impl FromStr for SecretString { + type Err = std::convert::Infallible; + + fn from_str(s: &str) -> Result { + Ok(Self(s.to_string())) + } +} + impl std::fmt::Debug for SecretString { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "[SECRET]") diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index b3269ae049..1aebe91428 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -415,7 +415,6 @@ mod tests { use super::*; - use serde::ser::Serialize; use serde_assert::{Deserializer, Serializer, Token, Tokens}; #[test] diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs index c9fbdde928..dede65e699 100644 --- a/libs/utils/src/pageserver_feedback.rs +++ b/libs/utils/src/pageserver_feedback.rs @@ -8,6 +8,7 @@ use tracing::{trace, warn}; use crate::lsn::Lsn; /// Feedback pageserver sends to safekeeper and safekeeper resends to compute. +/// /// Serialized in custom flexible key/value format. In replication protocol, it /// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres /// Standby status update / Hot standby feedback messages. @@ -29,12 +30,10 @@ pub struct PageserverFeedback { // Serialize with RFC3339 format. #[serde(with = "serde_systemtime")] pub replytime: SystemTime, + /// Used to track feedbacks from different shards. Always zero for unsharded tenants. + pub shard_number: u32, } -// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback. -// Do not remove previously available fields because this might be backwards incompatible. -pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5; - impl PageserverFeedback { pub fn empty() -> PageserverFeedback { PageserverFeedback { @@ -43,6 +42,7 @@ impl PageserverFeedback { remote_consistent_lsn: Lsn::INVALID, disk_consistent_lsn: Lsn::INVALID, replytime: *PG_EPOCH, + shard_number: 0, } } @@ -59,17 +59,26 @@ impl PageserverFeedback { // // TODO: change serialized fields names once all computes migrate to rename. pub fn serialize(&self, buf: &mut BytesMut) { - buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys + let buf_ptr = buf.len(); + buf.put_u8(0); // # of keys, will be filled later + let mut nkeys = 0; + + nkeys += 1; buf.put_slice(b"current_timeline_size\0"); buf.put_i32(8); buf.put_u64(self.current_timeline_size); + nkeys += 1; buf.put_slice(b"ps_writelsn\0"); buf.put_i32(8); buf.put_u64(self.last_received_lsn.0); + + nkeys += 1; buf.put_slice(b"ps_flushlsn\0"); buf.put_i32(8); buf.put_u64(self.disk_consistent_lsn.0); + + nkeys += 1; buf.put_slice(b"ps_applylsn\0"); buf.put_i32(8); buf.put_u64(self.remote_consistent_lsn.0); @@ -80,9 +89,19 @@ impl PageserverFeedback { .expect("failed to serialize pg_replytime earlier than PG_EPOCH") .as_micros() as i64; + nkeys += 1; buf.put_slice(b"ps_replytime\0"); buf.put_i32(8); buf.put_i64(timestamp); + + if self.shard_number > 0 { + nkeys += 1; + buf.put_slice(b"shard_number\0"); + buf.put_i32(4); + buf.put_u32(self.shard_number); + } + + buf[buf_ptr] = nkeys; } // Deserialize PageserverFeedback message @@ -123,6 +142,11 @@ impl PageserverFeedback { rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); } } + b"shard_number" => { + let len = buf.get_i32(); + assert_eq!(len, 4); + rf.shard_number = buf.get_u32(); + } _ => { let len = buf.get_i32(); warn!( @@ -194,10 +218,7 @@ mod tests { rf.serialize(&mut data); // Add an extra field to the buffer and adjust number of keys - if let Some(first) = data.first_mut() { - *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1; - } - + data[0] += 1; data.put_slice(b"new_field_one\0"); data.put_i32(8); data.put_u64(42); diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs new file mode 100644 index 0000000000..c3e2fba20c --- /dev/null +++ b/libs/utils/src/poison.rs @@ -0,0 +1,123 @@ +//! Protect a piece of state from reuse after it is left in an inconsistent state. +//! +//! # Example +//! +//! ``` +//! # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { +//! use utils::poison::Poison; +//! use std::time::Duration; +//! +//! struct State { +//! clean: bool, +//! } +//! let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true })); +//! +//! let mut mutex_guard = state.lock().await; +//! let mut poison_guard = mutex_guard.check_and_arm()?; +//! let state = poison_guard.data_mut(); +//! state.clean = false; +//! // If we get cancelled at this await point, subsequent check_and_arm() calls will fail. +//! tokio::time::sleep(Duration::from_secs(10)).await; +//! state.clean = true; +//! poison_guard.disarm(); +//! # Ok::<(), utils::poison::Error>(()) +//! # }); +//! ``` + +use tracing::warn; + +pub struct Poison { + what: &'static str, + state: State, + data: T, +} + +#[derive(Clone, Copy)] +enum State { + Clean, + Armed, + Poisoned { at: chrono::DateTime }, +} + +impl Poison { + /// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed. + pub fn new(what: &'static str, data: T) -> Self { + Self { + what, + state: State::Clean, + data, + } + } + + /// Check for poisoning and return a [`Guard`] that provides access to the wrapped state. + pub fn check_and_arm(&mut self) -> Result, Error> { + match self.state { + State::Clean => { + self.state = State::Armed; + Ok(Guard(self)) + } + State::Armed => unreachable!("transient state"), + State::Poisoned { at } => Err(Error::Poisoned { + what: self.what, + at, + }), + } + } +} + +/// Armed pointer to a [`Poison`]. +/// +/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state. +/// Once modifications are done, use [`Self::disarm`]. +/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned +/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error. +pub struct Guard<'a, T>(&'a mut Poison); + +impl<'a, T> Guard<'a, T> { + pub fn data(&self) -> &T { + &self.0.data + } + pub fn data_mut(&mut self) -> &mut T { + &mut self.0.data + } + + pub fn disarm(self) { + match self.0.state { + State::Clean => unreachable!("we set it to Armed in check_and_arm()"), + State::Armed => { + self.0.state = State::Clean; + } + State::Poisoned { at } => { + unreachable!("we fail check_and_arm() if it's in that state: {at}") + } + } + } +} + +impl<'a, T> Drop for Guard<'a, T> { + fn drop(&mut self) { + match self.0.state { + State::Clean => { + // set by disarm() + } + State::Armed => { + // still armed => poison it + let at = chrono::Utc::now(); + self.0.state = State::Poisoned { at }; + warn!(at=?at, "poisoning {}", self.0.what); + } + State::Poisoned { at } => { + unreachable!("we fail check_and_arm() if it's in that state: {at}") + } + } + } +} + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("poisoned at {at}: {what}")] + Poisoned { + what: &'static str, + at: chrono::DateTime, + }, +} diff --git a/libs/utils/src/rate_limit.rs b/libs/utils/src/rate_limit.rs index 557955bb88..f3f8f219e3 100644 --- a/libs/utils/src/rate_limit.rs +++ b/libs/utils/src/rate_limit.rs @@ -5,6 +5,15 @@ use std::time::{Duration, Instant}; pub struct RateLimit { last: Option, interval: Duration, + dropped: u64, +} + +pub struct RateLimitStats(u64); + +impl std::fmt::Display for RateLimitStats { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{} dropped calls", self.0) + } } impl RateLimit { @@ -12,20 +21,27 @@ impl RateLimit { Self { last: None, interval, + dropped: 0, } } /// Call `f` if the rate limit allows. /// Don't call it otherwise. pub fn call(&mut self, f: F) { + self.call2(|_| f()) + } + + pub fn call2(&mut self, f: F) { let now = Instant::now(); match self.last { Some(last) if now - last <= self.interval => { // ratelimit + self.dropped += 1; } _ => { self.last = Some(now); - f(); + f(RateLimitStats(self.dropped)); + self.dropped = 0; } } } diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index effc9c67b5..375b227b99 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -1,12 +1,11 @@ #![warn(missing_docs)] -use std::cmp::{Eq, Ordering, PartialOrd}; +use std::cmp::{Eq, Ordering}; use std::collections::BinaryHeap; -use std::fmt::Debug; use std::mem; use std::sync::Mutex; use std::time::Duration; -use tokio::sync::watch::{channel, Receiver, Sender}; +use tokio::sync::watch::{self, channel}; use tokio::time::timeout; /// An error happened while waiting for a number @@ -35,23 +34,73 @@ pub trait MonotonicCounter { fn cnt_value(&self) -> V; } -/// Internal components of a `SeqWait` -struct SeqWaitInt +/// Heap of waiters, lowest numbers pop first. +struct Waiters where - S: MonotonicCounter, V: Ord, { - waiters: BinaryHeap>, - current: S, - shutdown: bool, + heap: BinaryHeap>, + /// Number of the first waiter in the heap, or None if there are no waiters. + status_channel: watch::Sender>, +} + +impl Waiters +where + V: Ord + Copy, +{ + fn new() -> Self { + Waiters { + heap: BinaryHeap::new(), + status_channel: channel(None).0, + } + } + + /// `status_channel` contains the number of the first waiter in the heap. + /// This function should be called whenever waiters heap changes. + fn update_status(&self) { + let first_waiter = self.heap.peek().map(|w| w.wake_num); + let _ = self.status_channel.send_replace(first_waiter); + } + + /// Add new waiter to the heap, return a channel that will be notified when the number arrives. + fn add(&mut self, num: V) -> watch::Receiver<()> { + let (tx, rx) = channel(()); + self.heap.push(Waiter { + wake_num: num, + wake_channel: tx, + }); + self.update_status(); + rx + } + + /// Pop all waiters <= num from the heap. Collect channels in a vector, + /// so that caller can wake them up. + fn pop_leq(&mut self, num: V) -> Vec> { + let mut wake_these = Vec::new(); + while let Some(n) = self.heap.peek() { + if n.wake_num > num { + break; + } + wake_these.push(self.heap.pop().unwrap().wake_channel); + } + self.update_status(); + wake_these + } + + /// Used on shutdown to efficiently drop all waiters. + fn take_all(&mut self) -> BinaryHeap> { + let heap = mem::take(&mut self.heap); + self.update_status(); + heap + } } struct Waiter where T: Ord, { - wake_num: T, // wake me when this number arrives ... - wake_channel: Sender<()>, // ... by sending a message to this channel + wake_num: T, // wake me when this number arrives ... + wake_channel: watch::Sender<()>, // ... by sending a message to this channel } // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here @@ -76,6 +125,17 @@ impl PartialEq for Waiter { impl Eq for Waiter {} +/// Internal components of a `SeqWait` +struct SeqWaitInt +where + S: MonotonicCounter, + V: Ord, +{ + waiters: Waiters, + current: S, + shutdown: bool, +} + /// A tool for waiting on a sequence number /// /// This provides a way to wait the arrival of a number. @@ -108,7 +168,7 @@ where /// Create a new `SeqWait`, initialized to a particular number pub fn new(starting_num: S) -> Self { let internal = SeqWaitInt { - waiters: BinaryHeap::new(), + waiters: Waiters::new(), current: starting_num, shutdown: false, }; @@ -128,9 +188,8 @@ where // Block any future waiters from starting internal.shutdown = true; - // This will steal the entire waiters map. - // When we drop it all waiters will be woken. - mem::take(&mut internal.waiters) + // Take all waiters to drop them later. + internal.waiters.take_all() // Drop the lock as we exit this scope. }; @@ -182,9 +241,21 @@ where } } + /// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`. + pub fn would_wait_for(&self, num: V) -> Result<(), V> { + let internal = self.internal.lock().unwrap(); + let cnt = internal.current.cnt_value(); + drop(internal); + if cnt >= num { + Ok(()) + } else { + Err(cnt) + } + } + /// Register and return a channel that will be notified when a number arrives, /// or None, if it has already arrived. - fn queue_for_wait(&self, num: V) -> Result>, SeqWaitError> { + fn queue_for_wait(&self, num: V) -> Result>, SeqWaitError> { let mut internal = self.internal.lock().unwrap(); if internal.current.cnt_value() >= num { return Ok(None); @@ -193,12 +264,8 @@ where return Err(SeqWaitError::Shutdown); } - // Create a new channel. - let (tx, rx) = channel(()); - internal.waiters.push(Waiter { - wake_num: num, - wake_channel: tx, - }); + // Add waiter channel to the queue. + let rx = internal.waiters.add(num); // Drop the lock as we exit this scope. Ok(Some(rx)) } @@ -219,16 +286,8 @@ where } internal.current.cnt_advance(num); - // Pop all waiters <= num from the heap. Collect them in a vector, and - // wake them up after releasing the lock. - let mut wake_these = Vec::new(); - while let Some(n) = internal.waiters.peek() { - if n.wake_num > num { - break; - } - wake_these.push(internal.waiters.pop().unwrap().wake_channel); - } - wake_these + // Pop all waiters <= num from the heap. + internal.waiters.pop_leq(num) }; for tx in wake_these { @@ -243,13 +302,29 @@ where pub fn load(&self) -> S { self.internal.lock().unwrap().current } + + /// Get a Receiver for the current status. + /// + /// The current status is the number of the first waiter in the queue, + /// or None if there are no waiters. + /// + /// This receiver will be notified whenever the status changes. + /// It is useful for receiving notifications when the first waiter + /// starts waiting for a number, or when there are no more waiters left. + pub fn status_receiver(&self) -> watch::Receiver> { + self.internal + .lock() + .unwrap() + .waiters + .status_channel + .subscribe() + } } #[cfg(test)] mod tests { use super::*; use std::sync::Arc; - use std::time::Duration; impl MonotonicCounter for i32 { fn cnt_advance(&mut self, val: i32) { diff --git a/libs/utils/src/serde_system_time.rs b/libs/utils/src/serde_system_time.rs new file mode 100644 index 0000000000..b0f6934e87 --- /dev/null +++ b/libs/utils/src/serde_system_time.rs @@ -0,0 +1,55 @@ +//! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision. + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct SystemTime( + #[serde( + deserialize_with = "deser_rfc3339_millis", + serialize_with = "ser_rfc3339_millis" + )] + pub std::time::SystemTime, +); + +fn ser_rfc3339_millis( + ts: &std::time::SystemTime, + serializer: S, +) -> Result { + serializer.collect_str(&humantime::format_rfc3339_millis(*ts)) +} + +fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result +where + D: serde::de::Deserializer<'de>, +{ + let s: String = serde::de::Deserialize::deserialize(deserializer)?; + humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom) +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds. + fn to_millisecond_precision(time: SystemTime) -> SystemTime { + match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) { + Ok(duration) => { + let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis()); + SystemTime( + std::time::SystemTime::UNIX_EPOCH + + std::time::Duration::from_millis(total_millis), + ) + } + Err(_) => time, + } + } + + #[test] + fn test_serialize_deserialize() { + let input = SystemTime(std::time::SystemTime::now()); + let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0)); + let serialized = serde_json::to_string(&input).unwrap(); + assert_eq!(expected_serialized, serialized); + let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap(); + assert_eq!(to_millisecond_precision(input), deserialized); + } +} diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs new file mode 100644 index 0000000000..d146010b41 --- /dev/null +++ b/libs/utils/src/shard.rs @@ -0,0 +1,453 @@ +//! See `pageserver_api::shard` for description on sharding. + +use std::{ops::RangeInclusive, str::FromStr}; + +use hex::FromHex; +use serde::{Deserialize, Serialize}; + +use crate::id::TenantId; + +#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] +pub struct ShardNumber(pub u8); + +#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] +pub struct ShardCount(pub u8); + +/// Combination of ShardNumber and ShardCount. +/// +/// For use within the context of a particular tenant, when we need to know which shard we're +/// dealing with, but do not need to know the full ShardIdentity (because we won't be doing +/// any page->shard mapping), and do not need to know the fully qualified TenantShardId. +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] +pub struct ShardIndex { + pub shard_number: ShardNumber, + pub shard_count: ShardCount, +} + +/// Formatting helper, for generating the `shard_id` label in traces. +pub struct ShardSlug<'a>(&'a TenantShardId); + +/// TenantShardId globally identifies a particular shard in a particular tenant. +/// +/// These are written as `-`, for example: +/// # The second shard in a two-shard tenant +/// 072f1291a5310026820b2fe4b2968934-0102 +/// +/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without +/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables +/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`]. +/// +/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs, +/// is both forward and backward compatible with TenantId: a legacy TenantId can be +/// decoded as a TenantShardId, and when re-encoded it will be parseable +/// as a TenantId. +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] +pub struct TenantShardId { + pub tenant_id: TenantId, + pub shard_number: ShardNumber, + pub shard_count: ShardCount, +} + +impl ShardCount { + pub const MAX: Self = Self(u8::MAX); + pub const MIN: Self = Self(0); + + /// The internal value of a ShardCount may be zero, which means "1 shard, but use + /// legacy format for TenantShardId that excludes the shard suffix", also known + /// as [`TenantShardId::unsharded`]. + /// + /// This method returns the actual number of shards, i.e. if our internal value is + /// zero, we return 1 (unsharded tenants have 1 shard). + pub fn count(&self) -> u8 { + if self.0 > 0 { + self.0 + } else { + 1 + } + } + + /// The literal internal value: this is **not** the number of shards in the + /// tenant, as we have a special zero value for legacy unsharded tenants. Use + /// [`Self::count`] if you want to know the cardinality of shards. + pub fn literal(&self) -> u8 { + self.0 + } + + /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but + /// uses the legacy format for `TenantShardId`. See also the documentation for + /// [`Self::count`]. + pub fn is_unsharded(&self) -> bool { + self.0 == 0 + } + + /// `v` may be zero, or the number of shards in the tenant. `v` is what + /// [`Self::literal`] would return. + pub const fn new(val: u8) -> Self { + Self(val) + } +} + +impl ShardNumber { + pub const MAX: Self = Self(u8::MAX); +} + +impl TenantShardId { + pub fn unsharded(tenant_id: TenantId) -> Self { + Self { + tenant_id, + shard_number: ShardNumber(0), + shard_count: ShardCount(0), + } + } + + /// The range of all TenantShardId that belong to a particular TenantId. This is useful when + /// you have a BTreeMap of TenantShardId, and are querying by TenantId. + pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive { + RangeInclusive::new( + Self { + tenant_id, + shard_number: ShardNumber(0), + shard_count: ShardCount(0), + }, + Self { + tenant_id, + shard_number: ShardNumber::MAX, + shard_count: ShardCount::MAX, + }, + ) + } + + pub fn shard_slug(&self) -> impl std::fmt::Display + '_ { + ShardSlug(self) + } + + /// Convenience for code that has special behavior on the 0th shard. + pub fn is_shard_zero(&self) -> bool { + self.shard_number == ShardNumber(0) + } + + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. + pub fn is_unsharded(&self) -> bool { + self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded() + } + + /// Convenience for dropping the tenant_id and just getting the ShardIndex: this + /// is useful when logging from code that is already in a span that includes tenant ID, to + /// keep messages reasonably terse. + pub fn to_index(&self) -> ShardIndex { + ShardIndex { + shard_number: self.shard_number, + shard_count: self.shard_count, + } + } + + /// Calculate the children of this TenantShardId when splitting the overall tenant into + /// the given number of shards. + pub fn split(&self, new_shard_count: ShardCount) -> Vec { + let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1); + let mut child_shards = Vec::new(); + for shard_number in 0..ShardNumber(new_shard_count.0).0 { + // Key mapping is based on a round robin mapping of key hash modulo shard count, + // so our child shards are the ones which the same keys would map to. + if shard_number % effective_old_shard_count == self.shard_number.0 { + child_shards.push(TenantShardId { + tenant_id: self.tenant_id, + shard_number: ShardNumber(shard_number), + shard_count: new_shard_count, + }) + } + } + + child_shards + } +} + +impl<'a> std::fmt::Display for ShardSlug<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{:02x}{:02x}", + self.0.shard_number.0, self.0.shard_count.0 + ) + } +} + +impl std::fmt::Display for TenantShardId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.shard_count != ShardCount(0) { + write!(f, "{}-{}", self.tenant_id, self.shard_slug()) + } else { + // Legacy case (shard_count == 0) -- format as just the tenant id. Note that this + // is distinct from the normal single shard case (shard count == 1). + self.tenant_id.fmt(f) + } + } +} + +impl std::fmt::Debug for TenantShardId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // Debug is the same as Display: the compact hex representation + write!(f, "{}", self) + } +} + +impl std::str::FromStr for TenantShardId { + type Err = hex::FromHexError; + + fn from_str(s: &str) -> Result { + // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count + if s.len() == 32 { + // Legacy case: no shard specified + Ok(Self { + tenant_id: TenantId::from_str(s)?, + shard_number: ShardNumber(0), + shard_count: ShardCount(0), + }) + } else if s.len() == 37 { + let bytes = s.as_bytes(); + let tenant_id = TenantId::from_hex(&bytes[0..32])?; + let mut shard_parts: [u8; 2] = [0u8; 2]; + hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?; + Ok(Self { + tenant_id, + shard_number: ShardNumber(shard_parts[0]), + shard_count: ShardCount(shard_parts[1]), + }) + } else { + Err(hex::FromHexError::InvalidStringLength) + } + } +} + +impl From<[u8; 18]> for TenantShardId { + fn from(b: [u8; 18]) -> Self { + let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap(); + + Self { + tenant_id: TenantId::from(tenant_id_bytes), + shard_number: ShardNumber(b[16]), + shard_count: ShardCount(b[17]), + } + } +} + +impl ShardIndex { + pub fn new(number: ShardNumber, count: ShardCount) -> Self { + Self { + shard_number: number, + shard_count: count, + } + } + pub fn unsharded() -> Self { + Self { + shard_number: ShardNumber(0), + shard_count: ShardCount(0), + } + } + + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. + pub fn is_unsharded(&self) -> bool { + self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) + } + + /// For use in constructing remote storage paths: concatenate this with a TenantId + /// to get a fully qualified TenantShardId. + /// + /// Backward compat: this function returns an empty string if Self::is_unsharded, such + /// that the legacy pre-sharding remote key format is preserved. + pub fn get_suffix(&self) -> String { + if self.is_unsharded() { + "".to_string() + } else { + format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0) + } + } +} + +impl std::fmt::Display for ShardIndex { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0) + } +} + +impl std::fmt::Debug for ShardIndex { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // Debug is the same as Display: the compact hex representation + write!(f, "{}", self) + } +} + +impl std::str::FromStr for ShardIndex { + type Err = hex::FromHexError; + + fn from_str(s: &str) -> Result { + // Expect format: 1 byte shard number, 1 byte shard count + if s.len() == 4 { + let bytes = s.as_bytes(); + let mut shard_parts: [u8; 2] = [0u8; 2]; + hex::decode_to_slice(bytes, &mut shard_parts)?; + Ok(Self { + shard_number: ShardNumber(shard_parts[0]), + shard_count: ShardCount(shard_parts[1]), + }) + } else { + Err(hex::FromHexError::InvalidStringLength) + } + } +} + +impl From<[u8; 2]> for ShardIndex { + fn from(b: [u8; 2]) -> Self { + Self { + shard_number: ShardNumber(b[0]), + shard_count: ShardCount(b[1]), + } + } +} + +impl Serialize for TenantShardId { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + if serializer.is_human_readable() { + serializer.collect_str(self) + } else { + // Note: while human encoding of [`TenantShardId`] is backward and forward + // compatible, this binary encoding is not. + let mut packed: [u8; 18] = [0; 18]; + packed[0..16].clone_from_slice(&self.tenant_id.as_arr()); + packed[16] = self.shard_number.0; + packed[17] = self.shard_count.0; + + packed.serialize(serializer) + } + } +} + +impl<'de> Deserialize<'de> for TenantShardId { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + struct IdVisitor { + is_human_readable_deserializer: bool, + } + + impl<'de> serde::de::Visitor<'de> for IdVisitor { + type Value = TenantShardId; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + if self.is_human_readable_deserializer { + formatter.write_str("value in form of hex string") + } else { + formatter.write_str("value in form of integer array([u8; 18])") + } + } + + fn visit_seq(self, seq: A) -> Result + where + A: serde::de::SeqAccess<'de>, + { + let s = serde::de::value::SeqAccessDeserializer::new(seq); + let id: [u8; 18] = Deserialize::deserialize(s)?; + Ok(TenantShardId::from(id)) + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + TenantShardId::from_str(v).map_err(E::custom) + } + } + + if deserializer.is_human_readable() { + deserializer.deserialize_str(IdVisitor { + is_human_readable_deserializer: true, + }) + } else { + deserializer.deserialize_tuple( + 18, + IdVisitor { + is_human_readable_deserializer: false, + }, + ) + } + } +} + +impl Serialize for ShardIndex { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + if serializer.is_human_readable() { + serializer.collect_str(self) + } else { + // Binary encoding is not used in index_part.json, but is included in anticipation of + // switching various structures (e.g. inter-process communication, remote metadata) to more + // compact binary encodings in future. + let mut packed: [u8; 2] = [0; 2]; + packed[0] = self.shard_number.0; + packed[1] = self.shard_count.0; + packed.serialize(serializer) + } + } +} + +impl<'de> Deserialize<'de> for ShardIndex { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + struct IdVisitor { + is_human_readable_deserializer: bool, + } + + impl<'de> serde::de::Visitor<'de> for IdVisitor { + type Value = ShardIndex; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + if self.is_human_readable_deserializer { + formatter.write_str("value in form of hex string") + } else { + formatter.write_str("value in form of integer array([u8; 2])") + } + } + + fn visit_seq(self, seq: A) -> Result + where + A: serde::de::SeqAccess<'de>, + { + let s = serde::de::value::SeqAccessDeserializer::new(seq); + let id: [u8; 2] = Deserialize::deserialize(s)?; + Ok(ShardIndex::from(id)) + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + ShardIndex::from_str(v).map_err(E::custom) + } + } + + if deserializer.is_human_readable() { + deserializer.deserialize_str(IdVisitor { + is_human_readable_deserializer: true, + }) + } else { + deserializer.deserialize_tuple( + 2, + IdVisitor { + is_human_readable_deserializer: false, + }, + ) + } + } +} diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs index dc4a599111..01750b2aef 100644 --- a/libs/utils/src/simple_rcu.rs +++ b/libs/utils/src/simple_rcu.rs @@ -49,12 +49,11 @@ use std::sync::{RwLock, RwLockWriteGuard}; use tokio::sync::watch; -/// /// Rcu allows multiple readers to read and hold onto a value without blocking -/// (for very long). Storing to the Rcu updates the value, making new readers -/// immediately see the new value, but it also waits for all current readers to -/// finish. +/// (for very long). /// +/// Storing to the Rcu updates the value, making new readers immediately see +/// the new value, but it also waits for all current readers to finish. pub struct Rcu { inner: RwLock>, } @@ -221,7 +220,7 @@ impl RcuWaitList { #[cfg(test)] mod tests { use super::*; - use std::sync::{Arc, Mutex}; + use std::sync::Mutex; use std::time::Duration; #[tokio::test] diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index abc3842da8..16ec563fa7 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -1,4 +1,10 @@ -use std::{sync::Arc, time::Duration}; +use std::{ + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + time::Duration, +}; /// Gates are a concurrency helper, primarily used for implementing safe shutdown. /// @@ -6,79 +12,92 @@ use std::{sync::Arc, time::Duration}; /// the resource calls `close()` when they want to ensure that all holders of guards /// have released them, and that no future guards will be issued. pub struct Gate { - /// Each caller of enter() takes one unit from the semaphore. In close(), we - /// take all the units to ensure all GateGuards are destroyed. - sem: Arc, - - /// For observability only: a name that will be used to log warnings if a particular - /// gate is holding up shutdown - name: String, + inner: Arc, } impl std::fmt::Debug for Gate { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Gate<{}>", self.name) + f.debug_struct("Gate") + // use this for identification + .field("ptr", &Arc::as_ptr(&self.inner)) + .field("inner", &self.inner) + .finish() + } +} + +struct GateInner { + sem: tokio::sync::Semaphore, + closing: std::sync::atomic::AtomicBool, +} + +impl std::fmt::Debug for GateInner { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let avail = self.sem.available_permits(); + + let guards = u32::try_from(avail) + .ok() + // the sem only supports 32-bit ish amount, but lets play it safe + .and_then(|x| Gate::MAX_UNITS.checked_sub(x)); + + let closing = self.closing.load(Ordering::Relaxed); + + if let Some(guards) = guards { + f.debug_struct("Gate") + .field("remaining_guards", &guards) + .field("closing", &closing) + .finish() + } else { + f.debug_struct("Gate") + .field("avail_permits", &avail) + .field("closing", &closing) + .finish() + } } } /// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will /// not complete. #[derive(Debug)] -pub struct GateGuard(tokio::sync::OwnedSemaphorePermit); - -/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate -async fn warn_if_stuck( - fut: Fut, - name: &str, - warn_period: std::time::Duration, -) -> ::Output { - let started = std::time::Instant::now(); - - let mut fut = std::pin::pin!(fut); - - let mut warned = false; - let ret = loop { - match tokio::time::timeout(warn_period, &mut fut).await { - Ok(ret) => break ret, - Err(_) => { - tracing::warn!( - gate = name, - elapsed_ms = started.elapsed().as_millis(), - "still waiting, taking longer than expected..." - ); - warned = true; - } - } - }; - - // If we emitted a warning for slowness, also emit a message when we complete, so that - // someone debugging a shutdown can know for sure whether we have moved past this operation. - if warned { - tracing::info!( - gate = name, - elapsed_ms = started.elapsed().as_millis(), - "completed, after taking longer than expected" - ) - } - - ret +pub struct GateGuard { + // Record the span where the gate was entered, so that we can identify who was blocking Gate::close + span_at_enter: tracing::Span, + gate: Arc, } -#[derive(Debug)] +impl Drop for GateGuard { + fn drop(&mut self) { + if self.gate.closing.load(Ordering::Relaxed) { + self.span_at_enter.in_scope( + || tracing::info!(gate = ?Arc::as_ptr(&self.gate), "kept the gate from closing"), + ); + } + + // when the permit was acquired, it was forgotten to allow us to manage it's lifecycle + // manually, so "return" the permit now. + self.gate.sem.add_permits(1); + } +} + +#[derive(Debug, thiserror::Error)] pub enum GateError { + #[error("gate is closed")] GateClosed, } +impl Default for Gate { + fn default() -> Self { + Self { + inner: Arc::new(GateInner { + sem: tokio::sync::Semaphore::new(Self::MAX_UNITS as usize), + closing: AtomicBool::new(false), + }), + } + } +} + impl Gate { const MAX_UNITS: u32 = u32::MAX; - pub fn new(name: String) -> Self { - Self { - sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)), - name, - } - } - /// Acquire a guard that will prevent close() calls from completing. If close() /// was already called, this will return an error which should be interpreted /// as "shutting down". @@ -88,11 +107,23 @@ impl Gate { /// to avoid blocking close() indefinitely: typically types that contain a Gate will /// also contain a CancellationToken. pub fn enter(&self) -> Result { - self.sem - .clone() - .try_acquire_owned() - .map(GateGuard) - .map_err(|_| GateError::GateClosed) + let permit = self + .inner + .sem + .try_acquire() + .map_err(|_| GateError::GateClosed)?; + + // we now have the permit, let's disable the normal raii functionality and leave + // "returning" the permit to our GateGuard::drop. + // + // this is done to avoid the need for multiple Arcs (one for semaphore, next for other + // fields). + permit.forget(); + + Ok(GateGuard { + span_at_enter: tracing::Span::current(), + gate: self.inner.clone(), + }) } /// Types with a shutdown() method and a gate should call this method at the @@ -102,48 +133,89 @@ impl Gate { /// important that the holders of such guards are respecting a CancellationToken which has /// been cancelled before entering this function. pub async fn close(&self) { - warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await + let started_at = std::time::Instant::now(); + let mut do_close = std::pin::pin!(self.do_close()); + + // with 1s we rarely saw anything, let's try if we get more gate closing reasons with 100ms + let nag_after = Duration::from_millis(100); + + let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else { + return; + }; + + tracing::info!( + gate = ?self.as_ptr(), + elapsed_ms = started_at.elapsed().as_millis(), + "closing is taking longer than expected" + ); + + // close operation is not trying to be cancellation safe as pageserver does not need it. + // + // note: "closing" is not checked in Gate::enter -- it exists just for observability, + // dropping of GateGuard after this will log who they were. + self.inner.closing.store(true, Ordering::Relaxed); + + do_close.await; + + tracing::info!( + gate = ?self.as_ptr(), + elapsed_ms = started_at.elapsed().as_millis(), + "close completed" + ); + } + + /// Used as an identity of a gate. This identity will be resolved to something useful when + /// it's actually closed in a hopefully sensible `tracing::Span` which will describe it even + /// more. + /// + /// `GateGuard::drop` also logs this pointer when it has realized it has been keeping the gate + /// open for too long. + fn as_ptr(&self) -> *const GateInner { + Arc::as_ptr(&self.inner) } /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish. This /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking /// the CancellationToken on such types is analogous to "Did shutdown start?" pub fn close_complete(&self) -> bool { - self.sem.is_closed() + self.inner.sem.is_closed() } + #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(gate = ?self.as_ptr()))] async fn do_close(&self) { - tracing::debug!(gate = self.name, "Closing Gate..."); - match self.sem.acquire_many(Self::MAX_UNITS).await { - Ok(_units) => { + tracing::debug!("Closing Gate..."); + + match self.inner.sem.acquire_many(Self::MAX_UNITS).await { + Ok(_permit) => { // While holding all units, close the semaphore. All subsequent calls to enter() will fail. - self.sem.close(); + self.inner.sem.close(); } - Err(_) => { + Err(_closed) => { // Semaphore closed: we are the only function that can do this, so it indicates a double-call. // This is legal. Timeline::shutdown for example is not protected from being called more than // once. - tracing::debug!(gate = self.name, "Double close") + tracing::debug!("Double close") } } - tracing::debug!(gate = self.name, "Closed Gate.") + tracing::debug!("Closed Gate.") } } #[cfg(test)] mod tests { - use futures::FutureExt; - use super::*; #[tokio::test] - async fn test_idle_gate() { - // Having taken no gates, we should not be blocked in close - let gate = Gate::new("test".to_string()); + async fn close_unused() { + // Having taken no guards, we should not be blocked in close + let gate = Gate::default(); gate.close().await; + } + #[tokio::test] + async fn close_idle() { // If a guard is dropped before entering, close should not be blocked - let gate = Gate::new("test".to_string()); + let gate = Gate::default(); let guard = gate.enter().unwrap(); drop(guard); gate.close().await; @@ -152,25 +224,30 @@ mod tests { gate.enter().expect_err("enter should fail after close"); } - #[tokio::test] - async fn test_busy_gate() { - let gate = Gate::new("test".to_string()); + #[tokio::test(start_paused = true)] + async fn close_busy_gate() { + let gate = Gate::default(); + let forever = Duration::from_secs(24 * 7 * 365); - let guard = gate.enter().unwrap(); + let guard = + tracing::info_span!("i am holding back the gate").in_scope(|| gate.enter().unwrap()); let mut close_fut = std::pin::pin!(gate.close()); - // Close should be blocked - assert!(close_fut.as_mut().now_or_never().is_none()); + // Close should be waiting for guards to drop + tokio::time::timeout(forever, &mut close_fut) + .await + .unwrap_err(); // Attempting to enter() should fail, even though close isn't done yet. gate.enter() .expect_err("enter should fail after entering close"); + // this will now log, which we cannot verify except manually drop(guard); // Guard is gone, close should finish - assert!(close_fut.as_mut().now_or_never().is_some()); + close_fut.await; // Attempting to enter() is still forbidden gate.enter().expect_err("enter should fail finishing close"); diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index 0ccaf4e716..dc711fb028 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -5,7 +5,9 @@ use std::sync::{ use tokio::sync::Semaphore; /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of -/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard +/// `SemaphorePermit`. +/// +/// Allows use of `take` which does not require holding an outer mutex guard /// for the duration of initialization. /// /// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`]. @@ -69,37 +71,87 @@ impl OnceCell { F: FnOnce(InitPermit) -> Fut, Fut: std::future::Future>, { - let sem = { + loop { + let sem = { + let guard = self.inner.lock().unwrap(); + if guard.value.is_some() { + return Ok(Guard(guard)); + } + guard.init_semaphore.clone() + }; + + { + let permit = { + // increment the count for the duration of queued + let _guard = CountWaitingInitializers::start(self); + sem.acquire().await + }; + + let Ok(permit) = permit else { + let guard = self.inner.lock().unwrap(); + if !Arc::ptr_eq(&sem, &guard.init_semaphore) { + // there was a take_and_deinit in between + continue; + } + assert!( + guard.value.is_some(), + "semaphore got closed, must be initialized" + ); + return Ok(Guard(guard)); + }; + + permit.forget(); + } + + let permit = InitPermit(sem); + let (value, _permit) = factory(permit).await?; + let guard = self.inner.lock().unwrap(); - if guard.value.is_some() { - return Ok(Guard(guard)); - } - guard.init_semaphore.clone() - }; - let permit = { - // increment the count for the duration of queued - let _guard = CountWaitingInitializers::start(self); - sem.acquire_owned().await - }; - - match permit { - Ok(permit) => { - let permit = InitPermit(permit); - let (value, _permit) = factory(permit).await?; + return Ok(Self::set0(value, guard)); + } + } + /// Returns a guard to an existing initialized value, or returns an unique initialization + /// permit which can be used to initialize this `OnceCell` using `OnceCell::set`. + pub async fn get_or_init_detached(&self) -> Result, InitPermit> { + // It looks like OnceCell::get_or_init could be implemented using this method instead of + // duplication. However, that makes the future be !Send due to possibly holding on to the + // MutexGuard over an await point. + loop { + let sem = { let guard = self.inner.lock().unwrap(); + if guard.value.is_some() { + return Ok(Guard(guard)); + } + guard.init_semaphore.clone() + }; - Ok(Self::set0(value, guard)) - } - Err(_closed) => { - let guard = self.inner.lock().unwrap(); - assert!( - guard.value.is_some(), - "semaphore got closed, must be initialized" - ); - return Ok(Guard(guard)); + { + let permit = { + // increment the count for the duration of queued + let _guard = CountWaitingInitializers::start(self); + sem.acquire().await + }; + + let Ok(permit) = permit else { + let guard = self.inner.lock().unwrap(); + if !Arc::ptr_eq(&sem, &guard.init_semaphore) { + // there was a take_and_deinit in between + continue; + } + assert!( + guard.value.is_some(), + "semaphore got closed, must be initialized" + ); + return Ok(Guard(guard)); + }; + + permit.forget(); } + + let permit = InitPermit(sem); + return Err(permit); } } @@ -142,6 +194,14 @@ impl OnceCell { } } + /// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never + /// initialized. + pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> { + let inner = self.inner.get_mut().unwrap(); + + inner.take_and_deinit() + } + /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete. pub fn initializer_count(&self) -> usize { self.initializers.load(Ordering::Relaxed) @@ -195,30 +255,58 @@ impl<'a, T> Guard<'a, T> { /// /// The permit will be on a semaphore part of the new internal value, and any following /// [`OnceCell::get_or_init`] will wait on it to complete. - pub fn take_and_deinit(&mut self) -> (T, InitPermit) { - let mut swapped = Inner::default(); - let permit = swapped - .init_semaphore - .clone() - .try_acquire_owned() - .expect("we just created this"); - std::mem::swap(&mut *self.0, &mut swapped); - swapped - .value - .map(|v| (v, InitPermit(permit))) + pub fn take_and_deinit(mut self) -> (T, InitPermit) { + self.0 + .take_and_deinit() .expect("guard is not created unless value has been initialized") } } +impl Inner { + pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> { + let value = self.value.take()?; + + let mut swapped = Inner::default(); + let sem = swapped.init_semaphore.clone(); + // acquire and forget right away, moving the control over to InitPermit + sem.try_acquire().expect("we just created this").forget(); + let permit = InitPermit(sem); + std::mem::swap(self, &mut swapped); + Some((value, permit)) + } +} + /// Type held by OnceCell (de)initializing task. -pub struct InitPermit(tokio::sync::OwnedSemaphorePermit); +/// +/// On drop, this type will return the permit. +pub struct InitPermit(Arc); + +impl std::fmt::Debug for InitPermit { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let ptr = Arc::as_ptr(&self.0) as *const (); + f.debug_tuple("InitPermit").field(&ptr).finish() + } +} + +impl Drop for InitPermit { + fn drop(&mut self) { + assert_eq!( + self.0.available_permits(), + 0, + "InitPermit should only exist as the unique permit" + ); + self.0.add_permits(1); + } +} #[cfg(test)] mod tests { + use futures::Future; + use super::*; use std::{ convert::Infallible, - sync::atomic::{AtomicUsize, Ordering}, + pin::{pin, Pin}, time::Duration, }; @@ -380,4 +468,138 @@ mod tests { .unwrap(); assert_eq!(*g, "now initialized"); } + + #[tokio::test(start_paused = true)] + async fn reproduce_init_take_deinit_race() { + init_take_deinit_scenario(|cell, factory| { + Box::pin(async { + cell.get_or_init(factory).await.unwrap(); + }) + }) + .await; + } + + type BoxedInitFuture = Pin>>>; + type BoxedInitFunction = Box BoxedInitFuture>; + + /// Reproduce an assertion failure. + /// + /// This has interesting generics to be generic between `get_or_init` and `get_mut_or_init`. + /// We currently only have one, but the structure is kept. + async fn init_take_deinit_scenario(init_way: F) + where + F: for<'a> Fn( + &'a OnceCell<&'static str>, + BoxedInitFunction<&'static str, Infallible>, + ) -> Pin + 'a>>, + { + let cell = OnceCell::default(); + + // acquire the init_semaphore only permit to drive initializing tasks in order to waiting + // on the same semaphore. + let permit = cell + .inner + .lock() + .unwrap() + .init_semaphore + .clone() + .try_acquire_owned() + .unwrap(); + + let mut t1 = pin!(init_way( + &cell, + Box::new(|permit| Box::pin(async move { Ok(("t1", permit)) })), + )); + + let mut t2 = pin!(init_way( + &cell, + Box::new(|permit| Box::pin(async move { Ok(("t2", permit)) })), + )); + + // drive t2 first to the init_semaphore -- the timeout will be hit once t2 future can + // no longer make progress + tokio::select! { + _ = &mut t2 => unreachable!("it cannot get permit"), + _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {} + } + + // followed by t1 in the init_semaphore + tokio::select! { + _ = &mut t1 => unreachable!("it cannot get permit"), + _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {} + } + + // now let t2 proceed and initialize + drop(permit); + t2.await; + + let (s, permit) = { cell.get().unwrap().take_and_deinit() }; + assert_eq!("t2", s); + + // now originally t1 would see the semaphore it has as closed. it cannot yet get a permit from + // the new one. + tokio::select! { + _ = &mut t1 => unreachable!("it cannot get permit"), + _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {} + } + + // only now we get to initialize it + drop(permit); + t1.await; + + assert_eq!("t1", *cell.get().unwrap()); + } + + #[tokio::test(start_paused = true)] + async fn detached_init_smoke() { + let target = OnceCell::default(); + + let Err(permit) = target.get_or_init_detached().await else { + unreachable!("it is not initialized") + }; + + tokio::time::timeout( + std::time::Duration::from_secs(3600 * 24 * 7 * 365), + target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }), + ) + .await + .expect_err("should timeout since we are already holding the permit"); + + target.set(42, permit); + + let (_answer, permit) = { + let guard = target + .get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) }) + .await + .unwrap(); + + assert_eq!(*guard, 42); + + guard.take_and_deinit() + }; + + assert!(target.get().is_none()); + + target.set(11, permit); + + assert_eq!(*target.get().unwrap(), 11); + } + + #[tokio::test] + async fn take_and_deinit_on_mut() { + use std::convert::Infallible; + + let mut target = OnceCell::::default(); + assert!(target.take_and_deinit().is_none()); + + target + .get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) }) + .await + .unwrap(); + + let again = target.take_and_deinit(); + assert!(matches!(again, Some((42, _))), "{again:?}"); + + assert!(target.take_and_deinit().is_none()); + } } diff --git a/libs/utils/src/toml_edit_ext.rs b/libs/utils/src/toml_edit_ext.rs new file mode 100644 index 0000000000..ab5f7bdd95 --- /dev/null +++ b/libs/utils/src/toml_edit_ext.rs @@ -0,0 +1,22 @@ +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("item is not a document")] + ItemIsNotADocument, + #[error(transparent)] + Serde(toml_edit::de::Error), +} + +pub fn deserialize_item(item: &toml_edit::Item) -> Result +where + T: serde::de::DeserializeOwned, +{ + let document: toml_edit::Document = match item { + toml_edit::Item::Table(toml) => toml.clone().into(), + toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => { + toml.clone().into_table().into() + } + _ => return Err(Error::ItemIsNotADocument), + }; + + toml_edit::de::from_document(document).map_err(Error::Serde) +} diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs index db17f7d8cd..d24c81ad0b 100644 --- a/libs/utils/src/tracing_span_assert.rs +++ b/libs/utils/src/tracing_span_assert.rs @@ -20,13 +20,13 @@ //! //! // Then, in the main code: //! -//! let span = tracing::info_span!("TestSpan", test_id = 1); +//! let span = tracing::info_span!("TestSpan", tenant_id = 1); //! let _guard = span.enter(); //! //! // ... down the call stack //! -//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor}; -//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]); +//! use utils::tracing_span_assert::{check_fields_present, ConstExtractor}; +//! let extractor = ConstExtractor::new("tenant_id"); //! if let Err(missing) = check_fields_present!([&extractor]) { //! // if you copypaste this to a custom assert method, remember to add #[track_caller] //! // to get the "user" code location for the panic. @@ -45,27 +45,26 @@ pub enum ExtractionResult { } pub trait Extractor: Send + Sync + std::fmt::Debug { - fn name(&self) -> &str; + fn id(&self) -> &str; fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult; } #[derive(Debug)] -pub struct MultiNameExtractor { - name: &'static str, - field_names: [&'static str; L], +pub struct ConstExtractor { + field_name: &'static str, } -impl MultiNameExtractor { - pub fn new(name: &'static str, field_names: [&'static str; L]) -> MultiNameExtractor { - MultiNameExtractor { name, field_names } +impl ConstExtractor { + pub const fn new(field_name: &'static str) -> ConstExtractor { + ConstExtractor { field_name } } } -impl Extractor for MultiNameExtractor { - fn name(&self) -> &str { - self.name +impl Extractor for ConstExtractor { + fn id(&self) -> &str { + self.field_name } fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult { - if fields.iter().any(|f| self.field_names.contains(&f.name())) { + if fields.iter().any(|f| f.name() == self.field_name) { ExtractionResult::Present } else { ExtractionResult::Absent @@ -203,19 +202,19 @@ mod tests { } impl<'a> fmt::Debug for MemoryIdentity<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:p}: {}", self.as_ptr(), self.0.name()) + write!(f, "{:p}: {}", self.as_ptr(), self.0.id()) } } struct Setup { _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard, - tenant_extractor: MultiNameExtractor<2>, - timeline_extractor: MultiNameExtractor<2>, + tenant_extractor: ConstExtractor, + timeline_extractor: ConstExtractor, } fn setup_current_thread() -> Setup { - let tenant_extractor = MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]); - let timeline_extractor = MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]); + let tenant_extractor = ConstExtractor::new("tenant_id"); + let timeline_extractor = ConstExtractor::new("timeline_id"); let registry = tracing_subscriber::registry() .with(tracing_subscriber::fmt::layer()) @@ -343,12 +342,12 @@ mod tests { let span = tracing::info_span!("foo", e = "some value"); let _guard = span.enter(); - let extractor = MultiNameExtractor::new("E", ["e"]); + let extractor = ConstExtractor::new("e"); let res = check_fields_present0([&extractor]); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); // similarly for a not found key - let extractor = MultiNameExtractor::new("F", ["foobar"]); + let extractor = ConstExtractor::new("foobar"); let res = check_fields_present0([&extractor]); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); } @@ -368,16 +367,14 @@ mod tests { // normally this would work, but without any tracing-subscriber configured, both // check_field_present find nothing let _guard = subspan.enter(); - let extractors: [&dyn Extractor; 2] = [ - &MultiNameExtractor::new("E", ["e"]), - &MultiNameExtractor::new("F", ["f"]), - ]; + let extractors: [&dyn Extractor; 2] = + [&ConstExtractor::new("e"), &ConstExtractor::new("f")]; let res = check_fields_present0(extractors); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); // similarly for a not found key - let extractor = MultiNameExtractor::new("G", ["g"]); + let extractor = ConstExtractor::new("g"); let res = check_fields_present0([&extractor]); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); } @@ -410,7 +407,7 @@ mod tests { let span = tracing::info_span!("foo", e = "some value"); let _guard = span.enter(); - let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])]; + let extractors: [&dyn Extractor; 1] = [&ConstExtractor::new("e")]; if span.is_disabled() { // the tests are running single threaded, or we got lucky and no other tests subscriber diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs index 9953b447c8..5f0028bacd 100644 --- a/libs/utils/src/vec_map.rs +++ b/libs/utils/src/vec_map.rs @@ -1,27 +1,61 @@ use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds}; +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum VecMapOrdering { + Greater, + GreaterOrEqual, +} + /// Ordered map datastructure implemented in a Vec. +/// /// Append only - can only add keys that are larger than the /// current max key. +/// Ordering can be adjusted using [`VecMapOrdering`] +/// during `VecMap` construction. #[derive(Clone, Debug)] -pub struct VecMap(Vec<(K, V)>); +pub struct VecMap { + data: Vec<(K, V)>, + ordering: VecMapOrdering, +} impl Default for VecMap { fn default() -> Self { - VecMap(Default::default()) + VecMap { + data: Default::default(), + ordering: VecMapOrdering::Greater, + } } } -#[derive(Debug)] -pub struct InvalidKey; +#[derive(thiserror::Error, Debug)] +pub enum VecMapError { + #[error("Key violates ordering constraint")] + InvalidKey, + #[error("Mismatched ordering constraints")] + ExtendOrderingError, +} impl VecMap { + pub fn new(ordering: VecMapOrdering) -> Self { + Self { + data: Vec::new(), + ordering, + } + } + + pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self { + Self { + data: Vec::with_capacity(capacity), + ordering, + } + } + pub fn is_empty(&self) -> bool { - self.0.is_empty() + self.data.is_empty() } pub fn as_slice(&self) -> &[(K, V)] { - self.0.as_slice() + self.data.as_slice() } /// This function may panic if given a range where the lower bound is @@ -29,7 +63,7 @@ impl VecMap { pub fn slice_range>(&self, range: R) -> &[(K, V)] { use std::ops::Bound::*; - let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key); + let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key); let start_idx = match range.start_bound() { Unbounded => 0, @@ -41,7 +75,7 @@ impl VecMap { }; let end_idx = match range.end_bound() { - Unbounded => self.0.len(), + Unbounded => self.data.len(), Included(k) => match binary_search(k) { Ok(idx) => idx + 1, Err(idx) => idx, @@ -49,34 +83,30 @@ impl VecMap { Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity), }; - &self.0[start_idx..end_idx] + &self.data[start_idx..end_idx] } /// Add a key value pair to the map. - /// If `key` is less than or equal to the current maximum key - /// the pair will not be added and InvalidKey error will be returned. - pub fn append(&mut self, key: K, value: V) -> Result { - if let Some((last_key, _last_value)) = self.0.last() { - if &key <= last_key { - return Err(InvalidKey); - } - } + /// If `key` is not respective of the `self` ordering the + /// pair will not be added and `InvalidKey` error will be returned. + pub fn append(&mut self, key: K, value: V) -> Result { + self.validate_key_order(&key)?; let delta_size = self.instrument_vec_op(|vec| vec.push((key, value))); Ok(delta_size) } /// Update the maximum key value pair or add a new key value pair to the map. - /// If `key` is less than the current maximum key no updates or additions - /// will occur and InvalidKey error will be returned. + /// If `key` is not respective of the `self` ordering no updates or additions + /// will occur and `InvalidKey` error will be returned. pub fn append_or_update_last( &mut self, key: K, mut value: V, - ) -> Result<(Option, usize), InvalidKey> { - if let Some((last_key, last_value)) = self.0.last_mut() { + ) -> Result<(Option, usize), VecMapError> { + if let Some((last_key, last_value)) = self.data.last_mut() { match key.cmp(last_key) { - Ordering::Less => return Err(InvalidKey), + Ordering::Less => return Err(VecMapError::InvalidKey), Ordering::Equal => { std::mem::swap(last_value, &mut value); const DELTA_SIZE: usize = 0; @@ -100,40 +130,67 @@ impl VecMap { V: Clone, { let split_idx = self - .0 + .data .binary_search_by_key(&cutoff, extract_key) .unwrap_or_else(std::convert::identity); ( - VecMap(self.0[..split_idx].to_vec()), - VecMap(self.0[split_idx..].to_vec()), + VecMap { + data: self.data[..split_idx].to_vec(), + ordering: self.ordering, + }, + VecMap { + data: self.data[split_idx..].to_vec(), + ordering: self.ordering, + }, ) } /// Move items from `other` to the end of `self`, leaving `other` empty. - /// If any keys in `other` is less than or equal to any key in `self`, - /// `InvalidKey` error will be returned and no mutation will occur. - pub fn extend(&mut self, other: &mut Self) -> Result { - let self_last_opt = self.0.last().map(extract_key); - let other_first_opt = other.0.last().map(extract_key); + /// If the `other` ordering is different from `self` ordering + /// `ExtendOrderingError` error will be returned. + /// If any keys in `other` is not respective of the ordering defined in + /// `self`, `InvalidKey` error will be returned and no mutation will occur. + pub fn extend(&mut self, other: &mut Self) -> Result { + if self.ordering != other.ordering { + return Err(VecMapError::ExtendOrderingError); + } - if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) { - if self_last >= other_first { - return Err(InvalidKey); + let other_first_opt = other.data.last().map(extract_key); + if let Some(other_first) = other_first_opt { + self.validate_key_order(other_first)?; + } + + let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data)); + Ok(delta_size) + } + + /// Validate the current last key in `self` and key being + /// inserted against the order defined in `self`. + fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> { + if let Some(last_key) = self.data.last().map(extract_key) { + match (&self.ordering, &key.cmp(last_key)) { + (VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => { + return Err(VecMapError::InvalidKey); + } + (VecMapOrdering::Greater, Ordering::Greater) => {} + (VecMapOrdering::GreaterOrEqual, Ordering::Less) => { + return Err(VecMapError::InvalidKey); + } + (VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {} } } - let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0)); - Ok(delta_size) + Ok(()) } /// Instrument an operation on the underlying [`Vec`]. /// Will panic if the operation decreases capacity. /// Returns the increase in memory usage caused by the op. fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize { - let old_cap = self.0.capacity(); - op(&mut self.0); - let new_cap = self.0.capacity(); + let old_cap = self.data.capacity(); + op(&mut self.data); + let new_cap = self.data.capacity(); match old_cap.cmp(&new_cap) { Ordering::Less => { @@ -145,6 +202,36 @@ impl VecMap { Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"), } } + + /// Similar to `from_iter` defined in `FromIter` trait except + /// that it accepts an [`VecMapOrdering`] + pub fn from_iter>(iter: I, ordering: VecMapOrdering) -> Self { + let iter = iter.into_iter(); + let initial_capacity = { + match iter.size_hint() { + (lower_bound, None) => lower_bound, + (_, Some(upper_bound)) => upper_bound, + } + }; + + let mut vec_map = VecMap::with_capacity(initial_capacity, ordering); + for (key, value) in iter { + vec_map + .append(key, value) + .expect("The passed collection needs to be sorted!"); + } + + vec_map + } +} + +impl IntoIterator for VecMap { + type Item = (K, V); + type IntoIter = std::vec::IntoIter<(K, V)>; + + fn into_iter(self) -> Self::IntoIter { + self.data.into_iter() + } } fn extract_key(entry: &(K, V)) -> &K { @@ -155,7 +242,7 @@ fn extract_key(entry: &(K, V)) -> &K { mod tests { use std::{collections::BTreeMap, ops::Bound}; - use super::VecMap; + use super::{VecMap, VecMapOrdering}; #[test] fn unbounded_range() { @@ -310,5 +397,59 @@ mod tests { left.extend(&mut one_map).unwrap_err(); assert_eq!(left.as_slice(), &[(0, ()), (1, ())]); assert_eq!(one_map.as_slice(), &[(1, ())]); + + let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual); + map_greater_or_equal.append(2, ()).unwrap(); + map_greater_or_equal.append(2, ()).unwrap(); + + left.extend(&mut map_greater_or_equal).unwrap_err(); + assert_eq!(left.as_slice(), &[(0, ()), (1, ())]); + assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]); + } + + #[test] + fn extend_with_ordering() { + let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual); + left.append(0, ()).unwrap(); + assert_eq!(left.as_slice(), &[(0, ())]); + + let mut greater_right = VecMap::new(VecMapOrdering::Greater); + greater_right.append(0, ()).unwrap(); + left.extend(&mut greater_right).unwrap_err(); + assert_eq!(left.as_slice(), &[(0, ())]); + + let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual); + greater_or_equal_right.append(2, ()).unwrap(); + greater_or_equal_right.append(2, ()).unwrap(); + left.extend(&mut greater_or_equal_right).unwrap(); + assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]); + } + + #[test] + fn vec_map_from_sorted() { + let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())]; + let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater); + assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]); + + let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]; + let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual); + assert_eq!( + vec_map.as_slice(), + &[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())] + ); + } + + #[test] + #[should_panic] + fn vec_map_from_unsorted_greater() { + let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())]; + let _ = VecMap::from_iter(vec, VecMapOrdering::Greater); + } + + #[test] + #[should_panic] + fn vec_map_from_unsorted_greater_or_equal() { + let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())]; + let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual); } } diff --git a/libs/utils/src/yielding_loop.rs b/libs/utils/src/yielding_loop.rs index 963279eb4c..68274f0631 100644 --- a/libs/utils/src/yielding_loop.rs +++ b/libs/utils/src/yielding_loop.rs @@ -6,9 +6,10 @@ pub enum YieldingLoopError { Cancelled, } -/// Helper for long synchronous loops, e.g. over all tenants in the system. Periodically -/// yields to avoid blocking the executor, and after resuming checks the provided -/// cancellation token to drop out promptly on shutdown. +/// Helper for long synchronous loops, e.g. over all tenants in the system. +/// +/// Periodically yields to avoid blocking the executor, and after resuming +/// checks the provided cancellation token to drop out promptly on shutdown. #[inline(always)] pub async fn yielding_loop( interval: usize, @@ -23,7 +24,7 @@ where for (i, item) in iter.enumerate() { visitor(item); - if i + 1 % interval == 0 { + if (i + 1) % interval == 0 { tokio::task::yield_now().await; if cancel.is_cancelled() { return Err(YieldingLoopError::Cancelled); diff --git a/libs/utils/src/zstd.rs b/libs/utils/src/zstd.rs new file mode 100644 index 0000000000..be2dcc00f5 --- /dev/null +++ b/libs/utils/src/zstd.rs @@ -0,0 +1,78 @@ +use std::io::SeekFrom; + +use anyhow::{Context, Result}; +use async_compression::{ + tokio::{bufread::ZstdDecoder, write::ZstdEncoder}, + zstd::CParameter, + Level, +}; +use camino::Utf8Path; +use nix::NixPath; +use tokio::{ + fs::{File, OpenOptions}, + io::AsyncBufRead, + io::AsyncSeekExt, + io::AsyncWriteExt, +}; +use tokio_tar::{Archive, Builder, HeaderMode}; +use walkdir::WalkDir; + +/// Creates a Zstandard tarball. +pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> { + let file = OpenOptions::new() + .create(true) + .truncate(true) + .read(true) + .write(true) + .open(&tarball) + .await + .with_context(|| format!("tempfile creation {tarball}"))?; + + let mut paths = Vec::new(); + for entry in WalkDir::new(path) { + let entry = entry?; + let metadata = entry.metadata().expect("error getting dir entry metadata"); + // Also allow directories so that we also get empty directories + if !(metadata.is_file() || metadata.is_dir()) { + continue; + } + let path = entry.into_path(); + paths.push(path); + } + // Do a sort to get a more consistent listing + paths.sort_unstable(); + let zstd = ZstdEncoder::with_quality_and_params( + file, + Level::Default, + &[CParameter::enable_long_distance_matching(true)], + ); + let mut builder = Builder::new(zstd); + // Use reproducible header mode + builder.mode(HeaderMode::Deterministic); + for p in paths { + let rel_path = p.strip_prefix(path)?; + if rel_path.is_empty() { + // The top directory should not be compressed, + // the tar crate doesn't like that + continue; + } + builder.append_path_with_name(&p, rel_path).await?; + } + let mut zstd = builder.into_inner().await?; + zstd.shutdown().await?; + let mut compressed = zstd.into_inner(); + let compressed_len = compressed.metadata().await?.len(); + compressed.seek(SeekFrom::Start(0)).await?; + Ok((compressed, compressed_len)) +} + +/// Creates a Zstandard tarball. +pub async fn extract_zst_tarball( + path: &Utf8Path, + tarball: impl AsyncBufRead + Unpin, +) -> Result<()> { + let decoder = Box::pin(ZstdDecoder::new(tarball)); + let mut archive = Archive::new(decoder); + archive.unpack(path).await?; + Ok(()) +} diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs index 7160a42df2..3223765016 100644 --- a/libs/vm_monitor/src/cgroup.rs +++ b/libs/vm_monitor/src/cgroup.rs @@ -25,6 +25,8 @@ pub struct Config { /// /// For simplicity, this value must be greater than or equal to `memory_history_len`. memory_history_log_interval: usize, + /// The max number of iterations to skip before logging the next iteration + memory_history_log_noskip_interval: Duration, } impl Default for Config { @@ -33,6 +35,7 @@ impl Default for Config { memory_poll_interval: Duration::from_millis(100), memory_history_len: 5, // use 500ms of history for decision-making memory_history_log_interval: 20, // but only log every ~2s (otherwise it's spammy) + memory_history_log_noskip_interval: Duration::from_secs(15), // but only if it's changed, or 60 seconds have passed } } } @@ -85,7 +88,12 @@ impl CgroupWatcher { // buffer for samples that will be logged. once full, it remains so. let history_log_len = self.config.memory_history_log_interval; + let max_skip = self.config.memory_history_log_noskip_interval; let mut history_log_buf = vec![MemoryStatus::zeroed(); history_log_len]; + let mut last_logged_memusage = MemoryStatus::zeroed(); + + // Ensure that we're tracking a value that's definitely in the past, as Instant::now is only guaranteed to be non-decreasing on Rust's T1-supported systems. + let mut can_skip_logs_until = Instant::now() - max_skip; for t in 0_u64.. { ticker.tick().await; @@ -115,12 +123,24 @@ impl CgroupWatcher { // equal to the logging interval, we can just log the entire buffer every time we set // the last entry, which also means that for this log line, we can ignore that it's a // ring buffer (because all the entries are in order of increasing time). - if i == history_log_len - 1 { + // + // We skip logging the data if data hasn't meaningfully changed in a while, unless + // we've already ignored previous iterations for the last max_skip period. + if i == history_log_len - 1 + && (now > can_skip_logs_until + || !history_log_buf + .iter() + .all(|usage| last_logged_memusage.status_is_close_or_similar(usage))) + { info!( history = ?MemoryStatus::debug_slice(&history_log_buf), summary = ?summary, "Recent cgroup memory statistics history" ); + + can_skip_logs_until = now + max_skip; + + last_logged_memusage = *history_log_buf.last().unwrap(); } updates @@ -232,6 +252,24 @@ impl MemoryStatus { DS(slice) } + + /// Check if the other memory status is a close or similar result. + /// Returns true if the larger value is not larger than the smaller value + /// by 1/8 of the smaller value, and within 128MiB. + /// See tests::check_similarity_behaviour for examples of behaviour + fn status_is_close_or_similar(&self, other: &MemoryStatus) -> bool { + let margin; + let diff; + if self.non_reclaimable >= other.non_reclaimable { + margin = other.non_reclaimable / 8; + diff = self.non_reclaimable - other.non_reclaimable; + } else { + margin = self.non_reclaimable / 8; + diff = other.non_reclaimable - self.non_reclaimable; + } + + diff < margin && diff < 128 * 1024 * 1024 + } } #[cfg(test)] @@ -261,4 +299,65 @@ mod tests { assert_eq!(values(2, 4), [9, 0, 1, 2]); assert_eq!(values(2, 10), [3, 4, 5, 6, 7, 8, 9, 0, 1, 2]); } + + #[test] + fn check_similarity_behaviour() { + // This all accesses private methods, so we can't actually run this + // as doctests, because doctests run as an external crate. + let mut small = super::MemoryStatus { + non_reclaimable: 1024, + }; + let mut large = super::MemoryStatus { + non_reclaimable: 1024 * 1024 * 1024 * 1024, + }; + + // objects are self-similar, no matter the size + assert!(small.status_is_close_or_similar(&small)); + assert!(large.status_is_close_or_similar(&large)); + + // inequality is symmetric + assert!(!small.status_is_close_or_similar(&large)); + assert!(!large.status_is_close_or_similar(&small)); + + small.non_reclaimable = 64; + large.non_reclaimable = (small.non_reclaimable / 8) * 9; + + // objects are self-similar, no matter the size + assert!(small.status_is_close_or_similar(&small)); + assert!(large.status_is_close_or_similar(&large)); + + // values are similar if the larger value is larger by less than + // 12.5%, i.e. 1/8 of the smaller value. + // In the example above, large is exactly 12.5% larger, so this doesn't + // match. + assert!(!small.status_is_close_or_similar(&large)); + assert!(!large.status_is_close_or_similar(&small)); + + large.non_reclaimable -= 1; + assert!(large.status_is_close_or_similar(&large)); + + assert!(small.status_is_close_or_similar(&large)); + assert!(large.status_is_close_or_similar(&small)); + + // The 1/8 rule only applies up to 128MiB of difference + small.non_reclaimable = 1024 * 1024 * 1024 * 1024; + large.non_reclaimable = small.non_reclaimable / 8 * 9; + assert!(small.status_is_close_or_similar(&small)); + assert!(large.status_is_close_or_similar(&large)); + + assert!(!small.status_is_close_or_similar(&large)); + assert!(!large.status_is_close_or_similar(&small)); + // the large value is put just above the threshold + large.non_reclaimable = small.non_reclaimable + 128 * 1024 * 1024; + assert!(large.status_is_close_or_similar(&large)); + + assert!(!small.status_is_close_or_similar(&large)); + assert!(!large.status_is_close_or_similar(&small)); + // now below + large.non_reclaimable -= 1; + assert!(large.status_is_close_or_similar(&large)); + + assert!(small.status_is_close_or_similar(&large)); + assert!(large.status_is_close_or_similar(&small)); + } } diff --git a/libs/vm_monitor/src/dispatcher.rs b/libs/vm_monitor/src/dispatcher.rs index c76baf04e7..6a965ace9b 100644 --- a/libs/vm_monitor/src/dispatcher.rs +++ b/libs/vm_monitor/src/dispatcher.rs @@ -12,11 +12,11 @@ use futures::{ stream::{SplitSink, SplitStream}, SinkExt, StreamExt, }; -use tracing::info; +use tracing::{debug, info}; use crate::protocol::{ - OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, PROTOCOL_MAX_VERSION, - PROTOCOL_MIN_VERSION, + OutboundMsg, OutboundMsgKind, ProtocolRange, ProtocolResponse, ProtocolVersion, + PROTOCOL_MAX_VERSION, PROTOCOL_MIN_VERSION, }; /// The central handler for all communications in the monitor. @@ -118,7 +118,12 @@ impl Dispatcher { /// serialize the wrong thing and send it, since `self.sink.send` will take /// any string. pub async fn send(&mut self, message: OutboundMsg) -> anyhow::Result<()> { - info!(?message, "sending message"); + if matches!(&message.inner, OutboundMsgKind::HealthCheck { .. }) { + debug!(?message, "sending message"); + } else { + info!(?message, "sending message"); + } + let json = serde_json::to_string(&message).context("failed to serialize message")?; self.sink .send(Message::Text(json)) diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs index ba37966476..36f8573a38 100644 --- a/libs/vm_monitor/src/runner.rs +++ b/libs/vm_monitor/src/runner.rs @@ -12,7 +12,7 @@ use axum::extract::ws::{Message, WebSocket}; use futures::StreamExt; use tokio::sync::{broadcast, watch}; use tokio_util::sync::CancellationToken; -use tracing::{error, info, warn}; +use tracing::{debug, error, info, warn}; use crate::cgroup::{self, CgroupWatcher}; use crate::dispatcher::Dispatcher; @@ -69,7 +69,7 @@ pub struct Config { /// should be removed once we have a better solution there. sys_buffer_bytes: u64, - /// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in + /// Minimum fraction of total system memory reserved *before* the cgroup threshold; in /// other words, providing a ceiling for the highest value of the threshold by enforcing that /// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the /// threshold. @@ -474,26 +474,29 @@ impl Runner { // there is a message from the agent msg = self.dispatcher.source.next() => { if let Some(msg) = msg { - // Don't use 'message' as a key as the string also uses - // that for its key - info!(?msg, "received message"); - match msg { + match &msg { Ok(msg) => { let message: InboundMsg = match msg { Message::Text(text) => { - serde_json::from_str(&text).context("failed to deserialize text message")? + serde_json::from_str(text).context("failed to deserialize text message")? } other => { warn!( // Don't use 'message' as a key as the // string also uses that for its key msg = ?other, - "agent should only send text messages but received different type" + "problem processing incoming message: agent should only send text messages but received different type" ); continue }, }; + if matches!(&message.inner, InboundMsgKind::HealthCheck { .. }) { + debug!(?msg, "received message"); + } else { + info!(?msg, "received message"); + } + let out = match self.process_message(message.clone()).await { Ok(Some(out)) => out, Ok(None) => continue, @@ -517,7 +520,11 @@ impl Runner { .await .context("failed to send message")?; } - Err(e) => warn!("{e}"), + Err(e) => warn!( + error = format!("{e}"), + msg = ?msg, + "received error message" + ), } } else { anyhow::bail!("dispatcher connection closed") diff --git a/libs/walproposer/Cargo.toml b/libs/walproposer/Cargo.toml index 73aa073c44..2d442dc429 100644 --- a/libs/walproposer/Cargo.toml +++ b/libs/walproposer/Cargo.toml @@ -9,8 +9,6 @@ anyhow.workspace = true utils.workspace = true postgres_ffi.workspace = true -workspace_hack.workspace = true - [build-dependencies] anyhow.workspace = true bindgen.workspace = true diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs index fd09030dbd..28547f52bf 100644 --- a/libs/walproposer/build.rs +++ b/libs/walproposer/build.rs @@ -4,7 +4,6 @@ use std::{env, path::PathBuf, process::Command}; use anyhow::{anyhow, Context}; -use bindgen::CargoCallbacks; fn main() -> anyhow::Result<()> { // Tell cargo to invalidate the built crate whenever the wrapper changes @@ -34,6 +33,9 @@ fn main() -> anyhow::Result<()> { println!("cargo:rustc-link-lib=static=walproposer"); println!("cargo:rustc-link-search={walproposer_lib_search_str}"); + // Rebuild crate when libwalproposer.a changes + println!("cargo:rerun-if-changed={walproposer_lib_search_str}/libwalproposer.a"); + let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config"); let inc_server_path: String = if pg_config_bin.exists() { let output = Command::new(pg_config_bin) @@ -61,16 +63,25 @@ fn main() -> anyhow::Result<()> { .map_err(|s| anyhow!("Bad postgres server path {s:?}"))? }; + let unwind_abi_functions = [ + "log_internal", + "recovery_download", + "start_streaming", + "finish_sync_safekeepers", + "wait_event_set", + "WalProposerStart", + ]; + // The bindgen::Builder is the main entry point // to bindgen, and lets you build up options for // the resulting bindings. - let bindings = bindgen::Builder::default() + let mut builder = bindgen::Builder::default() // The input header we would like to generate // bindings for. .header("bindgen_deps.h") // Tell cargo to invalidate the built crate whenever any of the // included header files changed. - .parse_callbacks(Box::new(CargoCallbacks)) + .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())) .allowlist_type("WalProposer") .allowlist_type("WalProposerConfig") .allowlist_type("walproposer_api") @@ -79,6 +90,7 @@ fn main() -> anyhow::Result<()> { .allowlist_function("WalProposerBroadcast") .allowlist_function("WalProposerPoll") .allowlist_function("WalProposerFree") + .allowlist_function("SafekeeperStateDesiredEvents") .allowlist_var("DEBUG5") .allowlist_var("DEBUG4") .allowlist_var("DEBUG3") @@ -91,6 +103,7 @@ fn main() -> anyhow::Result<()> { .allowlist_var("ERROR") .allowlist_var("FATAL") .allowlist_var("PANIC") + .allowlist_var("PG_VERSION_NUM") .allowlist_var("WPEVENT") .allowlist_var("WL_LATCH_SET") .allowlist_var("WL_SOCKET_READABLE") @@ -100,7 +113,12 @@ fn main() -> anyhow::Result<()> { .allowlist_var("WL_SOCKET_MASK") .clang_arg("-DWALPROPOSER_LIB") .clang_arg(format!("-I{pgxn_neon}")) - .clang_arg(format!("-I{inc_server_path}")) + .clang_arg(format!("-I{inc_server_path}")); + + for name in unwind_abi_functions { + builder = builder.override_abi(bindgen::Abi::CUnwind, name); + } + let bindings = builder // Finish the builder and generate the bindings. .generate() // Unwrap the Result and panic on failure. diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index 1f7bf952dc..2fbea3fe45 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -22,6 +22,7 @@ use crate::bindings::WalProposerExecStatusType; use crate::bindings::WalproposerShmemState; use crate::bindings::XLogRecPtr; use crate::walproposer::ApiImpl; +use crate::walproposer::StreamingCallback; use crate::walproposer::WaitResult; extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState { @@ -32,11 +33,12 @@ extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemStat } } -extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) { +extern "C-unwind" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; - (*api).start_streaming(startpos) + let callback = StreamingCallback::new(wp); + (*api).start_streaming(startpos, &callback); } } @@ -48,6 +50,14 @@ extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr { } } +extern "C" fn update_donor(wp: *mut WalProposer, donor: *mut Safekeeper, donor_lsn: XLogRecPtr) { + unsafe { + let callback_data = (*(*wp).config).callback_data; + let api = callback_data as *mut Box; + (*api).update_donor(&mut (*donor), donor_lsn) + } +} + extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz { unsafe { let callback_data = (*(*wp).config).callback_data; @@ -134,19 +144,18 @@ extern "C" fn conn_async_read( unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; - let (res, result) = (*api).conn_async_read(&mut (*sk)); // This function has guarantee that returned buf will be valid until // the next call. So we can store a Vec in each Safekeeper and reuse // it on the next call. let mut inbuf = take_vec_u8(&mut (*sk).inbuf).unwrap_or_default(); - inbuf.clear(); - inbuf.extend_from_slice(res); + + let result = (*api).conn_async_read(&mut (*sk), &mut inbuf); // Put a Vec back to sk->inbuf and return data ptr. + *amount = inbuf.len() as i32; *buf = store_vec_u8(&mut (*sk).inbuf, inbuf); - *amount = res.len() as i32; result } @@ -178,10 +187,14 @@ extern "C" fn conn_blocking_write( } } -extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool { +extern "C-unwind" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; + + // currently `recovery_download` is always called right after election + (*api).after_election(&mut (*wp)); + (*api).recovery_download(&mut (*wp), &mut (*sk)) } } @@ -259,7 +272,7 @@ extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) { } } -extern "C" fn wait_event_set( +extern "C-unwind" fn wait_event_set( wp: *mut WalProposer, timeout: ::std::os::raw::c_long, event_sk: *mut *mut Safekeeper, @@ -277,7 +290,8 @@ extern "C" fn wait_event_set( } WaitResult::Timeout => { *event_sk = std::ptr::null_mut(); - *events = crate::bindings::WL_TIMEOUT; + // WaitEventSetWait returns 0 for timeout. + *events = 0; 0 } WaitResult::Network(sk, event_mask) => { @@ -310,7 +324,7 @@ extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr { } } -extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) { +extern "C-unwind" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; @@ -318,15 +332,15 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) { } } -extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) { +extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekeeper) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; - (*api).process_safekeeper_feedback(&mut (*wp), commit_lsn) + (*api).process_safekeeper_feedback(&mut (*wp), &mut (*sk)); } } -extern "C" fn log_internal( +extern "C-unwind" fn log_internal( wp: *mut WalProposer, level: ::std::os::raw::c_int, line: *const ::std::os::raw::c_char, @@ -340,7 +354,7 @@ extern "C" fn log_internal( } } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum Level { Debug5, Debug4, @@ -385,6 +399,7 @@ pub(crate) fn create_api() -> walproposer_api { get_shmem_state: Some(get_shmem_state), start_streaming: Some(start_streaming), get_flush_rec_ptr: Some(get_flush_rec_ptr), + update_donor: Some(update_donor), get_current_timestamp: Some(get_current_timestamp), conn_error_message: Some(conn_error_message), conn_status: Some(conn_status), @@ -415,6 +430,32 @@ pub(crate) fn create_api() -> walproposer_api { } } +pub fn empty_shmem() -> crate::bindings::WalproposerShmemState { + let empty_feedback = crate::bindings::PageserverFeedback { + present: false, + currentClusterSize: 0, + last_received_lsn: 0, + disk_consistent_lsn: 0, + remote_consistent_lsn: 0, + replytime: 0, + shard_number: 0, + }; + + crate::bindings::WalproposerShmemState { + propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 }, + donor_name: [0; 64], + donor_conninfo: [0; 1024], + donor_lsn: 0, + mutex: 0, + mineLastElectedTerm: crate::bindings::pg_atomic_uint64 { value: 0 }, + backpressureThrottlingTime: crate::bindings::pg_atomic_uint64 { value: 0 }, + currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 }, + shard_ps_feedback: [empty_feedback; 128], + num_shards: 0, + min_ps_feedback: empty_feedback, + } +} + impl std::fmt::Display for Level { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "{:?}", self) diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index 7251545792..ba75171db2 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -1,26 +1,27 @@ -use std::ffi::CString; +#![allow(clippy::todo)] -use postgres_ffi::WAL_SEGMENT_SIZE; -use utils::id::TenantTimelineId; +use std::ffi::CString; use crate::{ api_bindings::{create_api, take_vec_u8, Level}, bindings::{ - NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, - WalProposerFree, WalProposerStart, + NeonWALReadResult, Safekeeper, WalProposer, WalProposerBroadcast, WalProposerConfig, + WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart, }, }; +use postgres_ffi::WAL_SEGMENT_SIZE; +use utils::{id::TenantTimelineId, lsn::Lsn}; /// Rust high-level wrapper for C walproposer API. Many methods are not required /// for simple cases, hence todo!() in default implementations. /// /// Refer to `pgxn/neon/walproposer.h` for documentation. pub trait ApiImpl { - fn get_shmem_state(&self) -> &mut crate::bindings::WalproposerShmemState { + fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState { todo!() } - fn start_streaming(&self, _startpos: u64) { + fn start_streaming(&self, _startpos: u64, _callback: &StreamingCallback) { todo!() } @@ -28,6 +29,10 @@ pub trait ApiImpl { todo!() } + fn update_donor(&self, _donor: &mut Safekeeper, _donor_lsn: u64) { + todo!() + } + fn get_current_timestamp(&self) -> i64 { todo!() } @@ -70,7 +75,11 @@ pub trait ApiImpl { todo!() } - fn conn_async_read(&self, _sk: &mut Safekeeper) -> (&[u8], crate::bindings::PGAsyncReadResult) { + fn conn_async_read( + &self, + _sk: &mut Safekeeper, + _vec: &mut Vec, + ) -> crate::bindings::PGAsyncReadResult { todo!() } @@ -138,7 +147,7 @@ pub trait ApiImpl { todo!() } - fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) { + fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer, _sk: &mut Safekeeper) { todo!() } @@ -151,12 +160,14 @@ pub trait ApiImpl { } } +#[derive(Debug)] pub enum WaitResult { Latch, Timeout, Network(*mut Safekeeper, u32), } +#[derive(Clone)] pub struct Config { /// Tenant and timeline id pub ttid: TenantTimelineId, @@ -242,6 +253,24 @@ impl Drop for Wrapper { } } +pub struct StreamingCallback { + wp: *mut WalProposer, +} + +impl StreamingCallback { + pub fn new(wp: *mut WalProposer) -> StreamingCallback { + StreamingCallback { wp } + } + + pub fn broadcast(&self, startpos: Lsn, endpos: Lsn) { + unsafe { WalProposerBroadcast(self.wp, startpos.0, endpos.0) } + } + + pub fn poll(&self) { + unsafe { WalProposerPoll(self.wp) } + } +} + #[cfg(test)] mod tests { use core::panic; @@ -250,9 +279,14 @@ mod tests { sync::{atomic::AtomicUsize, mpsc::sync_channel}, }; + use std::cell::UnsafeCell; use utils::id::TenantTimelineId; - use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper}; + use crate::{ + api_bindings::Level, + bindings::{NeonWALReadResult, PG_VERSION_NUM}, + walproposer::Wrapper, + }; use super::ApiImpl; @@ -273,6 +307,8 @@ mod tests { replies_ptr: AtomicUsize, // channel to send LSN to the main thread sync_channel: std::sync::mpsc::SyncSender, + // Shmem state, used for storing donor info + shmem: UnsafeCell, } impl MockImpl { @@ -303,11 +339,22 @@ mod tests { } impl ApiImpl for MockImpl { + fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState { + self.shmem.get() + } + fn get_current_timestamp(&self) -> i64 { println!("get_current_timestamp"); 0 } + fn update_donor(&self, donor: &mut crate::bindings::Safekeeper, donor_lsn: u64) { + let mut shmem = unsafe { *self.get_shmem_state() }; + shmem.propEpochStartLsn.value = donor_lsn; + shmem.donor_conninfo = donor.conninfo; + shmem.donor_lsn = donor_lsn; + } + fn conn_status( &self, _: &mut crate::bindings::Safekeeper, @@ -344,14 +391,13 @@ mod tests { fn conn_async_read( &self, _: &mut crate::bindings::Safekeeper, - ) -> (&[u8], crate::bindings::PGAsyncReadResult) { + vec: &mut Vec, + ) -> crate::bindings::PGAsyncReadResult { println!("conn_async_read"); let reply = self.next_safekeeper_reply(); println!("conn_async_read result: {:?}", reply); - ( - reply, - crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS, - ) + vec.extend_from_slice(reply); + crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS } fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool { @@ -447,40 +493,82 @@ mod tests { let (sender, receiver) = sync_channel(1); + // Messages definitions are at walproposer.h + // xxx: it would be better to extract them from safekeeper crate and + // use serialization/deserialization here. + let greeting_tag = (b'g' as u64).to_ne_bytes(); + let proto_version = 2_u32.to_ne_bytes(); + let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes(); + let proposer_id = [0; 16]; + let system_id = 0_u64.to_ne_bytes(); + let tenant_id = ttid.tenant_id.as_arr(); + let timeline_id = ttid.timeline_id.as_arr(); + let pg_tli = 1_u32.to_ne_bytes(); + let wal_seg_size = 16777216_u32.to_ne_bytes(); + let proposer_greeting = [ + greeting_tag.as_slice(), + proto_version.as_slice(), + pg_version.as_slice(), + proposer_id.as_slice(), + system_id.as_slice(), + tenant_id.as_slice(), + timeline_id.as_slice(), + pg_tli.as_slice(), + wal_seg_size.as_slice(), + ] + .concat(); + + let voting_tag = (b'v' as u64).to_ne_bytes(); + let vote_request_term = 3_u64.to_ne_bytes(); + let proposer_id = [0; 16]; + let vote_request = [ + voting_tag.as_slice(), + vote_request_term.as_slice(), + proposer_id.as_slice(), + ] + .concat(); + + let acceptor_greeting_term = 2_u64.to_ne_bytes(); + let acceptor_greeting_node_id = 1_u64.to_ne_bytes(); + let acceptor_greeting = [ + greeting_tag.as_slice(), + acceptor_greeting_term.as_slice(), + acceptor_greeting_node_id.as_slice(), + ] + .concat(); + + let vote_response_term = 3_u64.to_ne_bytes(); + let vote_given = 1_u64.to_ne_bytes(); + let flush_lsn = 0x539_u64.to_ne_bytes(); + let truncate_lsn = 0x539_u64.to_ne_bytes(); + let th_len = 1_u32.to_ne_bytes(); + let th_term = 2_u64.to_ne_bytes(); + let th_lsn = 0x539_u64.to_ne_bytes(); + let timeline_start_lsn = 0x539_u64.to_ne_bytes(); + let vote_response = [ + voting_tag.as_slice(), + vote_response_term.as_slice(), + vote_given.as_slice(), + flush_lsn.as_slice(), + truncate_lsn.as_slice(), + th_len.as_slice(), + th_term.as_slice(), + th_lsn.as_slice(), + timeline_start_lsn.as_slice(), + ] + .concat(); + let my_impl: Box = Box::new(MockImpl { wait_events: Cell::new(WaitEventsData { sk: std::ptr::null_mut(), event_mask: 0, }), - expected_messages: vec![ - // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160001, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 }) - vec![ - 103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110, - 147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147, - 188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1, - ], - // VoteRequest(VoteRequest { term: 3 }) - vec![ - 118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - ], - ], + expected_messages: vec![proposer_greeting, vote_request], expected_ptr: AtomicUsize::new(0), - safekeeper_replies: vec![ - // Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) }) - vec![ - 103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, - ], - // VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 }) - vec![ - 118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57, - 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, - 0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, - ], - ], + safekeeper_replies: vec![acceptor_greeting, vote_response], replies_ptr: AtomicUsize::new(0), sync_channel: sender, + shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()), }); let config = crate::walproposer::Config { ttid, diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 980fbab22e..24373afca3 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -8,20 +8,21 @@ license.workspace = true default = [] # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, # which adds some runtime cost to run tests on outage conditions -testing = ["fail/failpoints"] +testing = ["fail/failpoints", "pageserver_api/testing" ] [dependencies] anyhow.workspace = true +arc-swap.workspace = true async-compression.workspace = true async-stream.workspace = true async-trait.workspace = true +bit_field.workspace = true byteorder.workspace = true bytes.workspace = true camino.workspace = true camino-tempfile.workspace = true chrono = { workspace = true, features = ["serde"] } clap = { workspace = true, features = ["string"] } -close_fds.workspace = true const_format.workspace = true consumption_metrics.workspace = true crc32c.workspace = true @@ -48,8 +49,10 @@ postgres_backend.workspace = true postgres-protocol.workspace = true postgres-types.workspace = true rand.workspace = true +range-set-blaze = { version = "0.1.16", features = ["alloc"] } regex.workspace = true scopeguard.workspace = true +send-future.workspace = true serde.workspace = true serde_json = { workspace = true, features = ["raw_value"] } serde_path_to_error.workspace = true @@ -58,19 +61,24 @@ signal-hook.workspace = true smallvec = { workspace = true, features = ["write"] } svg_fmt.workspace = true sync_wrapper.workspace = true +sysinfo.workspace = true tokio-tar.workspace = true thiserror.workspace = true +tikv-jemallocator.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } +tokio-epoll-uring.workspace = true tokio-io-timeout.workspace = true tokio-postgres.workspace = true tokio-stream.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } tracing.workspace = true +twox-hash.workspace = true url.workspace = true walkdir.workspace = true metrics.workspace = true pageserver_api.workspace = true +pageserver_compaction.workspace = true postgres_connection.workspace = true postgres_ffi.workspace = true pq_proto.workspace = true @@ -82,14 +90,18 @@ workspace_hack.workspace = true reqwest.workspace = true rpds.workspace = true enum-map.workspace = true -enumset.workspace = true +enumset = { workspace = true, features = ["serde"]} strum.workspace = true strum_macros.workspace = true +[target.'cfg(target_os = "linux")'.dependencies] +procfs.workspace = true + [dev-dependencies] criterion.workspace = true hex-literal.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] } +indoc.workspace = true [[bench]] name = "bench_layer_map" @@ -98,3 +110,7 @@ harness = false [[bench]] name = "bench_walredo" harness = false + +[[bench]] +name = "bench_ingest" +harness = false diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs new file mode 100644 index 0000000000..72cbb6beab --- /dev/null +++ b/pageserver/benches/bench_ingest.rs @@ -0,0 +1,258 @@ +use std::{env, num::NonZeroUsize}; + +use bytes::Bytes; +use camino::Utf8PathBuf; +use criterion::{criterion_group, criterion_main, Criterion}; +use pageserver::{ + config::PageServerConf, + context::{DownloadBehavior, RequestContext}, + l0_flush::{L0FlushConfig, L0FlushGlobalState}, + page_cache, + repository::Value, + task_mgr::TaskKind, + tenant::storage_layer::inmemory_layer::SerializedBatch, + tenant::storage_layer::InMemoryLayer, + virtual_file, +}; +use pageserver_api::{key::Key, shard::TenantShardId}; +use utils::{ + bin_ser::BeSer, + id::{TenantId, TimelineId}, +}; + +// A very cheap hash for generating non-sequential keys. +fn murmurhash32(mut h: u32) -> u32 { + h ^= h >> 16; + h = h.wrapping_mul(0x85ebca6b); + h ^= h >> 13; + h = h.wrapping_mul(0xc2b2ae35); + h ^= h >> 16; + h +} + +enum KeyLayout { + /// Sequential unique keys + Sequential, + /// Random unique keys + Random, + /// Random keys, but only use the bits from the mask of them + RandomReuse(u32), +} + +enum WriteDelta { + Yes, + No, +} + +async fn ingest( + conf: &'static PageServerConf, + put_size: usize, + put_count: usize, + key_layout: KeyLayout, + write_delta: WriteDelta, +) -> anyhow::Result<()> { + let mut lsn = utils::lsn::Lsn(1000); + let mut key = Key::from_i128(0x0); + + let timeline_id = TimelineId::generate(); + let tenant_id = TenantId::generate(); + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + + tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?; + + let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + + let gate = utils::sync::gate::Gate::default(); + let entered = gate.enter().unwrap(); + + let layer = + InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?; + + let data = Value::Image(Bytes::from(vec![0u8; put_size])); + let data_ser_size = data.serialized_size().unwrap() as usize; + let ctx = RequestContext::new( + pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler, + pageserver::context::DownloadBehavior::Download, + ); + + const BATCH_SIZE: usize = 16; + let mut batch = Vec::new(); + + for i in 0..put_count { + lsn += put_size as u64; + + // Generate lots of keys within a single relation, which simulates the typical bulk ingest case: people + // usually care the most about write performance when they're blasting a huge batch of data into a huge table. + match key_layout { + KeyLayout::Sequential => { + // Use sequential order to illustrate the experience a user is likely to have + // when ingesting bulk data. + key.field6 = i as u32; + } + KeyLayout::Random => { + // Use random-order keys to avoid giving a false advantage to data structures that are + // faster when inserting on the end. + key.field6 = murmurhash32(i as u32); + } + KeyLayout::RandomReuse(mask) => { + // Use low bits only, to limit cardinality + key.field6 = murmurhash32(i as u32) & mask; + } + } + + batch.push((key.to_compact(), lsn, data_ser_size, data.clone())); + if batch.len() >= BATCH_SIZE { + let this_batch = std::mem::take(&mut batch); + let serialized = SerializedBatch::from_values(this_batch).unwrap(); + layer.put_batch(serialized, &ctx).await?; + } + } + if !batch.is_empty() { + let this_batch = std::mem::take(&mut batch); + let serialized = SerializedBatch::from_values(this_batch).unwrap(); + layer.put_batch(serialized, &ctx).await?; + } + layer.freeze(lsn + 1).await; + + if matches!(write_delta, WriteDelta::Yes) { + let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct { + max_concurrency: NonZeroUsize::new(1).unwrap(), + }); + let (_desc, path) = layer + .write_to_disk(&ctx, None, l0_flush_state.inner()) + .await? + .unwrap(); + tokio::fs::remove_file(path).await?; + } + + Ok(()) +} + +/// Wrapper to instantiate a tokio runtime +fn ingest_main( + conf: &'static PageServerConf, + put_size: usize, + put_count: usize, + key_layout: KeyLayout, + write_delta: WriteDelta, +) { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + + runtime.block_on(async move { + let r = ingest(conf, put_size, put_count, key_layout, write_delta).await; + if let Err(e) = r { + panic!("{e:?}"); + } + }); +} + +/// Declare a series of benchmarks for the Pageserver's ingest write path. +/// +/// This benchmark does not include WAL decode: it starts at InMemoryLayer::put_value, and ends either +/// at freezing the ephemeral layer, or writing the ephemeral layer out to an L0 (depending on whether WriteDelta is set). +/// +/// Genuine disk I/O is used, so expect results to differ depending on storage. However, when running on +/// a fast disk, CPU is the bottleneck at time of writing. +fn criterion_benchmark(c: &mut Criterion) { + let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into().unwrap(); + let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent).unwrap(); + eprintln!("Data directory: {}", temp_dir.path()); + + let conf: &'static PageServerConf = Box::leak(Box::new( + pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()), + )); + virtual_file::init( + 16384, + virtual_file::io_engine_for_bench(), + pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, + ); + page_cache::init(conf.page_cache_size); + + { + let mut group = c.benchmark_group("ingest-small-values"); + let put_size = 100usize; + let put_count = 128 * 1024 * 1024 / put_size; + group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64)); + group.sample_size(10); + group.bench_function("ingest 128MB/100b seq", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Sequential, + WriteDelta::Yes, + ) + }) + }); + group.bench_function("ingest 128MB/100b rand", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Random, + WriteDelta::Yes, + ) + }) + }); + group.bench_function("ingest 128MB/100b rand-1024keys", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::RandomReuse(0x3ff), + WriteDelta::Yes, + ) + }) + }); + group.bench_function("ingest 128MB/100b seq, no delta", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Sequential, + WriteDelta::No, + ) + }) + }); + } + + { + let mut group = c.benchmark_group("ingest-big-values"); + let put_size = 8192usize; + let put_count = 128 * 1024 * 1024 / put_size; + group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64)); + group.sample_size(10); + group.bench_function("ingest 128MB/8k seq", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Sequential, + WriteDelta::Yes, + ) + }) + }); + group.bench_function("ingest 128MB/8k seq, no delta", |b| { + b.iter(|| { + ingest_main( + conf, + put_size, + put_count, + KeyLayout::Sequential, + WriteDelta::No, + ) + }) + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 5d05af0c00..1353e79f7c 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,7 +1,8 @@ +use criterion::measurement::WallTime; use pageserver::keyspace::{KeyPartitioning, KeySpace}; use pageserver::repository::Key; use pageserver::tenant::layer_map::LayerMap; -use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::storage_layer::PersistentLayerDesc; use pageserver_api::shard::TenantShardId; use rand::prelude::{SeedableRng, SliceRandom, StdRng}; @@ -15,7 +16,11 @@ use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion}; + +fn fixture_path(relative: &str) -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative) +} fn build_layer_map(filename_dump: PathBuf) -> LayerMap { let mut layer_map = LayerMap::default(); @@ -28,7 +33,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap { let mut updates = layer_map.batch_update(); for fname in filenames { let fname = fname.unwrap(); - let fname = LayerFileName::from_str(&fname).unwrap(); + let fname = LayerName::from_str(&fname).unwrap(); let layer = PersistentLayerDesc::from(fname); let lsn_range = layer.get_lsn_range(); @@ -109,7 +114,7 @@ fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning // between each test run. fn bench_from_captest_env(c: &mut Criterion) { // TODO consider compressing this file - let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt")); + let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt")); let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map); // Test with uniform query pattern @@ -139,7 +144,7 @@ fn bench_from_captest_env(c: &mut Criterion) { fn bench_from_real_project(c: &mut Criterion) { // Init layer map let now = Instant::now(); - let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt")); + let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt")); println!("Finished layer map init in {:?}", now.elapsed()); // Choose uniformly distributed queries @@ -242,7 +247,72 @@ fn bench_sequential(c: &mut Criterion) { group.finish(); } +fn bench_visibility_with_map( + group: &mut BenchmarkGroup, + layer_map: LayerMap, + read_points: Vec, + bench_name: &str, +) { + group.bench_function(bench_name, |b| { + b.iter(|| black_box(layer_map.get_visibility(read_points.clone()))); + }); +} + +// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines. +fn bench_visibility(c: &mut Criterion) { + let mut group = c.benchmark_group("visibility"); + { + // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines. + let now = Instant::now(); + let mut layer_map = LayerMap::default(); + let mut updates = layer_map.batch_update(); + for i in 0..100_000 { + let i32 = (i as u32) % 100; + let zero = Key::from_hex("000000000000000000000000000000000000").unwrap(); + let layer = PersistentLayerDesc::new_img( + TenantShardId::unsharded(TenantId::generate()), + TimelineId::generate(), + zero.add(10 * i32)..zero.add(10 * i32 + 1), + Lsn(i), + 0, + ); + updates.insert_historic(layer); + } + updates.flush(); + println!("Finished layer map init in {:?}", now.elapsed()); + + let mut read_points = Vec::new(); + for i in (0..100_000).step_by(1000) { + read_points.push(Lsn(i)); + } + + bench_visibility_with_map(&mut group, layer_map, read_points, "sequential"); + } + + { + let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt")); + let read_points = vec![Lsn(0x1C760FA190)]; + bench_visibility_with_map(&mut group, layer_map, read_points, "real_map"); + + let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt")); + let read_points = vec![ + Lsn(0x1C760FA190), + Lsn(0x000000931BEAD539), + Lsn(0x000000931BF63011), + Lsn(0x000000931B33AE68), + Lsn(0x00000038E67ABFA0), + Lsn(0x000000931B33AE68), + Lsn(0x000000914E3F38F0), + Lsn(0x000000931B33AE68), + ]; + bench_visibility_with_map(&mut group, layer_map, read_points, "real_map_many_branches"); + } + + group.finish(); +} + criterion_group!(group_1, bench_from_captest_env); criterion_group!(group_2, bench_from_real_project); criterion_group!(group_3, bench_sequential); -criterion_main!(group_1, group_2, group_3); +criterion_group!(group_4, bench_visibility); +criterion_main!(group_1, group_2, group_3, group_4); diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 4837626086..edc09d0bf2 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -1,191 +1,157 @@ -//! Simple benchmarking around walredo. +//! Quantify a single walredo manager's throughput under N concurrent callers. //! -//! Right now they hope to just set a baseline. Later we can try to expand into latency and -//! throughput after figuring out the coordinated omission problems below. +//! The benchmark implementation ([`bench_impl`]) is parametrized by +//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`] +//! - `n_redos` => number of times the benchmark shell execute the `redo_work` +//! - `nclients` => number of clients (more on this shortly). //! -//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by -//! logging what happens when a sequential scan is requested on a small table, then picking out two -//! suitable from logs. - -use std::sync::{Arc, Barrier}; +//! The benchmark impl sets up a multi-threaded tokio runtime with default parameters. +//! It spawns `nclients` times [`client`] tokio tasks. +//! Each task executes the `redo_work` `n_redos/nclients` times. +//! +//! We exercise the following combinations: +//! - `redo_work = short / medium`` +//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]` +//! +//! We let `criterion` determine the `n_redos` using `iter_custom`. +//! The idea is that for each `(redo_work, nclients)` combination, +//! criterion will run the `bench_impl` multiple times with different `n_redos`. +//! The `bench_impl` reports the aggregate wall clock time from the clients' perspective. +//! Criterion will divide that by `n_redos` to compute the "time per iteration". +//! In our case, "time per iteration" means "time per redo_work execution". +//! +//! NB: the way by which `iter_custom` determines the "number of iterations" +//! is called sampling. Apparently the idea here is to detect outliers. +//! We're not sure whether the current choice of sampling method makes sense. +//! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples +//! +//! # Reference Numbers +//! +//! 2024-04-15 on i3en.3xlarge +//! +//! ```text +//! short/1 time: [24.584 µs 24.737 µs 24.922 µs] +//! short/2 time: [33.479 µs 33.660 µs 33.888 µs] +//! short/4 time: [42.713 µs 43.046 µs 43.440 µs] +//! short/8 time: [71.814 µs 72.478 µs 73.240 µs] +//! short/16 time: [132.73 µs 134.45 µs 136.22 µs] +//! short/32 time: [258.31 µs 260.73 µs 263.27 µs] +//! short/64 time: [511.61 µs 514.44 µs 517.51 µs] +//! short/128 time: [992.64 µs 998.23 µs 1.0042 ms] +//! medium/1 time: [110.11 µs 110.50 µs 110.96 µs] +//! medium/2 time: [153.06 µs 153.85 µs 154.99 µs] +//! medium/4 time: [317.51 µs 319.92 µs 322.85 µs] +//! medium/8 time: [638.30 µs 644.68 µs 652.12 µs] +//! medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms] +//! medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms] +//! medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms] +//! medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms] +//! ``` +use anyhow::Context; use bytes::{Buf, Bytes}; -use pageserver::{ - config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager, +use criterion::{BenchmarkId, Criterion}; +use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager}; +use pageserver_api::{key::Key, shard::TenantShardId}; +use std::{ + sync::Arc, + time::{Duration, Instant}, }; -use pageserver_api::shard::TenantShardId; +use tokio::{sync::Barrier, task::JoinSet}; use utils::{id::TenantId, lsn::Lsn}; -use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; - -fn redo_scenarios(c: &mut Criterion) { - // logging should be enabled when adding more inputs, since walredo will only report malformed - // input to the stderr. - // utils::logging::init(utils::logging::LogFormat::Plain).unwrap(); +fn bench(c: &mut Criterion) { + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group("short"); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::short_input()); + b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); + }, + ); + } + } + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group("medium"); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::medium_input()); + b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); + }, + ); + } + } +} +criterion::criterion_group!(benches, bench); +criterion::criterion_main!(benches); +// Returns the sum of each client's wall-clock time spent executing their share of the n_redos. +fn bench_impl(redo_work: Arc, n_redos: u64, nclients: u64) -> Duration { let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap(); let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); let conf = Box::leak(Box::new(conf)); let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); - let manager = PostgresRedoManager::new(conf, tenant_shard_id); + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + let start = Arc::new(Barrier::new(nclients as usize)); + + let mut tasks = JoinSet::new(); + + let manager = PostgresRedoManager::new(conf, tenant_shard_id); let manager = Arc::new(manager); - { - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - tracing::info!("executing first"); - short().execute(rt.handle(), &manager).unwrap(); - tracing::info!("first executed"); + // divide the amount of work equally among the clients. + let nredos_per_client = n_redos / nclients; + for _ in 0..nclients { + rt.block_on(async { + tasks.spawn(client( + Arc::clone(&manager), + Arc::clone(&start), + Arc::clone(&redo_work), + nredos_per_client, + )) + }); } - let thread_counts = [1, 2, 4, 8, 16]; - - let mut group = c.benchmark_group("short"); - group.sampling_mode(criterion::SamplingMode::Flat); - - for thread_count in thread_counts { - group.bench_with_input( - BenchmarkId::new("short", thread_count), - &thread_count, - |b, thread_count| { - add_multithreaded_walredo_requesters(b, *thread_count, &manager, short); - }, - ); - } - drop(group); - - let mut group = c.benchmark_group("medium"); - group.sampling_mode(criterion::SamplingMode::Flat); - - for thread_count in thread_counts { - group.bench_with_input( - BenchmarkId::new("medium", thread_count), - &thread_count, - |b, thread_count| { - add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium); - }, - ); - } - drop(group); -} - -/// Sets up `threads` number of requesters to `request_redo`, with the given input. -fn add_multithreaded_walredo_requesters( - b: &mut criterion::Bencher, - threads: u32, - manager: &Arc, - input_factory: fn() -> Request, -) { - assert_ne!(threads, 0); - - if threads == 1 { - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - let handle = rt.handle(); - b.iter_batched_ref( - || Some(input_factory()), - |input| execute_all(input.take(), handle, manager), - criterion::BatchSize::PerIteration, - ); - } else { - let (work_tx, work_rx) = std::sync::mpsc::sync_channel(threads as usize); - - let work_rx = std::sync::Arc::new(std::sync::Mutex::new(work_rx)); - - let barrier = Arc::new(Barrier::new(threads as usize + 1)); - - let jhs = (0..threads) - .map(|_| { - std::thread::spawn({ - let manager = manager.clone(); - let barrier = barrier.clone(); - let work_rx = work_rx.clone(); - move || { - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - let handle = rt.handle(); - loop { - // queue up and wait if we want to go another round - if work_rx.lock().unwrap().recv().is_err() { - break; - } - - let input = Some(input_factory()); - - barrier.wait(); - - execute_all(input, handle, &manager).unwrap(); - - barrier.wait(); - } - } - }) - }) - .collect::>(); - - let _jhs = JoinOnDrop(jhs); - - b.iter_batched( - || { - for _ in 0..threads { - work_tx.send(()).unwrap() - } - }, - |()| { - // start the work - barrier.wait(); - - // wait for work to complete - barrier.wait(); - }, - criterion::BatchSize::PerIteration, - ); - - drop(work_tx); - } -} - -struct JoinOnDrop(Vec>); - -impl Drop for JoinOnDrop { - // it's not really needless because we want join all then check for panicks - #[allow(clippy::needless_collect)] - fn drop(&mut self) { - // first join all - let results = self.0.drain(..).map(|jh| jh.join()).collect::>(); - // then check the results; panicking here is not great, but it does get the message across - // to the user, and sets an exit value. - results.into_iter().try_for_each(|res| res).unwrap(); - } -} - -fn execute_all( - input: I, - handle: &tokio::runtime::Handle, - manager: &PostgresRedoManager, -) -> anyhow::Result<()> -where - I: IntoIterator, -{ - // just fire all requests as fast as possible - input.into_iter().try_for_each(|req| { - let page = req.execute(handle, manager)?; - assert_eq!(page.remaining(), 8192); - anyhow::Ok(()) + rt.block_on(async move { + let mut total_wallclock_time = Duration::ZERO; + while let Some(res) = tasks.join_next().await { + total_wallclock_time += res.unwrap(); + } + total_wallclock_time }) } -criterion_group!(benches, redo_scenarios); -criterion_main!(benches); +async fn client( + mgr: Arc, + start: Arc, + redo_work: Arc, + n_redos: u64, +) -> Duration { + start.wait().await; + let start = Instant::now(); + for _ in 0..n_redos { + let page = redo_work.execute(&mgr).await.unwrap(); + assert_eq!(page.remaining(), 8192); + // The real pageserver will rarely if ever do 2 walredos in a row without + // yielding to the executor. + tokio::task::yield_now().await; + } + start.elapsed() +} macro_rules! lsn { ($input:expr) => {{ @@ -197,12 +163,47 @@ macro_rules! lsn { }}; } -/// Short payload, 1132 bytes. -// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0 -// for null bytes. -#[allow(clippy::octal_escapes)] -fn short() -> Request { - Request { +/// Simple wrapper around `WalRedoManager::request_redo`. +/// +/// In benchmarks this is cloned around. +#[derive(Clone)] +struct Request { + key: Key, + lsn: Lsn, + base_img: Option<(Lsn, Bytes)>, + records: Vec<(Lsn, NeonWalRecord)>, + pg_version: u32, +} + +impl Request { + async fn execute(&self, manager: &PostgresRedoManager) -> anyhow::Result { + let Request { + key, + lsn, + base_img, + records, + pg_version, + } = self; + + // TODO: avoid these clones + manager + .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version) + .await + .context("request_redo") + } + + fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord { + let rec = Bytes::from_static(bytes); + NeonWalRecord::Postgres { will_init, rec } + } + + /// Short payload, 1132 bytes. + // pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0 + // for null bytes. + #[allow(clippy::octal_escapes)] + pub fn short_input() -> Request { + let pg_record = Self::pg_record; + Request { key: Key { field1: 0, field2: 1663, @@ -225,13 +226,14 @@ fn short() -> Request { ], pg_version: 14, } -} + } -/// Medium sized payload, serializes as 26393 bytes. -// see [`short`] -#[allow(clippy::octal_escapes)] -fn medium() -> Request { - Request { + /// Medium sized payload, serializes as 26393 bytes. + // see [`short`] + #[allow(clippy::octal_escapes)] + pub fn medium_input() -> Request { + let pg_record = Self::pg_record; + Request { key: Key { field1: 0, field2: 1663, @@ -473,39 +475,5 @@ fn medium() -> Request { ], pg_version: 14, } -} - -fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord { - let rec = Bytes::from_static(bytes); - NeonWalRecord::Postgres { will_init, rec } -} - -/// Simple wrapper around `WalRedoManager::request_redo`. -/// -/// In benchmarks this is cloned around. -#[derive(Clone)] -struct Request { - key: Key, - lsn: Lsn, - base_img: Option<(Lsn, Bytes)>, - records: Vec<(Lsn, NeonWalRecord)>, - pg_version: u32, -} - -impl Request { - fn execute( - self, - rt: &tokio::runtime::Handle, - manager: &PostgresRedoManager, - ) -> anyhow::Result { - let Request { - key, - lsn, - base_img, - records, - pg_version, - } = self; - - rt.block_on(manager.request_redo(key, lsn, base_img, records, pg_version)) } } diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml index 0ed27602cd..d9b36bf3d4 100644 --- a/pageserver/client/Cargo.toml +++ b/pageserver/client/Cargo.toml @@ -7,8 +7,7 @@ license.workspace = true [dependencies] pageserver_api.workspace = true thiserror.workspace = true -async-trait.workspace = true -reqwest.workspace = true +reqwest = { workspace = true, features = [ "stream" ] } utils.workspace = true serde.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 077c3909e1..737cb00835 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,13 +1,20 @@ +use std::collections::HashMap; + +use bytes::Bytes; +use detach_ancestor::AncestorDetached; use pageserver_api::{models::*, shard::TenantShardId}; use reqwest::{IntoUrl, Method, StatusCode}; use utils::{ http::error::HttpErrorBody, id::{TenantId, TimelineId}, + lsn::Lsn, }; +pub use reqwest::Body as ReqwestBody; + pub mod util; -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Client { mgmt_api_endpoint: String, authorization_header: Option, @@ -16,6 +23,9 @@ pub struct Client { #[derive(thiserror::Error, Debug)] pub enum Error { + #[error("send request: {0}")] + SendRequest(reqwest::Error), + #[error("receive body: {0}")] ReceiveBody(reqwest::Error), @@ -24,6 +34,9 @@ pub enum Error { #[error("pageserver API: {1}")] ApiError(StatusCode, String), + + #[error("Cancelled")] + Cancelled, } pub type Result = std::result::Result; @@ -56,10 +69,18 @@ pub enum ForceAwaitLogicalSize { impl Client { pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self { + Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt) + } + + pub fn from_client( + client: reqwest::Client, + mgmt_api_endpoint: String, + jwt: Option<&str>, + ) -> Self { Self { mgmt_api_endpoint, authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")), - client: reqwest::Client::new(), + client, } } @@ -69,6 +90,25 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } + /// Get an arbitrary path and returning a streaming Response. This function is suitable + /// for pass-through/proxy use cases where we don't care what the response content looks + /// like. + /// + /// Use/add one of the properly typed methods below if you know aren't proxying, and + /// know what kind of response you expect. + pub async fn get_raw(&self, path: String) -> Result { + debug_assert!(path.starts_with('/')); + let uri = format!("{}{}", self.mgmt_api_endpoint, path); + + let req = self.client.request(Method::GET, uri); + let req = if let Some(value) = &self.authorization_header { + req.header(reqwest::header::AUTHORIZATION, value) + } else { + req + }; + req.send().await.map_err(Error::ReceiveBody) + } + pub async fn tenant_details( &self, tenant_shard_id: TenantShardId, @@ -98,12 +138,12 @@ impl Client { pub async fn timeline_info( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, force_await_logical_size: ForceAwaitLogicalSize, ) -> Result { let uri = format!( - "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}", + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}", self.mgmt_api_endpoint ); @@ -121,11 +161,11 @@ impl Client { pub async fn keyspace( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result { let uri = format!( - "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace", + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace", self.mgmt_api_endpoint ); self.get(&uri) @@ -139,19 +179,39 @@ impl Client { self.request(Method::GET, uri, ()).await } + fn start_request( + &self, + method: Method, + uri: U, + ) -> reqwest::RequestBuilder { + let req = self.client.request(method, uri); + if let Some(value) = &self.authorization_header { + req.header(reqwest::header::AUTHORIZATION, value) + } else { + req + } + } + + async fn request_noerror( + &self, + method: Method, + uri: U, + body: B, + ) -> Result { + self.start_request(method, uri) + .json(&body) + .send() + .await + .map_err(Error::ReceiveBody) + } + async fn request( &self, method: Method, uri: U, body: B, ) -> Result { - let req = self.client.request(method, uri); - let req = if let Some(value) = &self.authorization_header { - req.header(reqwest::header::AUTHORIZATION, value) - } else { - req - }; - let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?; + let res = self.request_noerror(method, uri, body).await?; let response = res.error_from_body().await?; Ok(response) } @@ -162,13 +222,50 @@ impl Client { Ok(()) } - pub async fn tenant_create(&self, req: &TenantCreateRequest) -> Result { - let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint); - self.request(Method::POST, &uri, req) - .await? - .json() - .await - .map_err(Error::ReceiveBody) + /// The tenant deletion API can return 202 if deletion is incomplete, or + /// 404 if it is complete. Callers are responsible for checking the status + /// code and retrying. Error codes other than 404 will return Err(). + pub async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result { + let uri = format!("{}/v1/tenant/{tenant_shard_id}", self.mgmt_api_endpoint); + + match self.request(Method::DELETE, &uri, ()).await { + Err(Error::ApiError(status_code, msg)) => { + if status_code == StatusCode::NOT_FOUND { + Ok(StatusCode::NOT_FOUND) + } else { + Err(Error::ApiError(status_code, msg)) + } + } + Err(e) => Err(e), + Ok(response) => Ok(response.status()), + } + } + + pub async fn tenant_time_travel_remote_storage( + &self, + tenant_shard_id: TenantShardId, + timestamp: &str, + done_if_after: &str, + ) -> Result<()> { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/time_travel_remote_storage?travel_to={timestamp}&done_if_after={done_if_after}", + self.mgmt_api_endpoint + ); + self.request(Method::PUT, &uri, ()).await?; + Ok(()) + } + + pub async fn tenant_scan_remote_storage( + &self, + tenant_id: TenantId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_id}/scan_remote_storage", + self.mgmt_api_endpoint + ); + let response = self.request(Method::GET, &uri, ()).await?; + let body = response.json().await.map_err(Error::ReceiveBody)?; + Ok(body) } pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> { @@ -177,12 +274,53 @@ impl Client { Ok(()) } - pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> { - let uri = format!( + pub async fn tenant_secondary_download( + &self, + tenant_id: TenantShardId, + wait: Option, + ) -> Result<(StatusCode, SecondaryProgress)> { + let mut path = reqwest::Url::parse(&format!( "{}/v1/tenant/{}/secondary/download", self.mgmt_api_endpoint, tenant_id - ); - self.request(Method::POST, &uri, ()).await?; + )) + .expect("Cannot build URL"); + + if let Some(wait) = wait { + path.query_pairs_mut() + .append_pair("wait_ms", &format!("{}", wait.as_millis())); + } + + let response = self.request(Method::POST, path, ()).await?; + let status = response.status(); + let progress: SecondaryProgress = response.json().await.map_err(Error::ReceiveBody)?; + Ok((status, progress)) + } + + pub async fn tenant_secondary_status( + &self, + tenant_shard_id: TenantShardId, + ) -> Result { + let path = reqwest::Url::parse(&format!( + "{}/v1/tenant/{}/secondary/status", + self.mgmt_api_endpoint, tenant_shard_id + )) + .expect("Cannot build URL"); + + self.request(Method::GET, path, ()) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> { + let path = reqwest::Url::parse(&format!( + "{}/v1/tenant/{}/heatmap_upload", + self.mgmt_api_endpoint, tenant_id + )) + .expect("Cannot build URL"); + + self.request(Method::POST, path, ()).await?; Ok(()) } @@ -191,21 +329,27 @@ impl Client { tenant_shard_id: TenantShardId, config: LocationConfig, flush_ms: Option, + lazy: bool, ) -> Result<()> { - let req_body = TenantLocationConfigRequest { - tenant_id: tenant_shard_id, - config, - }; - let path = format!( + let req_body = TenantLocationConfigRequest { config }; + + let mut path = reqwest::Url::parse(&format!( "{}/v1/tenant/{}/location_config", self.mgmt_api_endpoint, tenant_shard_id - ); - let path = if let Some(flush_ms) = flush_ms { - format!("{}?flush_ms={}", path, flush_ms.as_millis()) - } else { - path - }; - self.request(Method::PUT, &path, &req_body).await?; + )) + // Should always work: mgmt_api_endpoint is configuration, not user input. + .expect("Cannot build URL"); + + if lazy { + path.query_pairs_mut().append_pair("lazy", "true"); + } + + if let Some(flush_ms) = flush_ms { + path.query_pairs_mut() + .append_pair("flush_ms", &format!("{}", flush_ms.as_millis())); + } + + self.request(Method::PUT, path, &req_body).await?; Ok(()) } @@ -218,6 +362,21 @@ impl Client { .map_err(Error::ReceiveBody) } + pub async fn get_location_config( + &self, + tenant_shard_id: TenantShardId, + ) -> Result> { + let path = format!( + "{}/v1/location_config/{tenant_shard_id}", + self.mgmt_api_endpoint + ); + self.request(Method::GET, &path, ()) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn timeline_create( &self, tenant_shard_id: TenantShardId, @@ -234,6 +393,67 @@ impl Client { .map_err(Error::ReceiveBody) } + /// The timeline deletion API can return 201 if deletion is incomplete, or + /// 403 if it is complete. Callers are responsible for checking the status + /// code and retrying. Error codes other than 403 will return Err(). + pub async fn timeline_delete( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}", + self.mgmt_api_endpoint + ); + + match self.request(Method::DELETE, &uri, ()).await { + Err(Error::ApiError(status_code, msg)) => { + if status_code == StatusCode::NOT_FOUND { + Ok(StatusCode::NOT_FOUND) + } else { + Err(Error::ApiError(status_code, msg)) + } + } + Err(e) => Err(e), + Ok(response) => Ok(response.status()), + } + } + + pub async fn timeline_archival_config( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + req: &TimelineArchivalConfigRequest, + ) -> Result<()> { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config", + self.mgmt_api_endpoint + ); + + self.request(Method::POST, &uri, req) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn timeline_detach_ancestor( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor", + self.mgmt_api_endpoint + ); + + self.request(Method::PUT, &uri, ()) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> { let uri = format!( "{}/v1/tenant/{}/reset", @@ -246,6 +466,22 @@ impl Client { .map_err(Error::ReceiveBody) } + pub async fn tenant_shard_split( + &self, + tenant_shard_id: TenantShardId, + req: TenantShardSplitRequest, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/shard_split", + self.mgmt_api_endpoint, tenant_shard_id + ); + self.request(Method::PUT, &uri, req) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn timeline_list( &self, tenant_shard_id: &TenantShardId, @@ -275,4 +511,213 @@ impl Client { .await .map_err(Error::ReceiveBody) } + + pub async fn put_io_engine( + &self, + engine: &pageserver_api::models::virtual_file::IoEngineKind, + ) -> Result<()> { + let uri = format!("{}/v1/io_engine", self.mgmt_api_endpoint); + self.request(Method::PUT, uri, engine) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + /// Configs io buffer alignment at runtime. + pub async fn put_io_alignment(&self, align: usize) -> Result<()> { + let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint); + self.request(Method::PUT, uri, align) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn get_utilization(&self) -> Result { + let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint); + self.get(uri) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn top_tenant_shards( + &self, + request: TopTenantShardsRequest, + ) -> Result { + let uri = format!("{}/v1/top_tenants", self.mgmt_api_endpoint); + self.request(Method::POST, uri, request) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn layer_map_info( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/layer", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id, + ); + self.get(&uri) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn layer_evict( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + layer_file_name: &str, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/layer/{}", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name + ); + let resp = self.request_noerror(Method::DELETE, &uri, ()).await?; + match resp.status() { + StatusCode::OK => Ok(true), + StatusCode::NOT_MODIFIED => Ok(false), + // TODO: dedupe this pattern / introduce separate error variant? + status => Err(match resp.json::().await { + Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), + Err(_) => { + Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri)) + } + }), + } + } + + pub async fn layer_ondemand_download( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + layer_file_name: &str, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/layer/{}", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name + ); + let resp = self.request_noerror(Method::GET, &uri, ()).await?; + match resp.status() { + StatusCode::OK => Ok(true), + StatusCode::NOT_MODIFIED => Ok(false), + // TODO: dedupe this pattern / introduce separate error variant? + status => Err(match resp.json::().await { + Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), + Err(_) => { + Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri)) + } + }), + } + } + + pub async fn ingest_aux_files( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + aux_files: HashMap, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/ingest_aux_files", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id + ); + let resp = self + .request_noerror(Method::POST, &uri, IngestAuxFilesRequest { aux_files }) + .await?; + match resp.status() { + StatusCode::OK => Ok(true), + status => Err(match resp.json::().await { + Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), + Err(_) => { + Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri)) + } + }), + } + } + + pub async fn list_aux_files( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, + ) -> Result> { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/list_aux_files", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id + ); + let resp = self + .request_noerror(Method::POST, &uri, ListAuxFilesRequest { lsn }) + .await?; + match resp.status() { + StatusCode::OK => { + let resp: HashMap = resp.json().await.map_err(|e| { + Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}")) + })?; + Ok(resp) + } + status => Err(match resp.json::().await { + Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), + Err(_) => { + Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri)) + } + }), + } + } + + pub async fn import_basebackup( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + base_lsn: Lsn, + end_lsn: Lsn, + pg_version: u32, + basebackup_tarball: ReqwestBody, + ) -> Result<()> { + let uri = format!( + "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}", + self.mgmt_api_endpoint, + ); + self.start_request(Method::PUT, uri) + .body(basebackup_tarball) + .send() + .await + .map_err(Error::SendRequest)? + .error_from_body() + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn import_wal( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + start_lsn: Lsn, + end_lsn: Lsn, + wal_tarball: ReqwestBody, + ) -> Result<()> { + let uri = format!( + "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_wal?start_lsn={start_lsn}&end_lsn={end_lsn}", + self.mgmt_api_endpoint, + ); + self.start_request(Method::PUT, uri) + .body(wal_tarball) + .send() + .await + .map_err(Error::SendRequest)? + .error_from_body() + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } } diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs index ff542670f1..f9507fc47a 100644 --- a/pageserver/client/src/page_service.rs +++ b/pageserver/client/src/page_service.rs @@ -60,7 +60,7 @@ impl Client { ) -> anyhow::Result { let copy_both: tokio_postgres::CopyBothDuplex = self .client - .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}")) + .copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}")) .await?; let Client { cancel_on_client_drop, @@ -156,7 +156,8 @@ impl PagestreamClient { PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e), PagestreamBeMessage::Exists(_) | PagestreamBeMessage::Nblocks(_) - | PagestreamBeMessage::DbSize(_) => { + | PagestreamBeMessage::DbSize(_) + | PagestreamBeMessage::GetSlruSegment(_) => { anyhow::bail!( "unexpected be message kind in response to getpage request: {}", msg.kind() diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml new file mode 100644 index 0000000000..0fd1d81845 --- /dev/null +++ b/pageserver/compaction/Cargo.toml @@ -0,0 +1,53 @@ +[package] +name = "pageserver_compaction" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[features] +default = [] + +[dependencies] +anyhow.workspace = true +async-compression.workspace = true +async-stream.workspace = true +byteorder.workspace = true +bytes.workspace = true +chrono = { workspace = true, features = ["serde"] } +clap = { workspace = true, features = ["string"] } +const_format.workspace = true +consumption_metrics.workspace = true +crossbeam-utils.workspace = true +either.workspace = true +flate2.workspace = true +fail.workspace = true +futures.workspace = true +git-version.workspace = true +hex.workspace = true +humantime.workspace = true +humantime-serde.workspace = true +itertools.workspace = true +once_cell.workspace = true +pageserver_api.workspace = true +pin-project-lite.workspace = true +rand.workspace = true +smallvec = { workspace = true, features = ["write"] } +svg_fmt.workspace = true +sync_wrapper.workspace = true +thiserror.workspace = true +tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } +tokio-io-timeout.workspace = true +tokio-util.workspace = true +tracing.workspace = true +tracing-error.workspace = true +tracing-subscriber.workspace = true +url.workspace = true +walkdir.workspace = true +metrics.workspace = true +utils.workspace = true +workspace_hack.workspace = true + +[dev-dependencies] +criterion.workspace = true +hex-literal.workspace = true +tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] } diff --git a/pageserver/compaction/TODO.md b/pageserver/compaction/TODO.md new file mode 100644 index 0000000000..85523ad5b3 --- /dev/null +++ b/pageserver/compaction/TODO.md @@ -0,0 +1,51 @@ +# TODO + +- If the key space can be perfectly partitioned at some key, perform planning on each + partition separately. For example, if we are compacting a level with layers like this: + + ``` + : + +--+ +----+ : +------+ + | | | | : | | + +--+ +----+ : +------+ + : + +-----+ +-+ : +--------+ + | | | | : | | + +-----+ +-+ : +--------+ + : + ``` + + At the dotted line, there is a natural split in the key space, such that all + layers are either on the left or the right of it. We can compact the + partitions separately. We could choose to create image layers for one + partition but not the other one, for example. + +- All the layers don't have to be exactly the same size, we can choose to cut a + layer short or stretch it a little larger than the target size, if it helps + the overall system. We can help perfect partitions (see previous bullet point) + to happen more frequently, by choosing the cut points wisely. For example, try + to cut layers at boundaries of underlying image layers. And "snap to grid", + i.e. don't cut layers at any key, but e.g. only when key % 10000 = 0. + +- Avoid rewriting layers when we'd just create an identical layer to an input + layer. + +- Parallelism. The code is already split up into planning and execution, so that + we first split up the compaction work into "Jobs", and then execute them. + It would be straightforward to execute multiple jobs in parallel. + +- Materialize extra pages in delta layers during compaction. This would reduce + read amplification. There has been the idea of partial image layers. Materializing + extra pages in the delta layers achieve the same goal, without introducing a new + concept. + +## Simulator + +- Expand the simulator for more workloads +- Automate a test suite that runs the simluator with different workloads and + spits out a table of results +- Model read amplification +- More sanity checking. One idea is to keep a reference count of each + MockRecord, i.e. use Arc instead of plain MockRecord, and panic if + a MockRecord that is newer than PITR horizon is completely dropped. That would + indicate that the record was lost. diff --git a/pageserver/compaction/src/bin/compaction-simulator.rs b/pageserver/compaction/src/bin/compaction-simulator.rs new file mode 100644 index 0000000000..c308694ae1 --- /dev/null +++ b/pageserver/compaction/src/bin/compaction-simulator.rs @@ -0,0 +1,215 @@ +use clap::{Parser, Subcommand}; +use pageserver_compaction::helpers::PAGE_SZ; +use pageserver_compaction::simulator::MockTimeline; +use rand::Rng; +use std::io::Write; +use std::path::{Path, PathBuf}; +use std::sync::OnceLock; + +use utils::project_git_version; + +project_git_version!(GIT_VERSION); + +#[derive(Parser)] +#[command( + version = GIT_VERSION, + about = "Neon Pageserver compaction simulator", + long_about = "A developer tool to visualize and test compaction" +)] +#[command(propagate_version = true)] +struct CliOpts { + #[command(subcommand)] + command: Commands, +} + +#[derive(Subcommand)] +enum Commands { + RunSuite, + Simulate(SimulateCmd), +} + +#[derive(Clone, clap::ValueEnum)] +enum Distribution { + Uniform, + HotCold, +} + +/// Read and update pageserver metadata file +#[derive(Parser)] +struct SimulateCmd { + distribution: Distribution, + + /// Number of records to digest + num_records: u64, + /// Record length + record_len: u64, + + // Logical database size in MB + logical_size: u64, +} + +async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> { + let mut executor = MockTimeline::new(); + + // Convert the logical size in MB into a key range. + let key_range = 0..((cmd.logical_size * 1024 * 1024) / PAGE_SZ); + //let key_range = u64::MIN..u64::MAX; + println!( + "starting simulation with key range {:016X}-{:016X}", + key_range.start, key_range.end + ); + + // helper function to print progress indicator + let print_progress = |i| -> anyhow::Result<()> { + if i == 0 || (i + 1) % 10000 == 0 || i == cmd.num_records - 1 { + print!( + "\ringested {} / {} records, {} MiB / {} MiB...", + i + 1, + cmd.num_records, + (i + 1) * cmd.record_len / (1_000_000), + cmd.num_records * cmd.record_len / (1_000_000), + ); + std::io::stdout().flush()?; + } + Ok(()) + }; + + match cmd.distribution { + Distribution::Uniform => { + for i in 0..cmd.num_records { + executor.ingest_uniform(1, cmd.record_len, &key_range)?; + executor.compact_if_needed().await?; + + print_progress(i)?; + } + } + Distribution::HotCold => { + let splitpoint = key_range.start + (key_range.end - key_range.start) / 10; + let hot_key_range = 0..splitpoint; + let cold_key_range = splitpoint..key_range.end; + + for i in 0..cmd.num_records { + let chosen_range = if rand::thread_rng().gen_bool(0.9) { + &hot_key_range + } else { + &cold_key_range + }; + executor.ingest_uniform(1, cmd.record_len, chosen_range)?; + executor.compact_if_needed().await?; + + print_progress(i)?; + } + } + } + println!("done!"); + executor.flush_l0(); + executor.compact_if_needed().await?; + let stats = executor.stats()?; + + // Print the stats to stdout, and also to a file + print!("{stats}"); + std::fs::write(results_path.join("stats.txt"), stats)?; + + let animation_path = results_path.join("compaction-animation.html"); + executor.draw_history(std::fs::File::create(&animation_path)?)?; + println!( + "animation: file://{}", + animation_path.canonicalize()?.display() + ); + + Ok(()) +} + +async fn run_suite_cmd(results_path: &Path, workload: &SimulateCmd) -> anyhow::Result<()> { + std::fs::create_dir(results_path)?; + + set_log_file(File::create(results_path.join("log"))?); + let result = simulate(workload, results_path).await; + set_log_stdout(); + result +} + +async fn run_suite() -> anyhow::Result<()> { + let top_results_path = PathBuf::from(format!( + "compaction-suite-results.{}", + std::time::SystemTime::UNIX_EPOCH.elapsed()?.as_secs() + )); + std::fs::create_dir(&top_results_path)?; + + let workload = SimulateCmd { + distribution: Distribution::Uniform, + // Generate 20 GB of WAL + record_len: 1_000, + num_records: 20_000_000, + // Logical size 5 GB + logical_size: 5_000, + }; + + run_suite_cmd(&top_results_path.join("uniform-20GB-5GB"), &workload).await?; + + println!( + "All tests finished. Results in {}", + top_results_path.display() + ); + Ok(()) +} + +use std::fs::File; +use std::io::Stdout; +use std::sync::Mutex; +use tracing_subscriber::fmt::writer::EitherWriter; +use tracing_subscriber::fmt::MakeWriter; + +static LOG_FILE: OnceLock>> = OnceLock::new(); +fn get_log_output() -> &'static Mutex> { + LOG_FILE.get_or_init(|| std::sync::Mutex::new(EitherWriter::B(std::io::stdout()))) +} + +fn set_log_file(f: File) { + *get_log_output().lock().unwrap() = EitherWriter::A(f); +} + +fn set_log_stdout() { + *get_log_output().lock().unwrap() = EitherWriter::B(std::io::stdout()); +} + +fn init_logging() -> anyhow::Result<()> { + // We fall back to printing all spans at info-level or above if + // the RUST_LOG environment variable is not set. + let rust_log_env_filter = || { + tracing_subscriber::EnvFilter::try_from_default_env() + .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")) + }; + + // NB: the order of the with() calls does not matter. + // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering + use tracing_subscriber::prelude::*; + tracing_subscriber::registry() + .with({ + let log_layer = tracing_subscriber::fmt::layer() + .with_target(false) + .with_ansi(false) + .with_writer(|| get_log_output().make_writer()); + log_layer.with_filter(rust_log_env_filter()) + }) + .init(); + + Ok(()) +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let cli = CliOpts::parse(); + + init_logging()?; + + match cli.command { + Commands::Simulate(cmd) => { + simulate(&cmd, &PathBuf::from("/tmp/compactions.html")).await?; + } + Commands::RunSuite => { + run_suite().await?; + } + }; + Ok(()) +} diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs new file mode 100644 index 0000000000..20f88868f9 --- /dev/null +++ b/pageserver/compaction/src/compact_tiered.rs @@ -0,0 +1,940 @@ +//! # Tiered compaction algorithm. +//! +//! Read all the input delta files, and write a new set of delta files that +//! include all the input WAL records. See retile_deltas(). +//! +//! In a "normal" LSM tree, you get to remove any values that are overwritten by +//! later values, but in our system, we keep all the history. So the reshuffling +//! doesn't remove any garbage, it just reshuffles the records to reduce read +//! amplification, i.e. the number of files that you need to access to find the +//! WAL records for a given key. +//! +//! If the new delta files would be very "narrow", i.e. each file would cover +//! only a narrow key range, then we create a new set of image files +//! instead. The current threshold is that if the estimated total size of the +//! image layers is smaller than the size of the deltas, then we create image +//! layers. That amounts to 2x storage amplification, and it means that the +//! distance of image layers in LSN dimension is roughly equal to the logical +//! database size. For example, if the logical database size is 10 GB, we would +//! generate new image layers every 10 GB of WAL. +use futures::StreamExt; +use pageserver_api::shard::ShardIdentity; +use tracing::{debug, info}; + +use std::collections::{HashSet, VecDeque}; +use std::ops::Range; + +use crate::helpers::{ + accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ, +}; +use crate::interface::*; +use utils::lsn::Lsn; + +use crate::identify_levels::identify_level; + +/// Main entry point to compaction. +/// +/// The starting point is a cutoff LSN (`end_lsn`). The compaction is run on +/// everything below that point, that needs compaction. The cutoff LSN must +/// partition the layers so that there are no layers that span across that +/// LSN. To start compaction at the top of the tree, pass the end LSN of the +/// written last L0 layer. +pub async fn compact_tiered( + executor: &mut E, + end_lsn: Lsn, + target_file_size: u64, + fanout: u64, + ctx: &E::RequestContext, +) -> anyhow::Result<()> { + assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}"); + let exp_base = fanout.max(2); + // Start at L0 + let mut current_level_no = 0; + let mut current_level_target_height = target_file_size; + loop { + // end LSN +1 to include possible image layers exactly at 'end_lsn'. + let all_layers = executor + .get_layers( + &(E::Key::MIN..E::Key::MAX), + &(Lsn(u64::MIN)..end_lsn + 1), + ctx, + ) + .await?; + info!( + "Compacting L{}, total # of layers: {}", + current_level_no, + all_layers.len() + ); + + // Identify the range of LSNs that belong to this level. We assume that + // each file in this level spans an LSN range up to 1.75x target file + // size. That should give us enough slop that if we created a slightly + // oversized L0 layer, e.g. because flushing the in-memory layer was + // delayed for some reason, we don't consider the oversized layer to + // belong to L1. But not too much slop, that we don't accidentally + // "skip" levels. + let max_height = (current_level_target_height as f64 * 1.75) as u64; + let Some(level) = identify_level(all_layers, end_lsn, max_height).await? else { + break; + }; + + // Calculate the height of this level. If the # of tiers exceeds the + // fanout parameter, it's time to compact it. + let depth = level.depth(); + info!( + "Level {} identified as LSN range {}-{}: depth {}", + current_level_no, level.lsn_range.start, level.lsn_range.end, depth + ); + for l in &level.layers { + debug!("LEVEL {} layer: {}", current_level_no, l.short_id()); + } + if depth < fanout { + debug!( + level = current_level_no, + depth = depth, + fanout, + "too few deltas to compact" + ); + break; + } + + compact_level( + &level.lsn_range, + &level.layers, + executor, + target_file_size, + ctx, + ) + .await?; + if current_level_target_height == u64::MAX { + // our target height includes all possible lsns + info!( + level = current_level_no, + depth = depth, + "compaction loop reached max current_level_target_height" + ); + break; + } + current_level_no += 1; + current_level_target_height = current_level_target_height.saturating_mul(exp_base); + } + Ok(()) +} + +async fn compact_level( + lsn_range: &Range, + layers: &[E::Layer], + executor: &mut E, + target_file_size: u64, + ctx: &E::RequestContext, +) -> anyhow::Result { + let mut layer_fragments = Vec::new(); + for l in layers { + layer_fragments.push(LayerFragment::new(l.clone())); + } + + let mut state = LevelCompactionState { + shard_identity: *executor.get_shard_identity(), + target_file_size, + _lsn_range: lsn_range.clone(), + layers: layer_fragments, + jobs: Vec::new(), + job_queue: Vec::new(), + next_level: false, + executor, + }; + + let first_job = CompactionJob { + key_range: E::Key::MIN..E::Key::MAX, + lsn_range: lsn_range.clone(), + strategy: CompactionStrategy::Divide, + input_layers: state + .layers + .iter() + .enumerate() + .map(|i| LayerId(i.0)) + .collect(), + completed: false, + }; + + state.jobs.push(first_job); + state.job_queue.push(JobId(0)); + state.execute(ctx).await?; + + info!( + "compaction completed! Need to process next level: {}", + state.next_level + ); + + Ok(state.next_level) +} + +/// Blackboard that keeps track of the state of all the jobs and work remaining +struct LevelCompactionState<'a, E> +where + E: CompactionJobExecutor, +{ + shard_identity: ShardIdentity, + + // parameters + target_file_size: u64, + + _lsn_range: Range, + layers: Vec>, + + // job queue + jobs: Vec>, + job_queue: Vec, + + /// If false, no need to compact levels below this + next_level: bool, + + /// Interface to the outside world + executor: &'a mut E, +} + +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] +struct LayerId(usize); +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] +struct JobId(usize); + +struct PendingJobSet { + pending: HashSet, + completed: HashSet, +} + +impl PendingJobSet { + fn new() -> Self { + PendingJobSet { + pending: HashSet::new(), + completed: HashSet::new(), + } + } + + fn complete_job(&mut self, job_id: JobId) { + self.pending.remove(&job_id); + self.completed.insert(job_id); + } + + fn all_completed(&self) -> bool { + self.pending.is_empty() + } +} + +// When we decide to rewrite a set of layers, LayerFragment is used to keep +// track which new layers supersede an old layer. When all the stakeholder jobs +// have completed, this layer can be deleted. +struct LayerFragment +where + E: CompactionJobExecutor, +{ + layer: E::Layer, + + // If we will write new layers to replace this one, this keeps track of the + // jobs that need to complete before this layer can be deleted. As the jobs + // complete, they are moved from 'pending' to 'completed' set. Once the + // 'pending' set becomes empty, the layer can be deleted. + // + // If None, this layer is not rewritten and must not be deleted. + deletable_after: Option, + + deleted: bool, +} + +impl LayerFragment +where + E: CompactionJobExecutor, +{ + fn new(layer: E::Layer) -> Self { + LayerFragment { + layer, + deletable_after: None, + deleted: false, + } + } +} + +#[derive(PartialEq)] +enum CompactionStrategy { + Divide, + CreateDelta, + CreateImage, +} + +struct CompactionJob { + key_range: Range, + lsn_range: Range, + + strategy: CompactionStrategy, + + input_layers: Vec, + + completed: bool, +} + +impl<'a, E> LevelCompactionState<'a, E> +where + E: CompactionJobExecutor, +{ + /// Main loop of the executor. + /// + /// In each iteration, we take the next job from the queue, and execute it. + /// The execution might add new jobs to the queue. Keep going until the + /// queue is empty. + /// + /// Initially, the job queue consists of one Divide job over the whole + /// level. On first call, it is divided into smaller jobs. + async fn execute(&mut self, ctx: &E::RequestContext) -> anyhow::Result<()> { + // TODO: this would be pretty straightforward to parallelize with FuturesUnordered + while let Some(next_job_id) = self.job_queue.pop() { + info!("executing job {}", next_job_id.0); + self.execute_job(next_job_id, ctx).await?; + } + + // all done! + Ok(()) + } + + async fn execute_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> { + let job = &self.jobs[job_id.0]; + match job.strategy { + CompactionStrategy::Divide => { + self.divide_job(job_id, ctx).await?; + Ok(()) + } + CompactionStrategy::CreateDelta => { + let mut deltas: Vec = Vec::new(); + let mut layer_ids: Vec = Vec::new(); + for layer_id in &job.input_layers { + let layer = &self.layers[layer_id.0].layer; + if let Some(dl) = self.executor.downcast_delta_layer(layer).await? { + deltas.push(dl.clone()); + layer_ids.push(*layer_id); + } + } + + self.executor + .create_delta(&job.lsn_range, &job.key_range, &deltas, ctx) + .await?; + self.jobs[job_id.0].completed = true; + + // did we complete any fragments? + for layer_id in layer_ids { + let l = &mut self.layers[layer_id.0]; + if let Some(deletable_after) = l.deletable_after.as_mut() { + deletable_after.complete_job(job_id); + if deletable_after.all_completed() { + self.executor.delete_layer(&l.layer, ctx).await?; + l.deleted = true; + } + } + } + + self.next_level = true; + + Ok(()) + } + CompactionStrategy::CreateImage => { + self.executor + .create_image(job.lsn_range.end, &job.key_range, ctx) + .await?; + self.jobs[job_id.0].completed = true; + + // TODO: we could check if any layers < PITR horizon became deletable + Ok(()) + } + } + } + + fn push_job(&mut self, job: CompactionJob) -> JobId { + let job_id = JobId(self.jobs.len()); + self.jobs.push(job); + self.job_queue.push(job_id); + job_id + } + + /// Take a partition of the key space, and decide how to compact it. + /// + /// TODO: Currently, this is called exactly once for the level, and we + /// decide whether to create new image layers to cover the whole level, or + /// write a new set of deltas. In the future, this should try to partition + /// the key space, and make the decision separately for each partition. + async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> { + let job = &self.jobs[job_id.0]; + assert!(job.strategy == CompactionStrategy::Divide); + + // Check for dummy cases + if job.input_layers.is_empty() { + return Ok(()); + } + + let job = &self.jobs[job_id.0]; + assert!(job.strategy == CompactionStrategy::Divide); + + // Would it be better to create images for this partition? + // Decide based on the average density of the level + let keyspace_size = keyspace_total_size( + &self + .executor + .get_keyspace(&job.key_range, job.lsn_range.end, ctx) + .await?, + &self.shard_identity, + ) * PAGE_SZ; + + let wal_size = job + .input_layers + .iter() + .filter(|layer_id| self.layers[layer_id.0].layer.is_delta()) + .map(|layer_id| self.layers[layer_id.0].layer.file_size()) + .sum::(); + if keyspace_size < wal_size { + // seems worth it + info!( + "covering with images, because keyspace_size is {}, size of deltas between {}-{} is {}", + keyspace_size, job.lsn_range.start, job.lsn_range.end, wal_size + ); + self.cover_with_images(job_id, ctx).await + } else { + // do deltas + info!( + "coverage not worth it, keyspace_size {}, wal_size {}", + keyspace_size, wal_size + ); + self.retile_deltas(job_id, ctx).await + } + } + + // LSN + // ^ + // | + // | ###|###|##### + // | +--+-----+--+ +--+-----+--+ + // | | | | | | | | | + // | +--+--+--+--+ +--+--+--+--+ + // | | | | | | | + // | +---+-+-+---+ ==> +---+-+-+---+ + // | | | | | | | | | + // | +---+-+-++--+ +---+-+-++--+ + // | | | | | | | | | + // | +-----+--+--+ +-----+--+--+ + // | + // +--------------> key + // + async fn cover_with_images( + &mut self, + job_id: JobId, + ctx: &E::RequestContext, + ) -> anyhow::Result<()> { + let job = &self.jobs[job_id.0]; + assert!(job.strategy == CompactionStrategy::Divide); + + // XXX: do we still need the "holes" stuff? + + let mut new_jobs = Vec::new(); + + // Slide a window through the keyspace + let keyspace = self + .executor + .get_keyspace(&job.key_range, job.lsn_range.end, ctx) + .await?; + + let mut window = KeyspaceWindow::new( + E::Key::MIN..E::Key::MAX, + keyspace, + self.target_file_size / PAGE_SZ, + ); + while let Some(key_range) = window.choose_next_image(&self.shard_identity) { + new_jobs.push(CompactionJob:: { + key_range, + lsn_range: job.lsn_range.clone(), + strategy: CompactionStrategy::CreateImage, + input_layers: Vec::new(), // XXX: Is it OK for this to be empty for image layer? + completed: false, + }); + } + + for j in new_jobs.into_iter().rev() { + let _job_id = self.push_job(j); + + // TODO: image layers don't let us delete anything. unless < PITR horizon + //let j = &self.jobs[job_id.0]; + // for layer_id in j.input_layers.iter() { + // self.layers[layer_id.0].pending_stakeholders.insert(job_id); + //} + } + + Ok(()) + } + + // Merge the contents of all the input delta layers into a new set + // of delta layers, based on the current partitioning. + // + // We split the new delta layers on the key dimension. We iterate through + // the key space, and for each key, check if including the next key to the + // current output layer we're building would cause the layer to become too + // large. If so, dump the current output layer and start new one. It's + // possible that there is a single key with so many page versions that + // storing all of them in a single layer file would be too large. In that + // case, we also split on the LSN dimension. + // + // LSN + // ^ + // | + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // + // + // If one key (X) has a lot of page versions: + // + // LSN + // ^ + // | (X) + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | +--+ | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | +--+ | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // + // TODO: this actually divides the layers into fixed-size chunks, not + // based on the partitioning. + // + // TODO: we should also opportunistically materialize and + // garbage collect what we can. + async fn retile_deltas( + &mut self, + job_id: JobId, + ctx: &E::RequestContext, + ) -> anyhow::Result<()> { + let job = &self.jobs[job_id.0]; + assert!(job.strategy == CompactionStrategy::Divide); + + // Sweep the key space left to right, running an estimate of how much + // disk size and keyspace we have accumulated + // + // Once the disk size reaches the target threshold, stop and think. + // If we have accumulated only a narrow band of keyspace, create an + // image layer. Otherwise write a delta layer. + + // FIXME: we are ignoring images here. Did we already divide the work + // so that we won't encounter them here? + + let mut deltas: Vec = Vec::new(); + for layer_id in &job.input_layers { + let l = &self.layers[layer_id.0]; + if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? { + deltas.push(dl.clone()); + } + } + // Open stream + let key_value_stream = + std::pin::pin!(merge_delta_keys_buffered::(deltas.as_slice(), ctx) + .await? + .map(Result::<_, anyhow::Error>::Ok)); + let mut new_jobs = Vec::new(); + + // Slide a window through the keyspace + let mut key_accum = + std::pin::pin!(accum_key_values(key_value_stream, self.target_file_size)); + let mut all_in_window: bool = false; + let mut window = Window::new(); + + // Helper function to create a job for a new delta layer with given key-lsn + // rectangle. + let create_delta_job = |key_range, lsn_range: &Range, new_jobs: &mut Vec<_>| { + // The inputs for the job are all the input layers of the original job that + // overlap with the rectangle. + let batch_layers: Vec = job + .input_layers + .iter() + .filter(|layer_id| { + overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range) + }) + .cloned() + .collect(); + assert!(!batch_layers.is_empty()); + new_jobs.push(CompactionJob { + key_range, + lsn_range: lsn_range.clone(), + strategy: CompactionStrategy::CreateDelta, + input_layers: batch_layers, + completed: false, + }); + }; + + loop { + if all_in_window && window.is_empty() { + // All done! + break; + } + + // If we now have enough keyspace for next delta layer in the window, create a + // new delta layer + if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window) + { + create_delta_job(key_range, &job.lsn_range, &mut new_jobs); + continue; + } + assert!(!all_in_window); + + // Process next key in the key space + match key_accum.next().await.transpose()? { + None => { + all_in_window = true; + } + Some(next_key) if next_key.partition_lsns.is_empty() => { + // Normal case: extend the window by the key + window.feed(next_key.key, next_key.size); + } + Some(next_key) => { + // A key with too large size impact for a single delta layer. This + // case occurs if you make a huge number of updates for a single key. + // + // Drain the window with has_more = false to make a clean cut before + // the key, and then make dedicated delta layers for the single key. + // + // We cannot cluster the key with the others, because we don't want + // layer files to overlap with each other in the lsn,key space (no + // overlaps for the rectangles). + let key = next_key.key; + debug!("key {key} with size impact larger than the layer size"); + while !window.is_empty() { + let has_more = false; + let key_range = window.choose_next_delta(self.target_file_size, has_more) + .expect("with has_more==false, choose_next_delta always returns something for a non-empty Window"); + create_delta_job(key_range, &job.lsn_range, &mut new_jobs); + } + + // Not really required: but here for future resilience: + // We make a "gap" here, so any structure the window holds should + // probably be reset. + window = Window::new(); + + let mut prior_lsn = job.lsn_range.start; + let mut lsn_ranges = Vec::new(); + for (lsn, _size) in next_key.partition_lsns.iter() { + lsn_ranges.push(prior_lsn..*lsn); + prior_lsn = *lsn; + } + lsn_ranges.push(prior_lsn..job.lsn_range.end); + for lsn_range in lsn_ranges { + let key_range = key..key.next(); + create_delta_job(key_range, &lsn_range, &mut new_jobs); + } + } + } + } + + // All the input files are rewritten. Set up the tracking for when they can + // be deleted. + for layer_id in job.input_layers.iter() { + let l = &mut self.layers[layer_id.0]; + assert!(l.deletable_after.is_none()); + l.deletable_after = Some(PendingJobSet::new()); + } + for j in new_jobs.into_iter().rev() { + let job_id = self.push_job(j); + let j = &self.jobs[job_id.0]; + for layer_id in j.input_layers.iter() { + self.layers[layer_id.0] + .deletable_after + .as_mut() + .unwrap() + .pending + .insert(job_id); + } + } + + Ok(()) + } +} + +/// Sliding window through keyspace and values for image layer +/// This is used by [`LevelCompactionState::cover_with_images`] to decide on good split points +struct KeyspaceWindow { + head: KeyspaceWindowHead, + + start_pos: KeyspaceWindowPos, +} +struct KeyspaceWindowHead { + // overall key range to cover + key_range: Range, + + keyspace: Vec>, + target_keysize: u64, +} + +#[derive(Clone)] +struct KeyspaceWindowPos { + end_key: K, + + keyspace_idx: usize, + + accum_keysize: u64, +} +impl KeyspaceWindowPos { + fn reached_end(&self, w: &KeyspaceWindowHead) -> bool { + self.keyspace_idx == w.keyspace.len() + } + + // Advance the cursor until it reaches 'target_keysize'. + fn advance_until_size( + &mut self, + w: &KeyspaceWindowHead, + max_size: u64, + shard_identity: &ShardIdentity, + ) { + while self.accum_keysize < max_size && !self.reached_end(w) { + let curr_range = &w.keyspace[self.keyspace_idx]; + if self.end_key < curr_range.start { + // skip over any unused space + self.end_key = curr_range.start; + } + + // We're now within 'curr_range'. Can we advance past it completely? + let distance = K::key_range_size(&(self.end_key..curr_range.end), shard_identity); + if (self.accum_keysize + distance as u64) < max_size { + // oh yeah, it fits + self.end_key = curr_range.end; + self.keyspace_idx += 1; + self.accum_keysize += distance as u64; + } else { + // advance within the range + let skip_key = self.end_key.skip_some(); + let distance = K::key_range_size(&(self.end_key..skip_key), shard_identity); + if (self.accum_keysize + distance as u64) < max_size { + self.end_key = skip_key; + self.accum_keysize += distance as u64; + } else { + self.end_key = self.end_key.next(); + self.accum_keysize += 1; + } + } + } + } +} + +impl KeyspaceWindow +where + K: CompactionKey, +{ + fn new(key_range: Range, keyspace: CompactionKeySpace, target_keysize: u64) -> Self { + assert!(keyspace.first().unwrap().start >= key_range.start); + + let start_key = key_range.start; + let start_pos = KeyspaceWindowPos:: { + end_key: start_key, + keyspace_idx: 0, + accum_keysize: 0, + }; + Self { + head: KeyspaceWindowHead:: { + key_range, + keyspace, + target_keysize, + }, + start_pos, + } + } + + fn choose_next_image(&mut self, shard_identity: &ShardIdentity) -> Option> { + if self.start_pos.keyspace_idx == self.head.keyspace.len() { + // we've reached the end + return None; + } + + let mut next_pos = self.start_pos.clone(); + next_pos.advance_until_size( + &self.head, + self.start_pos.accum_keysize + self.head.target_keysize, + shard_identity, + ); + + // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to + // 1.25x target size + let mut end_pos = next_pos.clone(); + end_pos.advance_until_size( + &self.head, + self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4), + shard_identity, + ); + if end_pos.reached_end(&self.head) { + // gobble up any unused keyspace between the last used key and end of the range + assert!(end_pos.end_key <= self.head.key_range.end); + end_pos.end_key = self.head.key_range.end; + next_pos = end_pos; + } + + let start_key = self.start_pos.end_key; + self.start_pos = next_pos; + Some(start_key..self.start_pos.end_key) + } +} + +// Take previous partitioning, based on the image layers below. +// +// Candidate is at the front: +// +// Consider stretching an image layer to next divider? If it's close enough, +// that's the image candidate +// +// If it's too far, consider splitting at a reasonable point +// +// Is the image candidate smaller than the equivalent delta? If so, +// split off the image. Otherwise, split off one delta. +// Try to snap off the delta at a reasonable point + +struct WindowElement { + start_key: K, // inclusive + last_key: K, // inclusive + accum_size: u64, +} + +/// Sliding window through keyspace and values for delta layer tiling +/// +/// This is used to decide which delta layer to write next. +struct Window { + elems: VecDeque>, + + // last key that was split off, inclusive + splitoff_key: Option, + splitoff_size: u64, +} + +impl Window +where + K: CompactionKey, +{ + fn new() -> Self { + Self { + elems: VecDeque::new(), + splitoff_key: None, + splitoff_size: 0, + } + } + + fn feed(&mut self, key: K, size: u64) { + let last_size; + if let Some(last) = self.elems.back_mut() { + // We require the keys to be strictly increasing for the window. + // Keys should already have been deduplicated by `accum_key_values` + assert!( + last.last_key < key, + "last_key(={}) >= key(={key})", + last.last_key + ); + last_size = last.accum_size; + } else { + last_size = 0; + } + // This is a new key. + let elem = WindowElement { + start_key: key, + last_key: key, + accum_size: last_size + size, + }; + self.elems.push_back(elem); + } + + fn remain_size(&self) -> u64 { + self.elems.back().unwrap().accum_size - self.splitoff_size + } + + fn peek_size(&self) -> u64 { + self.elems.front().unwrap().accum_size - self.splitoff_size + } + + fn is_empty(&self) -> bool { + self.elems.is_empty() + } + + fn commit_upto(&mut self, mut upto: usize) { + while upto > 1 { + let popped = self.elems.pop_front().unwrap(); + self.elems.front_mut().unwrap().start_key = popped.start_key; + upto -= 1; + } + } + + fn find_size_split(&self, target_size: u64) -> usize { + self.elems + .partition_point(|elem| elem.accum_size - self.splitoff_size < target_size) + } + + fn pop(&mut self) { + let first = self.elems.pop_front().unwrap(); + self.splitoff_size = first.accum_size; + + self.splitoff_key = Some(first.last_key); + } + + // the difference between delta and image is that an image covers + // any unused keyspace before and after, while a delta tries to + // minimize that. TODO: difference not implemented + fn pop_delta(&mut self) -> Range { + let first = self.elems.front().unwrap(); + let key_range = first.start_key..first.last_key.next(); + + self.pop(); + key_range + } + + // Prerequisite: we have enough input in the window + // + // On return None, the caller should feed more data and call again + fn choose_next_delta(&mut self, target_size: u64, has_more: bool) -> Option> { + if has_more && self.elems.is_empty() { + // Starting up + return None; + } + + // If we still have an undersized candidate, just keep going + while self.peek_size() < target_size { + if self.elems.len() > 1 { + self.commit_upto(2); + } else if has_more { + return None; + } else { + break; + } + } + + // Ensure we have enough input in the window to make a good decision + if has_more && self.remain_size() < target_size * 5 / 4 { + return None; + } + + // The candidate on the front is now large enough, for a delta. + // And we have enough data in the window to decide. + + // If we're willing to stretch it up to 1.25 target size, could we + // gobble up the rest of the work? This avoids creating very small + // "tail" layers at the end of the keyspace + if !has_more && self.remain_size() < target_size * 5 / 4 { + self.commit_upto(self.elems.len()); + } else { + let delta_split_at = self.find_size_split(target_size); + self.commit_upto(delta_split_at); + + // If it's still not large enough, request the caller to fill the window + if self.elems.len() == 1 && has_more { + return None; + } + } + Some(self.pop_delta()) + } +} diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs new file mode 100644 index 0000000000..8ed1d16082 --- /dev/null +++ b/pageserver/compaction/src/helpers.rs @@ -0,0 +1,295 @@ +//! This file contains generic utility functions over the interface types, +//! which could be handy for any compaction implementation. +use crate::interface::*; + +use futures::future::BoxFuture; +use futures::{Stream, StreamExt}; +use itertools::Itertools; +use pageserver_api::shard::ShardIdentity; +use pin_project_lite::pin_project; +use std::collections::BinaryHeap; +use std::collections::VecDeque; +use std::fmt::Display; +use std::future::Future; +use std::ops::{DerefMut, Range}; +use std::pin::Pin; +use std::task::{ready, Poll}; +use utils::lsn::Lsn; + +pub const PAGE_SZ: u64 = 8192; + +pub fn keyspace_total_size( + keyspace: &CompactionKeySpace, + shard_identity: &ShardIdentity, +) -> u64 +where + K: CompactionKey, +{ + keyspace + .iter() + .map(|r| K::key_range_size(r, shard_identity) as u64) + .sum() +} + +pub fn overlaps_with(a: &Range, b: &Range) -> bool { + !(a.end <= b.start || b.end <= a.start) +} + +pub fn union_to_keyspace(a: &mut CompactionKeySpace, b: CompactionKeySpace) { + let x = std::mem::take(a); + let mut all_ranges_iter = [x.into_iter(), b.into_iter()] + .into_iter() + .kmerge_by(|a, b| a.start < b.start); + let mut ranges = Vec::new(); + if let Some(first) = all_ranges_iter.next() { + let (mut start, mut end) = (first.start, first.end); + + for r in all_ranges_iter { + assert!(r.start >= start); + if r.start > end { + ranges.push(start..end); + start = r.start; + end = r.end; + } else if r.end > end { + end = r.end; + } + } + ranges.push(start..end); + } + *a = ranges +} + +pub fn intersect_keyspace( + a: &CompactionKeySpace, + r: &Range, +) -> CompactionKeySpace { + let mut ranges: Vec> = Vec::new(); + + for x in a.iter() { + if x.end <= r.start { + continue; + } + if x.start >= r.end { + break; + } + ranges.push(x.clone()) + } + + // trim the ends + if let Some(first) = ranges.first_mut() { + first.start = std::cmp::max(first.start, r.start); + } + if let Some(last) = ranges.last_mut() { + last.end = std::cmp::min(last.end, r.end); + } + ranges +} + +/// Create a stream that iterates through all DeltaEntrys among all input +/// layers, in key-lsn order. +/// +/// This is public because the create_delta() implementation likely wants to use this too +/// TODO: move to a more shared place +pub fn merge_delta_keys<'a, E: CompactionJobExecutor>( + layers: &'a [E::DeltaLayer], + ctx: &'a E::RequestContext, +) -> MergeDeltaKeys<'a, E> { + // Use a binary heap to merge the layers. Each input layer is initially + // represented by a LazyLoadLayer::Unloaded element, which uses the start of + // the layer's key range as the key. The first time a layer reaches the top + // of the heap, all the keys of the layer are loaded into a sorted vector. + // + // This helps to keep the memory usage reasonable: we only need to hold in + // memory the DeltaEntrys of the layers that overlap with the "current" key. + let mut heap: BinaryHeap> = BinaryHeap::new(); + for l in layers { + heap.push(LazyLoadLayer::Unloaded(l)); + } + MergeDeltaKeys { + heap, + ctx, + load_future: None, + } +} + +pub async fn merge_delta_keys_buffered<'a, E: CompactionJobExecutor + 'a>( + layers: &'a [E::DeltaLayer], + ctx: &'a E::RequestContext, +) -> anyhow::Result>::DeltaEntry<'a>>> +{ + let mut keys = Vec::new(); + for l in layers { + // Boxing and casting to LoadFuture is required to obtain the right Sync bound. + // If we do l.load_keys(ctx).await? directly, there is a compilation error. + let load_future: LoadFuture<'a, _> = Box::pin(l.load_keys(ctx)); + keys.extend(load_future.await?.into_iter()); + } + keys.sort_by_key(|k| (k.key(), k.lsn())); + let stream = futures::stream::iter(keys.into_iter()); + Ok(stream) +} + +enum LazyLoadLayer<'a, E: CompactionJobExecutor> { + Loaded(VecDeque<>::DeltaEntry<'a>>), + Unloaded(&'a E::DeltaLayer), +} +impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> { + fn min_key(&self) -> E::Key { + match self { + Self::Loaded(entries) => entries.front().unwrap().key(), + Self::Unloaded(dl) => dl.key_range().start, + } + } + fn min_lsn(&self) -> Lsn { + match self { + Self::Loaded(entries) => entries.front().unwrap().lsn(), + Self::Unloaded(dl) => dl.lsn_range().start, + } + } +} +impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} +impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + // reverse order so that we get a min-heap + (other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn())) + } +} +impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == std::cmp::Ordering::Equal + } +} +impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {} + +type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result>>; + +// Stream returned by `merge_delta_keys` +pin_project! { +#[allow(clippy::type_complexity)] +pub struct MergeDeltaKeys<'a, E: CompactionJobExecutor> { + heap: BinaryHeap>, + + #[pin] + load_future: Option>::DeltaEntry<'a>>>, + + ctx: &'a E::RequestContext, +} +} + +impl<'a, E> Stream for MergeDeltaKeys<'a, E> +where + E: CompactionJobExecutor + 'a, +{ + type Item = anyhow::Result<>::DeltaEntry<'a>>; + + fn poll_next( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll::Item>> { + let mut this = self.project(); + loop { + if let Some(mut load_future) = this.load_future.as_mut().as_pin_mut() { + // We are waiting for loading the keys to finish + match ready!(load_future.as_mut().poll(cx)) { + Ok(entries) => { + this.load_future.set(None); + *this.heap.peek_mut().unwrap() = + LazyLoadLayer::Loaded(VecDeque::from(entries)); + } + Err(e) => { + return Poll::Ready(Some(Err(e))); + } + } + } + + // If the topmost layer in the heap hasn't been loaded yet, start + // loading it. Otherwise return the next entry from it and update + // the layer's position in the heap (this decreaseKey operation is + // performed implicitly when `top` is dropped). + if let Some(mut top) = this.heap.peek_mut() { + match top.deref_mut() { + LazyLoadLayer::Unloaded(ref mut l) => { + let fut = l.load_keys(this.ctx); + this.load_future.set(Some(Box::pin(fut))); + continue; + } + LazyLoadLayer::Loaded(ref mut entries) => { + let result = entries.pop_front().unwrap(); + if entries.is_empty() { + std::collections::binary_heap::PeekMut::pop(top); + } + return Poll::Ready(Some(Ok(result))); + } + } + } else { + return Poll::Ready(None); + } + } + } +} + +// Accumulate values at key boundaries +pub struct KeySize { + pub key: K, + pub num_values: u64, + pub size: u64, + /// The lsns to partition at (if empty then no per-lsn partitioning) + pub partition_lsns: Vec<(Lsn, u64)>, +} + +pub fn accum_key_values<'a, I, K, D, E>( + input: I, + target_size: u64, +) -> impl Stream, E>> +where + K: Eq + PartialOrd + Display + Copy, + I: Stream>, + D: CompactionDeltaEntry<'a, K>, +{ + async_stream::try_stream! { + // Initialize the state from the first value + let mut input = std::pin::pin!(input); + + if let Some(first) = input.next().await { + let first = first?; + let mut part_size = first.size(); + let mut accum: KeySize = KeySize { + key: first.key(), + num_values: 1, + size: part_size, + partition_lsns: Vec::new(), + }; + let mut last_key = accum.key; + while let Some(this) = input.next().await { + let this = this?; + if this.key() == accum.key { + let add_size = this.size(); + if part_size + add_size > target_size { + accum.partition_lsns.push((this.lsn(), part_size)); + part_size = 0; + } + part_size += add_size; + accum.size += add_size; + accum.num_values += 1; + } else { + assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key); + last_key = accum.key; + yield accum; + part_size = this.size(); + accum = KeySize { + key: this.key(), + num_values: 1, + size: part_size, + partition_lsns: Vec::new(), + }; + } + } + assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key); + yield accum; + } + } +} diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs new file mode 100644 index 0000000000..1853afffdd --- /dev/null +++ b/pageserver/compaction/src/identify_levels.rs @@ -0,0 +1,381 @@ +//! An LSM tree consists of multiple levels, each exponentially larger than the +//! previous level. And each level consists of multiple "tiers". With tiered +//! compaction, a level is compacted when it has accumulated more than N tiers, +//! forming one tier on the next level. +//! +//! In the pageserver, we don't explicitly track the levels and tiers. Instead, +//! we identify them by looking at the shapes of the layers. It's an easy task +//! for a human, but it's not straightforward to come up with the exact +//! rules. Especially if there are cases like interrupted, half-finished +//! compactions, or highly skewed data distributions that have let us "skip" +//! some levels. It's not critical to classify all cases correctly; at worst we +//! delay some compaction work, and suffer from more read amplification, or we +//! perform some unnecessary compaction work. +//! +//! `identify_level` performs that shape-matching. +//! +//! It returns a Level struct, which has `depth()` function to count the number +//! of "tiers" in the level. The tier count is the max depth of stacked layers +//! within the level. That's a good measure, because the point of compacting is +//! to reduce read amplification, and the depth is what determines that. +//! +//! One interesting effect of this is that if we generate very small delta +//! layers at L0, e.g. because the L0 layers are flushed by timeout rather than +//! because they reach the target size, the L0 compaction will combine them to +//! one larger file. But if the combined file is still smaller than the target +//! file size, the file will still be considered to be part of L0 at the next +//! iteration. + +use anyhow::bail; +use std::collections::BTreeSet; +use std::ops::Range; +use utils::lsn::Lsn; + +use crate::interface::*; + +use tracing::{info, trace}; + +pub struct Level { + pub lsn_range: Range, + pub layers: Vec, +} + +/// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are +/// no layers that cross the boundary LSN. +/// +/// A further restriction is that all layers in the returned partition cover at +/// most 'lsn_max_size' LSN bytes. +pub async fn identify_level( + all_layers: Vec, + end_lsn: Lsn, + lsn_max_size: u64, +) -> anyhow::Result>> +where + K: CompactionKey, + L: CompactionLayer + Clone, +{ + // filter out layers that are above the `end_lsn`, they are completely irrelevant. + let mut layers = Vec::new(); + for l in all_layers { + if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn { + // shouldn't happen. Indicates that the caller passed a bogus + // end_lsn. + bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id()); + } + // include image layers sitting exacty at `end_lsn`. + let is_image = !l.is_delta(); + if (is_image && l.lsn_range().start > end_lsn) + || (!is_image && l.lsn_range().start >= end_lsn) + { + continue; + } + layers.push(l); + } + // All the remaining layers either belong to this level, or are below it. + info!( + "identify level at {}, size {}, num layers below: {}", + end_lsn, + lsn_max_size, + layers.len() + ); + if layers.is_empty() { + return Ok(None); + } + + // Walk the ranges in LSN order. + // + // ----- end_lsn + // | + // | + // v + // + layers.sort_by_key(|l| l.lsn_range().end); + let mut candidate_start_lsn = end_lsn; + let mut candidate_layers: Vec = Vec::new(); + let mut current_best_start_lsn = end_lsn; + let mut current_best_layers: Vec = Vec::new(); + let mut iter = layers.into_iter(); + loop { + let Some(l) = iter.next_back() else { + // Reached end. Accept the last candidate + current_best_start_lsn = candidate_start_lsn; + current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers)); + break; + }; + trace!( + "inspecting {} for candidate {}, current best {}", + l.short_id(), + candidate_start_lsn, + current_best_start_lsn + ); + + let r = l.lsn_range(); + + // Image layers don't restrict our choice of cutoff LSN + if l.is_delta() { + // Is this candidate workable? In other words, are there any + // delta layers that span across this LSN + // + // Valid: Not valid: + // + + + // | | + + // + <- candidate + | <- candidate + // + + + // | + // + + if r.end <= candidate_start_lsn { + // Hooray, there are no crossing LSNs. And we have visited + // through all the layers within candidate..end_lsn. The + // current candidate can be accepted. + current_best_start_lsn = r.end; + current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers)); + candidate_start_lsn = r.start; + } + + // Is it small enough to be considered part of this level? + if r.end.0 - r.start.0 > lsn_max_size { + // Too large, this layer belongs to next level. Stop. + trace!( + "too large {}, size {} vs {}", + l.short_id(), + r.end.0 - r.start.0, + lsn_max_size + ); + break; + } + + // If this crosses the candidate lsn, push it down. + if r.start < candidate_start_lsn { + trace!( + "layer {} prevents from stopping at {}", + l.short_id(), + candidate_start_lsn + ); + candidate_start_lsn = r.start; + } + } + + // Include this layer in our candidate + candidate_layers.push(l); + } + + Ok(if current_best_start_lsn == end_lsn { + // empty level + None + } else { + Some(Level { + lsn_range: current_best_start_lsn..end_lsn, + layers: current_best_layers, + }) + }) +} + +impl Level { + /// Count the number of deltas stacked on each other. + pub fn depth(&self) -> u64 + where + K: CompactionKey, + L: CompactionLayer, + { + struct Event { + key: K, + layer_idx: usize, + start: bool, + } + let mut events: Vec> = Vec::new(); + for (idx, l) in self.layers.iter().enumerate() { + let key_range = l.key_range(); + if key_range.end == key_range.start.next() && l.is_delta() { + // Ignore single-key delta layers as they can be stacked on top of each other + // as that is the only way to cut further. + continue; + } + events.push(Event { + key: l.key_range().start, + layer_idx: idx, + start: true, + }); + events.push(Event { + key: l.key_range().end, + layer_idx: idx, + start: false, + }); + } + events.sort_by_key(|e| (e.key, e.start)); + + // Sweep the key space left to right. Stop at each distinct key, and + // count the number of deltas on top of the highest image at that key. + // + // This is a little inefficient, as we walk through the active_set on + // every key. We could increment/decrement a counter on each step + // instead, but that'd require a bit more complex bookkeeping. + let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new(); + let mut max_depth = 0; + let mut events_iter = events.iter().peekable(); + while let Some(e) = events_iter.next() { + let l = &self.layers[e.layer_idx]; + let is_image = !l.is_delta(); + + // update the active set + if e.start { + active_set.insert((l.lsn_range().end, is_image, e.layer_idx)); + } else { + active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx)); + } + + // recalculate depth if this was the last event at this point + let more_events_at_this_key = events_iter + .peek() + .map_or(false, |next_e| next_e.key == e.key); + if !more_events_at_this_key { + let mut active_depth = 0; + for (_end_lsn, is_image, _idx) in active_set.iter().rev() { + if *is_image { + break; + } + active_depth += 1; + } + if active_depth > max_depth { + max_depth = active_depth; + } + } + } + debug_assert_eq!(active_set, BTreeSet::new()); + max_depth + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer}; + use std::sync::{Arc, Mutex}; + + fn delta(key_range: Range, lsn_range: Range) -> MockLayer { + MockLayer::Delta(Arc::new(MockDeltaLayer { + key_range, + lsn_range, + // identify_level() doesn't pay attention to the rest of the fields + file_size: 0, + deleted: Mutex::new(false), + records: vec![], + })) + } + + fn image(key_range: Range, lsn: Lsn) -> MockLayer { + MockLayer::Image(Arc::new(MockImageLayer { + key_range, + lsn_range: lsn..(lsn + 1), + // identify_level() doesn't pay attention to the rest of the fields + file_size: 0, + deleted: Mutex::new(false), + })) + } + + #[tokio::test] + async fn test_identify_level() -> anyhow::Result<()> { + let layers = vec![ + delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)), + delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)), + delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)), + delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), + delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), + delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)), + ]; + + // All layers fit in the max file size + let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000) + .await? + .unwrap(); + assert_eq!(level.depth(), 6); + + // Same LSN with smaller max file size. The second layer from the top is larger + // and belongs to next level. + let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000) + .await? + .unwrap(); + assert_eq!(level.depth(), 1); + + // Call with a smaller LSN + let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000) + .await? + .unwrap(); + assert_eq!(level.depth(), 2); + + // Call with an LSN that doesn't partition the space + let result = identify_level(layers, Lsn(0x6000), 0x1000).await; + assert!(result.is_err()); + Ok(()) + } + + #[tokio::test] + async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> { + // The files LSN ranges overlap, so even though there are more files that + // fit under the file size, they are not included in the level because they + // overlap so that we'd need to include the oldest file, too, which is + // larger + let layers = vec![ + delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)), + delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap + delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap + delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap + delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger + ]; + + let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000) + .await? + .unwrap(); + assert_eq!(level.depth(), 1); + + Ok(()) + } + + #[tokio::test] + async fn test_depth_nonoverlapping() -> anyhow::Result<()> { + // The key ranges don't overlap, so depth is only 1. + let layers = vec![ + delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)), + delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)), + delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)), + ]; + + let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000) + .await? + .unwrap(); + assert_eq!(level.layers.len(), 3); + assert_eq!(level.depth(), 1); + + // Staggered. The 1st and 3rd layer don't overlap with each other. + let layers = vec![ + delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)), + delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)), + delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)), + ]; + + let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000) + .await? + .unwrap(); + assert_eq!(level.layers.len(), 3); + assert_eq!(level.depth(), 2); + Ok(()) + } + + #[tokio::test] + async fn test_depth_images() -> anyhow::Result<()> { + let layers: Vec = vec![ + delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)), + delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)), + delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)), + // This covers the same key range as the 2nd delta layer. The depth + // in that key range is therefore 0. + image(1500..2500, Lsn(0x9000)), + ]; + + let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000) + .await? + .unwrap(); + assert_eq!(level.layers.len(), 4); + assert_eq!(level.depth(), 1); + Ok(()) + } +} diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs new file mode 100644 index 0000000000..5bc9b5ca1d --- /dev/null +++ b/pageserver/compaction/src/interface.rs @@ -0,0 +1,165 @@ +//! This is what the compaction implementation needs to know about +//! layers, keyspace etc. +//! +//! All the heavy lifting is done by the create_image and create_delta +//! functions that the implementor provides. +use futures::Future; +use pageserver_api::{key::Key, keyspace::ShardedRange, shard::ShardIdentity}; +use std::ops::Range; +use utils::lsn::Lsn; + +/// Public interface. This is the main thing that the implementor needs to provide +pub trait CompactionJobExecutor { + // Type system. + // + // We assume that there are two kinds of layers, deltas and images. The + // compaction doesn't distinguish whether they are stored locally or + // remotely. + // + // The keyspace is defined by the CompactionKey trait. + type Key: CompactionKey; + + type Layer: CompactionLayer + Clone; + type DeltaLayer: CompactionDeltaLayer + Clone; + type ImageLayer: CompactionImageLayer + Clone; + + // This is passed through to all the interface functions. The compaction + // implementation doesn't do anything with it, but it might be useful for + // the interface implementation. + type RequestContext: CompactionRequestContext; + + // ---- + // Functions that the planner uses to support its decisions + // ---- + + fn get_shard_identity(&self) -> &ShardIdentity; + + /// Return all layers that overlap the given bounding box. + fn get_layers( + &mut self, + key_range: &Range, + lsn_range: &Range, + ctx: &Self::RequestContext, + ) -> impl Future>> + Send; + + fn get_keyspace( + &mut self, + key_range: &Range, + lsn: Lsn, + ctx: &Self::RequestContext, + ) -> impl Future>> + Send; + + /// NB: This is a pretty expensive operation. In the real pageserver + /// implementation, it downloads the layer, and keeps it resident + /// until the DeltaLayer is dropped. + fn downcast_delta_layer( + &self, + layer: &Self::Layer, + ) -> impl Future>> + Send; + + // ---- + // Functions to execute the plan + // ---- + + /// Create a new image layer, materializing all the values in the key range, + /// at given 'lsn'. + fn create_image( + &mut self, + lsn: Lsn, + key_range: &Range, + ctx: &Self::RequestContext, + ) -> impl Future> + Send; + + /// Create a new delta layer, containing all the values from 'input_layers' + /// in the given key and LSN range. + fn create_delta( + &mut self, + lsn_range: &Range, + key_range: &Range, + input_layers: &[Self::DeltaLayer], + ctx: &Self::RequestContext, + ) -> impl Future> + Send; + + /// Delete a layer. The compaction implementation will call this only after + /// all the create_image() or create_delta() calls that deletion of this + /// layer depends on have finished. But if the implementor has extra lazy + /// background tasks, like uploading the index json file to remote storage. + /// it is the implementation's responsibility to track those. + fn delete_layer( + &mut self, + layer: &Self::Layer, + ctx: &Self::RequestContext, + ) -> impl Future> + Send; +} + +pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display { + const MIN: Self; + const MAX: Self; + + /// Calculate distance between key_range.start and key_range.end. + /// + /// This returns u32, for compatibility with Repository::key. If the + /// distance is larger, return u32::MAX. + fn key_range_size(key_range: &Range, shard_identity: &ShardIdentity) -> u32; + + // return "self + 1" + fn next(&self) -> Self; + + // return "self + ". The amount to skip + // is left to the implementation. + // FIXME: why not just "add(u32)" ? This is hard to use + fn skip_some(&self) -> Self; +} + +impl CompactionKey for Key { + const MIN: Self = Self::MIN; + const MAX: Self = Self::MAX; + + fn key_range_size(r: &std::ops::Range, shard_identity: &ShardIdentity) -> u32 { + ShardedRange::new(r.clone(), shard_identity).page_count() + } + fn next(&self) -> Key { + (self as &Key).next() + } + fn skip_some(&self) -> Key { + self.add(128) + } +} + +/// Contiguous ranges of keys that belong to the key space. In key order, and +/// with no overlap. +pub type CompactionKeySpace = Vec>; + +/// Functions needed from all layers. +pub trait CompactionLayer { + fn key_range(&self) -> &Range; + fn lsn_range(&self) -> &Range; + + fn file_size(&self) -> u64; + + /// For debugging, short human-readable representation of the layer. E.g. filename. + fn short_id(&self) -> String; + + fn is_delta(&self) -> bool; +} +pub trait CompactionDeltaLayer: CompactionLayer { + type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key> + where + Self: 'a; + + /// Return all keys in this delta layer. + fn load_keys<'a>( + &self, + ctx: &E::RequestContext, + ) -> impl Future>>> + Send; +} + +pub trait CompactionImageLayer: CompactionLayer {} + +pub trait CompactionDeltaEntry<'a, K> { + fn key(&self) -> K; + fn lsn(&self) -> Lsn; + fn size(&self) -> u64; +} + +pub trait CompactionRequestContext {} diff --git a/pageserver/compaction/src/lib.rs b/pageserver/compaction/src/lib.rs new file mode 100644 index 0000000000..2d6d673de5 --- /dev/null +++ b/pageserver/compaction/src/lib.rs @@ -0,0 +1,12 @@ +// The main module implementing the compaction algorithm +pub mod compact_tiered; +pub(crate) mod identify_levels; + +// Traits that the caller of the compaction needs to implement +pub mod interface; + +// Utility functions, useful for the implementation +pub mod helpers; + +// A simulator with mock implementations of 'interface' +pub mod simulator; diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs new file mode 100644 index 0000000000..776c537d03 --- /dev/null +++ b/pageserver/compaction/src/simulator.rs @@ -0,0 +1,617 @@ +mod draw; + +use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp}; + +use futures::StreamExt; +use pageserver_api::shard::ShardIdentity; +use rand::Rng; +use tracing::info; + +use utils::lsn::Lsn; + +use std::fmt::Write; +use std::ops::Range; +use std::sync::Arc; +use std::sync::Mutex; + +use crate::helpers::PAGE_SZ; +use crate::helpers::{merge_delta_keys, overlaps_with}; + +use crate::interface; +use crate::interface::CompactionLayer; + +// +// Implementation for the CompactionExecutor interface +// +pub struct MockTimeline { + // Parameters for the compaction algorithm + pub target_file_size: u64, + tiers_per_level: u64, + + num_l0_flushes: u64, + last_compact_at_flush: u64, + last_flush_lsn: Lsn, + + // In-memory layer + records: Vec, + total_len: u64, + start_lsn: Lsn, + end_lsn: Lsn, + + // Current keyspace at `end_lsn`. This is updated on every ingested record. + keyspace: KeySpace, + + // historic keyspaces + old_keyspaces: Vec<(Lsn, KeySpace)>, + + // "on-disk" layers + pub live_layers: Vec, + + num_deleted_layers: u64, + + // Statistics + wal_ingested: u64, + bytes_written: u64, + bytes_deleted: u64, + layers_created: u64, + layers_deleted: u64, + + // All the events - creation and deletion of files - are collected + // in 'history'. It is used to draw the SVG animation at the end. + time: u64, + history: Vec, +} + +type KeySpace = interface::CompactionKeySpace; + +pub struct MockRequestContext {} +impl interface::CompactionRequestContext for MockRequestContext {} + +pub type Key = u64; + +impl interface::CompactionKey for Key { + const MIN: Self = u64::MIN; + const MAX: Self = u64::MAX; + + fn key_range_size(key_range: &Range, _shard_identity: &ShardIdentity) -> u32 { + std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32 + } + + fn next(&self) -> Self { + self + 1 + } + fn skip_some(&self) -> Self { + // round up to next xx + self + 100 + } +} + +#[derive(Clone)] +pub struct MockRecord { + lsn: Lsn, + key: Key, + len: u64, +} + +impl interface::CompactionDeltaEntry<'_, Key> for MockRecord { + fn key(&self) -> Key { + self.key + } + fn lsn(&self) -> Lsn { + self.lsn + } + fn size(&self) -> u64 { + self.len + } +} + +pub struct MockDeltaLayer { + pub key_range: Range, + pub lsn_range: Range, + + pub file_size: u64, + + pub deleted: Mutex, + + pub records: Vec, +} + +impl interface::CompactionLayer for Arc { + fn key_range(&self) -> &Range { + &self.key_range + } + fn lsn_range(&self) -> &Range { + &self.lsn_range + } + + fn file_size(&self) -> u64 { + self.file_size + } + + fn short_id(&self) -> String { + format!( + "{:016X}-{:016X}__{:08X}-{:08X}", + self.key_range.start, self.key_range.end, self.lsn_range.start.0, self.lsn_range.end.0 + ) + } + + fn is_delta(&self) -> bool { + true + } +} + +impl interface::CompactionDeltaLayer for Arc { + type DeltaEntry<'a> = MockRecord; + + async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result> { + Ok(self.records.clone()) + } +} + +pub struct MockImageLayer { + pub key_range: Range, + pub lsn_range: Range, + + pub file_size: u64, + + pub deleted: Mutex, +} + +impl interface::CompactionImageLayer for Arc {} + +impl interface::CompactionLayer for Arc { + fn key_range(&self) -> &Range { + &self.key_range + } + fn lsn_range(&self) -> &Range { + &self.lsn_range + } + + fn file_size(&self) -> u64 { + self.file_size + } + + fn short_id(&self) -> String { + format!( + "{:016X}-{:016X}__{:08X}", + self.key_range.start, self.key_range.end, self.lsn_range.start.0, + ) + } + + fn is_delta(&self) -> bool { + false + } +} + +impl MockTimeline { + pub fn new() -> Self { + MockTimeline { + target_file_size: 256 * 1024 * 1024, + tiers_per_level: 4, + + num_l0_flushes: 0, + last_compact_at_flush: 0, + last_flush_lsn: Lsn(0), + + records: Vec::new(), + total_len: 0, + start_lsn: Lsn(1000), + end_lsn: Lsn(1000), + keyspace: KeySpace::new(), + + old_keyspaces: vec![], + + live_layers: vec![], + + num_deleted_layers: 0, + + wal_ingested: 0, + bytes_written: 0, + bytes_deleted: 0, + layers_created: 0, + layers_deleted: 0, + + time: 0, + history: Vec::new(), + } + } + + pub async fn compact(&mut self) -> anyhow::Result<()> { + let ctx = MockRequestContext {}; + + crate::compact_tiered::compact_tiered( + self, + self.last_flush_lsn, + self.target_file_size, + self.tiers_per_level, + &ctx, + ) + .await?; + + Ok(()) + } + + // Ingest one record to the timeline + pub fn ingest_record(&mut self, key: Key, len: u64) { + self.records.push(MockRecord { + lsn: self.end_lsn, + key, + len, + }); + self.total_len += len; + self.end_lsn += len; + + if self.total_len > self.target_file_size { + self.flush_l0(); + } + } + + pub async fn compact_if_needed(&mut self) -> anyhow::Result<()> { + if self.num_l0_flushes - self.last_compact_at_flush >= self.tiers_per_level { + self.compact().await?; + self.last_compact_at_flush = self.num_l0_flushes; + } + Ok(()) + } + + pub fn flush_l0(&mut self) { + if self.records.is_empty() { + return; + } + + let mut records = std::mem::take(&mut self.records); + records.sort_by_key(|rec| rec.key); + + let lsn_range = self.start_lsn..self.end_lsn; + let new_layer = Arc::new(MockDeltaLayer { + key_range: Key::MIN..Key::MAX, + lsn_range: lsn_range.clone(), + file_size: self.total_len, + records, + deleted: Mutex::new(false), + }); + info!("flushed L0 layer {}", new_layer.short_id()); + self.live_layers.push(MockLayer::from(&new_layer)); + + // reset L0 + self.start_lsn = self.end_lsn; + self.total_len = 0; + self.records = Vec::new(); + + self.layers_created += 1; + self.bytes_written += new_layer.file_size; + + self.time += 1; + self.history.push(LayerTraceEvent { + time_rel: self.time, + op: LayerTraceOp::Flush, + file: LayerTraceFile { + filename: new_layer.short_id(), + key_range: new_layer.key_range.clone(), + lsn_range: new_layer.lsn_range.clone(), + }, + }); + + self.num_l0_flushes += 1; + self.last_flush_lsn = self.end_lsn; + } + + // Ingest `num_records' records to the timeline, with random keys + // uniformly distributed in `key_range` + pub fn ingest_uniform( + &mut self, + num_records: u64, + len: u64, + key_range: &Range, + ) -> anyhow::Result<()> { + crate::helpers::union_to_keyspace(&mut self.keyspace, vec![key_range.clone()]); + let mut rng = rand::thread_rng(); + for _ in 0..num_records { + self.ingest_record(rng.gen_range(key_range.clone()), len); + self.wal_ingested += len; + } + Ok(()) + } + + pub fn stats(&self) -> anyhow::Result { + let mut s = String::new(); + + writeln!(s, "STATISTICS:")?; + writeln!( + s, + "WAL ingested: {:>10} MB", + self.wal_ingested / (1024 * 1024) + )?; + writeln!( + s, + "size created: {:>10} MB", + self.bytes_written / (1024 * 1024) + )?; + writeln!( + s, + "size deleted: {:>10} MB", + self.bytes_deleted / (1024 * 1024) + )?; + writeln!(s, "files created: {:>10}", self.layers_created)?; + writeln!(s, "files deleted: {:>10}", self.layers_deleted)?; + writeln!( + s, + "write amp: {:>10.2}", + self.bytes_written as f64 / self.wal_ingested as f64 + )?; + writeln!( + s, + "storage amp: {:>10.2}", + (self.bytes_written - self.bytes_deleted) as f64 / self.wal_ingested as f64 + )?; + + Ok(s) + } + + pub fn draw_history(&self, output: W) -> anyhow::Result<()> { + draw::draw_history(&self.history, output) + } +} + +impl Default for MockTimeline { + fn default() -> Self { + Self::new() + } +} + +#[derive(Clone)] +pub enum MockLayer { + Delta(Arc), + Image(Arc), +} + +impl interface::CompactionLayer for MockLayer { + fn key_range(&self) -> &Range { + match self { + MockLayer::Delta(this) => this.key_range(), + MockLayer::Image(this) => this.key_range(), + } + } + fn lsn_range(&self) -> &Range { + match self { + MockLayer::Delta(this) => this.lsn_range(), + MockLayer::Image(this) => this.lsn_range(), + } + } + fn file_size(&self) -> u64 { + match self { + MockLayer::Delta(this) => this.file_size, + MockLayer::Image(this) => this.file_size, + } + } + fn short_id(&self) -> String { + match self { + MockLayer::Delta(this) => this.short_id(), + MockLayer::Image(this) => this.short_id(), + } + } + + fn is_delta(&self) -> bool { + match self { + MockLayer::Delta(_) => true, + MockLayer::Image(_) => false, + } + } +} + +impl MockLayer { + fn is_deleted(&self) -> bool { + let guard = match self { + MockLayer::Delta(this) => this.deleted.lock().unwrap(), + MockLayer::Image(this) => this.deleted.lock().unwrap(), + }; + *guard + } + fn mark_deleted(&self) { + let mut deleted_guard = match self { + MockLayer::Delta(this) => this.deleted.lock().unwrap(), + MockLayer::Image(this) => this.deleted.lock().unwrap(), + }; + assert!(!*deleted_guard, "layer already deleted"); + *deleted_guard = true; + } +} + +impl From<&Arc> for MockLayer { + fn from(l: &Arc) -> Self { + MockLayer::Delta(l.clone()) + } +} + +impl From<&Arc> for MockLayer { + fn from(l: &Arc) -> Self { + MockLayer::Image(l.clone()) + } +} + +impl interface::CompactionJobExecutor for MockTimeline { + type Key = Key; + type Layer = MockLayer; + type DeltaLayer = Arc; + type ImageLayer = Arc; + type RequestContext = MockRequestContext; + + fn get_shard_identity(&self) -> &ShardIdentity { + static IDENTITY: ShardIdentity = ShardIdentity::unsharded(); + &IDENTITY + } + + async fn get_layers( + &mut self, + key_range: &Range, + lsn_range: &Range, + _ctx: &Self::RequestContext, + ) -> anyhow::Result> { + // Clear any deleted layers from our vec + self.live_layers.retain(|l| !l.is_deleted()); + + let layers: Vec = self + .live_layers + .iter() + .filter(|l| { + overlaps_with(l.lsn_range(), lsn_range) && overlaps_with(l.key_range(), key_range) + }) + .cloned() + .collect(); + + Ok(layers) + } + + async fn get_keyspace( + &mut self, + key_range: &Range, + _lsn: Lsn, + _ctx: &Self::RequestContext, + ) -> anyhow::Result> { + // find it in the levels + if self.old_keyspaces.is_empty() { + Ok(crate::helpers::intersect_keyspace( + &self.keyspace, + key_range, + )) + } else { + // not implemented + + // The mock implementation only allows requesting the + // keyspace at the level's end LSN. That's all that the + // current implementation needs. + panic!("keyspace not available for requested lsn"); + } + } + + async fn downcast_delta_layer( + &self, + layer: &MockLayer, + ) -> anyhow::Result>> { + Ok(match layer { + MockLayer::Delta(l) => Some(l.clone()), + MockLayer::Image(_) => None, + }) + } + + async fn create_image( + &mut self, + lsn: Lsn, + key_range: &Range, + ctx: &MockRequestContext, + ) -> anyhow::Result<()> { + let keyspace = self.get_keyspace(key_range, lsn, ctx).await?; + + let mut accum_size: u64 = 0; + for r in keyspace { + accum_size += r.end - r.start; + } + + let new_layer = Arc::new(MockImageLayer { + key_range: key_range.clone(), + lsn_range: lsn..lsn, + file_size: accum_size * PAGE_SZ, + deleted: Mutex::new(false), + }); + info!( + "created image layer, size {}: {}", + new_layer.file_size, + new_layer.short_id() + ); + self.live_layers.push(MockLayer::Image(new_layer.clone())); + + // update stats + self.bytes_written += new_layer.file_size; + self.layers_created += 1; + + self.time += 1; + self.history.push(LayerTraceEvent { + time_rel: self.time, + op: LayerTraceOp::CreateImage, + file: LayerTraceFile { + filename: new_layer.short_id(), + key_range: new_layer.key_range.clone(), + lsn_range: new_layer.lsn_range.clone(), + }, + }); + + Ok(()) + } + + async fn create_delta( + &mut self, + lsn_range: &Range, + key_range: &Range, + input_layers: &[Arc], + ctx: &MockRequestContext, + ) -> anyhow::Result<()> { + let mut key_value_stream = + std::pin::pin!(merge_delta_keys::(input_layers, ctx)); + let mut records: Vec = Vec::new(); + let mut total_len = 2; + while let Some(delta_entry) = key_value_stream.next().await { + let delta_entry: MockRecord = delta_entry?; + if key_range.contains(&delta_entry.key) && lsn_range.contains(&delta_entry.lsn) { + total_len += delta_entry.len; + records.push(delta_entry); + } + } + let total_records = records.len(); + let new_layer = Arc::new(MockDeltaLayer { + key_range: key_range.clone(), + lsn_range: lsn_range.clone(), + file_size: total_len, + records, + deleted: Mutex::new(false), + }); + info!( + "created delta layer, recs {}, size {}: {}", + total_records, + total_len, + new_layer.short_id() + ); + self.live_layers.push(MockLayer::Delta(new_layer.clone())); + + // update stats + self.bytes_written += total_len; + self.layers_created += 1; + + self.time += 1; + self.history.push(LayerTraceEvent { + time_rel: self.time, + op: LayerTraceOp::CreateDelta, + file: LayerTraceFile { + filename: new_layer.short_id(), + key_range: new_layer.key_range.clone(), + lsn_range: new_layer.lsn_range.clone(), + }, + }); + + Ok(()) + } + + async fn delete_layer( + &mut self, + layer: &Self::Layer, + _ctx: &MockRequestContext, + ) -> anyhow::Result<()> { + let layer = std::pin::pin!(layer); + info!("deleting layer: {}", layer.short_id()); + self.num_deleted_layers += 1; + self.bytes_deleted += layer.file_size(); + layer.mark_deleted(); + + self.time += 1; + self.history.push(LayerTraceEvent { + time_rel: self.time, + op: LayerTraceOp::Delete, + file: LayerTraceFile { + filename: layer.short_id(), + key_range: layer.key_range().clone(), + lsn_range: layer.lsn_range().clone(), + }, + }); + + Ok(()) + } +} diff --git a/pageserver/compaction/src/simulator/draw.rs b/pageserver/compaction/src/simulator/draw.rs new file mode 100644 index 0000000000..997925067f --- /dev/null +++ b/pageserver/compaction/src/simulator/draw.rs @@ -0,0 +1,411 @@ +use super::Key; +use anyhow::Result; +use std::cmp::Ordering; +use std::{ + collections::{BTreeMap, BTreeSet, HashSet}, + fmt::Write, + ops::Range, +}; +use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style}; +use utils::lsn::Lsn; + +// Map values to their compressed coordinate - the index the value +// would have in a sorted and deduplicated list of all values. +struct CoordinateMap { + map: BTreeMap, + stretch: f32, +} + +impl CoordinateMap { + fn new(coords: Vec, stretch: f32) -> Self { + let set: BTreeSet = coords.into_iter().collect(); + + let mut map: BTreeMap = BTreeMap::new(); + for (i, e) in set.iter().enumerate() { + map.insert(*e, i); + } + + Self { map, stretch } + } + + // This assumes that the map contains an exact point for this. + // Use map_inexact for values inbetween + fn map(&self, val: T) -> f32 { + *self.map.get(&val).unwrap() as f32 * self.stretch + } + + // the value is still assumed to be within the min/max bounds + // (this is currently unused) + fn _map_inexact(&self, val: T) -> f32 { + let prev = *self.map.range(..=val).next().unwrap().1; + let next = *self.map.range(val..).next().unwrap().1; + + // interpolate + (prev as f32 + (next - prev) as f32) * self.stretch + } + + fn max(&self) -> f32 { + self.map.len() as f32 * self.stretch + } +} + +#[derive(PartialEq, Hash, Eq)] +pub enum LayerTraceOp { + Flush, + CreateDelta, + CreateImage, + Delete, +} + +impl std::fmt::Display for LayerTraceOp { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + let op_str = match self { + LayerTraceOp::Flush => "flush", + LayerTraceOp::CreateDelta => "create_delta", + LayerTraceOp::CreateImage => "create_image", + LayerTraceOp::Delete => "delete", + }; + f.write_str(op_str) + } +} + +#[derive(PartialEq, Hash, Eq, Clone)] +pub struct LayerTraceFile { + pub filename: String, + pub key_range: Range, + pub lsn_range: Range, +} + +impl LayerTraceFile { + fn is_image(&self) -> bool { + self.lsn_range.end == self.lsn_range.start + } +} + +pub struct LayerTraceEvent { + pub time_rel: u64, + pub op: LayerTraceOp, + pub file: LayerTraceFile, +} + +pub fn draw_history(history: &[LayerTraceEvent], mut output: W) -> Result<()> { + let mut files: Vec = Vec::new(); + + for event in history { + files.push(event.file.clone()); + } + let last_time_rel = history.last().unwrap().time_rel; + + // Collect all coordinates + let mut keys: Vec = vec![]; + let mut lsns: Vec = vec![]; + for f in files.iter() { + keys.push(f.key_range.start); + keys.push(f.key_range.end); + lsns.push(f.lsn_range.start); + lsns.push(f.lsn_range.end); + } + + // Analyze + let key_map = CoordinateMap::new(keys, 2.0); + // Stretch out vertically for better visibility + let lsn_map = CoordinateMap::new(lsns, 3.0); + + let mut svg = String::new(); + + // Draw + writeln!( + svg, + "{}", + BeginSvg { + w: key_map.max(), + h: lsn_map.max(), + } + )?; + let lsn_max = lsn_map.max(); + + // Sort the files by LSN, but so that image layers go after all delta layers + // The SVG is painted in the order the elements appear, and we want to draw + // image layers on top of the delta layers if they overlap + // + // (This could also be implemented via z coordinates: image layers get one z + // coord, delta layers get another z coord.) + let mut files_sorted: Vec = files.into_iter().collect(); + files_sorted.sort_by(|a, b| { + if a.is_image() && !b.is_image() { + Ordering::Greater + } else if !a.is_image() && b.is_image() { + Ordering::Less + } else { + a.lsn_range.end.cmp(&b.lsn_range.end) + } + }); + + writeln!(svg, "")?; + let mut files_seen = HashSet::new(); + for f in files_sorted { + if files_seen.contains(&f) { + continue; + } + let key_start = key_map.map(f.key_range.start); + let key_end = key_map.map(f.key_range.end); + let key_diff = key_end - key_start; + + if key_start >= key_end { + panic!("Invalid key range {}-{}", key_start, key_end); + } + + let lsn_start = lsn_map.map(f.lsn_range.start); + let lsn_end = lsn_map.map(f.lsn_range.end); + + // Fill in and thicken rectangle if it's an + // image layer so that we can see it. + let mut style = Style::default(); + style.fill = Fill::Color(rgb(0x80, 0x80, 0x80)); + style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5); + + let y_start = lsn_max - lsn_start; + let y_end = lsn_max - lsn_end; + + let x_margin = 0.25; + let y_margin = 0.5; + + match f.lsn_range.start.cmp(&f.lsn_range.end) { + Ordering::Less => { + write!( + svg, + r#" "#, + f.filename, + key_start + x_margin, + y_end + y_margin, + key_diff - x_margin * 2.0, + y_start - y_end - y_margin * 2.0, + 1.0, // border_radius, + style, + )?; + write!(svg, "{}", f.filename)?; + writeln!(svg, "")?; + } + Ordering::Equal => { + //lsn_diff = 0.3; + //lsn_offset = -lsn_diff / 2.0; + //margin = 0.05; + style.fill = Fill::Color(rgb(0x80, 0, 0x80)); + style.stroke = Stroke::Color(rgb(0x80, 0, 0x80), 3.0); + write!( + svg, + r#" "#, + f.filename, + key_start + x_margin, + y_end, + key_end - x_margin, + y_end, + style, + )?; + write!( + svg, + "{}<br>{} - {}", + f.filename, lsn_end, y_end + )?; + writeln!(svg, "")?; + } + Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end), + } + files_seen.insert(f); + } + + let mut record_style = Style::default(); + record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80)); + record_style.stroke = Stroke::None; + + writeln!(svg, "{}", EndSvg)?; + + let mut layer_events_str = String::new(); + let mut first = true; + for e in history { + if !first { + writeln!(layer_events_str, ",")?; + } + write!( + layer_events_str, + r#" {{"time_rel": {}, "filename": "{}", "op": "{}"}}"#, + e.time_rel, e.file.filename, e.op + )?; + first = false; + } + writeln!(layer_events_str)?; + + writeln!( + output, + r#" + + + + + + + + +

+ +
+{svg} +
+ + +"# + )?; + + Ok(()) +} diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs new file mode 100644 index 0000000000..bd8b54a286 --- /dev/null +++ b/pageserver/compaction/tests/tests.rs @@ -0,0 +1,70 @@ +use once_cell::sync::OnceCell; +use pageserver_compaction::interface::CompactionLayer; +use pageserver_compaction::simulator::MockTimeline; +use utils::logging; + +static LOG_HANDLE: OnceCell<()> = OnceCell::new(); + +pub(crate) fn setup_logging() { + LOG_HANDLE.get_or_init(|| { + logging::init( + logging::LogFormat::Test, + logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + logging::Output::Stdout, + ) + .expect("Failed to init test logging") + }); +} + +/// Test the extreme case that there are so many updates for a single key that +/// even if we produce an extremely narrow delta layer, spanning just that one +/// key, we still too many records to fit in the target file size. We need to +/// split in the LSN dimension too in that case. +#[tokio::test] +async fn test_many_updates_for_single_key() { + setup_logging(); + let mut executor = MockTimeline::new(); + executor.target_file_size = 1_000_000; // 1 MB + + // Ingest 10 MB of updates to a single key. + for _ in 1..1000 { + executor.ingest_uniform(100, 10, &(0..100_000)).unwrap(); + executor.ingest_uniform(1000, 10, &(0..1)).unwrap(); + executor.compact().await.unwrap(); + } + + // Check that all the layers are smaller than the target size (with some slop) + for l in executor.live_layers.iter() { + println!("layer {}: {}", l.short_id(), l.file_size()); + } + for l in executor.live_layers.iter() { + assert!(l.file_size() < executor.target_file_size * 2); + // Sanity check that none of the delta layers are empty either. + if l.is_delta() { + assert!(l.file_size() > 0); + } + } +} + +#[tokio::test] +async fn test_simple_updates() { + setup_logging(); + let mut executor = MockTimeline::new(); + executor.target_file_size = 500_000; // 500 KB + + // Ingest some traffic. + for _ in 1..400 { + executor.ingest_uniform(100, 500, &(0..100_000)).unwrap(); + } + + for l in executor.live_layers.iter() { + println!("layer {}: {}", l.short_id(), l.file_size()); + } + + println!("Running compaction..."); + executor.compact().await.unwrap(); + + for l in executor.live_layers.iter() { + println!("layer {}: {}", l.short_id(), l.file_size()); + } +} diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml index c5cd451e8d..be5626040b 100644 --- a/pageserver/ctl/Cargo.toml +++ b/pageserver/ctl/Cargo.toml @@ -12,9 +12,15 @@ bytes.workspace = true camino.workspace = true clap = { workspace = true, features = ["string"] } git-version.workspace = true +humantime.workspace = true pageserver = { path = ".." } +pageserver_api.workspace = true +remote_storage = { path = "../../libs/remote_storage" } postgres_ffi.workspace = true +thiserror.workspace = true tokio.workspace = true +tokio-util.workspace = true +toml_edit.workspace = true utils.workspace = true svg_fmt.workspace = true workspace_hack.workspace = true diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index 0e77ef0563..bc939f9688 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -9,21 +9,49 @@ //! Coordinates in both axis are compressed for better readability. //! (see ) //! -//! Example use: +//! The plain text API was chosen so that we can easily work with filenames from various +//! sources; see the Usage section below for examples. +//! +//! # Usage +//! +//! ## Producing the SVG +//! //! ```bash -//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \ -//! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg -//! $ firefox out.svg +//! +//! # local timeline dir +//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \ +//! grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg +//! +//! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer` +//! (jq -r '.historic_layers[] | .layer_file_name' | cargo run -p pagectl draw-timeline) < layer-map.json > out.svg +//! +//! # From an `index_part.json` in S3 +//! (jq -r '.layer_metadata | keys[]' | cargo run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg +//! +//! # enrich with lines for gc_cutoff and a child branch point +//! cat <(jq -r '.historic_layers[] | .layer_file_name' < layers.json) <(echo -e 'gc_cutoff:0000001CE3FE32C9\nbranch:0000001DE3FE32C9') | cargo run --bin pagectl draw-timeline >| out.svg //! ``` //! -//! This API was chosen so that we can easily work with filenames extracted from ssh, -//! or from pageserver log files. +//! ## Viewing //! -//! TODO Consider shipping this as a grafana panel plugin: -//! -use anyhow::Result; +//! **Inkscape** is better than the built-in viewers in browsers. +//! +//! After selecting a layer file rectangle, use "Open XML Editor" (Ctrl|Cmd + Shift + X) +//! to see the layer file name in the comment field. +//! +//! ```bash +//! +//! # Linux +//! inkscape out.svg +//! +//! # macOS +//! /Applications/Inkscape.app/Contents/MacOS/inkscape out.svg +//! +//! ``` +//! + +use anyhow::{Context, Result}; use pageserver::repository::Key; -use pageserver::METADATA_FILE_NAME; use std::cmp::Ordering; use std::io::{self, BufRead}; use std::path::PathBuf; @@ -54,6 +82,19 @@ fn parse_filename(name: &str) -> (Range, Range) { let split: Vec<&str> = name.split("__").collect(); let keys: Vec<&str> = split[0].split('-').collect(); let mut lsns: Vec<&str> = split[1].split('-').collect(); + + // The current format of the layer file name: 000000067F0000000400000B150100000000-000000067F0000000400000D350100000000__00000000014B7AC8-v1-00000001 + + // Handle generation number `-00000001` part + if lsns.last().expect("should").len() == 8 { + lsns.pop(); + } + + // Handle version number `-v1` part + if lsns.last().expect("should").starts_with('v') { + lsns.pop(); + } + if lsns.len() == 1 { lsns.push(lsns[0]); } @@ -63,33 +104,94 @@ fn parse_filename(name: &str) -> (Range, Range) { (keys, lsns) } +#[derive(Clone, Copy)] +enum LineKind { + GcCutoff, + Branch, +} + +impl From for Fill { + fn from(value: LineKind) -> Self { + match value { + LineKind::GcCutoff => Fill::Color(rgb(255, 0, 0)), + LineKind::Branch => Fill::Color(rgb(0, 255, 0)), + } + } +} + +impl FromStr for LineKind { + type Err = anyhow::Error; + + fn from_str(s: &str) -> std::prelude::v1::Result { + Ok(match s { + "gc_cutoff" => LineKind::GcCutoff, + "branch" => LineKind::Branch, + _ => anyhow::bail!("unsupported linekind: {s}"), + }) + } +} + pub fn main() -> Result<()> { // Parse layer filenames from stdin - let mut ranges: Vec<(Range, Range)> = vec![]; + struct Layer { + filename: String, + key_range: Range, + lsn_range: Range, + } + let mut files: Vec = vec![]; let stdin = io::stdin(); - for line in stdin.lock().lines() { + + let mut lines: Vec<(Lsn, LineKind)> = vec![]; + + for (lineno, line) in stdin.lock().lines().enumerate() { + let lineno = lineno + 1; + let line = line.unwrap(); + if let Some((kind, lsn)) = line.split_once(':') { + let (kind, lsn) = LineKind::from_str(kind) + .context("parse kind") + .and_then(|kind| { + if lsn.contains('/') { + Lsn::from_str(lsn) + } else { + Lsn::from_hex(lsn) + } + .map(|lsn| (kind, lsn)) + .context("parse lsn") + }) + .with_context(|| format!("parse {line:?} on {lineno}"))?; + lines.push((lsn, kind)); + continue; + } let line = PathBuf::from_str(&line).unwrap(); let filename = line.file_name().unwrap(); let filename = filename.to_str().unwrap(); - if filename == METADATA_FILE_NAME { - // Don't try and parse "metadata" like a key-lsn range - continue; - } - let range = parse_filename(filename); - ranges.push(range); + let (key_range, lsn_range) = parse_filename(filename); + files.push(Layer { + filename: filename.to_owned(), + key_range, + lsn_range, + }); } // Collect all coordinates - let mut keys: Vec = vec![]; - let mut lsns: Vec = vec![]; - for (keyr, lsnr) in &ranges { + let mut keys: Vec = Vec::with_capacity(files.len()); + let mut lsns: Vec = Vec::with_capacity(files.len() + lines.len()); + + for Layer { + key_range: keyr, + lsn_range: lsnr, + .. + } in &files + { keys.push(keyr.start); keys.push(keyr.end); lsns.push(lsnr.start); lsns.push(lsnr.end); } + lsns.extend(lines.iter().map(|(lsn, _)| *lsn)); + // Analyze let key_map = build_coordinate_compression_map(keys); let lsn_map = build_coordinate_compression_map(lsns); @@ -103,11 +205,19 @@ pub fn main() -> Result<()> { println!( "{}", BeginSvg { - w: key_map.len() as f32, + w: (key_map.len() + 10) as f32, h: stretch * lsn_map.len() as f32 } ); - for (keyr, lsnr) in &ranges { + + let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas + + for Layer { + filename, + key_range: keyr, + lsn_range: lsnr, + } in &files + { let key_start = *key_map.get(&keyr.start).unwrap(); let key_end = *key_map.get(&keyr.end).unwrap(); let key_diff = key_end - key_start; @@ -123,7 +233,6 @@ pub fn main() -> Result<()> { let mut lsn_diff = (lsn_end - lsn_start) as f32; let mut fill = Fill::None; let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas - let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas let mut lsn_offset = 0.0; // Fill in and thicken rectangle if it's an @@ -143,7 +252,7 @@ pub fn main() -> Result<()> { println!( " {}", rectangle( - key_start as f32 + stretch * xmargin, + 5.0 + key_start as f32 + stretch * xmargin, stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)), key_diff as f32 - stretch * 2.0 * xmargin, stretch * (lsn_diff - 2.0 * ymargin) @@ -151,8 +260,29 @@ pub fn main() -> Result<()> { .fill(fill) .stroke(Stroke::Color(rgb(0, 0, 0), 0.1)) .border_radius(0.4) + .comment(filename) ); } + + for (lsn, kind) in lines { + let lsn_start = *lsn_map.get(&lsn).unwrap(); + let lsn_end = lsn_start; + let stretch = 2.0; + let lsn_diff = 0.3; + let lsn_offset = -lsn_diff / 2.0; + let ymargin = 0.05; + println!( + "{}", + rectangle( + 0.0f32 + stretch * xmargin, + stretch * (lsn_map.len() as f32 - (lsn_end as f32 - ymargin - lsn_offset)), + (key_map.len() + 10) as f32, + stretch * (lsn_diff - 2.0 * ymargin) + ) + .fill(kind) + ); + } + println!("{}", EndSvg); eprintln!("num_images: {}", num_images); diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs index 20e5572914..20018846f8 100644 --- a/pageserver/ctl/src/index_part.rs +++ b/pageserver/ctl/src/index_part.rs @@ -1,11 +1,6 @@ -use std::collections::HashMap; - use anyhow::Context; use camino::Utf8PathBuf; -use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata; -use pageserver::tenant::storage_layer::LayerFileName; -use pageserver::tenant::{metadata::TimelineMetadata, IndexPart}; -use utils::lsn::Lsn; +use pageserver::tenant::IndexPart; #[derive(clap::Subcommand)] pub(crate) enum IndexPartCmd { @@ -17,20 +12,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { IndexPartCmd::Dump { path } => { let bytes = tokio::fs::read(path).await.context("read file")?; let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?; - #[derive(serde::Serialize)] - struct Output<'a> { - layer_metadata: &'a HashMap, - disk_consistent_lsn: Lsn, - timeline_metadata: &'a TimelineMetadata, - } - - let output = Output { - layer_metadata: &des.layer_metadata, - disk_consistent_lsn: des.get_disk_consistent_lsn(), - timeline_metadata: &des.metadata, - }; - - let output = serde_json::to_string_pretty(&output).context("serialize output")?; + let output = serde_json::to_string_pretty(&des).context("serialize output")?; println!("{output}"); Ok(()) } diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs new file mode 100644 index 0000000000..af4b5a21ab --- /dev/null +++ b/pageserver/ctl/src/key.rs @@ -0,0 +1,475 @@ +use anyhow::Context; +use clap::Parser; +use pageserver_api::{ + key::Key, + reltag::{BlockNumber, RelTag, SlruKind}, + shard::{ShardCount, ShardStripeSize}, +}; +use std::str::FromStr; + +#[derive(Parser)] +pub(super) struct DescribeKeyCommand { + /// Key material in one of the forms: hex, span attributes captured from log, reltag blocknum + input: Vec, + + /// The number of shards to calculate what Keys placement would be. + #[arg(long)] + shard_count: Option, + + /// The sharding stripe size. + /// + /// The default is hardcoded. It makes no sense to provide this without providing + /// `--shard-count`. + #[arg(long, requires = "shard_count")] + stripe_size: Option, +} + +/// Sharded shard count without unsharded count, which the actual ShardCount supports. +#[derive(Clone, Copy)] +pub(super) struct CustomShardCount(std::num::NonZeroU8); + +#[derive(Debug, thiserror::Error)] +pub(super) enum InvalidShardCount { + #[error(transparent)] + ParsingFailed(#[from] std::num::ParseIntError), + #[error("too few shards")] + TooFewShards, +} + +impl FromStr for CustomShardCount { + type Err = InvalidShardCount; + + fn from_str(s: &str) -> Result { + let inner: std::num::NonZeroU8 = s.parse()?; + if inner.get() < 2 { + Err(InvalidShardCount::TooFewShards) + } else { + Ok(CustomShardCount(inner)) + } + } +} + +impl From for ShardCount { + fn from(value: CustomShardCount) -> Self { + ShardCount::new(value.0.get()) + } +} + +impl DescribeKeyCommand { + pub(super) fn execute(self) { + let DescribeKeyCommand { + input, + shard_count, + stripe_size, + } = self; + + let material = KeyMaterial::try_from(input.as_slice()).unwrap(); + let kind = material.kind(); + let key = Key::from(material); + + println!("parsed from {kind}: {key}:"); + println!(); + println!("{key:?}"); + + macro_rules! kind_query { + ([$($name:ident),*$(,)?]) => {{[$(kind_query!($name)),*]}}; + ($name:ident) => {{ + let s: &'static str = stringify!($name); + let s = s.strip_prefix("is_").unwrap_or(s); + let s = s.strip_suffix("_key").unwrap_or(s); + + #[allow(clippy::needless_borrow)] + (s, key.$name()) + }}; + } + + // the current characterization is a mess of these boolean queries and separate + // "recognization". I think it accurately represents how strictly we model the Key + // right now, but could of course be made less confusing. + + let queries = kind_query!([ + is_rel_block_key, + is_rel_vm_block_key, + is_rel_fsm_block_key, + is_slru_block_key, + is_inherited_key, + is_rel_size_key, + is_slru_segment_size_key, + ]); + + let recognized_kind = "recognized kind"; + let metadata_key = "metadata key"; + let shard_placement = "shard placement"; + + let longest = queries + .iter() + .map(|t| t.0) + .chain([recognized_kind, metadata_key, shard_placement]) + .map(|s| s.len()) + .max() + .unwrap(); + + let colon = 1; + let padding = 1; + + for (name, is) in queries { + let width = longest - name.len() + colon + padding; + println!("{}{:width$}{}", name, ":", is); + } + + let width = longest - recognized_kind.len() + colon + padding; + println!( + "{}{:width$}{:?}", + recognized_kind, + ":", + RecognizedKeyKind::new(key), + ); + + if let Some(shard_count) = shard_count { + // seeing the sharding placement might be confusing, so leave it out unless shard + // count was given. + + let stripe_size = stripe_size.map(ShardStripeSize).unwrap_or_default(); + println!( + "# placement with shard_count: {} and stripe_size: {}:", + shard_count.0, stripe_size.0 + ); + let width = longest - shard_placement.len() + colon + padding; + println!( + "{}{:width$}{:?}", + shard_placement, + ":", + pageserver_api::shard::describe(&key, shard_count.into(), stripe_size) + ); + } + } +} + +/// Hand-wavy "inputs we accept" for a key. +#[derive(Debug)] +pub(super) enum KeyMaterial { + Hex(Key), + String(SpanAttributesFromLogs), + Split(RelTag, BlockNumber), +} + +impl KeyMaterial { + fn kind(&self) -> &'static str { + match self { + KeyMaterial::Hex(_) => "hex", + KeyMaterial::String(_) | KeyMaterial::Split(_, _) => "split", + } + } +} + +impl From for Key { + fn from(value: KeyMaterial) -> Self { + match value { + KeyMaterial::Hex(key) => key, + KeyMaterial::String(SpanAttributesFromLogs(rt, blocknum)) + | KeyMaterial::Split(rt, blocknum) => { + pageserver_api::key::rel_block_to_key(rt, blocknum) + } + } + } +} + +impl> TryFrom<&[S]> for KeyMaterial { + type Error = anyhow::Error; + + fn try_from(value: &[S]) -> Result { + match value { + [] => anyhow::bail!( + "need 1..N positional arguments describing the key, try hex or a log line" + ), + [one] => { + let one = one.as_ref(); + + let key = Key::from_hex(one).map(KeyMaterial::Hex); + + let attrs = SpanAttributesFromLogs::from_str(one).map(KeyMaterial::String); + + match (key, attrs) { + (Ok(key), _) => Ok(key), + (_, Ok(s)) => Ok(s), + (Err(e1), Err(e2)) => anyhow::bail!( + "failed to parse {one:?} as hex or span attributes:\n- {e1:#}\n- {e2:#}" + ), + } + } + more => { + // assume going left to right one of these is a reltag and then we find a blocknum + // this works, because we don't have plain numbers at least right after reltag in + // logs. for some definition of "works". + + let Some((reltag_at, reltag)) = more + .iter() + .map(AsRef::as_ref) + .enumerate() + .find_map(|(i, s)| { + s.split_once("rel=") + .map(|(_garbage, actual)| actual) + .unwrap_or(s) + .parse::() + .ok() + .map(|rt| (i, rt)) + }) + else { + anyhow::bail!("found no RelTag in arguments"); + }; + + let Some(blocknum) = more + .iter() + .map(AsRef::as_ref) + .skip(reltag_at) + .find_map(|s| { + s.split_once("blkno=") + .map(|(_garbage, actual)| actual) + .unwrap_or(s) + .parse::() + .ok() + }) + else { + anyhow::bail!("found no blocknum in arguments"); + }; + + Ok(KeyMaterial::Split(reltag, blocknum)) + } + } + } +} + +#[derive(Debug)] +pub(super) struct SpanAttributesFromLogs(RelTag, BlockNumber); + +impl std::str::FromStr for SpanAttributesFromLogs { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + // accept the span separator but do not require or fail if either is missing + // "whatever{rel=1663/16389/24615 blkno=1052204 req_lsn=FFFFFFFF/FFFFFFFF}" + let (_, reltag) = s + .split_once("rel=") + .ok_or_else(|| anyhow::anyhow!("cannot find 'rel='"))?; + let reltag = reltag.split_whitespace().next().unwrap(); + + let (_, blocknum) = s + .split_once("blkno=") + .ok_or_else(|| anyhow::anyhow!("cannot find 'blkno='"))?; + let blocknum = blocknum.split_whitespace().next().unwrap(); + + let reltag = reltag + .parse() + .with_context(|| format!("parse reltag from {reltag:?}"))?; + let blocknum = blocknum + .parse() + .with_context(|| format!("parse blocknum from {blocknum:?}"))?; + + Ok(Self(reltag, blocknum)) + } +} + +#[derive(Debug)] +#[allow(dead_code)] // debug print is used +enum RecognizedKeyKind { + DbDir, + ControlFile, + Checkpoint, + AuxFilesV1, + SlruDir(Result), + RelMap(RelTagish<2>), + RelDir(RelTagish<2>), + AuxFileV2(Result>), +} + +#[derive(Debug, PartialEq)] +#[allow(unused)] +enum AuxFileV2 { + Recognized(&'static str, utils::Hex<[u8; 13]>), + OtherWithPrefix(&'static str, utils::Hex<[u8; 13]>), + Other(utils::Hex<[u8; 13]>), +} + +impl RecognizedKeyKind { + fn new(key: Key) -> Option { + use RecognizedKeyKind::{ + AuxFilesV1, Checkpoint, ControlFile, DbDir, RelDir, RelMap, SlruDir, + }; + + let slru_dir_kind = pageserver_api::key::slru_dir_kind(&key); + + Some(match key { + pageserver_api::key::DBDIR_KEY => DbDir, + pageserver_api::key::CONTROLFILE_KEY => ControlFile, + pageserver_api::key::CHECKPOINT_KEY => Checkpoint, + pageserver_api::key::AUX_FILES_KEY => AuxFilesV1, + _ if slru_dir_kind.is_some() => SlruDir(slru_dir_kind.unwrap()), + _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 0 => { + RelMap([key.field2, key.field3].into()) + } + _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 1 => { + RelDir([key.field2, key.field3].into()) + } + _ if key.is_metadata_key() => RecognizedKeyKind::AuxFileV2( + AuxFileV2::new(key).ok_or_else(|| utils::Hex(key.to_i128().to_be_bytes())), + ), + _ => return None, + }) + } +} + +impl AuxFileV2 { + fn new(key: Key) -> Option { + const EMPTY_HASH: [u8; 13] = { + let mut out = [0u8; 13]; + let hash = pageserver::aux_file::fnv_hash(b"").to_be_bytes(); + let mut i = 3; + while i < 16 { + out[i - 3] = hash[i]; + i += 1; + } + out + }; + + let bytes = key.to_i128().to_be_bytes(); + let hash = utils::Hex(<[u8; 13]>::try_from(&bytes[3..]).unwrap()); + + assert_eq!(EMPTY_HASH.len(), hash.0.len()); + + // TODO: we could probably find the preimages for the hashes + + Some(match (bytes[1], bytes[2]) { + (1, 1) => AuxFileV2::Recognized("pg_logical/mappings/", hash), + (1, 2) => AuxFileV2::Recognized("pg_logical/snapshots/", hash), + (1, 3) if hash.0 == EMPTY_HASH => { + AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash) + } + (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash), + (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash), + (0xff, 0xff) => AuxFileV2::Other(hash), + _ => return None, + }) + } +} + +/// Prefix of RelTag, currently only known use cases are the two item versions. +/// +/// Renders like a reltag with `/`, nothing else. +struct RelTagish([u32; N]); + +impl From<[u32; N]> for RelTagish { + fn from(val: [u32; N]) -> Self { + RelTagish(val) + } +} + +impl std::fmt::Debug for RelTagish { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use std::fmt::Write as _; + let mut first = true; + self.0.iter().try_for_each(|x| { + if !first { + f.write_char('/')?; + } + first = false; + write!(f, "{}", x) + }) + } +} + +#[cfg(test)] +mod tests { + use pageserver::aux_file::encode_aux_file_key; + + use super::*; + + #[test] + fn hex_is_key_material() { + let m = KeyMaterial::try_from(&["000000067F0000400200DF927900FFFFFFFF"][..]).unwrap(); + assert!(matches!(m, KeyMaterial::Hex(_)), "{m:?}"); + } + + #[test] + fn single_positional_spanalike_is_key_material() { + // why is this needed? if you are checking many, then copypaste starts to appeal + let strings = [ + (line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0"), + (line!(), "rel=1663/208101/2620_fsm blkno=2"), + (line!(), "rel=1663/208101/2620.1 blkno=2"), + ]; + + let mut first: Option = None; + + for (line, example) in strings { + let m = KeyMaterial::try_from(&[example][..]) + .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}")); + let key = Key::from(m); + if let Some(first) = first { + assert_eq!(first, key); + } else { + first = Some(key); + } + } + + // not supporting this is rather accidential, but I think the input parsing is lenient + // enough already + KeyMaterial::try_from(&["1663/208101/2620_fsm 2"][..]).unwrap_err(); + } + + #[test] + fn multiple_spanlike_args() { + let strings = [ + (line!(), &["process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}"][..]), + (line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]), + (line!(), &["1663/208101/2620_fsm", "2"][..]), + ]; + + let mut first: Option = None; + + for (line, example) in strings { + let m = KeyMaterial::try_from(example) + .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}")); + let key = Key::from(m); + if let Some(first) = first { + assert_eq!(first, key); + } else { + first = Some(key); + } + } + } + #[test] + fn recognized_auxfiles() { + use AuxFileV2::*; + + let empty = [ + 0x2e, 0x07, 0xbb, 0x01, 0x42, 0x62, 0xb8, 0x21, 0x75, 0x62, 0x95, 0xc5, 0x8d, + ]; + let foobar = [ + 0x62, 0x79, 0x3c, 0x64, 0xbf, 0x6f, 0x0d, 0x35, 0x97, 0xba, 0x44, 0x6f, 0x18, + ]; + + #[rustfmt::skip] + let examples = [ + (line!(), "pg_logical/mappings/foobar", Recognized("pg_logical/mappings/", utils::Hex(foobar))), + (line!(), "pg_logical/snapshots/foobar", Recognized("pg_logical/snapshots/", utils::Hex(foobar))), + (line!(), "pg_logical/replorigin_checkpoint", Recognized("pg_logical/replorigin_checkpoint", utils::Hex(empty))), + (line!(), "pg_logical/foobar", OtherWithPrefix("pg_logical/", utils::Hex(foobar))), + (line!(), "pg_replslot/foobar", Recognized("pg_replslot/", utils::Hex(foobar))), + (line!(), "foobar", Other(utils::Hex(foobar))), + ]; + + for (line, path, expected) in examples { + let key = encode_aux_file_key(path); + let recognized = + AuxFileV2::new(key).unwrap_or_else(|| panic!("line {line} example failed")); + + assert_eq!(recognized, expected); + } + + assert_eq!( + AuxFileV2::new(Key::from_hex("600000102000000000000000000000000000").unwrap()), + None, + "example key has one too few 0 after 6 before 1" + ); + } +} diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index 15d4eb09e0..a07107753e 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -12,13 +12,13 @@ use std::collections::BinaryHeap; use std::ops::Range; use std::{fs, str}; -use pageserver::page_cache::PAGE_SZ; +use pageserver::page_cache::{self, PAGE_SZ}; use pageserver::repository::{Key, KEY_SIZE}; use pageserver::tenant::block_io::FileBlockReader; use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection}; use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE}; use pageserver::tenant::storage_layer::range_overlaps; -use pageserver::virtual_file::VirtualFile; +use pageserver::virtual_file::{self, VirtualFile}; use utils::{bin_ser::BeSer, lsn::Lsn}; @@ -100,13 +100,15 @@ pub(crate) fn parse_filename(name: &str) -> Option { // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH" async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result> { - let file = FileBlockReader::new(VirtualFile::open(path).await?); - let summary_blk = file.read_blk(0, ctx).await?; + let file = VirtualFile::open(path, ctx).await?; + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( actual_summary.index_start_blk, actual_summary.index_root_blk, - file, + block_reader, ); // min-heap (reserve space for one more element added before eviction) let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); @@ -142,7 +144,11 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree. - pageserver::virtual_file::init(10); + pageserver::virtual_file::init( + 10, + virtual_file::api::IoEngineKind::StdFs, + pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, + ); pageserver::page_cache::init(100); let mut total_delta_layers = 0usize; diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index ebf4a4bec3..dd753398e2 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -59,15 +59,17 @@ pub(crate) enum LayerCmd { async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result<()> { let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path"); - virtual_file::init(10); + virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1); page_cache::init(100); - let file = FileBlockReader::new(VirtualFile::open(path).await?); - let summary_blk = file.read_blk(0, ctx).await?; + let file = VirtualFile::open(path, ctx).await?; + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( actual_summary.index_start_blk, actual_summary.index_root_blk, - &file, + &block_reader, ); // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API. let mut all = vec![]; @@ -83,10 +85,11 @@ async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result ctx, ) .await?; - let cursor = BlockCursor::new_fileblockreader(&file); + let cursor = BlockCursor::new_fileblockreader(&block_reader); for (k, v) in all { let value = cursor.read_blob(v.pos(), ctx).await?; println!("key:{} value_len:{}", k, value.len()); + assert!(k.is_i128_representable(), "invalid key: "); } // TODO(chi): special handling for last key? Ok(()) @@ -187,7 +190,11 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { new_tenant_id, new_timeline_id, } => { - pageserver::virtual_file::init(10); + pageserver::virtual_file::init( + 10, + virtual_file::api::IoEngineKind::StdFs, + pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, + ); pageserver::page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index fb42d6d2f1..3b66b0c4aa 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -6,9 +6,15 @@ mod draw_timeline_dir; mod index_part; +mod key; mod layer_map_analyzer; mod layers; +use std::{ + str::FromStr, + time::{Duration, SystemTime}, +}; + use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; use index_part::IndexPartCmd; @@ -20,8 +26,16 @@ use pageserver::{ tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, virtual_file, }; +use pageserver_api::{config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, shard::TenantShardId}; use postgres_ffi::ControlFileData; -use utils::{lsn::Lsn, project_git_version}; +use remote_storage::{RemotePath, RemoteStorageConfig}; +use tokio_util::sync::CancellationToken; +use utils::{ + id::TimelineId, + logging::{self, LogFormat, TracingErrorLayerEnablement}, + lsn::Lsn, + project_git_version, +}; project_git_version!(GIT_VERSION); @@ -43,10 +57,13 @@ enum Commands { #[command(subcommand)] IndexPart(IndexPartCmd), PrintLayerFile(PrintLayerFileCmd), + TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd), DrawTimeline {}, AnalyzeLayerMap(AnalyzeLayerMapCmd), #[command(subcommand)] Layer(LayerCmd), + /// Debug print a hex key found from logs + Key(key::DescribeKeyCommand), } /// Read and update pageserver metadata file @@ -68,6 +85,26 @@ struct PrintLayerFileCmd { path: Utf8PathBuf, } +/// Roll back the time for the specified prefix using S3 history. +/// +/// The command is fairly low level and powerful. Validation is only very light, +/// so it is more powerful, and thus potentially more dangerous. +#[derive(Parser)] +struct TimeTravelRemotePrefixCmd { + /// A configuration string for the remote_storage configuration. + /// + /// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }` + config_toml_str: String, + /// remote prefix to time travel recover. For safety reasons, we require it to contain + /// a timeline or tenant ID in the prefix. + prefix: String, + /// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy. + travel_to: String, + /// Timestamp of the start of the operation, must be after any changes we want to roll back and after. + /// You can use a few seconds before invoking the command. Same format as `travel_to`. + done_if_after: Option, +} + #[derive(Parser)] struct AnalyzeLayerMapCmd { /// Pageserver data path @@ -78,6 +115,14 @@ struct AnalyzeLayerMapCmd { #[tokio::main] async fn main() -> anyhow::Result<()> { + logging::init( + LogFormat::Plain, + TracingErrorLayerEnablement::EnableWithRustLogFilter, + logging::Output::Stdout, + )?; + + logging::replace_panic_hook_with_tracing_panic_hook().forget(); + let cli = CliOpts::parse(); match cli.command { @@ -105,6 +150,43 @@ async fn main() -> anyhow::Result<()> { print_layerfile(&cmd.path).await?; } } + Commands::TimeTravelRemotePrefix(cmd) => { + let timestamp = humantime::parse_rfc3339(&cmd.travel_to) + .map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?; + + let done_if_after = if let Some(done_if_after) = &cmd.done_if_after { + humantime::parse_rfc3339(done_if_after).map_err(|_e| { + anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after) + })? + } else { + const SAFETY_MARGIN: Duration = Duration::from_secs(3); + tokio::time::sleep(SAFETY_MARGIN).await; + // Convert to string representation and back to get rid of sub-second values + let done_if_after = SystemTime::now(); + tokio::time::sleep(SAFETY_MARGIN).await; + done_if_after + }; + + let timestamp = strip_subsecond(timestamp); + let done_if_after = strip_subsecond(done_if_after); + + let Some(prefix) = validate_prefix(&cmd.prefix) else { + println!("specified prefix '{}' failed validation", cmd.prefix); + return Ok(()); + }; + let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?; + let toml_item = toml_document + .get("remote_storage") + .expect("need remote_storage"); + let config = RemoteStorageConfig::from_toml(toml_item)?; + let storage = remote_storage::GenericRemoteStorage::from_config(&config).await; + let cancel = CancellationToken::new(); + storage + .unwrap() + .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel) + .await?; + } + Commands::Key(dkc) => dkc.execute(), }; Ok(()) } @@ -123,7 +205,11 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> { async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> { // Basic initialization of things that don't change after startup - virtual_file::init(10); + virtual_file::init( + 10, + virtual_file::api::IoEngineKind::StdFs, + DEFAULT_IO_BUFFER_ALIGNMENT, + ); page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); dump_layerfile_from_path(path, true, &ctx).await @@ -141,6 +227,7 @@ fn handle_metadata( let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?; println!("Current metadata:\n{meta:?}"); let mut update_meta = false; + // TODO: simplify this part if let Some(disk_consistent_lsn) = disk_consistent_lsn { meta = TimelineMetadata::new( *disk_consistent_lsn, @@ -185,3 +272,89 @@ fn handle_metadata( Ok(()) } + +/// Ensures that the given S3 prefix is sufficiently constrained. +/// The command is very risky already and we don't want to expose something +/// that allows usually unintentional and quite catastrophic time travel of +/// an entire bucket, which would be a major catastrophy and away +/// by only one character change (similar to "rm -r /home /username/foobar"). +fn validate_prefix(prefix: &str) -> Option { + if prefix.is_empty() { + // Empty prefix means we want to specify the *whole* bucket + return None; + } + let components = prefix.split('/').collect::>(); + let (last, components) = { + let last = components.last()?; + if last.is_empty() { + ( + components.iter().nth_back(1)?, + &components[..(components.len() - 1)], + ) + } else { + (last, &components[..]) + } + }; + 'valid: { + if let Ok(_timeline_id) = TimelineId::from_str(last) { + // Ends in either a tenant or timeline ID + break 'valid; + } + if *last == "timelines" { + if let Some(before_last) = components.iter().nth_back(1) { + if let Ok(_tenant_id) = TenantShardId::from_str(before_last) { + // Has a valid tenant id + break 'valid; + } + } + } + + return None; + } + RemotePath::from_string(prefix).ok() +} + +fn strip_subsecond(timestamp: SystemTime) -> SystemTime { + let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string(); + humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_validate_prefix() { + assert_eq!(validate_prefix(""), None); + assert_eq!(validate_prefix("/"), None); + #[track_caller] + fn assert_valid(prefix: &str) { + let remote_path = RemotePath::from_string(prefix).unwrap(); + assert_eq!(validate_prefix(prefix), Some(remote_path)); + } + assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"); + // Path is not relative but absolute + assert_eq!( + validate_prefix( + "/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/" + ), + None + ); + assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/"); + // Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix + assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None); + assert_eq!(validate_prefix("wal"), None); + assert_eq!(validate_prefix("/wal/"), None); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001"); + // Partial tenant ID + assert_eq!( + validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"), + None + ); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683"); + assert_eq!(validate_prefix("pageserver/v1/tenants/"), None); + } +} diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs new file mode 100644 index 0000000000..bce3285606 --- /dev/null +++ b/pageserver/pagebench/src/cmd/aux_files.rs @@ -0,0 +1,105 @@ +use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest}; +use pageserver_api::shard::TenantShardId; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Instant; + +/// Ingest aux files into the pageserver. +#[derive(clap::Parser)] +pub(crate) struct Args { + #[clap(long, default_value = "http://localhost:9898")] + mgmt_api_endpoint: String, + #[clap(long, default_value = "postgres://postgres@localhost:64000")] + page_service_connstring: String, + #[clap(long)] + pageserver_jwt: Option, + + targets: Option>, +} + +pub(crate) fn main(args: Args) -> anyhow::Result<()> { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + let main_task = rt.spawn(main_impl(args)); + rt.block_on(main_task).unwrap() +} + +async fn main_impl(args: Args) -> anyhow::Result<()> { + let args: &'static Args = Box::leak(Box::new(args)); + + let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + args.mgmt_api_endpoint.clone(), + args.pageserver_jwt.as_deref(), + )); + + // discover targets + let timelines: Vec = crate::util::cli::targets::discover( + &mgmt_api_client, + crate::util::cli::targets::Spec { + limit_to_first_n_targets: None, + targets: { + if let Some(targets) = &args.targets { + if targets.len() != 1 { + anyhow::bail!("must specify exactly one target"); + } + Some(targets.clone()) + } else { + None + } + }, + }, + ) + .await?; + + let timeline = timelines[0]; + let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id); + let timeline_id = timeline.timeline_id; + + println!("operating on timeline {}", timeline); + + mgmt_api_client + .tenant_config(&TenantConfigRequest { + tenant_id: timeline.tenant_id, + config: TenantConfig { + switch_aux_file_policy: Some(AuxFilePolicy::V2), + ..Default::default() + }, + }) + .await?; + + for batch in 0..100 { + let items = (0..100) + .map(|id| { + ( + format!("pg_logical/mappings/{:03}.{:03}", batch, id), + format!("{:08}", id), + ) + }) + .collect::>(); + let file_cnt = items.len(); + mgmt_api_client + .ingest_aux_files(tenant_shard_id, timeline_id, items) + .await?; + println!("ingested {file_cnt} files"); + } + + for _ in 0..100 { + let start = Instant::now(); + let files = mgmt_api_client + .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1)) + .await?; + println!( + "{} files found in {}s", + files.len(), + start.elapsed().as_secs_f64() + ); + } + + anyhow::Ok(()) +} diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 2d61b0e252..3ae6d99aa7 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -1,4 +1,5 @@ use anyhow::Context; +use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ForceAwaitLogicalSize; use pageserver_client::page_service::BasebackupRequest; @@ -8,7 +9,7 @@ use utils::lsn::Lsn; use rand::prelude::*; use tokio::sync::Barrier; use tokio::task::JoinSet; -use tracing::{debug, info, instrument}; +use tracing::{info, instrument}; use std::collections::HashMap; use std::num::NonZeroUsize; @@ -25,8 +26,8 @@ use crate::util::{request_stats, tokio_thread_local_stats}; pub(crate) struct Args { #[clap(long, default_value = "http://localhost:9898")] mgmt_api_endpoint: String, - #[clap(long, default_value = "localhost:64000")] - page_service_host_port: String, + #[clap(long, default_value = "postgres://postgres@localhost:64000")] + page_service_connstring: String, #[clap(long)] pageserver_jwt: Option, #[clap(long, default_value = "1")] @@ -95,7 +96,7 @@ async fn main_impl( let timeline = *timeline; let info = mgmt_api_client .timeline_info( - timeline.tenant_id, + TenantShardId::unsharded(timeline.tenant_id), timeline.timeline_id, ForceAwaitLogicalSize::No, ) @@ -230,12 +231,9 @@ async fn client( ) { start_work_barrier.wait().await; - let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring( - &args.page_service_host_port, - args.pageserver_jwt.as_deref(), - )) - .await - .unwrap(); + let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone()) + .await + .unwrap(); while let Some(Work { lsn, gzip }) = work.recv().await { let start = Instant::now(); @@ -263,7 +261,7 @@ async fn client( } }) .await; - debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed)); + info!("basebackup size is {} bytes", size.load(Ordering::Relaxed)); let elapsed = start.elapsed(); live_stats.inc(); STATS.with(|stats| { diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 400b5476b7..ac4a732377 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -1,20 +1,19 @@ use anyhow::Context; use camino::Utf8PathBuf; -use futures::future::join_all; -use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key}; +use pageserver_api::key::Key; use pageserver_api::keyspace::KeySpaceAccum; use pageserver_api::models::PagestreamGetPageRequest; +use pageserver_api::shard::TenantShardId; use tokio_util::sync::CancellationToken; use utils::id::TenantTimelineId; use utils::lsn::Lsn; use rand::prelude::*; -use tokio::sync::Barrier; use tokio::task::JoinSet; -use tracing::{info, instrument}; +use tracing::info; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::future::Future; use std::num::NonZeroUsize; use std::pin::Pin; @@ -38,8 +37,12 @@ pub(crate) struct Args { num_clients: NonZeroUsize, #[clap(long)] runtime: Option, + /// Each client sends requests at the given rate. + /// + /// If a request takes too long and we should be issuing a new request already, + /// we skip that request and account it as `MISSED`. #[clap(long)] - per_target_rate_limit: Option, + per_client_rate: Option, /// Probability for sending `latest=true` in the request (uniform distribution). #[clap(long, default_value = "1")] req_latest_probability: f64, @@ -51,18 +54,31 @@ pub(crate) struct Args { /// It doesn't get invalidated if the keyspace changes under the hood, e.g., due to new ingested data or compaction. #[clap(long)] keyspace_cache: Option, + /// Before starting the benchmark, live-reconfigure the pageserver to use the given + /// [`pageserver_api::models::virtual_file::IoEngineKind`]. + #[clap(long)] + set_io_engine: Option, + + /// Before starting the benchmark, live-reconfigure the pageserver to use specified alignment for io buffers. + #[clap(long)] + set_io_alignment: Option, + targets: Option>, } #[derive(Debug, Default)] struct LiveStats { completed_requests: AtomicU64, + missed: AtomicU64, } impl LiveStats { - fn inc(&self) { + fn request_done(&self) { self.completed_requests.fetch_add(1, Ordering::Relaxed); } + fn missed(&self, n: u64) { + self.missed.fetch_add(n, Ordering::Relaxed); + } } #[derive(Clone, serde::Serialize, serde::Deserialize)] @@ -79,6 +95,12 @@ impl KeyRange { } } +#[derive(PartialEq, Eq, Hash, Copy, Clone)] +struct WorkerId { + timeline: TenantTimelineId, + num_client: usize, // from 0..args.num_clients +} + #[derive(serde::Serialize)] struct Output { total: request_stats::Output, @@ -103,6 +125,14 @@ async fn main_impl( args.pageserver_jwt.as_deref(), )); + if let Some(engine_str) = &args.set_io_engine { + mgmt_api_client.put_io_engine(engine_str).await?; + } + + if let Some(align) = args.set_io_alignment { + mgmt_api_client.put_io_alignment(align).await?; + } + // discover targets let timelines: Vec = crate::util::cli::targets::discover( &mgmt_api_client, @@ -153,7 +183,10 @@ async fn main_impl( let timeline = *timeline; async move { let partitioning = mgmt_api_client - .keyspace(timeline.tenant_id, timeline.timeline_id) + .keyspace( + TenantShardId::unsharded(timeline.tenant_id), + timeline.timeline_id, + ) .await?; let lsn = partitioning.at_lsn; let start = Instant::now(); @@ -163,7 +196,7 @@ async fn main_impl( for r in partitioning.keys.ranges.iter() { let mut i = r.start; while i != r.end { - if is_rel_block_key(&i) { + if i.is_rel_block_key() { filtered.add_key(i); } i = i.next(); @@ -206,13 +239,12 @@ async fn main_impl( let live_stats = Arc::new(LiveStats::default()); - let num_client_tasks = timelines.len(); let num_live_stats_dump = 1; - let num_work_sender_tasks = 1; + let num_work_sender_tasks = args.num_clients.get() * timelines.len(); let num_main_impl = 1; let start_work_barrier = Arc::new(tokio::sync::Barrier::new( - num_client_tasks + num_live_stats_dump + num_work_sender_tasks + num_main_impl, + num_live_stats_dump + num_work_sender_tasks + num_main_impl, )); tokio::spawn({ @@ -224,10 +256,12 @@ async fn main_impl( let start = std::time::Instant::now(); tokio::time::sleep(std::time::Duration::from_secs(1)).await; let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed); + let missed = stats.missed.swap(0, Ordering::Relaxed); let elapsed = start.elapsed(); info!( - "RPS: {:.0}", - completed_requests as f64 / elapsed.as_secs_f64() + "RPS: {:.0} MISSED: {:.0}", + completed_requests as f64 / elapsed.as_secs_f64(), + missed as f64 / elapsed.as_secs_f64() ); } } @@ -235,119 +269,109 @@ async fn main_impl( let cancel = CancellationToken::new(); - let mut work_senders: HashMap = HashMap::new(); - let mut tasks = Vec::new(); - for tl in &timelines { - let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are - work_senders.insert(*tl, sender); - tasks.push(tokio::spawn(client( - args, - *tl, - Arc::clone(&start_work_barrier), - receiver, - Arc::clone(&live_stats), - cancel.clone(), - ))); - } - - let work_sender: Pin>> = { + let rps_period = args + .per_client_rate + .map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64))); + let make_worker: &dyn Fn(WorkerId) -> Pin>> = &|worker_id| { + let live_stats = live_stats.clone(); let start_work_barrier = start_work_barrier.clone(); - let cancel = cancel.clone(); - match args.per_target_rate_limit { - None => Box::pin(async move { - let weights = rand::distributions::weighted::WeightedIndex::new( - all_ranges.iter().map(|v| v.len()), - ) + let ranges: Vec = all_ranges + .iter() + .filter(|r| r.timeline == worker_id.timeline) + .cloned() + .collect(); + let weights = + rand::distributions::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len())) .unwrap(); - start_work_barrier.wait().await; - - while !cancel.is_cancelled() { - let (timeline, req) = { - let mut rng = rand::thread_rng(); - let r = &all_ranges[weights.sample(&mut rng)]; - let key: i128 = rng.gen_range(r.start..r.end); - let key = Key::from_i128(key); - let (rel_tag, block_no) = - key_to_rel_block(key).expect("we filter non-rel-block keys out above"); - ( - r.timeline, - PagestreamGetPageRequest { - latest: rng.gen_bool(args.req_latest_probability), - lsn: r.timeline_lsn, - rel: rel_tag, - blkno: block_no, - }, - ) - }; - let sender = work_senders.get(&timeline).unwrap(); - // TODO: what if this blocks? - if sender.send(req).await.is_err() { - assert!(cancel.is_cancelled(), "client has gone away unexpectedly"); - } - } - }), - Some(rps_limit) => Box::pin(async move { - let period = Duration::from_secs_f64(1.0 / (rps_limit as f64)); - let make_timeline_task: &dyn Fn( - TenantTimelineId, - ) - -> Pin>> = &|timeline| { - let sender = work_senders.get(&timeline).unwrap(); - let ranges: Vec = all_ranges - .iter() - .filter(|r| r.timeline == timeline) - .cloned() - .collect(); - let weights = rand::distributions::weighted::WeightedIndex::new( - ranges.iter().map(|v| v.len()), - ) + let cancel = cancel.clone(); + Box::pin(async move { + let client = + pageserver_client::page_service::Client::new(args.page_service_connstring.clone()) + .await .unwrap(); + let mut client = client + .pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id) + .await + .unwrap(); - let cancel = cancel.clone(); - Box::pin(async move { - let mut ticker = tokio::time::interval(period); - ticker.set_missed_tick_behavior( - /* TODO review this choice */ - tokio::time::MissedTickBehavior::Burst, - ); - while !cancel.is_cancelled() { - ticker.tick().await; - let req = { - let mut rng = rand::thread_rng(); - let r = &ranges[weights.sample(&mut rng)]; - let key: i128 = rng.gen_range(r.start..r.end); - let key = Key::from_i128(key); - assert!(is_rel_block_key(&key)); - let (rel_tag, block_no) = key_to_rel_block(key) - .expect("we filter non-rel-block keys out above"); - PagestreamGetPageRequest { - latest: rng.gen_bool(args.req_latest_probability), - lsn: r.timeline_lsn, - rel: rel_tag, - blkno: block_no, - } - }; - if sender.send(req).await.is_err() { - assert!(cancel.is_cancelled(), "client has gone away unexpectedly"); - } - } - }) + start_work_barrier.wait().await; + let client_start = Instant::now(); + let mut ticks_processed = 0; + while !cancel.is_cancelled() { + // Detect if a request took longer than the RPS rate + if let Some(period) = &rps_period { + let periods_passed_until_now = + usize::try_from(client_start.elapsed().as_micros() / period.as_micros()) + .unwrap(); + + if periods_passed_until_now > ticks_processed { + live_stats.missed((periods_passed_until_now - ticks_processed) as u64); + } + ticks_processed = periods_passed_until_now; + } + + let start = Instant::now(); + let req = { + let mut rng = rand::thread_rng(); + let r = &ranges[weights.sample(&mut rng)]; + let key: i128 = rng.gen_range(r.start..r.end); + let key = Key::from_i128(key); + assert!(key.is_rel_block_key()); + let (rel_tag, block_no) = key + .to_rel_block() + .expect("we filter non-rel-block keys out above"); + PagestreamGetPageRequest { + request_lsn: if rng.gen_bool(args.req_latest_probability) { + Lsn::MAX + } else { + r.timeline_lsn + }, + not_modified_since: r.timeline_lsn, + rel: rel_tag, + blkno: block_no, + } }; + client.getpage(req).await.unwrap(); + let end = Instant::now(); + live_stats.request_done(); + ticks_processed += 1; + STATS.with(|stats| { + stats + .borrow() + .lock() + .unwrap() + .observe(end.duration_since(start)) + .unwrap(); + }); - let tasks: Vec<_> = work_senders - .keys() - .map(|tl| make_timeline_task(*tl)) - .collect(); - - start_work_barrier.wait().await; - - join_all(tasks).await; - }), - } + if let Some(period) = &rps_period { + let next_at = client_start + + Duration::from_micros( + (ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(), + ); + tokio::time::sleep_until(next_at.into()).await; + } + } + }) }; - let work_sender_task = tokio::spawn(work_sender); + info!("spawning workers"); + let mut workers = JoinSet::new(); + for timeline in timelines.iter().cloned() { + for num_client in 0..args.num_clients.get() { + let worker_id = WorkerId { + timeline, + num_client, + }; + workers.spawn(make_worker(worker_id)); + } + } + let workers = async move { + while let Some(res) = workers.join_next().await { + res.unwrap(); + } + }; info!("waiting for everything to become ready"); start_work_barrier.wait().await; @@ -356,20 +380,13 @@ async fn main_impl( tokio::time::sleep(runtime.into()).await; info!("runtime over, signalling cancellation"); cancel.cancel(); - work_sender_task.await.unwrap(); + workers.await; info!("work sender exited"); } else { - work_sender_task.await.unwrap(); + workers.await; unreachable!("work sender never terminates"); } - info!("joining clients"); - for t in tasks { - t.await.unwrap(); - } - - info!("all clients stopped"); - let output = Output { total: { let mut agg_stats = request_stats::Stats::new(); @@ -386,45 +403,3 @@ async fn main_impl( anyhow::Ok(()) } - -#[instrument(skip_all)] -async fn client( - args: &'static Args, - timeline: TenantTimelineId, - start_work_barrier: Arc, - mut work: tokio::sync::mpsc::Receiver, - live_stats: Arc, - cancel: CancellationToken, -) { - let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone()) - .await - .unwrap(); - let mut client = client - .pagestream(timeline.tenant_id, timeline.timeline_id) - .await - .unwrap(); - - let do_requests = async { - start_work_barrier.wait().await; - while let Some(req) = work.recv().await { - let start = Instant::now(); - client - .getpage(req) - .await - .with_context(|| format!("getpage for {timeline}")) - .unwrap(); - let elapsed = start.elapsed(); - live_stats.inc(); - STATS.with(|stats| { - stats.borrow().lock().unwrap().observe(elapsed).unwrap(); - }); - } - }; - tokio::select! { - res = do_requests => { res }, - _ = cancel.cancelled() => { - // fallthrough to shutdown - } - } - client.shutdown().await; -} diff --git a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs new file mode 100644 index 0000000000..1bb71b9353 --- /dev/null +++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs @@ -0,0 +1,333 @@ +use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId}; + +use pageserver_client::mgmt_api; +use rand::seq::SliceRandom; +use tokio_util::sync::CancellationToken; +use tracing::{debug, info}; +use utils::id::{TenantTimelineId, TimelineId}; + +use std::{f64, sync::Arc}; +use tokio::{ + sync::{mpsc, OwnedSemaphorePermit}, + task::JoinSet, +}; + +use std::{ + num::NonZeroUsize, + sync::atomic::{AtomicU64, Ordering}, + time::{Duration, Instant}, +}; + +/// Evict & on-demand download random layers. +#[derive(clap::Parser)] +pub(crate) struct Args { + #[clap(long, default_value = "http://localhost:9898")] + mgmt_api_endpoint: String, + #[clap(long)] + pageserver_jwt: Option, + #[clap(long)] + runtime: Option, + #[clap(long, default_value = "1")] + tasks_per_target: NonZeroUsize, + #[clap(long, default_value = "1")] + concurrency_per_target: NonZeroUsize, + /// Probability for sending `latest=true` in the request (uniform distribution). + #[clap(long)] + limit_to_first_n_targets: Option, + /// Before starting the benchmark, live-reconfigure the pageserver to use the given + /// [`pageserver_api::models::virtual_file::IoEngineKind`]. + #[clap(long)] + set_io_engine: Option, + targets: Option>, +} + +pub(crate) fn main(args: Args) -> anyhow::Result<()> { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()?; + let task = rt.spawn(main_impl(args)); + rt.block_on(task).unwrap().unwrap(); + Ok(()) +} + +#[derive(serde::Serialize)] +struct Output { + downloads_count: u64, + downloads_bytes: u64, + evictions_count: u64, + timeline_restarts: u64, + #[serde(with = "humantime_serde")] + runtime: Duration, +} + +#[derive(Debug, Default)] +struct LiveStats { + evictions_count: AtomicU64, + downloads_count: AtomicU64, + downloads_bytes: AtomicU64, + timeline_restarts: AtomicU64, +} + +impl LiveStats { + fn eviction_done(&self) { + self.evictions_count.fetch_add(1, Ordering::Relaxed); + } + fn download_done(&self, size: u64) { + self.downloads_count.fetch_add(1, Ordering::Relaxed); + self.downloads_bytes.fetch_add(size, Ordering::Relaxed); + } + fn timeline_restart_done(&self) { + self.timeline_restarts.fetch_add(1, Ordering::Relaxed); + } +} + +async fn main_impl(args: Args) -> anyhow::Result<()> { + let args: &'static Args = Box::leak(Box::new(args)); + + let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + args.mgmt_api_endpoint.clone(), + args.pageserver_jwt.as_deref(), + )); + + if let Some(engine_str) = &args.set_io_engine { + mgmt_api_client.put_io_engine(engine_str).await?; + } + + // discover targets + let timelines: Vec = crate::util::cli::targets::discover( + &mgmt_api_client, + crate::util::cli::targets::Spec { + limit_to_first_n_targets: args.limit_to_first_n_targets, + targets: args.targets.clone(), + }, + ) + .await?; + + let token = CancellationToken::new(); + let mut tasks = JoinSet::new(); + + let periodic_stats = Arc::new(LiveStats::default()); + let total_stats = Arc::new(LiveStats::default()); + + let start = Instant::now(); + tasks.spawn({ + let periodic_stats = Arc::clone(&periodic_stats); + let total_stats = Arc::clone(&total_stats); + let cloned_token = token.clone(); + async move { + let mut last_at = Instant::now(); + loop { + if cloned_token.is_cancelled() { + return; + } + tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await; + let now = Instant::now(); + let delta: Duration = now - last_at; + last_at = now; + + let LiveStats { + evictions_count, + downloads_count, + downloads_bytes, + timeline_restarts, + } = &*periodic_stats; + let evictions_count = evictions_count.swap(0, Ordering::Relaxed); + let downloads_count = downloads_count.swap(0, Ordering::Relaxed); + let downloads_bytes = downloads_bytes.swap(0, Ordering::Relaxed); + let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed); + + total_stats.evictions_count.fetch_add(evictions_count, Ordering::Relaxed); + total_stats.downloads_count.fetch_add(downloads_count, Ordering::Relaxed); + total_stats.downloads_bytes.fetch_add(downloads_bytes, Ordering::Relaxed); + total_stats.timeline_restarts.fetch_add(timeline_restarts, Ordering::Relaxed); + + let evictions_per_s = evictions_count as f64 / delta.as_secs_f64(); + let downloads_per_s = downloads_count as f64 / delta.as_secs_f64(); + let downloads_mibs_per_s = downloads_bytes as f64 / delta.as_secs_f64() / ((1 << 20) as f64); + + info!("evictions={evictions_per_s:.2}/s downloads={downloads_per_s:.2}/s download_bytes={downloads_mibs_per_s:.2}MiB/s timeline_restarts={timeline_restarts}"); + } + } + }); + + for tl in timelines { + for _ in 0..args.tasks_per_target.get() { + tasks.spawn(timeline_actor( + args, + Arc::clone(&mgmt_api_client), + tl, + Arc::clone(&periodic_stats), + token.clone(), + )); + } + } + if let Some(runtime) = args.runtime { + tokio::spawn(async move { + tokio::time::sleep(runtime.into()).await; + token.cancel(); + }); + } + + while let Some(res) = tasks.join_next().await { + res.unwrap(); + } + let end = Instant::now(); + let duration: Duration = end - start; + + let output = { + let LiveStats { + evictions_count, + downloads_count, + downloads_bytes, + timeline_restarts, + } = &*total_stats; + Output { + downloads_count: downloads_count.load(Ordering::Relaxed), + downloads_bytes: downloads_bytes.load(Ordering::Relaxed), + evictions_count: evictions_count.load(Ordering::Relaxed), + timeline_restarts: timeline_restarts.load(Ordering::Relaxed), + runtime: duration, + } + }; + let output = serde_json::to_string_pretty(&output).unwrap(); + println!("{output}"); + + Ok(()) +} + +async fn timeline_actor( + args: &'static Args, + mgmt_api_client: Arc, + timeline: TenantTimelineId, + live_stats: Arc, + token: CancellationToken, +) { + // TODO: support sharding + let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id); + + struct Timeline { + joinset: JoinSet<()>, + layers: Vec>, + concurrency: Arc, + } + while !token.is_cancelled() { + debug!("restarting timeline"); + let layer_map_info = mgmt_api_client + .layer_map_info(tenant_shard_id, timeline.timeline_id) + .await + .unwrap(); + let concurrency = Arc::new(tokio::sync::Semaphore::new( + args.concurrency_per_target.get(), + )); + + let mut joinset = JoinSet::new(); + let layers = layer_map_info + .historic_layers + .into_iter() + .map(|historic_layer| { + let (tx, rx) = mpsc::channel(1); + joinset.spawn(layer_actor( + tenant_shard_id, + timeline.timeline_id, + historic_layer, + rx, + Arc::clone(&mgmt_api_client), + Arc::clone(&live_stats), + )); + tx + }) + .collect::>(); + + let mut timeline = Timeline { + joinset, + layers, + concurrency, + }; + + live_stats.timeline_restart_done(); + + while !token.is_cancelled() { + assert!(!timeline.joinset.is_empty()); + if let Some(res) = timeline.joinset.try_join_next() { + debug!(?res, "a layer actor exited, should not happen"); + timeline.joinset.shutdown().await; + break; + } + + let mut permit = Some( + Arc::clone(&timeline.concurrency) + .acquire_owned() + .await + .unwrap(), + ); + + loop { + let layer_tx = { + let mut rng = rand::thread_rng(); + timeline.layers.choose_mut(&mut rng).expect("no layers") + }; + match layer_tx.try_send(permit.take().unwrap()) { + Ok(_) => break, + Err(e) => match e { + mpsc::error::TrySendError::Full(back) => { + // TODO: retrying introduces bias away from slow downloaders + permit.replace(back); + } + mpsc::error::TrySendError::Closed(_) => panic!(), + }, + } + } + } + } +} + +async fn layer_actor( + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + mut layer: HistoricLayerInfo, + mut rx: mpsc::Receiver, + mgmt_api_client: Arc, + live_stats: Arc, +) { + #[derive(Clone, Copy)] + enum Action { + Evict, + OnDemandDownload, + } + + while let Some(_permit) = rx.recv().await { + let action = if layer.is_remote() { + Action::OnDemandDownload + } else { + Action::Evict + }; + + let did_it = match action { + Action::Evict => { + let did_it = mgmt_api_client + .layer_evict(tenant_shard_id, timeline_id, layer.layer_file_name()) + .await + .unwrap(); + live_stats.eviction_done(); + did_it + } + Action::OnDemandDownload => { + let did_it = mgmt_api_client + .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name()) + .await + .unwrap(); + live_stats.download_done(layer.layer_file_size()); + did_it + } + }; + if !did_it { + debug!("local copy of layer map appears out of sync, re-downloading"); + return; + } + debug!("did it"); + layer.set_remote(match action { + Action::Evict => true, + Action::OnDemandDownload => false, + }); + } +} diff --git a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs index 98938d780a..f07beeecfd 100644 --- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs +++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use humantime::Duration; +use pageserver_api::shard::TenantShardId; use tokio::task::JoinSet; use utils::id::TenantTimelineId; @@ -59,7 +60,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { let mgmt_api_client = Arc::clone(&mgmt_api_client); js.spawn(async move { let info = mgmt_api_client - .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes) + .timeline_info( + TenantShardId::unsharded(tl.tenant_id), + tl.timeline_id, + ForceAwaitLogicalSize::Yes, + ) .await .unwrap(); @@ -74,7 +79,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { while !info.current_logical_size_is_accurate { ticker.tick().await; info = mgmt_api_client - .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes) + .timeline_info( + TenantShardId::unsharded(tl.tenant_id), + tl.timeline_id, + ForceAwaitLogicalSize::Yes, + ) .await .unwrap(); } diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs index 9fa77f0671..5527557450 100644 --- a/pageserver/pagebench/src/main.rs +++ b/pageserver/pagebench/src/main.rs @@ -3,7 +3,6 @@ use utils::logging; /// Re-usable pieces of code that aren't CLI-specific. mod util { - pub(crate) mod connstring; pub(crate) mod request_stats; #[macro_use] pub(crate) mod tokio_thread_local_stats; @@ -15,8 +14,10 @@ mod util { /// The pagebench CLI sub-commands, dispatched in [`main`] below. mod cmd { + pub(super) mod aux_files; pub(super) mod basebackup; pub(super) mod getpage_latest_lsn; + pub(super) mod ondemand_download_churn; pub(super) mod trigger_initial_size_calculation; } @@ -26,6 +27,8 @@ enum Args { Basebackup(cmd::basebackup::Args), GetPageLatestLsn(cmd::getpage_latest_lsn::Args), TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args), + OndemandDownloadChurn(cmd::ondemand_download_churn::Args), + AuxFiles(cmd::aux_files::Args), } fn main() { @@ -44,6 +47,8 @@ fn main() { Args::TriggerInitialSizeCalculation(args) => { cmd::trigger_initial_size_calculation::main(args) } + Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args), + Args::AuxFiles(args) => cmd::aux_files::main(args), } .unwrap() } diff --git a/pageserver/pagebench/src/util/connstring.rs b/pageserver/pagebench/src/util/connstring.rs deleted file mode 100644 index 07a0ff042d..0000000000 --- a/pageserver/pagebench/src/util/connstring.rs +++ /dev/null @@ -1,8 +0,0 @@ -pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String { - let colon_and_jwt = if let Some(jwt) = jwt { - format!(":{jwt}") // TODO: urlescape - } else { - String::new() - }; - format!("postgres://postgres{colon_and_jwt}@{host_port}") -} diff --git a/pageserver/pagebench/src/util/request_stats.rs b/pageserver/pagebench/src/util/request_stats.rs index 5ecf1cbf24..4aa6950782 100644 --- a/pageserver/pagebench/src/util/request_stats.rs +++ b/pageserver/pagebench/src/util/request_stats.rs @@ -66,13 +66,10 @@ impl serde::Serialize for LatencyPercentiles { { use serde::ser::SerializeMap; let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?; - for p in LATENCY_PERCENTILES { + for (p, v) in LATENCY_PERCENTILES.iter().zip(&self.latency_percentiles) { ser.serialize_entry( &format!("p{p}"), - &format!( - "{}", - &humantime::format_duration(self.latency_percentiles[0]) - ), + &format!("{}", humantime::format_duration(*v)), )?; } ser.end() diff --git a/pageserver/src/assert_u64_eq_usize.rs b/pageserver/src/assert_u64_eq_usize.rs new file mode 100644 index 0000000000..66ca7fd057 --- /dev/null +++ b/pageserver/src/assert_u64_eq_usize.rs @@ -0,0 +1,39 @@ +//! `u64`` and `usize`` aren't guaranteed to be identical in Rust, but life is much simpler if that's the case. + +pub(crate) const _ASSERT_U64_EQ_USIZE: () = { + if std::mem::size_of::() != std::mem::size_of::() { + panic!("the traits defined in this module assume that usize and u64 can be converted to each other without loss of information"); + } +}; + +pub(crate) trait U64IsUsize { + fn into_usize(self) -> usize; +} + +impl U64IsUsize for u64 { + #[inline(always)] + fn into_usize(self) -> usize { + #[allow(clippy::let_unit_value)] + let _ = _ASSERT_U64_EQ_USIZE; + self as usize + } +} + +pub(crate) trait UsizeIsU64 { + fn into_u64(self) -> u64; +} + +impl UsizeIsU64 for usize { + #[inline(always)] + fn into_u64(self) -> u64 { + #[allow(clippy::let_unit_value)] + let _ = _ASSERT_U64_EQ_USIZE; + self as u64 + } +} + +pub const fn u64_to_usize(x: u64) -> usize { + #[allow(clippy::let_unit_value)] + let _ = _ASSERT_U64_EQ_USIZE; + x as usize +} diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs index 2cb661863d..9e3dedb75a 100644 --- a/pageserver/src/auth.rs +++ b/pageserver/src/auth.rs @@ -14,8 +14,14 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< } (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope - (Scope::SafekeeperData, _) => Err(AuthError( - "SafekeeperData scope makes no sense for Pageserver".into(), - )), + (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Scrubber, _) => { + Err(AuthError( + format!( + "JWT scope '{:?}' is ineligible for Pageserver auth", + claims.scope + ) + .into(), + )) + } } } diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs new file mode 100644 index 0000000000..5e527b7d61 --- /dev/null +++ b/pageserver/src/aux_file.rs @@ -0,0 +1,286 @@ +use std::sync::Arc; + +use ::metrics::IntGauge; +use bytes::{Buf, BufMut, Bytes}; +use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE}; +use tracing::warn; + +// BEGIN Copyright (c) 2017 Servo Contributors + +/// Const version of FNV hash. +#[inline] +#[must_use] +pub const fn fnv_hash(bytes: &[u8]) -> u128 { + const INITIAL_STATE: u128 = 0x6c62272e07bb014262b821756295c58d; + const PRIME: u128 = 0x0000000001000000000000000000013B; + + let mut hash = INITIAL_STATE; + let mut i = 0; + while i < bytes.len() { + hash ^= bytes[i] as u128; + hash = hash.wrapping_mul(PRIME); + i += 1; + } + hash +} + +// END Copyright (c) 2017 Servo Contributors + +/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, least significant 13B of FNV hash]. +fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key { + let mut key: [u8; 16] = [0; METADATA_KEY_SIZE]; + let hash = fnv_hash(data).to_be_bytes(); + key[0] = AUX_KEY_PREFIX; + key[1] = dir_level1; + key[2] = dir_level2; + key[3..16].copy_from_slice(&hash[3..16]); + Key::from_metadata_key_fixed_size(&key) +} + +const AUX_DIR_PG_LOGICAL: u8 = 0x01; +const AUX_DIR_PG_REPLSLOT: u8 = 0x02; +const AUX_DIR_PG_UNKNOWN: u8 = 0xFF; + +/// Encode the aux file into a fixed-size key. +/// +/// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type. +/// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path +/// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix +/// is roughly based on the first two components of the path, one unique number for one component. +/// +/// * pg_logical/mappings -> 0x0101 +/// * pg_logical/snapshots -> 0x0102 +/// * pg_logical/replorigin_checkpoint -> 0x0103 +/// * pg_logical/others -> 0x01FF +/// * pg_replslot/ -> 0x0201 +/// * others -> 0xFFFF +/// +/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`. +/// The new file type must have never been written to the storage before. Otherwise, there could be data +/// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix. +pub fn encode_aux_file_key(path: &str) -> Key { + if let Some(fname) = path.strip_prefix("pg_logical/mappings/") { + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes()) + } else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") { + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes()) + } else if path == "pg_logical/replorigin_checkpoint" { + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"") + } else if let Some(fname) = path.strip_prefix("pg_logical/") { + if cfg!(debug_assertions) { + warn!( + "unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning", + path + ); + } + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes()) + } else if let Some(fname) = path.strip_prefix("pg_replslot/") { + aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes()) + } else { + if cfg!(debug_assertions) { + warn!( + "unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning", + path + ); + } + aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes()) + } +} + +const AUX_FILE_ENCODING_VERSION: u8 = 0x01; + +pub fn decode_file_value(val: &[u8]) -> anyhow::Result> { + let mut ptr = val; + if ptr.is_empty() { + // empty value = no files + return Ok(Vec::new()); + } + assert_eq!( + ptr.get_u8(), + AUX_FILE_ENCODING_VERSION, + "unsupported aux file value" + ); + let mut files = vec![]; + while ptr.has_remaining() { + let key_len = ptr.get_u32() as usize; + let key = &ptr[..key_len]; + ptr.advance(key_len); + let val_len = ptr.get_u32() as usize; + let content = &ptr[..val_len]; + ptr.advance(val_len); + + let path = std::str::from_utf8(key)?; + files.push((path, content)); + } + Ok(files) +} + +/// Decode an aux file key-value pair into a list of files. The returned `Bytes` contains reference +/// to the original value slice. Be cautious about memory consumption. +pub fn decode_file_value_bytes(val: &Bytes) -> anyhow::Result> { + let mut ptr = val.clone(); + if ptr.is_empty() { + // empty value = no files + return Ok(Vec::new()); + } + assert_eq!( + ptr.get_u8(), + AUX_FILE_ENCODING_VERSION, + "unsupported aux file value" + ); + let mut files = vec![]; + while ptr.has_remaining() { + let key_len = ptr.get_u32() as usize; + let key = ptr.slice(..key_len); + ptr.advance(key_len); + let val_len = ptr.get_u32() as usize; + let content = ptr.slice(..val_len); + ptr.advance(val_len); + + let path = std::str::from_utf8(&key)?.to_string(); + files.push((path, content)); + } + Ok(files) +} + +pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result> { + if files.is_empty() { + // no files = empty value + return Ok(Vec::new()); + } + let mut encoded = vec![]; + encoded.put_u8(AUX_FILE_ENCODING_VERSION); + for (path, content) in files { + if path.len() > u32::MAX as usize { + anyhow::bail!("{} exceeds path size limit", path); + } + encoded.put_u32(path.len() as u32); + encoded.put_slice(path.as_bytes()); + if content.len() > u32::MAX as usize { + anyhow::bail!("{} exceeds content size limit", path); + } + encoded.put_u32(content.len() as u32); + encoded.put_slice(content); + } + Ok(encoded) +} + +/// An estimation of the size of aux files. +pub struct AuxFileSizeEstimator { + aux_file_size_gauge: IntGauge, + size: Arc>>, +} + +impl AuxFileSizeEstimator { + pub fn new(aux_file_size_gauge: IntGauge) -> Self { + Self { + aux_file_size_gauge, + size: Arc::new(std::sync::Mutex::new(None)), + } + } + + /// When generating base backup or doing initial logical size calculation + pub fn on_initial(&self, new_size: usize) { + let mut guard = self.size.lock().unwrap(); + *guard = Some(new_size as isize); + self.report(new_size as isize); + } + + pub fn on_add(&self, file_size: usize) { + let mut guard = self.size.lock().unwrap(); + if let Some(size) = &mut *guard { + *size += file_size as isize; + self.report(*size); + } + } + + pub fn on_remove(&self, file_size: usize) { + let mut guard = self.size.lock().unwrap(); + if let Some(size) = &mut *guard { + *size -= file_size as isize; + self.report(*size); + } + } + + pub fn on_update(&self, old_size: usize, new_size: usize) { + let mut guard = self.size.lock().unwrap(); + if let Some(size) = &mut *guard { + *size += new_size as isize - old_size as isize; + self.report(*size); + } + } + + pub fn report(&self, size: isize) { + self.aux_file_size_gauge.set(size as i64); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_hash_portable() { + // AUX file encoding requires the hash to be portable across all platforms. This test case checks + // if the algorithm produces the same hash across different environments. + + assert_eq!( + 265160408618497461376862998434862070044, + super::fnv_hash("test1".as_bytes()) + ); + assert_eq!( + 295486155126299629456360817749600553988, + super::fnv_hash("test/test2".as_bytes()) + ); + assert_eq!( + 144066263297769815596495629667062367629, + super::fnv_hash("".as_bytes()) + ); + } + + #[test] + fn test_encoding_portable() { + // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions + // of the page server. + assert_eq!( + "62000001017F8B83D94F7081693471ABF91C", + encode_aux_file_key("pg_logical/mappings/test1").to_string(), + ); + assert_eq!( + "62000001027F8E83D94F7081693471ABFCCD", + encode_aux_file_key("pg_logical/snapshots/test2").to_string(), + ); + assert_eq!( + "62000001032E07BB014262B821756295C58D", + encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string(), + ); + assert_eq!( + "62000001FF4F38E1C74754E7D03C1A660178", + encode_aux_file_key("pg_logical/unsupported").to_string(), + ); + assert_eq!( + "62000002017F8D83D94F7081693471ABFB92", + encode_aux_file_key("pg_replslot/test3").to_string() + ); + assert_eq!( + "620000FFFF2B6ECC8AEF93F643DC44F15E03", + encode_aux_file_key("other_file_not_supported").to_string(), + ); + } + + #[test] + fn test_value_encoding() { + let files = vec![ + ("pg_logical/1.file", "1111".as_bytes()), + ("pg_logical/2.file", "2222".as_bytes()), + ]; + assert_eq!( + files, + decode_file_value(&encode_file_value(&files).unwrap()).unwrap() + ); + let files = vec![]; + assert_eq!( + files, + decode_file_value(&encode_file_value(&files).unwrap()).unwrap() + ); + } +} diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 009deff0aa..207f781e1b 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -10,10 +10,10 @@ //! This module is responsible for creation of such tarball //! from data stored in object storage. //! -use anyhow::{anyhow, bail, ensure, Context}; +use anyhow::{anyhow, Context}; use bytes::{BufMut, Bytes, BytesMut}; use fail::fail_point; -use pageserver_api::key::{key_to_slru_block, Key}; +use pageserver_api::key::Key; use postgres_ffi::pg_constants; use std::fmt::Write as FmtWrite; use std::time::SystemTime; @@ -38,6 +38,14 @@ use postgres_ffi::PG_TLI; use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; +#[derive(Debug, thiserror::Error)] +pub enum BasebackupError { + #[error("basebackup pageserver error {0:#}")] + Server(#[from] anyhow::Error), + #[error("basebackup client error {0:#}")] + Client(#[source] io::Error), +} + /// Create basebackup with non-rel data in it. /// Only include relational data if 'full_backup' is true. /// @@ -53,7 +61,7 @@ pub async fn send_basebackup_tarball<'a, W>( prev_lsn: Option, full_backup: bool, ctx: &'a RequestContext, -) -> anyhow::Result<()> +) -> Result<(), BasebackupError> where W: AsyncWrite + Send + Sync + Unpin, { @@ -92,8 +100,10 @@ where // Consolidate the derived and the provided prev_lsn values let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn { - if backup_prev != Lsn(0) { - ensure!(backup_prev == provided_prev_lsn); + if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn { + return Err(BasebackupError::Server(anyhow!( + "backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}" + ))); } provided_prev_lsn } else { @@ -143,6 +153,7 @@ where ar: &'a mut Builder<&'b mut W>, buf: Vec, current_segment: Option<(SlruKind, u32)>, + total_blocks: usize, } impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W> @@ -154,18 +165,30 @@ where ar, buf: Vec::new(), current_segment: None, + total_blocks: 0, } } - async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> { - let (kind, segno, _) = key_to_slru_block(*key)?; + async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> { + let (kind, segno, _) = key.to_slru_block()?; match kind { SlruKind::Clog => { - ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8); + if !(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8) { + return Err(BasebackupError::Server(anyhow!( + "invalid SlruKind::Clog record: block.len()={}", + block.len() + ))); + } } SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => { - ensure!(block.len() == BLCKSZ as usize); + if block.len() != BLCKSZ as usize { + return Err(BasebackupError::Server(anyhow!( + "invalid {:?} record: block.len()={}", + kind, + block.len() + ))); + } } } @@ -192,26 +215,34 @@ where Ok(()) } - async fn flush(&mut self) -> anyhow::Result<()> { + async fn flush(&mut self) -> Result<(), BasebackupError> { let nblocks = self.buf.len() / BLCKSZ as usize; let (kind, segno) = self.current_segment.take().unwrap(); let segname = format!("{}/{:>04X}", kind.to_str(), segno); let header = new_tar_header(&segname, self.buf.len() as u64)?; - self.ar.append(&header, self.buf.as_slice()).await?; + self.ar + .append(&header, self.buf.as_slice()) + .await + .map_err(BasebackupError::Client)?; - trace!("Added to basebackup slru {} relsize {}", segname, nblocks); + self.total_blocks += nblocks; + debug!("Added to basebackup slru {} relsize {}", segname, nblocks); self.buf.clear(); Ok(()) } - async fn finish(mut self) -> anyhow::Result<()> { - if self.current_segment.is_none() || self.buf.is_empty() { - return Ok(()); - } + async fn finish(mut self) -> Result<(), BasebackupError> { + let res = if self.current_segment.is_none() || self.buf.is_empty() { + Ok(()) + } else { + self.flush().await + }; - self.flush().await + info!("Collected {} SLRU blocks", self.total_blocks); + + res } } @@ -219,9 +250,11 @@ impl<'a, W> Basebackup<'a, W> where W: AsyncWrite + Send + Sync + Unpin, { - async fn send_tarball(mut self) -> anyhow::Result<()> { + async fn send_tarball(mut self) -> Result<(), BasebackupError> { // TODO include checksum + let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup; + // Create pgdata subdirs structure for dir in PGDATA_SUBDIRS.iter() { let header = new_tar_header_dir(dir)?; @@ -248,33 +281,42 @@ where .context("could not add config file to basebackup tarball")?; } } - - // Gather non-relational files from object storage pages. - let slru_partitions = self - .timeline - .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx) - .await? - .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64); - - let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar); - - for part in slru_partitions.parts { - let blocks = self + if !lazy_slru_download { + // Gather non-relational files from object storage pages. + let slru_partitions = self .timeline - .get_vectored(&part.ranges, self.lsn, self.ctx) - .await?; + .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))? + .partition( + self.timeline.get_shard_identity(), + Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, + ); - for (key, block) in blocks { - slru_builder.add_block(&key, block?).await?; + let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar); + + for part in slru_partitions.parts { + let blocks = self + .timeline + .get_vectored(part, self.lsn, self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))?; + + for (key, block) in blocks { + let block = block.map_err(|e| BasebackupError::Server(e.into()))?; + slru_builder.add_block(&key, block).await?; + } } + slru_builder.finish().await?; } - slru_builder.finish().await?; - let mut min_restart_lsn: Lsn = Lsn::MAX; // Create tablespace directories - for ((spcnode, dbnode), has_relmap_file) in - self.timeline.list_dbdirs(self.lsn, self.ctx).await? + for ((spcnode, dbnode), has_relmap_file) in self + .timeline + .list_dbdirs(self.lsn, self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))? { self.add_dbdir(spcnode, dbnode, has_relmap_file).await?; @@ -283,7 +325,8 @@ where let rels = self .timeline .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await?; + .await + .map_err(|e| BasebackupError::Server(e.into()))?; for &rel in rels.iter() { // Send init fork as main fork to provide well formed empty // contents of UNLOGGED relations. Postgres copies it in @@ -305,23 +348,36 @@ where self.add_rel(rel, rel).await?; } } - - for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? { - if path.starts_with("pg_replslot") { - let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN; - let restart_lsn = Lsn(u64::from_le_bytes( - content[offs..offs + 8].try_into().unwrap(), - )); - info!("Replication slot {} restart LSN={}", path, restart_lsn); - min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn); - } - let header = new_tar_header(&path, content.len() as u64)?; - self.ar - .append(&header, &*content) - .await - .context("could not add aux file to basebackup tarball")?; - } } + + for (path, content) in self + .timeline + .list_aux_files(self.lsn, self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))? + { + if path.starts_with("pg_replslot") { + let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN; + let restart_lsn = Lsn(u64::from_le_bytes( + content[offs..offs + 8].try_into().unwrap(), + )); + info!("Replication slot {} restart LSN={}", path, restart_lsn); + min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn); + } else if path == "pg_logical/replorigin_checkpoint" { + // replorigin_checkoint is written only on compute shutdown, so it contains + // deteriorated values. So we generate our own version of this file for the particular LSN + // based on information about replorigins extracted from transaction commit records. + // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all, + // but now we should handle (skip) it for backward compatibility. + continue; + } + let header = new_tar_header(&path, content.len() as u64)?; + self.ar + .append(&header, &*content) + .await + .context("could not add aux file to basebackup tarball")?; + } + if min_restart_lsn != Lsn::MAX { info!( "Min restart LSN for logical replication is {}", @@ -337,34 +393,67 @@ where for xid in self .timeline .list_twophase_files(self.lsn, self.ctx) - .await? + .await + .map_err(|e| BasebackupError::Server(e.into()))? { self.add_twophase_file(xid).await?; } + let repl_origins = self + .timeline + .get_replorigins(self.lsn, self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))?; + let n_origins = repl_origins.len(); + if n_origins != 0 { + // + // Construct "pg_logical/replorigin_checkpoint" file based on information about replication origins + // extracted from transaction commit record. We are using this file to pass information about replication + // origins to compute to allow logical replication to restart from proper point. + // + let mut content = Vec::with_capacity(n_origins * 16 + 8); + content.extend_from_slice(&pg_constants::REPLICATION_STATE_MAGIC.to_le_bytes()); + for (origin_id, origin_lsn) in repl_origins { + content.extend_from_slice(&origin_id.to_le_bytes()); + content.extend_from_slice(&[0u8; 6]); // align to 8 bytes + content.extend_from_slice(&origin_lsn.0.to_le_bytes()); + } + let crc32 = crc32c::crc32c(&content); + content.extend_from_slice(&crc32.to_le_bytes()); + let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?; + self.ar.append(&header, &*content).await.context( + "could not add pg_logical/replorigin_checkpoint file to basebackup tarball", + )?; + } fail_point!("basebackup-before-control-file", |_| { - bail!("failpoint basebackup-before-control-file") + Err(BasebackupError::Server(anyhow!( + "failpoint basebackup-before-control-file" + ))) }); // Generate pg_control and bootstrap WAL segment. self.add_pgcontrol_file().await?; - self.ar.finish().await?; + self.ar.finish().await.map_err(BasebackupError::Client)?; debug!("all tarred up!"); Ok(()) } /// Add contents of relfilenode `src`, naming it as `dst`. - async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> { + async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> { let nblocks = self .timeline - .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx) - .await?; + .get_rel_size(src, Version::Lsn(self.lsn), self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))?; // If the relation is empty, create an empty file if nblocks == 0 { let file_name = dst.to_segfile_name(0); let header = new_tar_header(&file_name, 0)?; - self.ar.append(&header, &mut io::empty()).await?; + self.ar + .append(&header, &mut io::empty()) + .await + .map_err(BasebackupError::Client)?; return Ok(()); } @@ -378,14 +467,18 @@ where for blknum in startblk..endblk { let img = self .timeline - .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx) - .await?; + .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))?; segment_data.extend_from_slice(&img[..]); } let file_name = dst.to_segfile_name(seg as u32); let header = new_tar_header(&file_name, segment_data.len() as u64)?; - self.ar.append(&header, segment_data.as_slice()).await?; + self.ar + .append(&header, segment_data.as_slice()) + .await + .map_err(BasebackupError::Client)?; seg += 1; startblk = endblk; @@ -405,20 +498,22 @@ where spcnode: u32, dbnode: u32, has_relmap_file: bool, - ) -> anyhow::Result<()> { + ) -> Result<(), BasebackupError> { let relmap_img = if has_relmap_file { let img = self .timeline .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await?; + .await + .map_err(|e| BasebackupError::Server(e.into()))?; - ensure!( - img.len() - == dispatch_pgversion!( - self.timeline.pg_version, - pgv::bindings::SIZEOF_RELMAPFILE - ) - ); + if img.len() + != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE) + { + return Err(BasebackupError::Server(anyhow!( + "img.len() != SIZE_OF_RELMAPFILE, img.len()={}", + img.len(), + ))); + } Some(img) } else { @@ -431,14 +526,20 @@ where ver => format!("{ver}\x0A"), }; let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?; - self.ar.append(&header, pg_version_str.as_bytes()).await?; + self.ar + .append(&header, pg_version_str.as_bytes()) + .await + .map_err(BasebackupError::Client)?; info!("timeline.pg_version {}", self.timeline.pg_version); if let Some(img) = relmap_img { // filenode map for global tablespace let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?; - self.ar.append(&header, &img[..]).await?; + self.ar + .append(&header, &img[..]) + .await + .map_err(BasebackupError::Client)?; } else { warn!("global/pg_filenode.map is missing"); } @@ -457,18 +558,26 @@ where && self .timeline .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await? + .await + .map_err(|e| BasebackupError::Server(e.into()))? .is_empty() { return Ok(()); } // User defined tablespaces are not supported - ensure!(spcnode == DEFAULTTABLESPACE_OID); + if spcnode != DEFAULTTABLESPACE_OID { + return Err(BasebackupError::Server(anyhow!( + "spcnode != DEFAULTTABLESPACE_OID, spcnode={spcnode}" + ))); + } // Append dir path for each database let path = format!("base/{}", dbnode); let header = new_tar_header_dir(&path)?; - self.ar.append(&header, &mut io::empty()).await?; + self.ar + .append(&header, &mut io::empty()) + .await + .map_err(BasebackupError::Client)?; if let Some(img) = relmap_img { let dst_path = format!("base/{}/PG_VERSION", dbnode); @@ -478,11 +587,17 @@ where ver => format!("{ver}\x0A"), }; let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?; - self.ar.append(&header, pg_version_str.as_bytes()).await?; + self.ar + .append(&header, pg_version_str.as_bytes()) + .await + .map_err(BasebackupError::Client)?; let relmap_path = format!("base/{}/pg_filenode.map", dbnode); let header = new_tar_header(&relmap_path, img.len() as u64)?; - self.ar.append(&header, &img[..]).await?; + self.ar + .append(&header, &img[..]) + .await + .map_err(BasebackupError::Client)?; } }; Ok(()) @@ -491,11 +606,12 @@ where // // Extract twophase state files // - async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { + async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> { let img = self .timeline .get_twophase_file(xid, self.lsn, self.ctx) - .await?; + .await + .map_err(|e| BasebackupError::Server(e.into()))?; let mut buf = BytesMut::new(); buf.extend_from_slice(&img[..]); @@ -503,7 +619,10 @@ where buf.put_u32_le(crc); let path = format!("pg_twophase/{:>08X}", xid); let header = new_tar_header(&path, buf.len() as u64)?; - self.ar.append(&header, &buf[..]).await?; + self.ar + .append(&header, &buf[..]) + .await + .map_err(BasebackupError::Client)?; Ok(()) } @@ -512,24 +631,28 @@ where // Add generated pg_control file and bootstrap WAL segment. // Also send zenith.signal file with extra bootstrap data. // - async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> { + async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> { // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { - if self.lsn == self.timeline.get_ancestor_lsn() { - write!(zenith_signal, "PREV LSN: none")?; + if self.timeline.is_ancestor_lsn(self.lsn) { + write!(zenith_signal, "PREV LSN: none") + .map_err(|e| BasebackupError::Server(e.into()))?; } else { - write!(zenith_signal, "PREV LSN: invalid")?; + write!(zenith_signal, "PREV LSN: invalid") + .map_err(|e| BasebackupError::Server(e.into()))?; } } else { - write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?; + write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn) + .map_err(|e| BasebackupError::Server(e.into()))?; } self.ar .append( &new_tar_header("zenith.signal", zenith_signal.len() as u64)?, zenith_signal.as_bytes(), ) - .await?; + .await + .map_err(BasebackupError::Client)?; let checkpoint_bytes = self .timeline @@ -551,7 +674,10 @@ where //send pg_control let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?; - self.ar.append(&header, &pg_control_bytes[..]).await?; + self.ar + .append(&header, &pg_control_bytes[..]) + .await + .map_err(BasebackupError::Client)?; //send wal segment let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE); @@ -566,8 +692,16 @@ where self.lsn, ) .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; - ensure!(wal_seg.len() == WAL_SEGMENT_SIZE); - self.ar.append(&header, &wal_seg[..]).await?; + if wal_seg.len() != WAL_SEGMENT_SIZE { + return Err(BasebackupError::Server(anyhow!( + "wal_seg.len() != WAL_SEGMENT_SIZE, wal_seg.len()={}", + wal_seg.len() + ))); + } + self.ar + .append(&header, &wal_seg[..]) + .await + .map_err(BasebackupError::Client)?; Ok(()) } } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 15e3359c06..59194ab4bd 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -1,50 +1,57 @@ +#![recursion_limit = "300"] + //! Main entry point for the Page Server executable. +use std::env; use std::env::{var, VarError}; +use std::io::Read; +use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use std::{env, ops::ControlFlow, str::FromStr}; use anyhow::{anyhow, Context}; use camino::Utf8Path; use clap::{Arg, ArgAction, Command}; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; +use pageserver::config::PageserverIdentity; use pageserver::control_plane_client::ControlPlaneClient; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; -use pageserver::task_mgr::WALRECEIVER_RUNTIME; +use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME}; use pageserver::tenant::{secondary, TenantSharedResources}; +use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener}; use remote_storage::GenericRemoteStorage; +use tokio::signal::unix::SignalKind; use tokio::time::Instant; +use tokio_util::sync::CancellationToken; use tracing::*; use metrics::set_build_info_metric; use pageserver::{ - config::{defaults::*, PageServerConf}, - context::{DownloadBehavior, RequestContext}, + config::PageServerConf, deletion_queue::DeletionQueue, http, page_cache, page_service, task_mgr, - task_mgr::TaskKind, - task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME}, + task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME}, tenant::mgr, virtual_file, }; use postgres_backend::AuthType; use utils::failpoint_support; use utils::logging::TracingErrorLayerEnablement; -use utils::signals::ShutdownSignals; use utils::{ auth::{JwtAuth, SwappableJwtAuth}, logging, project_build_tag, project_git_version, sentry_init::init_sentry, - signals::Signal, tcp_listener, }; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + const PID_FILE_NAME: &str = "pageserver.pid"; const FEATURES: &[&str] = &[ @@ -79,18 +86,13 @@ fn main() -> anyhow::Result<()> { .with_context(|| format!("Error opening workdir '{workdir}'"))?; let cfg_file_path = workdir.join("pageserver.toml"); + let identity_file_path = workdir.join("identity.toml"); // Set CWD to workdir for non-daemon modes env::set_current_dir(&workdir) .with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?; - let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? { - ControlFlow::Continue(conf) => conf, - ControlFlow::Break(()) => { - info!("Pageserver config init successful"); - return Ok(()); - } - }; + let conf = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?; // Initialize logging. // @@ -120,17 +122,72 @@ fn main() -> anyhow::Result<()> { &[("node_id", &conf.id.to_string())], ); + // after setting up logging, log the effective IO engine choice and read path implementations + info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); + info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings"); + info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment"); + + // The tenants directory contains all the pageserver local disk state. + // Create if not exists and make sure all the contents are durable before proceeding. + // Ensuring durability eliminates a whole bug class where we come up after an unclean shutdown. + // After unclea shutdown, we don't know if all the filesystem content we can read via syscalls is actually durable or not. + // Examples for that: OOM kill, systemd killing us during shutdown, self abort due to unrecoverable IO error. let tenants_path = conf.tenants_path(); - if !tenants_path.exists() { - utils::crashsafe::create_dir_all(conf.tenants_path()) - .with_context(|| format!("Failed to create tenants root dir at '{tenants_path}'"))?; + { + let open = || { + nix::dir::Dir::open( + tenants_path.as_std_path(), + nix::fcntl::OFlag::O_DIRECTORY | nix::fcntl::OFlag::O_RDONLY, + nix::sys::stat::Mode::empty(), + ) + }; + let dirfd = match open() { + Ok(dirfd) => dirfd, + Err(e) => match e { + nix::errno::Errno::ENOENT => { + utils::crashsafe::create_dir_all(&tenants_path).with_context(|| { + format!("Failed to create tenants root dir at '{tenants_path}'") + })?; + open().context("open tenants dir after creating it")? + } + e => anyhow::bail!(e), + }, + }; + + let started = Instant::now(); + // Linux guarantees durability for syncfs. + // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync(). + #[cfg(target_os = "linux")] + { + use std::os::fd::AsRawFd; + nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?; + } + #[cfg(target_os = "macos")] + { + // macOS is not a production platform for Neon, don't even bother. + drop(dirfd); + } + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + { + compile_error!("Unsupported OS"); + } + + let elapsed = started.elapsed(); + info!( + elapsed_ms = elapsed.as_millis(), + "made tenant directory contents durable" + ); } // Initialize up failpoints support let scenario = failpoint_support::init(); // Basic initialization of things that don't change after startup - virtual_file::init(conf.max_file_descriptors); + virtual_file::init( + conf.max_file_descriptors, + conf.virtual_file_io_engine, + conf.io_buffer_alignment, + ); page_cache::init(conf.page_cache_size); start_pageserver(launch_ts, conf).context("Failed to start pageserver")?; @@ -140,74 +197,43 @@ fn main() -> anyhow::Result<()> { } fn initialize_config( + identity_file_path: &Utf8Path, cfg_file_path: &Utf8Path, - arg_matches: clap::ArgMatches, workdir: &Utf8Path, -) -> anyhow::Result> { - let init = arg_matches.get_flag("init"); - let update_config = init || arg_matches.get_flag("update-config"); +) -> anyhow::Result<&'static PageServerConf> { + // The deployment orchestrator writes out an indentity file containing the node id + // for all pageservers. This file is the source of truth for the node id. In order + // to allow for rolling back pageserver releases, the node id is also included in + // the pageserver config that the deployment orchestrator writes to disk for the pageserver. + // A rolled back version of the pageserver will get the node id from the pageserver.toml + // config file. + let identity = match std::fs::File::open(identity_file_path) { + Ok(mut f) => { + let md = f.metadata().context("stat config file")?; + if !md.is_file() { + anyhow::bail!("Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ..."); + } - let (mut toml, config_file_exists) = if cfg_file_path.is_file() { - if init { - anyhow::bail!( - "Config file '{cfg_file_path}' already exists, cannot init it, use --update-config to update it", - ); + let mut s = String::new(); + f.read_to_string(&mut s).context("read identity file")?; + toml_edit::de::from_str::(&s)? + } + Err(e) => { + anyhow::bail!("Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ..."); } - // Supplement the CLI arguments with the config file - let cfg_file_contents = std::fs::read_to_string(cfg_file_path) - .with_context(|| format!("Failed to read pageserver config at '{cfg_file_path}'"))?; - ( - cfg_file_contents - .parse::() - .with_context(|| { - format!("Failed to parse '{cfg_file_path}' as pageserver config") - })?, - true, - ) - } else if cfg_file_path.exists() { - anyhow::bail!("Config file '{cfg_file_path}' exists but is not a regular file"); - } else { - // We're initializing the tenant, so there's no config file yet - ( - DEFAULT_CONFIG_FILE - .parse::() - .context("could not parse built-in config file")?, - false, - ) }; - if let Some(values) = arg_matches.get_many::("config-override") { - for option_line in values { - let doc = toml_edit::Document::from_str(option_line).with_context(|| { - format!("Option '{option_line}' could not be parsed as a toml document") - })?; + let config_file_contents = + std::fs::read_to_string(cfg_file_path).context("read config file from filesystem")?; + let config_toml = serde_path_to_error::deserialize( + toml_edit::de::Deserializer::from_str(&config_file_contents) + .context("build toml deserializer")?, + ) + .context("deserialize config toml")?; + let conf = PageServerConf::parse_and_validate(identity.id, config_toml, workdir) + .context("runtime-validation of config toml")?; - for (key, item) in doc.iter() { - if config_file_exists && update_config && key == "id" && toml.contains_key(key) { - anyhow::bail!("Pageserver config file exists at '{cfg_file_path}' and has node id already, it cannot be overridden"); - } - toml.insert(key, item.clone()); - } - } - } - - debug!("Resulting toml: {toml}"); - let conf = PageServerConf::parse_and_validate(&toml, workdir) - .context("Failed to parse pageserver configuration")?; - - if update_config { - info!("Writing pageserver config to '{cfg_file_path}'"); - - std::fs::write(cfg_file_path, toml.to_string()) - .with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?; - info!("Config successfully written to '{cfg_file_path}'") - } - - Ok(if init { - ControlFlow::Break(()) - } else { - ControlFlow::Continue(Box::leak(Box::new(conf))) - }) + Ok(Box::leak(Box::new(conf))) } struct WaitForPhaseResult { @@ -274,6 +300,12 @@ fn start_pageserver( ); set_build_info_metric(GIT_VERSION, BUILD_TAG); set_launch_timestamp_metric(launch_ts); + #[cfg(target_os = "linux")] + metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap(); + metrics::register_internal(Box::new( + pageserver::metrics::tokio_epoll_uring::Collector::new(), + )) + .unwrap(); pageserver::preinitialize_metrics(); // If any failpoints were set from FAILPOINTS environment variable, @@ -293,6 +325,7 @@ fn start_pageserver( // Create and lock PID file. This ensures that there cannot be more than one // pageserver process running at the same time. let lock_file_path = conf.workdir.join(PID_FILE_NAME); + info!("Claiming pid file at {lock_file_path:?}..."); let lock_file = utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?; info!("Claimed pid file at {lock_file_path:?}"); @@ -308,6 +341,7 @@ fn start_pageserver( let http_listener = tcp_listener::bind(http_addr)?; let pg_addr = &conf.listen_pg_addr; + info!("Starting pageserver pg protocol handler on {pg_addr}"); let pageserver_listener = tcp_listener::bind(pg_addr)?; @@ -372,7 +406,7 @@ fn start_pageserver( let shutdown_pageserver = tokio_util::sync::CancellationToken::new(); // Set up remote storage client - let remote_storage = create_remote_storage_client(conf)?; + let remote_storage = BACKGROUND_RUNTIME.block_on(create_remote_storage_client(conf))?; // Set up deletion queue let (deletion_queue, deletion_workers) = DeletionQueue::new( @@ -411,14 +445,21 @@ fn start_pageserver( background_jobs_can_start: background_jobs_barrier.clone(), }; + info!(config=?conf.l0_flush, "using l0_flush config"); + let l0_flush_global_state = + pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone()); + // Scan the local 'tenants/' directory and start loading the tenants let deletion_queue_client = deletion_queue.new_client(); + let background_purges = mgr::BackgroundPurges::default(); let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( conf, + background_purges.clone(), TenantSharedResources { broker_client: broker_client.clone(), remote_storage: remote_storage.clone(), deletion_queue_client, + l0_flush_global_state, }, order, shutdown_pageserver.clone(), @@ -505,16 +546,12 @@ fn start_pageserver( } }); - let secondary_controller = if let Some(remote_storage) = &remote_storage { - secondary::spawn_tasks( - tenant_manager.clone(), - remote_storage.clone(), - background_jobs_barrier.clone(), - shutdown_pageserver.clone(), - ) - } else { - secondary::null_controller() - }; + let (secondary_controller, secondary_controller_tasks) = secondary::spawn_tasks( + tenant_manager.clone(), + remote_storage.clone(), + background_jobs_barrier.clone(), + shutdown_pageserver.clone(), + ); // shared state between the disk-usage backed eviction background task and the http endpoint // that allows triggering disk-usage based eviction manually. note that the http endpoint @@ -522,25 +559,24 @@ fn start_pageserver( // been configured. let disk_usage_eviction_state: Arc = Arc::default(); - if let Some(remote_storage) = &remote_storage { - launch_disk_usage_global_eviction_task( - conf, - remote_storage.clone(), - disk_usage_eviction_state.clone(), - tenant_manager.clone(), - background_jobs_barrier.clone(), - )?; - } + let disk_usage_eviction_task = launch_disk_usage_global_eviction_task( + conf, + remote_storage.clone(), + disk_usage_eviction_state.clone(), + tenant_manager.clone(), + background_jobs_barrier.clone(), + ); // Start up the service to handle HTTP mgmt API request. We created the // listener earlier already. - { - let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); + let http_endpoint_listener = { + let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper + let cancel = CancellationToken::new(); let router_state = Arc::new( http::routes::State::new( conf, - tenant_manager, + tenant_manager.clone(), http_auth.clone(), remote_storage.clone(), broker_client.clone(), @@ -556,148 +592,102 @@ fn start_pageserver( let service = utils::http::RouterService::new(router).unwrap(); let server = hyper::Server::from_tcp(http_listener)? .serve(service) - .with_graceful_shutdown(task_mgr::shutdown_watcher()); + .with_graceful_shutdown({ + let cancel = cancel.clone(); + async move { cancel.clone().cancelled().await } + }); - task_mgr::spawn( - MGMT_REQUEST_RUNTIME.handle(), - TaskKind::HttpEndpointListener, - None, - None, + let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "http endpoint listener", - true, - async { - server.await?; - Ok(()) - }, - ); - } + server, + )); + HttpEndpointListener(CancellableTask { task, cancel }) + }; - if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint { - let metrics_ctx = RequestContext::todo_child( - TaskKind::MetricsCollection, - // This task itself shouldn't download anything. - // The actual size calculation does need downloads, and - // creates a child context with the right DownloadBehavior. - DownloadBehavior::Error, - ); - - let local_disk_storage = conf.workdir.join("last_consumption_metrics.json"); - - task_mgr::spawn( - crate::BACKGROUND_RUNTIME.handle(), - TaskKind::MetricsCollection, - None, - None, - "consumption metrics collection", - true, + let consumption_metrics_tasks = { + let cancel = shutdown_pageserver.child_token(); + let task = crate::BACKGROUND_RUNTIME.spawn({ + let tenant_manager = tenant_manager.clone(); + let cancel = cancel.clone(); async move { // first wait until background jobs are cleared to launch. // // this is because we only process active tenants and timelines, and the // Timeline::get_current_logical_size will spawn the logical size calculation, // which will not be rate-limited. - let cancel = task_mgr::shutdown_token(); - tokio::select! { - _ = cancel.cancelled() => { return Ok(()); }, + _ = cancel.cancelled() => { return; }, _ = background_jobs_barrier.wait() => {} }; - pageserver::consumption_metrics::collect_metrics( - metric_collection_endpoint, - conf.metric_collection_interval, - conf.cached_metric_collection_interval, - conf.synthetic_size_calculation_interval, - conf.id, - local_disk_storage, - cancel, - metrics_ctx, - ) - .instrument(info_span!("metrics_collection")) - .await?; - Ok(()) - }, - ); - } + pageserver::consumption_metrics::run(conf, tenant_manager, cancel).await; + } + }); + ConsumptionMetricsTasks(CancellableTask { task, cancel }) + }; // Spawn a task to listen for libpq connections. It will spawn further tasks // for each connection. We created the listener earlier already. - { - let libpq_ctx = RequestContext::todo_child( - TaskKind::LibpqEndpointListener, - // listener task shouldn't need to download anything. (We will - // create a separate sub-contexts for each connection, with their - // own download behavior. This context is used only to listen and - // accept connections.) - DownloadBehavior::Error, - ); - task_mgr::spawn( - COMPUTE_REQUEST_RUNTIME.handle(), - TaskKind::LibpqEndpointListener, - None, - None, - "libpq endpoint listener", - true, - async move { - page_service::libpq_listener_main( - conf, - broker_client, - pg_auth, - pageserver_listener, - conf.pg_auth_type, - libpq_ctx, - task_mgr::shutdown_token(), - ) - .await - }, - ); - } + let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, { + let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it + pageserver_listener + .set_nonblocking(true) + .context("set listener to nonblocking")?; + tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")? + }); let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard()); // All started up! Now just sit and wait for shutdown signal. - ShutdownSignals::handle(|signal| match signal { - Signal::Quit => { - info!( - "Got {}. Terminating in immediate shutdown mode", - signal.name() - ); - std::process::exit(111); - } - Signal::Interrupt | Signal::Terminate => { - info!( - "Got {}. Terminating gracefully in fast shutdown mode", - signal.name() - ); + { + BACKGROUND_RUNTIME.block_on(async move { + let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap(); + let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap(); + let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap(); + let signal = tokio::select! { + _ = sigquit.recv() => { + info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",); + std::process::exit(111); + } + _ = sigint.recv() => { "SIGINT" }, + _ = sigterm.recv() => { "SIGTERM" }, + }; + + info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",); // This cancels the `shutdown_pageserver` cancellation tree. // Right now that tree doesn't reach very far, and `task_mgr` is used instead. // The plan is to change that over time. shutdown_pageserver.take(); - let bg_remote_storage = remote_storage.clone(); - let bg_deletion_queue = deletion_queue.clone(); - BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver( - bg_remote_storage.map(|_| bg_deletion_queue), + pageserver::shutdown_pageserver( + http_endpoint_listener, + page_service, + consumption_metrics_tasks, + disk_usage_eviction_task, + &tenant_manager, + background_purges, + deletion_queue.clone(), + secondary_controller_tasks, 0, - )); + ) + .await; unreachable!() - } - }) + }) + } } -fn create_remote_storage_client( +async fn create_remote_storage_client( conf: &'static PageServerConf, -) -> anyhow::Result> { +) -> anyhow::Result { let config = if let Some(config) = &conf.remote_storage_config { config } else { - tracing::warn!("no remote storage configured, this is a deprecated configuration"); - return Ok(None); + anyhow::bail!("no remote storage configured, this is a deprecated configuration"); }; // Create the client - let mut remote_storage = GenericRemoteStorage::from_config(config)?; + let mut remote_storage = GenericRemoteStorage::from_config(config).await?; // If `test_remote_failures` is non-zero, wrap the client with a // wrapper that simulates failures. @@ -713,40 +703,19 @@ fn create_remote_storage_client( GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures); } - Ok(Some(remote_storage)) + Ok(remote_storage) } fn cli() -> Command { Command::new("Neon page server") .about("Materializes WAL stream to pages and serves them to the postgres") .version(version()) - .arg( - Arg::new("init") - .long("init") - .action(ArgAction::SetTrue) - .help("Initialize pageserver with all given config overrides"), - ) .arg( Arg::new("workdir") .short('D') .long("workdir") .help("Working directory for the pageserver"), ) - // See `settings.md` for more details on the extra configuration patameters pageserver can process - .arg( - Arg::new("config-override") - .short('c') - .num_args(1) - .action(ArgAction::Append) - .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \ - Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), - ) - .arg( - Arg::new("update-config") - .long("update-config") - .action(ArgAction::SetTrue) - .help("Update the config file when started"), - ) .arg( Arg::new("enabled-features") .long("enabled-features") diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 52277d7f24..29a98855d3 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -4,24 +4,23 @@ //! file, or on the command line. //! See also `settings.md` for better description on every parameter. -use anyhow::{anyhow, bail, ensure, Context, Result}; -use pageserver_api::shard::TenantShardId; +use anyhow::{bail, ensure, Context}; +use pageserver_api::models::ImageCompressionAlgorithm; +use pageserver_api::{ + config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes}, + shard::TenantShardId, +}; use remote_storage::{RemotePath, RemoteStorageConfig}; -use serde::de::IntoDeserializer; use std::env; use storage_broker::Uri; use utils::crashsafe::path_with_suffix_extension; -use utils::id::ConnectionId; use utils::logging::SecretString; use once_cell::sync::OnceCell; use reqwest::Url; use std::num::NonZeroUsize; -use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use toml_edit; -use toml_edit::{Document, Item}; use camino::{Utf8Path, Utf8PathBuf}; use postgres_backend::AuthType; @@ -30,115 +29,27 @@ use utils::{ logging::LogFormat, }; -use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig; -use crate::tenant::config::TenantConf; -use crate::tenant::config::TenantConfOpt; -use crate::tenant::{ - TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME, -}; -use crate::{ - IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, - TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX, -}; - -use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP; - -pub mod defaults { - use crate::tenant::config::defaults::*; - use const_format::formatcp; - - pub use pageserver_api::{ - DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, - DEFAULT_PG_LISTEN_PORT, - }; - pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; - - pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s"; - pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; - - pub const DEFAULT_SUPERUSER: &str = "cloud_admin"; - - pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; - pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; - - pub const DEFAULT_LOG_FORMAT: &str = "plain"; - - pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8; - - pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = - super::ConfigurableSemaphore::DEFAULT_INITIAL.get(); - - pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min"; - pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s"; - pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; - pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; - pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; - - pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8; - pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1; - - pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; - - /// - /// Default built-in configuration file. - /// - pub const DEFAULT_CONFIG_FILE: &str = formatcp!( - r#" -# Initial configuration file created by 'pageserver --init' -#listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}' -#listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}' - -#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}' -#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}' - -#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE} -#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS} - -# initial superuser role name to use when creating a new tenant -#initial_superuser_name = '{DEFAULT_SUPERUSER}' - -#broker_endpoint = '{BROKER_DEFAULT_ENDPOINT}' - -#log_format = '{DEFAULT_LOG_FORMAT}' - -#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}' -#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}' - -#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}' -#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}' -#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}' - -#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}} - -#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}' - -#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE} - -[tenant_config] -#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes -#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} -#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes -#compaction_period = '{DEFAULT_COMPACTION_PERIOD}' -#compaction_threshold = {DEFAULT_COMPACTION_THRESHOLD} - -#gc_period = '{DEFAULT_GC_PERIOD}' -#gc_horizon = {DEFAULT_GC_HORIZON} -#image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD} -#pitr_interval = '{DEFAULT_PITR_INTERVAL}' - -#min_resident_size_override = .. # in bytes -#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' -#gc_feedback = false - -#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY} -#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY} - -[remote_storage] - -"# - ); -} +use crate::tenant::storage_layer::inmemory_layer::IndexEntry; +use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; +use crate::virtual_file; +use crate::virtual_file::io_engine; +use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX}; +/// Global state of pageserver. +/// +/// It's mostly immutable configuration, but some semaphores and the +/// like crept in over time and the name stuck. +/// +/// Instantiated by deserializing `pageserver.toml` into [`pageserver_api::config::ConfigToml`] +/// and passing that to [`PageServerConf::parse_and_validate`]. +/// +/// # Adding a New Field +/// +/// 1. Add the field to `pageserver_api::config::ConfigToml`. +/// 2. Fix compiler errors (exhaustive destructuring will guide you). +/// +/// For fields that require additional validation or filling in of defaults at runtime, +/// check for examples in the [`PageServerConf::parse_and_validate`] method. #[derive(Debug, Clone, PartialEq, Eq)] pub struct PageServerConf { // Identifier of that particular pageserver so e g safekeepers @@ -184,7 +95,7 @@ pub struct PageServerConf { pub remote_storage_config: Option, - pub default_tenant_conf: TenantConf, + pub default_tenant_conf: crate::tenant::config::TenantConf, /// Storage broker endpoints to connect to. pub broker_endpoint: Uri, @@ -192,9 +103,9 @@ pub struct PageServerConf { pub log_format: LogFormat, - /// Number of tenants which will be concurrently loaded from remote storage proactively on startup, - /// does not limit tenants loaded in response to client I/O. A lower value implicitly deprioritizes - /// loading such tenants, vs. other work in the system. + /// Number of tenants which will be concurrently loaded from remote storage proactively on startup or attach. + /// + /// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system. pub concurrent_tenant_warmup: ConfigurableSemaphore, /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed. @@ -209,8 +120,8 @@ pub struct PageServerConf { // How often to collect metrics and send them to the metrics endpoint. pub metric_collection_interval: Duration, // How often to send unchanged cached metrics to the metrics endpoint. - pub cached_metric_collection_interval: Duration, pub metric_collection_endpoint: Option, + pub metric_collection_bucket: Option, pub synthetic_size_calculation_interval: Duration, pub disk_usage_based_eviction: Option, @@ -247,8 +158,30 @@ pub struct PageServerConf { /// Maximum number of WAL records to be ingested and committed at the same time pub ingest_batch_size: u64, + + pub virtual_file_io_engine: virtual_file::IoEngineKind, + + pub max_vectored_read_bytes: MaxVectoredReadBytes, + + pub image_compression: ImageCompressionAlgorithm, + + /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this + /// is exceeded, we start proactively closing ephemeral layers to limit the total amount + /// of ephemeral data. + /// + /// Setting this to zero disables limits on total ephemeral layer size. + pub ephemeral_bytes_per_memory_kb: usize, + + pub l0_flush: crate::l0_flush::L0FlushConfig, + + /// Direct IO settings + pub virtual_file_direct_io: virtual_file::DirectIoMode, + + pub io_buffer_alignment: usize, } +/// Token for authentication to safekeepers +/// /// We do not want to store this in a PageServerConf because the latter may be logged /// and/or serialized at a whim, while the token is secret. Currently this token is the /// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in @@ -257,422 +190,6 @@ pub struct PageServerConf { /// startup code to the connection code through a dozen layers. pub static SAFEKEEPER_AUTH_TOKEN: OnceCell> = OnceCell::new(); -// use dedicated enum for builder to better indicate the intention -// and avoid possible confusion with nested options -pub enum BuilderValue { - Set(T), - NotSet, -} - -impl BuilderValue { - pub fn ok_or(self, err: E) -> Result { - match self { - Self::Set(v) => Ok(v), - Self::NotSet => Err(err), - } - } -} - -// needed to simplify config construction -struct PageServerConfigBuilder { - listen_pg_addr: BuilderValue, - - listen_http_addr: BuilderValue, - - availability_zone: BuilderValue>, - - wait_lsn_timeout: BuilderValue, - wal_redo_timeout: BuilderValue, - - superuser: BuilderValue, - - page_cache_size: BuilderValue, - max_file_descriptors: BuilderValue, - - workdir: BuilderValue, - - pg_distrib_dir: BuilderValue, - - http_auth_type: BuilderValue, - pg_auth_type: BuilderValue, - - // - auth_validation_public_key_path: BuilderValue>, - remote_storage_config: BuilderValue>, - - id: BuilderValue, - - broker_endpoint: BuilderValue, - broker_keepalive_interval: BuilderValue, - - log_format: BuilderValue, - - concurrent_tenant_warmup: BuilderValue, - concurrent_tenant_size_logical_size_queries: BuilderValue, - - metric_collection_interval: BuilderValue, - cached_metric_collection_interval: BuilderValue, - metric_collection_endpoint: BuilderValue>, - synthetic_size_calculation_interval: BuilderValue, - - disk_usage_based_eviction: BuilderValue>, - - test_remote_failures: BuilderValue, - - ondemand_download_behavior_treat_error_as_warn: BuilderValue, - - background_task_maximum_delay: BuilderValue, - - control_plane_api: BuilderValue>, - control_plane_api_token: BuilderValue>, - control_plane_emergency_mode: BuilderValue, - - heatmap_upload_concurrency: BuilderValue, - secondary_download_concurrency: BuilderValue, - - ingest_batch_size: BuilderValue, -} - -impl Default for PageServerConfigBuilder { - fn default() -> Self { - use self::BuilderValue::*; - use defaults::*; - Self { - listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()), - listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()), - availability_zone: Set(None), - wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) - .expect("cannot parse default wait lsn timeout")), - wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT) - .expect("cannot parse default wal redo timeout")), - superuser: Set(DEFAULT_SUPERUSER.to_string()), - page_cache_size: Set(DEFAULT_PAGE_CACHE_SIZE), - max_file_descriptors: Set(DEFAULT_MAX_FILE_DESCRIPTORS), - workdir: Set(Utf8PathBuf::new()), - pg_distrib_dir: Set(Utf8PathBuf::from_path_buf( - env::current_dir().expect("cannot access current directory"), - ) - .expect("non-Unicode path") - .join("pg_install")), - http_auth_type: Set(AuthType::Trust), - pg_auth_type: Set(AuthType::Trust), - auth_validation_public_key_path: Set(None), - remote_storage_config: Set(None), - id: NotSet, - broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT - .parse() - .expect("failed to parse default broker endpoint")), - broker_keepalive_interval: Set(humantime::parse_duration( - storage_broker::DEFAULT_KEEPALIVE_INTERVAL, - ) - .expect("cannot parse default keepalive interval")), - log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), - - concurrent_tenant_warmup: Set(NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP) - .expect("Invalid default constant")), - concurrent_tenant_size_logical_size_queries: Set( - ConfigurableSemaphore::DEFAULT_INITIAL, - ), - metric_collection_interval: Set(humantime::parse_duration( - DEFAULT_METRIC_COLLECTION_INTERVAL, - ) - .expect("cannot parse default metric collection interval")), - cached_metric_collection_interval: Set(humantime::parse_duration( - DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL, - ) - .expect("cannot parse default cached_metric_collection_interval")), - synthetic_size_calculation_interval: Set(humantime::parse_duration( - DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL, - ) - .expect("cannot parse default synthetic size calculation interval")), - metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT), - - disk_usage_based_eviction: Set(None), - - test_remote_failures: Set(0), - - ondemand_download_behavior_treat_error_as_warn: Set(false), - - background_task_maximum_delay: Set(humantime::parse_duration( - DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY, - ) - .unwrap()), - - control_plane_api: Set(None), - control_plane_api_token: Set(None), - control_plane_emergency_mode: Set(false), - - heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY), - secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY), - - ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE), - } - } -} - -impl PageServerConfigBuilder { - pub fn listen_pg_addr(&mut self, listen_pg_addr: String) { - self.listen_pg_addr = BuilderValue::Set(listen_pg_addr) - } - - pub fn listen_http_addr(&mut self, listen_http_addr: String) { - self.listen_http_addr = BuilderValue::Set(listen_http_addr) - } - - pub fn availability_zone(&mut self, availability_zone: Option) { - self.availability_zone = BuilderValue::Set(availability_zone) - } - - pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) { - self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout) - } - - pub fn wal_redo_timeout(&mut self, wal_redo_timeout: Duration) { - self.wal_redo_timeout = BuilderValue::Set(wal_redo_timeout) - } - - pub fn superuser(&mut self, superuser: String) { - self.superuser = BuilderValue::Set(superuser) - } - - pub fn page_cache_size(&mut self, page_cache_size: usize) { - self.page_cache_size = BuilderValue::Set(page_cache_size) - } - - pub fn max_file_descriptors(&mut self, max_file_descriptors: usize) { - self.max_file_descriptors = BuilderValue::Set(max_file_descriptors) - } - - pub fn workdir(&mut self, workdir: Utf8PathBuf) { - self.workdir = BuilderValue::Set(workdir) - } - - pub fn pg_distrib_dir(&mut self, pg_distrib_dir: Utf8PathBuf) { - self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir) - } - - pub fn http_auth_type(&mut self, auth_type: AuthType) { - self.http_auth_type = BuilderValue::Set(auth_type) - } - - pub fn pg_auth_type(&mut self, auth_type: AuthType) { - self.pg_auth_type = BuilderValue::Set(auth_type) - } - - pub fn auth_validation_public_key_path( - &mut self, - auth_validation_public_key_path: Option, - ) { - self.auth_validation_public_key_path = BuilderValue::Set(auth_validation_public_key_path) - } - - pub fn remote_storage_config(&mut self, remote_storage_config: Option) { - self.remote_storage_config = BuilderValue::Set(remote_storage_config) - } - - pub fn broker_endpoint(&mut self, broker_endpoint: Uri) { - self.broker_endpoint = BuilderValue::Set(broker_endpoint) - } - - pub fn broker_keepalive_interval(&mut self, broker_keepalive_interval: Duration) { - self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval) - } - - pub fn id(&mut self, node_id: NodeId) { - self.id = BuilderValue::Set(node_id) - } - - pub fn log_format(&mut self, log_format: LogFormat) { - self.log_format = BuilderValue::Set(log_format) - } - - pub fn concurrent_tenant_warmup(&mut self, u: NonZeroUsize) { - self.concurrent_tenant_warmup = BuilderValue::Set(u); - } - - pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) { - self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u); - } - - pub fn metric_collection_interval(&mut self, metric_collection_interval: Duration) { - self.metric_collection_interval = BuilderValue::Set(metric_collection_interval) - } - - pub fn cached_metric_collection_interval( - &mut self, - cached_metric_collection_interval: Duration, - ) { - self.cached_metric_collection_interval = - BuilderValue::Set(cached_metric_collection_interval) - } - - pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option) { - self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint) - } - - pub fn synthetic_size_calculation_interval( - &mut self, - synthetic_size_calculation_interval: Duration, - ) { - self.synthetic_size_calculation_interval = - BuilderValue::Set(synthetic_size_calculation_interval) - } - - pub fn test_remote_failures(&mut self, fail_first: u64) { - self.test_remote_failures = BuilderValue::Set(fail_first); - } - - pub fn disk_usage_based_eviction(&mut self, value: Option) { - self.disk_usage_based_eviction = BuilderValue::Set(value); - } - - pub fn ondemand_download_behavior_treat_error_as_warn( - &mut self, - ondemand_download_behavior_treat_error_as_warn: bool, - ) { - self.ondemand_download_behavior_treat_error_as_warn = - BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn); - } - - pub fn background_task_maximum_delay(&mut self, delay: Duration) { - self.background_task_maximum_delay = BuilderValue::Set(delay); - } - - pub fn control_plane_api(&mut self, api: Option) { - self.control_plane_api = BuilderValue::Set(api) - } - - pub fn control_plane_api_token(&mut self, token: Option) { - self.control_plane_api_token = BuilderValue::Set(token) - } - - pub fn control_plane_emergency_mode(&mut self, enabled: bool) { - self.control_plane_emergency_mode = BuilderValue::Set(enabled) - } - - pub fn heatmap_upload_concurrency(&mut self, value: usize) { - self.heatmap_upload_concurrency = BuilderValue::Set(value) - } - - pub fn secondary_download_concurrency(&mut self, value: usize) { - self.secondary_download_concurrency = BuilderValue::Set(value) - } - - pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) { - self.ingest_batch_size = BuilderValue::Set(ingest_batch_size) - } - - pub fn build(self) -> anyhow::Result { - let concurrent_tenant_warmup = self - .concurrent_tenant_warmup - .ok_or(anyhow!("missing concurrent_tenant_warmup"))?; - let concurrent_tenant_size_logical_size_queries = self - .concurrent_tenant_size_logical_size_queries - .ok_or(anyhow!( - "missing concurrent_tenant_size_logical_size_queries" - ))?; - Ok(PageServerConf { - listen_pg_addr: self - .listen_pg_addr - .ok_or(anyhow!("missing listen_pg_addr"))?, - listen_http_addr: self - .listen_http_addr - .ok_or(anyhow!("missing listen_http_addr"))?, - availability_zone: self - .availability_zone - .ok_or(anyhow!("missing availability_zone"))?, - wait_lsn_timeout: self - .wait_lsn_timeout - .ok_or(anyhow!("missing wait_lsn_timeout"))?, - wal_redo_timeout: self - .wal_redo_timeout - .ok_or(anyhow!("missing wal_redo_timeout"))?, - superuser: self.superuser.ok_or(anyhow!("missing superuser"))?, - page_cache_size: self - .page_cache_size - .ok_or(anyhow!("missing page_cache_size"))?, - max_file_descriptors: self - .max_file_descriptors - .ok_or(anyhow!("missing max_file_descriptors"))?, - workdir: self.workdir.ok_or(anyhow!("missing workdir"))?, - pg_distrib_dir: self - .pg_distrib_dir - .ok_or(anyhow!("missing pg_distrib_dir"))?, - http_auth_type: self - .http_auth_type - .ok_or(anyhow!("missing http_auth_type"))?, - pg_auth_type: self.pg_auth_type.ok_or(anyhow!("missing pg_auth_type"))?, - auth_validation_public_key_path: self - .auth_validation_public_key_path - .ok_or(anyhow!("missing auth_validation_public_key_path"))?, - remote_storage_config: self - .remote_storage_config - .ok_or(anyhow!("missing remote_storage_config"))?, - id: self.id.ok_or(anyhow!("missing id"))?, - // TenantConf is handled separately - default_tenant_conf: TenantConf::default(), - broker_endpoint: self - .broker_endpoint - .ok_or(anyhow!("No broker endpoints provided"))?, - broker_keepalive_interval: self - .broker_keepalive_interval - .ok_or(anyhow!("No broker keepalive interval provided"))?, - log_format: self.log_format.ok_or(anyhow!("missing log_format"))?, - concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup), - concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new( - concurrent_tenant_size_logical_size_queries, - ), - eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new( - concurrent_tenant_size_logical_size_queries, - ), - metric_collection_interval: self - .metric_collection_interval - .ok_or(anyhow!("missing metric_collection_interval"))?, - cached_metric_collection_interval: self - .cached_metric_collection_interval - .ok_or(anyhow!("missing cached_metric_collection_interval"))?, - metric_collection_endpoint: self - .metric_collection_endpoint - .ok_or(anyhow!("missing metric_collection_endpoint"))?, - synthetic_size_calculation_interval: self - .synthetic_size_calculation_interval - .ok_or(anyhow!("missing synthetic_size_calculation_interval"))?, - disk_usage_based_eviction: self - .disk_usage_based_eviction - .ok_or(anyhow!("missing disk_usage_based_eviction"))?, - test_remote_failures: self - .test_remote_failures - .ok_or(anyhow!("missing test_remote_failuers"))?, - ondemand_download_behavior_treat_error_as_warn: self - .ondemand_download_behavior_treat_error_as_warn - .ok_or(anyhow!( - "missing ondemand_download_behavior_treat_error_as_warn" - ))?, - background_task_maximum_delay: self - .background_task_maximum_delay - .ok_or(anyhow!("missing background_task_maximum_delay"))?, - control_plane_api: self - .control_plane_api - .ok_or(anyhow!("missing control_plane_api"))?, - control_plane_api_token: self - .control_plane_api_token - .ok_or(anyhow!("missing control_plane_api_token"))?, - control_plane_emergency_mode: self - .control_plane_emergency_mode - .ok_or(anyhow!("missing control_plane_emergency_mode"))?, - heatmap_upload_concurrency: self - .heatmap_upload_concurrency - .ok_or(anyhow!("missing heatmap_upload_concurrency"))?, - secondary_download_concurrency: self - .secondary_download_concurrency - .ok_or(anyhow!("missing secondary_download_concurrency"))?, - ingest_batch_size: self - .ingest_batch_size - .ok_or(anyhow!("missing ingest_batch_size"))?, - }) - } -} - impl PageServerConf { // // Repository paths, relative to workdir. @@ -686,6 +203,10 @@ impl PageServerConf { self.workdir.join("deletion") } + pub fn metadata_path(&self) -> Utf8PathBuf { + self.workdir.join("metadata.json") + } + pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf { // Encode a version in the filename, so that if we ever switch away from JSON we can // increment this. @@ -707,21 +228,12 @@ impl PageServerConf { self.tenants_path().join(tenant_shard_id.to_string()) } - pub fn tenant_ignore_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { - self.tenant_path(tenant_shard_id) - .join(IGNORED_TENANT_FILE_NAME) - } - /// Points to a place in pageserver's local directory, - /// where certain tenant's tenantconf file should be located. - /// - /// Legacy: superseded by tenant_location_config_path. Eventually - /// remove this function. - pub fn tenant_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { - self.tenant_path(tenant_shard_id).join(TENANT_CONFIG_NAME) - } - - pub fn tenant_location_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { + /// where certain tenant's LocationConf be stored. + pub(crate) fn tenant_location_config_path( + &self, + tenant_shard_id: &TenantShardId, + ) -> Utf8PathBuf { self.tenant_path(tenant_shard_id) .join(TENANT_LOCATION_CONFIG_NAME) } @@ -745,18 +257,7 @@ impl PageServerConf { .join(timeline_id.to_string()) } - pub fn timeline_uninit_mark_file_path( - &self, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - ) -> Utf8PathBuf { - path_with_suffix_extension( - self.timeline_path(&tenant_shard_id, &timeline_id), - TIMELINE_UNINIT_MARK_SUFFIX, - ) - } - - pub fn timeline_delete_mark_file_path( + pub(crate) fn timeline_delete_mark_file_path( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, @@ -767,38 +268,6 @@ impl PageServerConf { ) } - pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { - self.tenant_path(tenant_shard_id) - .join(TENANT_DELETED_MARKER_FILE_NAME) - } - - pub fn traces_path(&self) -> Utf8PathBuf { - self.workdir.join("traces") - } - - pub fn trace_path( - &self, - tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, - connection_id: &ConnectionId, - ) -> Utf8PathBuf { - self.traces_path() - .join(tenant_shard_id.to_string()) - .join(timeline_id.to_string()) - .join(connection_id.to_string()) - } - - /// Points to a place in pageserver's local directory, - /// where certain timeline's metadata file should be located. - pub fn metadata_path( - &self, - tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, - ) -> Utf8PathBuf { - self.timeline_path(tenant_shard_id, timeline_id) - .join(METADATA_FILE_NAME) - } - /// Turns storage remote path of a file into its local path. pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf { remote_path.with_base(&self.workdir) @@ -828,103 +297,135 @@ impl PageServerConf { /// validating the input and failing on errors. /// /// This leaves any options not present in the file in the built-in defaults. - pub fn parse_and_validate(toml: &Document, workdir: &Utf8Path) -> anyhow::Result { - let mut builder = PageServerConfigBuilder::default(); - builder.workdir(workdir.to_owned()); + pub fn parse_and_validate( + id: NodeId, + config_toml: pageserver_api::config::ConfigToml, + workdir: &Utf8Path, + ) -> anyhow::Result { + let pageserver_api::config::ConfigToml { + listen_pg_addr, + listen_http_addr, + availability_zone, + wait_lsn_timeout, + wal_redo_timeout, + superuser, + page_cache_size, + max_file_descriptors, + pg_distrib_dir, + http_auth_type, + pg_auth_type, + auth_validation_public_key_path, + remote_storage, + broker_endpoint, + broker_keepalive_interval, + log_format, + metric_collection_interval, + metric_collection_endpoint, + metric_collection_bucket, + synthetic_size_calculation_interval, + disk_usage_based_eviction, + test_remote_failures, + ondemand_download_behavior_treat_error_as_warn, + background_task_maximum_delay, + control_plane_api, + control_plane_api_token, + control_plane_emergency_mode, + heatmap_upload_concurrency, + secondary_download_concurrency, + ingest_batch_size, + max_vectored_read_bytes, + image_compression, + ephemeral_bytes_per_memory_kb, + compact_level0_phase1_value_access: _, + l0_flush, + virtual_file_direct_io, + concurrent_tenant_warmup, + concurrent_tenant_size_logical_size_queries, + virtual_file_io_engine, + io_buffer_alignment, + tenant_config, + } = config_toml; - let mut t_conf = TenantConfOpt::default(); + let mut conf = PageServerConf { + // ------------------------------------------------------------ + // fields that are already fully validated by the ConfigToml Deserialize impl + // ------------------------------------------------------------ + listen_pg_addr, + listen_http_addr, + availability_zone, + wait_lsn_timeout, + wal_redo_timeout, + superuser, + page_cache_size, + max_file_descriptors, + http_auth_type, + pg_auth_type, + auth_validation_public_key_path, + remote_storage_config: remote_storage, + broker_endpoint, + broker_keepalive_interval, + log_format, + metric_collection_interval, + metric_collection_endpoint, + metric_collection_bucket, + synthetic_size_calculation_interval, + disk_usage_based_eviction, + test_remote_failures, + ondemand_download_behavior_treat_error_as_warn, + background_task_maximum_delay, + control_plane_api, + control_plane_emergency_mode, + heatmap_upload_concurrency, + secondary_download_concurrency, + ingest_batch_size, + max_vectored_read_bytes, + image_compression, + ephemeral_bytes_per_memory_kb, + virtual_file_direct_io, + io_buffer_alignment, - for (key, item) in toml.iter() { - match key { - "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?), - "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?), - "availability_zone" => builder.availability_zone(Some(parse_toml_string(key, item)?)), - "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?), - "wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?), - "initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?), - "page_cache_size" => builder.page_cache_size(parse_toml_u64(key, item)? as usize), - "max_file_descriptors" => { - builder.max_file_descriptors(parse_toml_u64(key, item)? as usize) - } - "pg_distrib_dir" => { - builder.pg_distrib_dir(Utf8PathBuf::from(parse_toml_string(key, item)?)) - } - "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some( - Utf8PathBuf::from(parse_toml_string(key, item)?), - )), - "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?), - "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?), - "remote_storage" => { - builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?) - } - "tenant_config" => { - t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?; - } - "id" => builder.id(NodeId(parse_toml_u64(key, item)?)), - "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?), - "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?), - "log_format" => builder.log_format( - LogFormat::from_config(&parse_toml_string(key, item)?)? - ), - "concurrent_tenant_warmup" => builder.concurrent_tenant_warmup({ - let input = parse_toml_string(key, item)?; - let permits = input.parse::().context("expected a number of initial permits, not {s:?}")?; - NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")? - }), - "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({ - let input = parse_toml_string(key, item)?; - let permits = input.parse::().context("expected a number of initial permits, not {s:?}")?; - NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")? - }), - "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?), - "cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?), - "metric_collection_endpoint" => { - let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?; - builder.metric_collection_endpoint(Some(endpoint)); - }, - "synthetic_size_calculation_interval" => - builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), - "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?), - "disk_usage_based_eviction" => { - tracing::info!("disk_usage_based_eviction: {:#?}", &item); - builder.disk_usage_based_eviction( - deserialize_from_item("disk_usage_based_eviction", item) - .context("parse disk_usage_based_eviction")? - ) - }, - "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), - "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?), - "control_plane_api" => { - let parsed = parse_toml_string(key, item)?; - if parsed.is_empty() { - builder.control_plane_api(None) - } else { - builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?)) + // ------------------------------------------------------------ + // fields that require additional validation or custom handling + // ------------------------------------------------------------ + workdir: workdir.to_owned(), + pg_distrib_dir: pg_distrib_dir.unwrap_or_else(|| { + std::env::current_dir() + .expect("current_dir() failed") + .try_into() + .expect("current_dir() is not a valid Utf8Path") + }), + control_plane_api_token: control_plane_api_token.map(SecretString::from), + id, + default_tenant_conf: tenant_config, + concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup), + concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new( + concurrent_tenant_size_logical_size_queries, + ), + eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new( + // re-use `concurrent_tenant_size_logical_size_queries` + concurrent_tenant_size_logical_size_queries, + ), + virtual_file_io_engine: match virtual_file_io_engine { + Some(v) => v, + None => match crate::virtual_file::io_engine_feature_test() + .context("auto-detect virtual_file_io_engine")? + { + io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise + io_engine::FeatureTestResult::Worse { engine, remark } => { + // TODO: bubble this up to the caller so we can tracing::warn! it. + eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}"); + engine } }, - "control_plane_api_token" => { - let parsed = parse_toml_string(key, item)?; - if parsed.is_empty() { - builder.control_plane_api_token(None) - } else { - builder.control_plane_api_token(Some(parsed.into())) - } - }, - "control_plane_emergency_mode" => { - builder.control_plane_emergency_mode(parse_toml_bool(key, item)?) - }, - "heatmap_upload_concurrency" => { - builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize) - }, - "secondary_download_concurrency" => { - builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize) - }, - "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?), - _ => bail!("unrecognized pageserver option '{key}'"), - } - } + }, + l0_flush: l0_flush + .map(crate::l0_flush::L0FlushConfig::from) + .unwrap_or_default(), + }; - let mut conf = builder.build().context("invalid config")?; + // ------------------------------------------------------------ + // custom validation code that covers more than one field in isolation + // ------------------------------------------------------------ if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT { let auth_validation_public_key_path = conf @@ -938,7 +439,14 @@ impl PageServerConf { ); } - conf.default_tenant_conf = t_conf.merge(TenantConf::default()); + IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance) + .map_err(anyhow::Error::msg) + .with_context(|| { + format!( + "effective checkpoint distance is unsupported: {}", + conf.default_tenant_conf.checkpoint_distance + ) + })?; Ok(conf) } @@ -952,111 +460,23 @@ impl PageServerConf { pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self { let pg_distrib_dir = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install"); - PageServerConf { - id: NodeId(0), + let config_toml = pageserver_api::config::ConfigToml { wait_lsn_timeout: Duration::from_secs(60), wal_redo_timeout: Duration::from_secs(60), - page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE, - max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS, - listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), - listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - availability_zone: None, - superuser: "cloud_admin".to_string(), - workdir: repo_dir, - pg_distrib_dir, - http_auth_type: AuthType::Trust, - pg_auth_type: AuthType::Trust, - auth_validation_public_key_path: None, - remote_storage_config: None, - default_tenant_conf: TenantConf::default(), - broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), - broker_keepalive_interval: Duration::from_secs(5000), - log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), - concurrent_tenant_warmup: ConfigurableSemaphore::new( - NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP) - .expect("Invalid default constant"), - ), - concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), - eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default( - ), + pg_distrib_dir: Some(pg_distrib_dir), metric_collection_interval: Duration::from_secs(60), - cached_metric_collection_interval: Duration::from_secs(60 * 60), - metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, synthetic_size_calculation_interval: Duration::from_secs(60), - disk_usage_based_eviction: None, - test_remote_failures: 0, - ondemand_download_behavior_treat_error_as_warn: false, background_task_maximum_delay: Duration::ZERO, - control_plane_api: None, - control_plane_api_token: None, - control_plane_emergency_mode: false, - heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, - secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, - ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, - } + ..Default::default() + }; + PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap() } } -// Helper functions to parse a toml Item - -fn parse_toml_string(name: &str, item: &Item) -> Result { - let s = item - .as_str() - .with_context(|| format!("configure option {name} is not a string"))?; - Ok(s.to_string()) -} - -fn parse_toml_u64(name: &str, item: &Item) -> Result { - // A toml integer is signed, so it cannot represent the full range of an u64. That's OK - // for our use, though. - let i: i64 = item - .as_integer() - .with_context(|| format!("configure option {name} is not an integer"))?; - if i < 0 { - bail!("configure option {name} cannot be negative"); - } - Ok(i as u64) -} - -fn parse_toml_bool(name: &str, item: &Item) -> Result { - item.as_bool() - .with_context(|| format!("configure option {name} is not a bool")) -} - -fn parse_toml_duration(name: &str, item: &Item) -> Result { - let s = item - .as_str() - .with_context(|| format!("configure option {name} is not a string"))?; - - Ok(humantime::parse_duration(s)?) -} - -fn parse_toml_from_str(name: &str, item: &Item) -> anyhow::Result -where - T: FromStr, - ::Err: std::fmt::Display, -{ - let v = item - .as_str() - .with_context(|| format!("configure option {name} is not a string"))?; - T::from_str(v).map_err(|e| { - anyhow!( - "Failed to parse string as {parse_type} for configure option {name}: {e}", - parse_type = stringify!(T) - ) - }) -} - -fn deserialize_from_item(name: &str, item: &Item) -> anyhow::Result -where - T: serde::de::DeserializeOwned, -{ - // ValueDeserializer::new is not public, so use the ValueDeserializer's documented way - let deserializer = match item.clone().into_value() { - Ok(value) => value.into_deserializer(), - Err(item) => anyhow::bail!("toml_edit::Item '{item}' is not a toml_edit::Value"), - }; - T::deserialize(deserializer).with_context(|| format!("deserializing item for node {name}")) +#[derive(serde::Deserialize, serde::Serialize)] +#[serde(deny_unknown_fields)] +pub struct PageserverIdentity { + pub id: NodeId, } /// Configurable semaphore permits setting. @@ -1120,430 +540,109 @@ impl ConfigurableSemaphore { #[cfg(test)] mod tests { - use std::{ - fs, - num::{NonZeroU32, NonZeroUsize}, - }; - use camino_tempfile::{tempdir, Utf8TempDir}; - use pageserver_api::models::EvictionPolicy; - use remote_storage::{RemoteStorageKind, S3Config}; - use utils::serde_percent::Percent; + use camino::Utf8PathBuf; + use utils::id::NodeId; - use super::*; - use crate::DEFAULT_PG_VERSION; - - const ALL_BASE_VALUES_TOML: &str = r#" -# Initial configuration file created by 'pageserver --init' - -listen_pg_addr = '127.0.0.1:64000' -listen_http_addr = '127.0.0.1:9898' - -wait_lsn_timeout = '111 s' -wal_redo_timeout = '111 s' - -page_cache_size = 444 -max_file_descriptors = 333 - -# initial superuser role name to use when creating a new tenant -initial_superuser_name = 'zzzz' -id = 10 - -metric_collection_interval = '222 s' -cached_metric_collection_interval = '22200 s' -metric_collection_endpoint = 'http://localhost:80/metrics' -synthetic_size_calculation_interval = '333 s' - -log_format = 'json' -background_task_maximum_delay = '334 s' - -"#; + use super::PageServerConf; #[test] - fn parse_defaults() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - let broker_endpoint = storage_broker::DEFAULT_ENDPOINT; - // we have to create dummy values to overcome the validation errors - let config_string = format!( - "pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'", - ); - let toml = config_string.parse()?; - - let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); - - assert_eq!( - parsed_config, - PageServerConf { - id: NodeId(10), - listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), - listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - availability_zone: None, - wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?, - wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?, - superuser: defaults::DEFAULT_SUPERUSER.to_string(), - page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE, - max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS, - workdir, - pg_distrib_dir, - http_auth_type: AuthType::Trust, - pg_auth_type: AuthType::Trust, - auth_validation_public_key_path: None, - remote_storage_config: None, - default_tenant_conf: TenantConf::default(), - broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), - broker_keepalive_interval: humantime::parse_duration( - storage_broker::DEFAULT_KEEPALIVE_INTERVAL - )?, - log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), - concurrent_tenant_warmup: ConfigurableSemaphore::new( - NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap() - ), - concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), - eviction_task_immitated_concurrent_logical_size_queries: - ConfigurableSemaphore::default(), - metric_collection_interval: humantime::parse_duration( - defaults::DEFAULT_METRIC_COLLECTION_INTERVAL - )?, - cached_metric_collection_interval: humantime::parse_duration( - defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL - )?, - metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, - synthetic_size_calculation_interval: humantime::parse_duration( - defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL - )?, - disk_usage_based_eviction: None, - test_remote_failures: 0, - ondemand_download_behavior_treat_error_as_warn: false, - background_task_maximum_delay: humantime::parse_duration( - defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY - )?, - control_plane_api: None, - control_plane_api_token: None, - control_plane_emergency_mode: false, - heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, - secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, - ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, - }, - "Correct defaults should be used when no config values are provided" - ); - - Ok(()) + fn test_empty_config_toml_is_valid() { + // we use Default impl of everything in this situation + let input = r#" + "#; + let config_toml = toml_edit::de::from_str::(input) + .expect("empty config is valid"); + let workdir = Utf8PathBuf::from("/nonexistent"); + PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) + .expect("parse_and_validate"); } #[test] - fn parse_basic_config() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - let broker_endpoint = storage_broker::DEFAULT_ENDPOINT; - - let config_string = format!( - "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'", - ); - let toml = config_string.parse()?; - - let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); - - assert_eq!( - parsed_config, - PageServerConf { - id: NodeId(10), - listen_pg_addr: "127.0.0.1:64000".to_string(), - listen_http_addr: "127.0.0.1:9898".to_string(), - availability_zone: None, - wait_lsn_timeout: Duration::from_secs(111), - wal_redo_timeout: Duration::from_secs(111), - superuser: "zzzz".to_string(), - page_cache_size: 444, - max_file_descriptors: 333, - workdir, - pg_distrib_dir, - http_auth_type: AuthType::Trust, - pg_auth_type: AuthType::Trust, - auth_validation_public_key_path: None, - remote_storage_config: None, - default_tenant_conf: TenantConf::default(), - broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), - broker_keepalive_interval: Duration::from_secs(5), - log_format: LogFormat::Json, - concurrent_tenant_warmup: ConfigurableSemaphore::new( - NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap() - ), - concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), - eviction_task_immitated_concurrent_logical_size_queries: - ConfigurableSemaphore::default(), - metric_collection_interval: Duration::from_secs(222), - cached_metric_collection_interval: Duration::from_secs(22200), - metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), - synthetic_size_calculation_interval: Duration::from_secs(333), - disk_usage_based_eviction: None, - test_remote_failures: 0, - ondemand_download_behavior_treat_error_as_warn: false, - background_task_maximum_delay: Duration::from_secs(334), - control_plane_api: None, - control_plane_api_token: None, - control_plane_emergency_mode: false, - heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, - secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, - ingest_batch_size: 100, - }, - "Should be able to parse all basic config values correctly" - ); - - Ok(()) + fn test_compactl0_phase1_access_mode_is_ignored_silently() { + let input = indoc::indoc! {r#" + [compact_level0_phase1_value_access] + mode = "streaming-kmerge" + validate = "key-lsn-value" + "#}; + toml_edit::de::from_str::(input).unwrap(); } - #[test] - fn parse_remote_fs_storage_config() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - let broker_endpoint = "http://127.0.0.1:7777"; - - let local_storage_path = tempdir.path().join("local_remote_storage"); - - let identical_toml_declarations = &[ - format!( - r#"[remote_storage] -local_path = '{local_storage_path}'"#, - ), - format!("remote_storage={{local_path='{local_storage_path}'}}"), - ]; - - for remote_storage_config_str in identical_toml_declarations { - let config_string = format!( - r#"{ALL_BASE_VALUES_TOML} -pg_distrib_dir='{pg_distrib_dir}' -broker_endpoint = '{broker_endpoint}' - -{remote_storage_config_str}"#, - ); - - let toml = config_string.parse()?; - - let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| { - panic!("Failed to parse config '{config_string}', reason: {e:?}") - }) - .remote_storage_config - .expect("Should have remote storage config for the local FS"); - - assert_eq!( - parsed_remote_storage_config, - RemoteStorageConfig { - storage: RemoteStorageKind::LocalFs(local_storage_path.clone()), - }, - "Remote storage config should correctly parse the local FS config and fill other storage defaults" - ); + /// If there's a typo in the pageserver config, we'd rather catch that typo + /// and fail pageserver startup than silently ignoring the typo, leaving whoever + /// made it in the believe that their config change is effective. + /// + /// The default in serde is to allow unknown fields, so, we rely + /// on developer+review discipline to add `deny_unknown_fields` when adding + /// new structs to the config, and these tests here as a regression test. + /// + /// The alternative to all of this would be to allow unknown fields in the config. + /// To catch them, we could have a config check tool or mgmt API endpoint that + /// compares the effective config with the TOML on disk and makes sure that + /// the on-disk TOML is a strict subset of the effective config. + mod unknown_fields_handling { + macro_rules! test { + ($short_name:ident, $input:expr) => { + #[test] + fn $short_name() { + let input = $input; + let err = toml_edit::de::from_str::(&input) + .expect_err("some_invalid_field is an invalid field"); + dbg!(&err); + assert!(err.to_string().contains("some_invalid_field")); + } + }; } - Ok(()) - } + use indoc::indoc; - #[test] - fn parse_remote_s3_storage_config() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - - let bucket_name = "some-sample-bucket".to_string(); - let bucket_region = "eu-north-1".to_string(); - let prefix_in_bucket = "test_prefix".to_string(); - let endpoint = "http://localhost:5000".to_string(); - let max_concurrent_syncs = NonZeroUsize::new(111).unwrap(); - let max_sync_errors = NonZeroU32::new(222).unwrap(); - let s3_concurrency_limit = NonZeroUsize::new(333).unwrap(); - let broker_endpoint = "http://127.0.0.1:7777"; - - let identical_toml_declarations = &[ - format!( - r#"[remote_storage] -max_concurrent_syncs = {max_concurrent_syncs} -max_sync_errors = {max_sync_errors} -bucket_name = '{bucket_name}' -bucket_region = '{bucket_region}' -prefix_in_bucket = '{prefix_in_bucket}' -endpoint = '{endpoint}' -concurrency_limit = {s3_concurrency_limit}"# - ), - format!( - "remote_storage={{max_concurrent_syncs={max_concurrent_syncs}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\ - bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}", - ), - ]; - - for remote_storage_config_str in identical_toml_declarations { - let config_string = format!( - r#"{ALL_BASE_VALUES_TOML} -pg_distrib_dir='{pg_distrib_dir}' -broker_endpoint = '{broker_endpoint}' - -{remote_storage_config_str}"#, - ); - - let toml = config_string.parse()?; - - let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir) - .unwrap_or_else(|e| { - panic!("Failed to parse config '{config_string}', reason: {e:?}") - }) - .remote_storage_config - .expect("Should have remote storage config for S3"); - - assert_eq!( - parsed_remote_storage_config, - RemoteStorageConfig { - storage: RemoteStorageKind::AwsS3(S3Config { - bucket_name: bucket_name.clone(), - bucket_region: bucket_region.clone(), - prefix_in_bucket: Some(prefix_in_bucket.clone()), - endpoint: Some(endpoint.clone()), - concurrency_limit: s3_concurrency_limit, - max_keys_per_list_response: None, - }), - }, - "Remote storage config should correctly parse the S3 config" - ); - } - Ok(()) - } - - #[test] - fn parse_tenant_config() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - - let broker_endpoint = "http://127.0.0.1:7777"; - let trace_read_requests = true; - - let config_string = format!( - r#"{ALL_BASE_VALUES_TOML} -pg_distrib_dir='{pg_distrib_dir}' -broker_endpoint = '{broker_endpoint}' - -[tenant_config] -trace_read_requests = {trace_read_requests}"#, + test!( + toplevel, + indoc! {r#" + some_invalid_field = 23 + "#} ); - let toml = config_string.parse()?; - - let conf = PageServerConf::parse_and_validate(&toml, &workdir)?; - assert_eq!( - conf.default_tenant_conf.trace_read_requests, trace_read_requests, - "Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants", + test!( + toplevel_nested, + indoc! {r#" + [some_invalid_field] + foo = 23 + "#} ); - Ok(()) - } - - #[test] - fn parse_incorrect_tenant_config() -> anyhow::Result<()> { - let config_string = r#" - [tenant_config] - checkpoint_distance = -1 # supposed to be an u64 - "# - .to_string(); - - let toml: Document = config_string.parse()?; - let item = toml.get("tenant_config").unwrap(); - let error = TenantConfOpt::try_from(item.to_owned()).unwrap_err(); - - let expected_error_str = "checkpoint_distance: invalid value: integer `-1`, expected u64"; - assert_eq!(error.to_string(), expected_error_str); - - Ok(()) - } - - #[test] - fn parse_override_tenant_config() -> anyhow::Result<()> { - let config_string = r#"tenant_config={ min_resident_size_override = 400 }"#.to_string(); - - let toml: Document = config_string.parse()?; - let item = toml.get("tenant_config").unwrap(); - let conf = TenantConfOpt::try_from(item.to_owned()).unwrap(); - - assert_eq!(conf.min_resident_size_override, Some(400)); - - Ok(()) - } - - #[test] - fn eviction_pageserver_config_parse() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - - let pageserver_conf_toml = format!( - r#"pg_distrib_dir = "{pg_distrib_dir}" -metric_collection_endpoint = "http://sample.url" -metric_collection_interval = "10min" -id = 222 - -[disk_usage_based_eviction] -max_usage_pct = 80 -min_avail_bytes = 0 -period = "10s" - -[tenant_config] -evictions_low_residence_duration_metric_threshold = "20m" - -[tenant_config.eviction_policy] -kind = "LayerAccessThreshold" -period = "20m" -threshold = "20m" -"#, + test!( + disk_usage_based_eviction, + indoc! {r#" + [disk_usage_based_eviction] + some_invalid_field = 23 + "#} ); - let toml: Document = pageserver_conf_toml.parse()?; - let conf = PageServerConf::parse_and_validate(&toml, &workdir)?; - assert_eq!(conf.pg_distrib_dir, pg_distrib_dir); - assert_eq!( - conf.metric_collection_endpoint, - Some("http://sample.url".parse().unwrap()) + test!( + tenant_config, + indoc! {r#" + [tenant_config] + some_invalid_field = 23 + "#} ); - assert_eq!( - conf.metric_collection_interval, - Duration::from_secs(10 * 60) + + test!( + l0_flush, + indoc! {r#" + [l0_flush] + mode = "direct" + some_invalid_field = 23 + "#} ); - assert_eq!( - conf.default_tenant_conf - .evictions_low_residence_duration_metric_threshold, - Duration::from_secs(20 * 60) - ); - assert_eq!(conf.id, NodeId(222)); - assert_eq!( - conf.disk_usage_based_eviction, - Some(DiskUsageEvictionTaskConfig { - max_usage_pct: Percent::new(80).unwrap(), - min_avail_bytes: 0, - period: Duration::from_secs(10), - #[cfg(feature = "testing")] - mock_statvfs: None, - eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed, - }) - ); - match &conf.default_tenant_conf.eviction_policy { - EvictionPolicy::NoEviction => panic!("Unexpected eviction opolicy tenant settings"), - EvictionPolicy::LayerAccessThreshold(eviction_thresold) => { - assert_eq!(eviction_thresold.period, Duration::from_secs(20 * 60)); - assert_eq!(eviction_thresold.threshold, Duration::from_secs(20 * 60)); - } - } - Ok(()) - } - - fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> { - let tempdir_path = tempdir.path(); - - let workdir = tempdir_path.join("workdir"); - fs::create_dir_all(&workdir)?; - - let pg_distrib_dir = tempdir_path.join("pg_distrib"); - let pg_distrib_dir_versioned = pg_distrib_dir.join(format!("v{DEFAULT_PG_VERSION}")); - fs::create_dir_all(&pg_distrib_dir_versioned)?; - let postgres_bin_dir = pg_distrib_dir_versioned.join("bin"); - fs::create_dir_all(&postgres_bin_dir)?; - fs::write(postgres_bin_dir.join("postgres"), "I'm postgres, trust me")?; - - Ok((workdir, pg_distrib_dir)) + // TODO: fix this => https://github.com/neondatabase/neon/issues/8915 + // test!( + // remote_storage_config, + // indoc! {r#" + // [remote_storage_config] + // local_path = "/nonexistent" + // some_invalid_field = 23 + // "#} + // ); } } diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 012a950b60..64a267e0e4 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -1,12 +1,18 @@ //! Periodically collect consumption metrics for all active tenants //! and push them to a HTTP endpoint. +use crate::config::PageServerConf; +use crate::consumption_metrics::metrics::MetricsKey; +use crate::consumption_metrics::upload::KeyGen as _; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant::size::CalculateSyntheticSizeError; use crate::tenant::tasks::BackgroundLoopKind; -use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant}; +use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant}; use camino::Utf8PathBuf; use consumption_metrics::EventType; +use itertools::Itertools as _; use pageserver_api::models::TenantState; +use remote_storage::{GenericRemoteStorage, RemoteStorageConfig}; use reqwest::Url; use std::collections::HashMap; use std::sync::Arc; @@ -16,9 +22,8 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::NodeId; -mod metrics; -use metrics::MetricsKey; mod disk_cache; +mod metrics; mod upload; const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); @@ -37,50 +42,74 @@ type RawMetric = (MetricsKey, (EventType, u64)); /// for deduplication, but that is no longer needed. type Cache = HashMap; +pub async fn run( + conf: &'static PageServerConf, + tenant_manager: Arc, + cancel: CancellationToken, +) { + let Some(metric_collection_endpoint) = conf.metric_collection_endpoint.as_ref() else { + return; + }; + + let local_disk_storage = conf.workdir.join("last_consumption_metrics.json"); + + let metrics_ctx = RequestContext::todo_child( + TaskKind::MetricsCollection, + // This task itself shouldn't download anything. + // The actual size calculation does need downloads, and + // creates a child context with the right DownloadBehavior. + DownloadBehavior::Error, + ); + let collect_metrics = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "consumption metrics collection", + collect_metrics( + tenant_manager.clone(), + metric_collection_endpoint, + &conf.metric_collection_bucket, + conf.metric_collection_interval, + conf.id, + local_disk_storage, + cancel.clone(), + metrics_ctx, + ) + .instrument(info_span!("metrics_collection")), + )); + + let worker_ctx = + RequestContext::todo_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download); + let synthetic_size_worker = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "synthetic size calculation", + calculate_synthetic_size_worker( + tenant_manager.clone(), + conf.synthetic_size_calculation_interval, + cancel.clone(), + worker_ctx, + ) + .instrument(info_span!("synthetic_size_worker")), + )); + + let (collect_metrics, synthetic_size_worker) = + futures::future::join(collect_metrics, synthetic_size_worker).await; + collect_metrics + .expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process"); + synthetic_size_worker + .expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process"); +} + /// Main thread that serves metrics collection #[allow(clippy::too_many_arguments)] -pub async fn collect_metrics( +async fn collect_metrics( + tenant_manager: Arc, metric_collection_endpoint: &Url, + metric_collection_bucket: &Option, metric_collection_interval: Duration, - _cached_metric_collection_interval: Duration, - synthetic_size_calculation_interval: Duration, node_id: NodeId, local_disk_storage: Utf8PathBuf, cancel: CancellationToken, ctx: RequestContext, ) -> anyhow::Result<()> { - if _cached_metric_collection_interval != Duration::ZERO { - tracing::warn!( - "cached_metric_collection_interval is no longer used, please set it to zero." - ) - } - - // spin up background worker that caclulates tenant sizes - let worker_ctx = - ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download); - task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), - TaskKind::CalculateSyntheticSize, - None, - None, - "synthetic size calculation", - false, - async move { - calculate_synthetic_size_worker( - synthetic_size_calculation_interval, - &cancel, - &worker_ctx, - ) - .instrument(info_span!("synthetic_size_worker")) - .await?; - Ok(()) - }, - ); - let path: Arc = Arc::new(local_disk_storage); - let cancel = task_mgr::shutdown_token(); - let restore_and_reschedule = restore_and_reschedule(&path, metric_collection_interval); let mut cached_metrics = tokio::select! { @@ -94,13 +123,33 @@ pub async fn collect_metrics( .build() .expect("Failed to create http client with timeout"); + let bucket_client = if let Some(bucket_config) = metric_collection_bucket { + match GenericRemoteStorage::from_config(bucket_config).await { + Ok(client) => Some(client), + Err(e) => { + // Non-fatal error: if we were given an invalid config, we will proceed + // with sending metrics over the network, but not to S3. + tracing::warn!("Invalid configuration for metric_collection_bucket: {e}"); + None + } + } + } else { + None + }; + let node_id = node_id.to_string(); loop { let started_at = Instant::now(); // these are point in time, with variable "now" - let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await; + let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await; + + // Pre-generate event idempotency keys, to reuse them across the bucket + // and HTTP sinks. + let idempotency_keys = std::iter::repeat_with(|| node_id.as_str().generate()) + .take(metrics.len()) + .collect_vec(); let metrics = Arc::new(metrics); @@ -118,21 +167,35 @@ pub async fn collect_metrics( tracing::error!("failed to persist metrics to {path:?}: {e:#}"); } } + + if let Some(bucket_client) = &bucket_client { + let res = upload::upload_metrics_bucket( + bucket_client, + &cancel, + &node_id, + &metrics, + &idempotency_keys, + ) + .await; + if let Err(e) = res { + tracing::error!("failed to upload to S3: {e:#}"); + } + } }; let upload = async { - let res = upload::upload_metrics( + let res = upload::upload_metrics_http( &client, metric_collection_endpoint, &cancel, - &node_id, &metrics, &mut cached_metrics, + &idempotency_keys, ) .await; if let Err(e) = res { // serialization error which should never happen - tracing::error!("failed to upload due to {e:#}"); + tracing::error!("failed to upload via HTTP due to {e:#}"); } }; @@ -145,11 +208,9 @@ pub async fn collect_metrics( BackgroundLoopKind::ConsumptionMetricsCollectMetrics, ); - let res = tokio::time::timeout_at( - started_at + metric_collection_interval, - task_mgr::shutdown_token().cancelled(), - ) - .await; + let res = + tokio::time::timeout_at(started_at + metric_collection_interval, cancel.cancelled()) + .await; if res.is_ok() { return Ok(()); } @@ -247,9 +308,10 @@ async fn reschedule( /// Caclculate synthetic size for each active tenant async fn calculate_synthetic_size_worker( + tenant_manager: Arc, synthetic_size_calculation_interval: Duration, - cancel: &CancellationToken, - ctx: &RequestContext, + cancel: CancellationToken, + ctx: RequestContext, ) -> anyhow::Result<()> { info!("starting calculate_synthetic_size_worker"); scopeguard::defer! { @@ -259,7 +321,7 @@ async fn calculate_synthetic_size_worker( loop { let started_at = Instant::now(); - let tenants = match mgr::list_tenants().await { + let tenants = match tenant_manager.list_tenants() { Ok(tenants) => tenants, Err(e) => { warn!("cannot get tenant list: {e:#}"); @@ -272,20 +334,24 @@ async fn calculate_synthetic_size_worker( continue; } - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { // We only send consumption metrics from shard 0, so don't waste time calculating // synthetic size on other shards. continue; } - let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else { + let Ok(tenant) = tenant_manager.get_attached_tenant_shard(tenant_shard_id) else { continue; }; + if !tenant.is_active() { + continue; + } + // there is never any reason to exit calculate_synthetic_size_worker following any // return value -- we don't need to care about shutdown because no tenant is found when // pageserver is shut down. - calculate_and_log(&tenant, cancel, ctx).await; + calculate_and_log(&tenant, &cancel, &ctx).await; } crate::tenant::tasks::warn_when_period_overrun( @@ -314,21 +380,12 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re // Same for the loop that fetches computed metrics. // By using the same limiter, we centralize metrics collection for "start" and "finished" counters, // which turns out is really handy to understand the system. - let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else { - return; - }; - - // this error can be returned if timeline is shutting down, but it does not - // mean the synthetic size worker should terminate. we do not need any checks - // in this function because `mgr::get_tenant` will error out after shutdown has - // progressed to shutting down tenants. - let shutting_down = matches!( - e.downcast_ref::(), - Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_)) - ); - - if !shutting_down { - let tenant_shard_id = tenant.tenant_shard_id(); - error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}"); + match tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await { + Ok(_) => {} + Err(CalculateSyntheticSizeError::Cancelled) => {} + Err(e) => { + let tenant_shard_id = tenant.tenant_shard_id(); + error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}"); + } } } diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 26b299a71d..7ba2d04c4f 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -1,3 +1,4 @@ +use crate::tenant::mgr::TenantManager; use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize}; use chrono::{DateTime, Utc}; use consumption_metrics::EventType; @@ -181,6 +182,7 @@ impl MetricsKey { } pub(super) async fn collect_all_metrics( + tenant_manager: &Arc, cached_metrics: &Cache, ctx: &RequestContext, ) -> Vec { @@ -188,7 +190,7 @@ pub(super) async fn collect_all_metrics( let started_at = std::time::Instant::now(); - let tenants = match crate::tenant::mgr::list_tenants().await { + let tenants = match tenant_manager.list_tenants() { Ok(tenants) => tenants, Err(err) => { tracing::error!("failed to list tenants: {:?}", err); @@ -197,10 +199,11 @@ pub(super) async fn collect_all_metrics( }; let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move { - if state != TenantState::Active || !id.is_zero() { + if state != TenantState::Active || !id.is_shard_zero() { None } else { - crate::tenant::mgr::get_tenant(id, true) + tenant_manager + .get_attached_tenant_shard(id) .ok() .map(|tenant| (id.tenant_id, tenant)) } diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs index 38a4c9eb5d..f9cbcea565 100644 --- a/pageserver/src/consumption_metrics/metrics/tests.rs +++ b/pageserver/src/consumption_metrics/metrics/tests.rs @@ -1,7 +1,5 @@ use super::*; use std::collections::HashMap; -use std::time::SystemTime; -use utils::lsn::Lsn; #[test] fn startup_collected_timeline_metrics_before_advancing() { diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 322ed95cc8..0325ee403a 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -1,4 +1,9 @@ +use std::time::SystemTime; + +use chrono::{DateTime, Utc}; use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE}; +use remote_storage::{GenericRemoteStorage, RemotePath}; +use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; use tracing::Instrument; @@ -13,21 +18,22 @@ struct Ids { pub(super) timeline_id: Option, } +/// Serialize and write metrics to an HTTP endpoint #[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))] -pub(super) async fn upload_metrics( +pub(super) async fn upload_metrics_http( client: &reqwest::Client, metric_collection_endpoint: &reqwest::Url, cancel: &CancellationToken, - node_id: &str, metrics: &[RawMetric], cached_metrics: &mut Cache, + idempotency_keys: &[IdempotencyKey<'_>], ) -> anyhow::Result<()> { let mut uploaded = 0; let mut failed = 0; let started_at = std::time::Instant::now(); - let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, node_id); + let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys); while let Some(res) = iter.next() { let (chunk, body) = res?; @@ -74,29 +80,86 @@ pub(super) async fn upload_metrics( Ok(()) } -// The return type is quite ugly, but we gain testability in isolation -fn serialize_in_chunks<'a, F>( +/// Serialize and write metrics to a remote storage object +#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))] +pub(super) async fn upload_metrics_bucket( + client: &GenericRemoteStorage, + cancel: &CancellationToken, + node_id: &str, + metrics: &[RawMetric], + idempotency_keys: &[IdempotencyKey<'_>], +) -> anyhow::Result<()> { + if metrics.is_empty() { + // Skip uploads if we have no metrics, so that readers don't have to handle the edge case + // of an empty object. + return Ok(()); + } + + // Compose object path + let datetime: DateTime = SystemTime::now().into(); + let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ"); + let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?; + + // Set up a gzip writer into a buffer + let mut compressed_bytes: Vec = Vec::new(); + let compressed_writer = std::io::Cursor::new(&mut compressed_bytes); + let mut gzip_writer = async_compression::tokio::write::GzipEncoder::new(compressed_writer); + + // Serialize and write into compressed buffer + let started_at = std::time::Instant::now(); + for res in serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys) { + let (_chunk, body) = res?; + gzip_writer.write_all(&body).await?; + } + gzip_writer.flush().await?; + gzip_writer.shutdown().await?; + let compressed_length = compressed_bytes.len(); + + // Write to remote storage + client + .upload_storage_object( + futures::stream::once(futures::future::ready(Ok(compressed_bytes.into()))), + compressed_length, + &path, + cancel, + ) + .await?; + let elapsed = started_at.elapsed(); + + tracing::info!( + compressed_length, + elapsed_ms = elapsed.as_millis(), + "write metrics bucket at {path}", + ); + + Ok(()) +} + +/// Serializes the input metrics as JSON in chunks of chunk_size. The provided +/// idempotency keys are injected into the corresponding metric events (reused +/// across different metrics sinks), and must have the same length as input. +fn serialize_in_chunks<'a>( chunk_size: usize, input: &'a [RawMetric], - factory: F, + idempotency_keys: &'a [IdempotencyKey<'a>], ) -> impl ExactSizeIterator> + 'a -where - F: KeyGen<'a> + 'a, { use bytes::BufMut; - struct Iter<'a, F> { + assert_eq!(input.len(), idempotency_keys.len()); + + struct Iter<'a> { inner: std::slice::Chunks<'a, RawMetric>, + idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>, chunk_size: usize, // write to a BytesMut so that we can cheaply clone the frozen Bytes for retries buffer: bytes::BytesMut, // chunk amount of events are reused to produce the serialized document scratch: Vec>, - factory: F, } - impl<'a, F: KeyGen<'a>> Iterator for Iter<'a, F> { + impl<'a> Iterator for Iter<'a> { type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>; fn next(&mut self) -> Option { @@ -107,17 +170,14 @@ where self.scratch.extend( chunk .iter() - .map(|raw_metric| raw_metric.as_event(&self.factory.generate())), + .zip(&mut self.idempotency_keys) + .map(|(raw_metric, key)| raw_metric.as_event(key)), ); } else { // next rounds: update_in_place to reuse allocations assert_eq!(self.scratch.len(), self.chunk_size); - self.scratch - .iter_mut() - .zip(chunk.iter()) - .for_each(|(slot, raw_metric)| { - raw_metric.update_in_place(slot, &self.factory.generate()) - }); + itertools::izip!(self.scratch.iter_mut(), chunk, &mut self.idempotency_keys) + .for_each(|(slot, raw_metric, key)| raw_metric.update_in_place(slot, key)); } let res = serde_json::to_writer( @@ -138,18 +198,19 @@ where } } - impl<'a, F: KeyGen<'a>> ExactSizeIterator for Iter<'a, F> {} + impl<'a> ExactSizeIterator for Iter<'a> {} let buffer = bytes::BytesMut::new(); let inner = input.chunks(chunk_size); + let idempotency_keys = idempotency_keys.iter(); let scratch = Vec::new(); Iter { inner, + idempotency_keys, chunk_size, buffer, scratch, - factory, } } @@ -208,7 +269,7 @@ impl RawMetricExt for RawMetric { } } -trait KeyGen<'a>: Copy { +pub(crate) trait KeyGen<'a> { fn generate(&self) -> IdempotencyKey<'a>; } @@ -262,35 +323,33 @@ async fn upload( ) -> Result<(), UploadError> { let warn_after = 3; let max_attempts = 10; + + // this is used only with tests so far + let last_value = if is_last { "true" } else { "false" }; + let res = utils::backoff::retry( - move || { - let body = body.clone(); - async move { - let res = client - .post(metric_collection_endpoint.clone()) - .header(reqwest::header::CONTENT_TYPE, "application/json") - .header( - LAST_IN_BATCH.clone(), - if is_last { "true" } else { "false" }, - ) - .body(body) - .send() - .await; + || async { + let res = client + .post(metric_collection_endpoint.clone()) + .header(reqwest::header::CONTENT_TYPE, "application/json") + .header(LAST_IN_BATCH.clone(), last_value) + .body(body.clone()) + .send() + .await; - let res = res.and_then(|res| res.error_for_status()); + let res = res.and_then(|res| res.error_for_status()); - // 10 redirects are normally allowed, so we don't need worry about 3xx - match res { - Ok(_response) => Ok(()), - Err(e) => { - let status = e.status().filter(|s| s.is_client_error()); - if let Some(status) = status { - // rejection used to be a thing when the server could reject a - // whole batch of metrics if one metric was bad. - Err(UploadError::Rejected(status)) - } else { - Err(UploadError::Reqwest(e)) - } + // 10 redirects are normally allowed, so we don't need worry about 3xx + match res { + Ok(_response) => Ok(()), + Err(e) => { + let status = e.status().filter(|s| s.is_client_error()); + if let Some(status) = status { + // rejection used to be a thing when the server could reject a + // whole batch of metrics if one metric was bad. + Err(UploadError::Rejected(status)) + } else { + Err(UploadError::Reqwest(e)) } } } @@ -299,9 +358,11 @@ async fn upload( warn_after, max_attempts, "upload consumption_metrics", - utils::backoff::Cancel::new(cancel.clone(), || UploadError::Cancelled), + cancel, ) - .await; + .await + .ok_or_else(|| UploadError::Cancelled) + .and_then(|x| x); match &res { Ok(_) => {} @@ -329,7 +390,10 @@ mod tests { let examples = metric_samples(); assert!(examples.len() > 1); - let factory = FixedGen::new(Utc::now(), "1", 42); + let now = Utc::now(); + let idempotency_keys = (0..examples.len()) + .map(|i| FixedGen::new(now, "1", i as u16).generate()) + .collect::>(); // need to use Event here because serde_json::Value uses default hashmap, not linked // hashmap @@ -338,13 +402,13 @@ mod tests { events: Vec>, } - let correct = serialize_in_chunks(examples.len(), &examples, factory) + let correct = serialize_in_chunks(examples.len(), &examples, &idempotency_keys) .map(|res| res.unwrap().1) .flat_map(|body| serde_json::from_slice::(&body).unwrap().events) .collect::>(); for chunk_size in 1..examples.len() { - let actual = serialize_in_chunks(chunk_size, &examples, factory) + let actual = serialize_in_chunks(chunk_size, &examples, &idempotency_keys) .map(|res| res.unwrap().1) .flat_map(|body| serde_json::from_slice::(&body).unwrap().events) .collect::>(); diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index ee331ea154..7afcf52cf2 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -1,7 +1,9 @@ -//! This module defines `RequestContext`, a structure that we use throughout -//! the pageserver to propagate high-level context from places -//! that _originate_ activity down to the shared code paths at the -//! heart of the pageserver. It's inspired by Golang's `context.Context`. +//! Defines [`RequestContext`]. +//! +//! It is a structure that we use throughout the pageserver to propagate +//! high-level context from places that _originate_ activity down to the +//! shared code paths at the heart of the pageserver. It's inspired by +//! Golang's `context.Context`. //! //! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions: //! 1. What high-level activity ([`TaskKind`]) needs this page? @@ -59,6 +61,7 @@ //! 1. It should be easy to forward the context to callees. //! 2. To propagate more data from high-level to low-level code, the functions in //! the middle should not need to be modified. +//! //! The solution is to have a container structure ([`RequestContext`]) that //! carries the information. Functions that don't care about what's in it //! pass it along to callees. @@ -88,21 +91,26 @@ use crate::task_mgr::TaskKind; +pub(crate) mod optional_counter; + // The main structure of this module, see module-level comment. -#[derive(Clone, Debug)] +#[derive(Debug)] pub struct RequestContext { task_kind: TaskKind, download_behavior: DownloadBehavior, access_stats_behavior: AccessStatsBehavior, page_content_kind: PageContentKind, + pub micros_spent_throttled: optional_counter::MicroSecondsCounterU32, } /// The kind of access to the page cache. #[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)] pub enum PageContentKind { Unknown, + DeltaLayerSummary, DeltaLayerBtreeNode, DeltaLayerValue, + ImageLayerSummary, ImageLayerBtreeNode, ImageLayerValue, InMemoryLayer, @@ -150,6 +158,7 @@ impl RequestContextBuilder { download_behavior: DownloadBehavior::Download, access_stats_behavior: AccessStatsBehavior::Update, page_content_kind: PageContentKind::Unknown, + micros_spent_throttled: Default::default(), }, } } @@ -163,6 +172,7 @@ impl RequestContextBuilder { download_behavior: original.download_behavior, access_stats_behavior: original.access_stats_behavior, page_content_kind: original.page_content_kind, + micros_spent_throttled: Default::default(), }, } } diff --git a/pageserver/src/context/optional_counter.rs b/pageserver/src/context/optional_counter.rs new file mode 100644 index 0000000000..100c649f18 --- /dev/null +++ b/pageserver/src/context/optional_counter.rs @@ -0,0 +1,101 @@ +use std::{ + sync::atomic::{AtomicU32, Ordering}, + time::Duration, +}; + +#[derive(Debug)] +pub struct CounterU32 { + inner: AtomicU32, +} +impl Default for CounterU32 { + fn default() -> Self { + Self { + inner: AtomicU32::new(u32::MAX), + } + } +} +impl CounterU32 { + pub fn open(&self) -> Result<(), &'static str> { + match self + .inner + .compare_exchange(u32::MAX, 0, Ordering::Relaxed, Ordering::Relaxed) + { + Ok(_) => Ok(()), + Err(_) => Err("open() called on clsoed state"), + } + } + pub fn close(&self) -> Result { + match self.inner.swap(u32::MAX, Ordering::Relaxed) { + u32::MAX => Err("close() called on closed state"), + x => Ok(x), + } + } + + pub fn add(&self, count: u32) -> Result<(), &'static str> { + if count == 0 { + return Ok(()); + } + let mut had_err = None; + self.inner + .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| match cur { + u32::MAX => { + had_err = Some("add() called on closed state"); + None + } + x => { + let (new, overflowed) = x.overflowing_add(count); + if new == u32::MAX || overflowed { + had_err = Some("add() overflowed the counter"); + None + } else { + Some(new) + } + } + }) + .map_err(|_| had_err.expect("we set it whenever the function returns None")) + .map(|_| ()) + } +} + +#[derive(Default, Debug)] +pub struct MicroSecondsCounterU32 { + inner: CounterU32, +} + +impl MicroSecondsCounterU32 { + pub fn open(&self) -> Result<(), &'static str> { + self.inner.open() + } + pub fn add(&self, duration: Duration) -> Result<(), &'static str> { + match duration.as_micros().try_into() { + Ok(x) => self.inner.add(x), + Err(_) => Err("add(): duration conversion error"), + } + } + pub fn close_and_checked_sub_from(&self, from: Duration) -> Result { + let val = self.inner.close()?; + let val = Duration::from_micros(val as u64); + let subbed = match from.checked_sub(val) { + Some(v) => v, + None => return Err("Duration::checked_sub"), + }; + Ok(subbed) + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + #[test] + fn test_basic() { + let counter = MicroSecondsCounterU32::default(); + counter.open().unwrap(); + counter.add(Duration::from_micros(23)).unwrap(); + let res = counter + .close_and_checked_sub_from(Duration::from_micros(42)) + .unwrap(); + assert_eq!(res, Duration::from_micros(42 - 23)); + } +} diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index 950791ea48..f6d1c35a8c 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -2,17 +2,20 @@ use std::collections::HashMap; use futures::Future; use pageserver_api::{ - control_api::{ - ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse, - }, + controller_api::NodeRegisterRequest, shard::TenantShardId, + upcall_api::{ + ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, + ValidateRequestTenant, ValidateResponse, + }, }; use serde::{de::DeserializeOwned, Serialize}; use tokio_util::sync::CancellationToken; use url::Url; -use utils::{backoff, generation::Generation, id::NodeId}; +use utils::{backoff, failpoint_support, generation::Generation, id::NodeId}; -use crate::config::PageServerConf; +use crate::{config::PageServerConf, virtual_file::on_fatal_io_error}; +use pageserver_api::config::NodeMetadata; /// The Pageserver's client for using the control plane API: this is a small subset /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md) @@ -32,7 +35,10 @@ pub enum RetryForeverError { pub trait ControlPlaneGenerationsApi { fn re_attach( &self, - ) -> impl Future, RetryForeverError>> + Send; + conf: &PageServerConf, + ) -> impl Future< + Output = Result, RetryForeverError>, + > + Send; fn validate( &self, tenants: Vec<(TenantShardId, Generation)>, @@ -57,7 +63,7 @@ impl ControlPlaneClient { let mut client = reqwest::ClientBuilder::new(); if let Some(jwt) = &conf.control_plane_api_token { - let mut headers = hyper::HeaderMap::new(); + let mut headers = reqwest::header::HeaderMap::new(); headers.insert( "Authorization", format!("Bearer {}", jwt.get_contents()).parse().unwrap(), @@ -82,61 +88,108 @@ impl ControlPlaneClient { R: Serialize, T: DeserializeOwned, { - #[derive(thiserror::Error, Debug)] - enum RemoteAttemptError { - #[error("shutdown")] - Shutdown, - #[error("remote: {0}")] - Remote(reqwest::Error), - } - - match backoff::retry( + let res = backoff::retry( || async { let response = self .http_client .post(url.clone()) .json(&request) .send() - .await - .map_err(RemoteAttemptError::Remote)?; + .await?; - response - .error_for_status_ref() - .map_err(RemoteAttemptError::Remote)?; - response - .json::() - .await - .map_err(RemoteAttemptError::Remote) + response.error_for_status_ref()?; + response.json::().await }, |_| false, 3, u32::MAX, "calling control plane generation validation API", - backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown), + &self.cancel, ) .await - { - Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown), - Err(RemoteAttemptError::Remote(_)) => { - panic!("We retry forever, this should never be reached"); - } - Ok(r) => Ok(r), - } + .ok_or(RetryForeverError::ShuttingDown)? + .expect("We retry forever, this should never be reached"); + + Ok(res) } } impl ControlPlaneGenerationsApi for ControlPlaneClient { /// Block until we get a successful response, or error out if we are shut down - async fn re_attach(&self) -> Result, RetryForeverError> { + async fn re_attach( + &self, + conf: &PageServerConf, + ) -> Result, RetryForeverError> { let re_attach_path = self .base_url .join("re-attach") .expect("Failed to build re-attach path"); - let request = ReAttachRequest { - node_id: self.node_id, + + // Include registration content in the re-attach request if a metadata file is readable + let metadata_path = conf.metadata_path(); + let register = match tokio::fs::read_to_string(&metadata_path).await { + Ok(metadata_str) => match serde_json::from_str::(&metadata_str) { + Ok(m) => { + // Since we run one time at startup, be generous in our logging and + // dump all metadata. + tracing::info!( + "Loaded node metadata: postgres {}:{}, http {}:{}, other fields: {:?}", + m.postgres_host, + m.postgres_port, + m.http_host, + m.http_port, + m.other + ); + + let az_id = { + let az_id_from_metadata = m + .other + .get("availability_zone_id") + .and_then(|jv| jv.as_str().map(|str| str.to_owned())); + + match az_id_from_metadata { + Some(az_id) => Some(az_id), + None => { + tracing::warn!("metadata.json does not contain an 'availability_zone_id' field"); + conf.availability_zone.clone() + } + } + }; + + if az_id.is_none() { + panic!("Availablity zone id could not be inferred from metadata.json or pageserver config"); + } + + Some(NodeRegisterRequest { + node_id: conf.id, + listen_pg_addr: m.postgres_host, + listen_pg_port: m.postgres_port, + listen_http_addr: m.http_host, + listen_http_port: m.http_port, + availability_zone_id: az_id.expect("Checked above"), + }) + } + Err(e) => { + tracing::error!("Unreadable metadata in {metadata_path}: {e}"); + None + } + }, + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { + // This is legal: we may have been deployed with some external script + // doing registration for us. + tracing::info!("Metadata file not found at {metadata_path}"); + } else { + on_fatal_io_error(&e, &format!("Loading metadata at {metadata_path}")) + } + None + } }; - fail::fail_point!("control-plane-client-re-attach"); + let request = ReAttachRequest { + node_id: self.node_id, + register, + }; let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?; tracing::info!( @@ -144,10 +197,12 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { response.tenants.len() ); + failpoint_support::sleep_millis_async!("control-plane-client-re-attach"); + Ok(response .tenants .into_iter() - .map(|t| (t.id, Generation::new(t.gen))) + .map(|rart| (rart.id, rart)) .collect::>()) } @@ -173,7 +228,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { .collect(), }; - fail::fail_point!("control-plane-client-validate"); + failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel); + if self.cancel.is_cancelled() { + return Err(RetryForeverError::ShuttingDown); + } let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?; diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 6a820e1bdc..22f7d5b824 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -20,10 +20,9 @@ use remote_storage::{GenericRemoteStorage, RemotePath}; use serde::Deserialize; use serde::Serialize; use thiserror::Error; -use tokio; use tokio_util::sync::CancellationToken; use tracing::Instrument; -use tracing::{self, debug, error}; +use tracing::{debug, error}; use utils::crashsafe::path_with_suffix_extension; use utils::generation::Generation; use utils::id::TimelineId; @@ -39,7 +38,7 @@ use deleter::DeleterMessage; use list_writer::ListWriterQueueMessage; use validator::ValidatorQueueMessage; -use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName}; +use crate::{config::PageServerConf, tenant::storage_layer::LayerName}; // TODO: configurable for how long to wait before executing deletions @@ -234,7 +233,7 @@ impl DeletionHeader { let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?; let header_path = conf.deletion_header_path(); let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX); - VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes) + VirtualFile::crashsafe_overwrite(header_path, temp_path, header_bytes) .await .maybe_fatal_err("save deletion header")?; @@ -312,7 +311,7 @@ impl DeletionList { result.extend( timeline_layers .into_iter() - .map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))), + .map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))), ); } } @@ -325,7 +324,8 @@ impl DeletionList { let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX); let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list"); - VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes) + + VirtualFile::crashsafe_overwrite(path, temp_path, bytes) .await .maybe_fatal_err("save deletion list") .map_err(Into::into) @@ -382,17 +382,6 @@ pub enum DeletionQueueError { } impl DeletionQueueClient { - pub(crate) fn broken() -> Self { - // Channels whose receivers are immediately dropped. - let (tx, _rx) = tokio::sync::mpsc::unbounded_channel(); - let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1); - Self { - tx, - executor_tx, - lsn_table: Arc::default(), - } - } - /// This is cancel-safe. If you drop the future before it completes, the message /// is not pushed, although in the context of the deletion queue it doesn't matter: once /// we decide to do a deletion the decision is always final. @@ -479,7 +468,7 @@ impl DeletionQueueClient { tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, - layers: Vec<(LayerFileName, LayerFileMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> Result<(), DeletionQueueError> { if current_generation.is_none() { debug!("Enqueuing deletions in legacy mode, skipping queue"); @@ -511,7 +500,7 @@ impl DeletionQueueClient { tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, - layers: Vec<(LayerFileName, LayerFileMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> Result<(), DeletionQueueError> { metrics::DELETION_QUEUE .keys_submitted @@ -632,7 +621,7 @@ impl DeletionQueue { /// /// If remote_storage is None, then the returned workers will also be None. pub fn new( - remote_storage: Option, + remote_storage: GenericRemoteStorage, control_plane_client: Option, conf: &'static PageServerConf, ) -> (Self, Option>) @@ -658,23 +647,6 @@ impl DeletionQueue { // longer to flush after Tenants have all been torn down. let cancel = CancellationToken::new(); - let remote_storage = match remote_storage { - None => { - return ( - Self { - client: DeletionQueueClient { - tx, - executor_tx, - lsn_table: lsn_table.clone(), - }, - cancel, - }, - None, - ) - } - Some(r) => r, - }; - ( Self { client: DeletionQueueClient { @@ -700,8 +672,6 @@ impl DeletionQueue { } pub async fn shutdown(&mut self, timeout: Duration) { - self.cancel.cancel(); - match tokio::time::timeout(timeout, self.client.flush()).await { Ok(Ok(())) => { tracing::info!("Deletion queue flushed successfully on shutdown") @@ -715,6 +685,10 @@ impl DeletionQueue { tracing::warn!("Timed out flushing deletion queue on shutdown") } } + + // We only cancel _after_ flushing: otherwise we would be shutting down the + // components that do the flush. + self.cancel.cancel(); } } @@ -722,7 +696,7 @@ impl DeletionQueue { mod test { use camino::Utf8Path; use hex_literal::hex; - use pageserver_api::shard::ShardIndex; + use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant}; use std::{io::ErrorKind, time::Duration}; use tracing::info; @@ -732,23 +706,20 @@ mod test { use crate::{ control_plane_client::RetryForeverError, repository::Key, - tenant::{ - harness::TenantHarness, remote_timeline_client::remote_timeline_path, - storage_layer::DeltaFileName, - }, + tenant::{harness::TenantHarness, storage_layer::DeltaLayerName}, }; use super::*; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); - pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName { + pub const EXAMPLE_LAYER_NAME: LayerName = LayerName::Delta(DeltaLayerName { key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF), lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51), }); // When you need a second layer in a test. - pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName { + pub const EXAMPLE_LAYER_NAME_ALT: LayerName = LayerName::Delta(DeltaLayerName { key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF), lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61), }); @@ -766,7 +737,7 @@ mod test { /// Simulate a pageserver restart by destroying and recreating the deletion queue async fn restart(&mut self) { let (deletion_queue, workers) = DeletionQueue::new( - Some(self.storage.clone()), + self.storage.clone(), Some(self.mock_control_plane.clone()), self.harness.conf, ); @@ -798,7 +769,7 @@ mod test { /// Returns remote layer file name, suitable for use in assert_remote_files fn write_remote_layer( &self, - file_name: LayerFileName, + file_name: LayerName, gen: Generation, ) -> anyhow::Result { let tenant_shard_id = self.harness.tenant_shard_id; @@ -832,10 +803,13 @@ mod test { } impl ControlPlaneGenerationsApi for MockControlPlane { - #[allow(clippy::diverging_sub_expression)] // False positive via async_trait - async fn re_attach(&self) -> Result, RetryForeverError> { + async fn re_attach( + &self, + _conf: &PageServerConf, + ) -> Result, RetryForeverError> { unimplemented!() } + async fn validate( &self, tenants: Vec<(TenantShardId, Generation)>, @@ -854,9 +828,9 @@ mod test { } } - fn setup(test_name: &str) -> anyhow::Result { + async fn setup(test_name: &str) -> anyhow::Result { let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}"))); - let harness = TenantHarness::create(test_name)?; + let harness = TenantHarness::create(test_name).await?; // We do not load() the harness: we only need its config and remote_storage @@ -865,14 +839,19 @@ mod test { std::fs::create_dir_all(remote_fs_dir)?; let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?; let storage_config = RemoteStorageConfig { - storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), + storage: RemoteStorageKind::LocalFs { + local_path: remote_fs_dir.clone(), + }, + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; - let storage = GenericRemoteStorage::from_config(&storage_config).unwrap(); + let storage = GenericRemoteStorage::from_config(&storage_config) + .await + .unwrap(); let mock_control_plane = MockControlPlane::new(); let (deletion_queue, worker) = DeletionQueue::new( - Some(storage.clone()), + storage.clone(), Some(mock_control_plane.clone()), harness.conf, ); @@ -945,11 +924,13 @@ mod test { #[tokio::test] async fn deletion_queue_smoke() -> anyhow::Result<()> { // Basic test that the deletion queue processes the deletions we pass into it - let ctx = setup("deletion_queue_smoke").expect("Failed test setup"); + let ctx = setup("deletion_queue_smoke") + .await + .expect("Failed test setup"); let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; - let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); let tenant_shard_id = ctx.harness.tenant_shard_id; let content: Vec = "victim1 contents".into(); @@ -1015,7 +996,9 @@ mod test { #[tokio::test] async fn deletion_queue_validation() -> anyhow::Result<()> { - let ctx = setup("deletion_queue_validation").expect("Failed test setup"); + let ctx = setup("deletion_queue_validation") + .await + .expect("Failed test setup"); let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; @@ -1074,7 +1057,9 @@ mod test { #[tokio::test] async fn deletion_queue_recovery() -> anyhow::Result<()> { // Basic test that the deletion queue processes the deletions we pass into it - let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup"); + let mut ctx = setup("deletion_queue_recovery") + .await + .expect("Failed test setup"); let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; @@ -1158,17 +1143,13 @@ mod test { pub(crate) mod mock { use tracing::info; - use crate::tenant::remote_timeline_client::remote_layer_path; - use super::*; - use std::sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, - }; + use std::sync::atomic::{AtomicUsize, Ordering}; pub struct ConsumerState { rx: tokio::sync::mpsc::UnboundedReceiver, executor_rx: tokio::sync::mpsc::Receiver, + cancel: CancellationToken, } impl ConsumerState { @@ -1182,7 +1163,7 @@ pub(crate) mod mock { match msg { DeleterMessage::Delete(objects) => { for path in objects { - match remote_storage.delete(&path).await { + match remote_storage.delete(&path, &self.cancel).await { Ok(_) => { debug!("Deleted {path}"); } @@ -1215,7 +1196,7 @@ pub(crate) mod mock { for path in objects { info!("Executing deletion {path}"); - match remote_storage.delete(&path).await { + match remote_storage.delete(&path, &self.cancel).await { Ok(_) => { debug!("Deleted {path}"); } @@ -1265,7 +1246,11 @@ pub(crate) mod mock { executor_tx, executed, remote_storage, - consumer: std::sync::Mutex::new(ConsumerState { rx, executor_rx }), + consumer: std::sync::Mutex::new(ConsumerState { + rx, + executor_rx, + cancel: CancellationToken::new(), + }), lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())), } } diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs index 57421b1547..1f04bc0410 100644 --- a/pageserver/src/deletion_queue/deleter.rs +++ b/pageserver/src/deletion_queue/deleter.rs @@ -8,6 +8,7 @@ use remote_storage::GenericRemoteStorage; use remote_storage::RemotePath; +use remote_storage::TimeoutOrCancel; use remote_storage::MAX_KEYS_PER_DELETE; use std::time::Duration; use tokio_util::sync::CancellationToken; @@ -71,15 +72,19 @@ impl Deleter { Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute")) }); - self.remote_storage.delete_objects(&self.accumulator).await + self.remote_storage + .delete_objects(&self.accumulator, &self.cancel) + .await }, - |_| false, + TimeoutOrCancel::caused_by_cancel, 3, 10, "executing deletion batch", - backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Shutting down")), + &self.cancel, ) .await + .ok_or_else(|| anyhow::anyhow!("Shutting down")) + .and_then(|x| x) } /// Block until everything in accumulator has been executed diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs index 3a3d600ac2..ae3b2c9180 100644 --- a/pageserver/src/deletion_queue/list_writer.rs +++ b/pageserver/src/deletion_queue/list_writer.rs @@ -34,7 +34,7 @@ use crate::deletion_queue::TEMP_SUFFIX; use crate::metrics; use crate::tenant::remote_timeline_client::remote_layer_path; use crate::tenant::remote_timeline_client::LayerFileMetadata; -use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::storage_layer::LayerName; use crate::virtual_file::on_fatal_io_error; use crate::virtual_file::MaybeFatalIo; @@ -59,7 +59,7 @@ pub(super) struct DeletionOp { // `layers` and `objects` are both just lists of objects. `layers` is used if you do not // have a config object handy to project it to a remote key, and need the consuming worker // to do it for you. - pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>, + pub(super) layers: Vec<(LayerName, LayerFileMetadata)>, pub(super) objects: Vec, /// The _current_ generation of the Tenant shard attachment in which we are enqueuing diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index bf06c78e67..d215fd2b7d 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -190,7 +190,7 @@ where } } else { // If we failed validation, then do not apply any of the projected updates - warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation); + info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation); metrics::DELETION_QUEUE.dropped_lsn_updates.inc(); } } @@ -225,7 +225,7 @@ where && (tenant.generation == *validated_generation); if !this_list_valid { - warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation); + info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation); metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64); mutated = true; } else { diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 800e52bb51..a58fa2c0b1 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -41,79 +41,117 @@ // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl // reading these fields. We use the Debug impl for semi-structured logging, though. -use std::{ - sync::Arc, - time::{Duration, SystemTime}, -}; +use std::{sync::Arc, time::SystemTime}; use anyhow::Context; -use pageserver_api::shard::TenantShardId; +use pageserver_api::{config::DiskUsageEvictionTaskConfig, shard::TenantShardId}; use remote_storage::GenericRemoteStorage; -use serde::{Deserialize, Serialize}; +use serde::Serialize; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn, Instrument}; -use utils::serde_percent::Percent; use utils::{completion, id::TimelineId}; use crate::{ config::PageServerConf, - task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, + metrics::disk_usage_based_eviction::METRICS, + task_mgr::{self, BACKGROUND_RUNTIME}, tenant::{ - self, mgr::TenantManager, remote_timeline_client::LayerFileMetadata, secondary::SecondaryTenant, - storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName}, - Timeline, + storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint}, }, + CancellableTask, DiskUsageEvictionTask, }; -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct DiskUsageEvictionTaskConfig { - pub max_usage_pct: Percent, - pub min_avail_bytes: u64, - #[serde(with = "humantime_serde")] - pub period: Duration, - #[cfg(feature = "testing")] - pub mock_statvfs: Option, - /// Select sorting for evicted layers - #[serde(default)] - pub eviction_order: EvictionOrder, -} - /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size` /// partitioning. -#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -#[serde(tag = "type", content = "args")] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum EvictionOrder { - /// Order the layers to be evicted by how recently they have been accessed in absolute - /// time. - /// - /// This strategy is unfair when some tenants grow faster than others towards the slower - /// growing. - #[default] - AbsoluteAccessed, - /// Order the layers to be evicted by how recently they have been accessed relatively within /// the set of resident layers of a tenant. - /// - /// This strategy will evict layers more fairly but is untested. RelativeAccessed { - #[serde(default)] + /// Determines if the tenant with most layers should lose first. + /// + /// Having this enabled is currently the only reasonable option, because the order in which + /// we read tenants is deterministic. If we find the need to use this as `false`, we need + /// to ensure nondeterminism by adding in a random number to break the + /// `relative_last_activity==0.0` ties. highest_layer_count_loses_first: bool, }, } -impl EvictionOrder { - /// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer - /// counts should be the first ones to have their layers evicted. - fn highest_layer_count_loses_first(&self) -> bool { - match self { - EvictionOrder::AbsoluteAccessed => false, - EvictionOrder::RelativeAccessed { +impl From for EvictionOrder { + fn from(value: pageserver_api::config::EvictionOrder) -> Self { + match value { + pageserver_api::config::EvictionOrder::RelativeAccessed { highest_layer_count_loses_first, - } => *highest_layer_count_loses_first, + } => Self::RelativeAccessed { + highest_layer_count_loses_first, + }, + } + } +} + +impl EvictionOrder { + fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) { + use EvictionOrder::*; + + match self { + RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| { + (*partition, candidate.relative_last_activity) + }), + } + } + + /// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants + /// layers in **most** recently used order. + fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 { + use EvictionOrder::*; + + match self { + RelativeAccessed { + highest_layer_count_loses_first, + } => { + // keeping the -1 or not decides if every tenant should lose their least recently accessed + // layer OR if this should happen in the order of having highest layer count: + let fudge = if *highest_layer_count_loses_first { + // relative_last_activity vs. tenant layer count: + // - 0.1..=1.0 (10 layers) + // - 0.01..=1.0 (100 layers) + // - 0.001..=1.0 (1000 layers) + // + // leading to evicting less of the smallest tenants. + 0 + } else { + // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a + // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could + // be that less than 10k layer evictions is enough, so we would not need to evict from + // all tenants. + // + // as the tenant ordering is now deterministic this could hit the same tenants + // disproportionetly on multiple invocations. alternative could be to remember how many + // layers did we evict last time from this tenant, and inject that as an additional + // fudge here. + 1 + }; + + let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1); + let divider = total as f32; + + // most recently used is always (total - 0) / divider == 1.0 + // least recently used depends on the fudge: + // - (total - 1) - (total - 1) / total => 0 / total + // - total - (total - 1) / total => 1 / total + let distance = (total - index) as f32; + + finite_f32::FiniteF32::try_from_normalized(distance / divider) + .unwrap_or_else(|val| { + tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}"); + finite_f32::FiniteF32::ZERO + }) + } } } } @@ -130,36 +168,34 @@ pub fn launch_disk_usage_global_eviction_task( state: Arc, tenant_manager: Arc, background_jobs_barrier: completion::Barrier, -) -> anyhow::Result<()> { +) -> Option { let Some(task_config) = &conf.disk_usage_based_eviction else { info!("disk usage based eviction task not configured"); - return Ok(()); + return None; }; info!("launching disk usage based eviction task"); - task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), - TaskKind::DiskUsageEviction, - None, - None, + let cancel = CancellationToken::new(); + let task = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "disk usage based eviction", - false, - async move { - let cancel = task_mgr::shutdown_token(); + { + let cancel = cancel.clone(); + async move { + // wait until initial load is complete, because we cannot evict from loading tenants. + tokio::select! { + _ = cancel.cancelled() => { return anyhow::Ok(()); }, + _ = background_jobs_barrier.wait() => { } + }; - // wait until initial load is complete, because we cannot evict from loading tenants. - tokio::select! { - _ = cancel.cancelled() => { return Ok(()); }, - _ = background_jobs_barrier.wait() => { } - }; - - disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await; - Ok(()) + disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel) + .await; + anyhow::Ok(()) + } }, - ); + )); - Ok(()) + Some(DiskUsageEvictionTask(CancellableTask { cancel, task })) } #[instrument(skip_all)] @@ -240,7 +276,7 @@ async fn disk_usage_eviction_task_iteration( storage, usage_pre, tenant_manager, - task_config.eviction_order, + task_config.eviction_order.into(), cancel, ) .await; @@ -288,7 +324,6 @@ pub enum IterationOutcome { Finished(IterationOutcomeFinished), } -#[allow(dead_code)] #[derive(Debug, Serialize)] pub struct IterationOutcomeFinished { /// The actual usage observed before we started the iteration. @@ -303,7 +338,6 @@ pub struct IterationOutcomeFinished { } #[derive(Debug, Serialize)] -#[allow(dead_code)] struct AssumedUsage { /// The expected value for `after`, after phase 2. projected_after: U, @@ -311,14 +345,12 @@ struct AssumedUsage { failed: LayerCount, } -#[allow(dead_code)] #[derive(Debug, Serialize)] struct PlannedUsage { respecting_tenant_min_resident_size: U, fallback_to_global_lru: Option, } -#[allow(dead_code)] #[derive(Debug, Default, Serialize)] struct LayerCount { file_sizes: u64, @@ -350,13 +382,23 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( "running disk usage based eviction due to pressure" ); - let candidates = + let (candidates, collection_time) = { + let started_at = std::time::Instant::now(); match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? { EvictionCandidates::Cancelled => { return Ok(IterationOutcome::Cancelled); } - EvictionCandidates::Finished(partitioned) => partitioned, - }; + EvictionCandidates::Finished(partitioned) => (partitioned, started_at.elapsed()), + } + }; + + METRICS.layers_collected.inc_by(candidates.len() as u64); + + tracing::info!( + elapsed_ms = collection_time.as_millis(), + total_layers = candidates.len(), + "collection completed" + ); // Debug-log the list of candidates let now = SystemTime::now(); @@ -387,55 +429,10 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( // the tenant's min-resident-size threshold, print a warning, and memorize the disk // usage at that point, in 'usage_planned_min_resident_size_respecting'. - let selection = select_victims(&candidates, usage_pre); + let (evicted_amount, usage_planned) = + select_victims(&candidates, usage_pre).into_amount_and_planned(); - let mut candidates = candidates; - - let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) { - // we currently have the layers ordered by AbsoluteAccessed so that we can get the summary - // for comparison here. this is a temporary measure to develop alternatives. - use std::fmt::Write; - - let mut summary_buf = String::with_capacity(256); - - { - let absolute_summary = candidates - .iter() - .take(selection.amount) - .map(|(_, candidate)| candidate) - .collect::(); - - write!(summary_buf, "{absolute_summary}").expect("string grows"); - - info!("absolute accessed selection summary: {summary_buf}"); - } - - candidates.sort_unstable_by_key(|(partition, candidate)| { - (*partition, candidate.relative_last_activity) - }); - - let selection = select_victims(&candidates, usage_pre); - - { - summary_buf.clear(); - - let relative_summary = candidates - .iter() - .take(selection.amount) - .map(|(_, candidate)| candidate) - .collect::(); - - write!(summary_buf, "{relative_summary}").expect("string grows"); - - info!("relative accessed selection summary: {summary_buf}"); - } - - selection - } else { - selection - }; - - let (evicted_amount, usage_planned) = selection.into_amount_and_planned(); + METRICS.layers_selected.inc_by(evicted_amount as u64); // phase2: evict layers @@ -464,9 +461,15 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( if let Some(next) = next { match next { Ok(Ok(file_size)) => { + METRICS.layers_evicted.inc(); usage_assumed.add_available_bytes(file_size); } - Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => { + Ok(Err(( + file_size, + EvictionError::NotFound + | EvictionError::Downloaded + | EvictionError::Timeout, + ))) => { evictions_failed.file_sizes += file_size; evictions_failed.count += 1; } @@ -482,7 +485,10 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( // calling again when consumed_all is fine as evicted is fused. let Some((_partition, candidate)) = evicted.next() else { - consumed_all = true; + if !consumed_all { + tracing::info!("all evictions started, waiting"); + consumed_all = true; + } continue; }; @@ -490,21 +496,24 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( EvictionLayer::Attached(layer) => { let file_size = layer.layer_desc().file_size; js.spawn(async move { - layer - .evict_and_wait() - .await - .map(|()| file_size) - .map_err(|e| (file_size, e)) + // have a low eviction waiting timeout because our LRU calculations go stale fast; + // also individual layer evictions could hang because of bugs and we do not want to + // pause disk_usage_based_eviction for such. + let timeout = std::time::Duration::from_secs(5); + + match layer.evict_and_wait(timeout).await { + Ok(()) => Ok(file_size), + Err(e) => Err((file_size, e)), + } }); } EvictionLayer::Secondary(layer) => { - let file_size = layer.metadata.file_size(); - let tenant_manager = tenant_manager.clone(); + let file_size = layer.metadata.file_size; js.spawn(async move { layer .secondary_tenant - .evict_layer(tenant_manager.get_conf(), layer.timeline_id, layer.name) + .evict_layer(layer.timeline_id, layer.name) .await; Ok(file_size) }); @@ -516,6 +525,30 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( (usage_assumed, evictions_failed) }; + let started_at = std::time::Instant::now(); + + let evict_layers = async move { + let mut evict_layers = std::pin::pin!(evict_layers); + + let maximum_expected = std::time::Duration::from_secs(10); + + let res = tokio::time::timeout(maximum_expected, &mut evict_layers).await; + let tuple = if let Ok(tuple) = res { + tuple + } else { + let elapsed = started_at.elapsed(); + tracing::info!(elapsed_ms = elapsed.as_millis(), "still ongoing"); + evict_layers.await + }; + + let elapsed = started_at.elapsed(); + tracing::info!(elapsed_ms = elapsed.as_millis(), "completed"); + tuple + }; + + let evict_layers = + evict_layers.instrument(tracing::info_span!("evict_layers", layers=%evicted_amount)); + let (usage_assumed, evictions_failed) = tokio::select! { tuple = evict_layers => { tuple }, _ = cancel.cancelled() => { @@ -539,7 +572,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( pub(crate) struct EvictionSecondaryLayer { pub(crate) secondary_tenant: Arc, pub(crate) timeline_id: TimelineId, - pub(crate) name: LayerFileName, + pub(crate) name: LayerName, pub(crate) metadata: LayerFileMetadata, } @@ -548,7 +581,6 @@ pub(crate) struct EvictionSecondaryLayer { #[derive(Clone)] pub(crate) enum EvictionLayer { Attached(Layer), - #[allow(dead_code)] Secondary(EvictionSecondaryLayer), } @@ -573,9 +605,9 @@ impl EvictionLayer { } } - pub(crate) fn get_name(&self) -> LayerFileName { + pub(crate) fn get_name(&self) -> LayerName { match self { - Self::Attached(l) => l.layer_desc().filename(), + Self::Attached(l) => l.layer_desc().layer_name(), Self::Secondary(sl) => sl.name.clone(), } } @@ -583,7 +615,7 @@ impl EvictionLayer { pub(crate) fn get_file_size(&self) -> u64 { match self { Self::Attached(l) => l.layer_desc().file_size, - Self::Secondary(sl) => sl.metadata.file_size(), + Self::Secondary(sl) => sl.metadata.file_size, } } } @@ -593,6 +625,7 @@ pub(crate) struct EvictionCandidate { pub(crate) layer: EvictionLayer, pub(crate) last_activity_ts: SystemTime, pub(crate) relative_last_activity: finite_f32::FiniteF32, + pub(crate) visibility: LayerVisibilityHint, } impl std::fmt::Display for EvictionLayer { @@ -606,6 +639,7 @@ impl std::fmt::Display for EvictionLayer { } } +#[derive(Default)] pub(crate) struct DiskUsageEvictionInfo { /// Timeline's largest layer (remote or resident) pub max_layer_size: Option, @@ -633,14 +667,22 @@ impl std::fmt::Debug for EvictionCandidate { } #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] -enum MinResidentSizePartition { +enum EvictionPartition { + // A layer that is un-wanted by the tenant: evict all these first, before considering + // any other layers + EvictNow, + + // Above the minimum size threshold: this layer is a candidate for eviction. Above, + + // Below the minimum size threshold: this layer should only be evicted if all the + // tenants' layers above the minimum size threshold have already been considered. Below, } enum EvictionCandidates { Cancelled, - Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>), + Finished(Vec<(EvictionPartition, EvictionCandidate)>), } /// Gather the eviction candidates. @@ -750,9 +792,11 @@ async fn collect_eviction_candidates( eviction_order: EvictionOrder, cancel: &CancellationToken, ) -> anyhow::Result { + const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10); + // get a snapshot of the list of tenants - let tenants = tenant::mgr::list_tenants() - .await + let tenants = tenant_manager + .list_tenants() .context("get list of tenants")?; // TODO: avoid listing every layer in every tenant: this loop can block the executor, @@ -764,8 +808,12 @@ async fn collect_eviction_candidates( if cancel.is_cancelled() { return Ok(EvictionCandidates::Cancelled); } - let tenant = match tenant::mgr::get_tenant(tenant_id, true) { - Ok(tenant) => tenant, + let tenant = match tenant_manager.get_attached_tenant_shard(tenant_id) { + Ok(tenant) if tenant.is_active() => tenant, + Ok(_) => { + debug!(tenant_id=%tenant_id.tenant_id, shard_id=%tenant_id.shard_slug(), "Tenant shard is not active"); + continue; + } Err(e) => { // this can happen if tenant has lifecycle transition after we fetched it debug!("failed to get tenant: {e:#}"); @@ -778,6 +826,8 @@ async fn collect_eviction_candidates( continue; } + let started_at = std::time::Instant::now(); + // collect layers from all timelines in this tenant // // If one of the timelines becomes `!is_active()` during the iteration, @@ -792,6 +842,7 @@ async fn collect_eviction_candidates( } let info = tl.get_local_layers_for_disk_usage_eviction().await; debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len()); + tenant_candidates.extend(info.resident_layers.into_iter()); max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0)); @@ -829,68 +880,65 @@ async fn collect_eviction_candidates( max_layer_size }; - // Sort layers most-recently-used first, then partition by - // cumsum above/below min_resident_size. + // Sort layers most-recently-used first, then calculate [`EvictionPartition`] for each layer, + // where the inputs are: + // - whether the layer is visible + // - whether the layer is above/below the min_resident_size cutline tenant_candidates .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts)); let mut cumsum: i128 = 0; - // keeping the -1 or not decides if every tenant should lose their least recently accessed - // layer OR if this should happen in the order of having highest layer count: - let fudge = if eviction_order.highest_layer_count_loses_first() { - // relative_age vs. tenant layer count: - // - 0.1..=1.0 (10 layers) - // - 0.01..=1.0 (100 layers) - // - 0.001..=1.0 (1000 layers) - // - // leading to evicting less of the smallest tenants. - 0 - } else { - // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a - // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could - // be that less than 10k layer evictions is enough, so we would not need to evict from - // all tenants. - // - // as the tenant ordering is now deterministic this could hit the same tenants - // disproportionetly on multiple invocations. alternative could be to remember how many - // layers did we evict last time from this tenant, and inject that as an additional - // fudge here. - 1 - }; + let total = tenant_candidates.len(); - let total = tenant_candidates - .len() - .checked_sub(fudge) - .filter(|&x| x > 0) - // support 0 or 1 resident layer tenants as well - .unwrap_or(1); - let divider = total as f32; + let tenant_candidates = + tenant_candidates + .into_iter() + .enumerate() + .map(|(i, mut candidate)| { + // as we iterate this reverse sorted list, the most recently accessed layer will always + // be 1.0; this is for us to evict it last. + candidate.relative_last_activity = + eviction_order.relative_last_activity(total, i); - for (i, mut candidate) in tenant_candidates.into_iter().enumerate() { - // as we iterate this reverse sorted list, the most recently accessed layer will always - // be 1.0; this is for us to evict it last. - candidate.relative_last_activity = if matches!( - eviction_order, - EvictionOrder::RelativeAccessed { .. } - ) { - // another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or - // similarly for u16. unsure how it would help. - finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider) - .unwrap_or_else(|val| { - tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}"); - finite_f32::FiniteF32::ZERO - }) - } else { - finite_f32::FiniteF32::ZERO - }; + let partition = match candidate.visibility { + LayerVisibilityHint::Covered => { + // Covered layers are evicted first + EvictionPartition::EvictNow + } + LayerVisibilityHint::Visible => { + cumsum += i128::from(candidate.layer.get_file_size()); - let partition = if cumsum > min_resident_size as i128 { - MinResidentSizePartition::Above - } else { - MinResidentSizePartition::Below - }; - cumsum += i128::from(candidate.layer.get_file_size()); - candidates.push((partition, candidate)); + if cumsum > min_resident_size as i128 { + EvictionPartition::Above + } else { + // The most recent layers below the min_resident_size threshold + // are the last to be evicted. + EvictionPartition::Below + } + } + }; + + (partition, candidate) + }); + + METRICS + .tenant_layer_count + .observe(tenant_candidates.len() as f64); + + candidates.extend(tenant_candidates); + + let elapsed = started_at.elapsed(); + METRICS + .tenant_collection_time + .observe(elapsed.as_secs_f64()); + + if elapsed > LOG_DURATION_THRESHOLD { + tracing::info!( + tenant_id=%tenant.tenant_shard_id().tenant_id, + shard_id=%tenant.tenant_shard_id().shard_slug(), + elapsed_ms = elapsed.as_millis(), + "collection took longer than threshold" + ); } } @@ -906,31 +954,70 @@ async fn collect_eviction_candidates( }, ); - for secondary_tenant in secondary_tenants { - let mut layer_info = secondary_tenant.get_layers_for_eviction(); + for tenant in secondary_tenants { + // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is + // to prevent repeated disk usage based evictions from completely draining less often + // updating secondaries. + let (mut layer_info, total_layers) = tenant.get_layers_for_eviction(); + + debug_assert!( + total_layers >= layer_info.resident_layers.len(), + "total_layers ({total_layers}) must be at least the resident_layers.len() ({})", + layer_info.resident_layers.len() + ); + + let started_at = std::time::Instant::now(); layer_info .resident_layers .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts)); - candidates.extend(layer_info.resident_layers.into_iter().map(|candidate| { - ( - // Secondary locations' layers are always considered above the min resident size, - // i.e. secondary locations are permitted to be trimmed to zero layers if all - // the layers have sufficiently old access times. - MinResidentSizePartition::Above, - candidate, - ) - })); + let tenant_candidates = + layer_info + .resident_layers + .into_iter() + .enumerate() + .map(|(i, mut candidate)| { + candidate.relative_last_activity = + eviction_order.relative_last_activity(total_layers, i); + ( + // Secondary locations' layers are always considered above the min resident size, + // i.e. secondary locations are permitted to be trimmed to zero layers if all + // the layers have sufficiently old access times. + EvictionPartition::Above, + candidate, + ) + }); + + METRICS + .tenant_layer_count + .observe(tenant_candidates.len() as f64); + candidates.extend(tenant_candidates); + + tokio::task::yield_now().await; + + let elapsed = started_at.elapsed(); + + METRICS + .tenant_collection_time + .observe(elapsed.as_secs_f64()); + + if elapsed > LOG_DURATION_THRESHOLD { + tracing::info!( + tenant_id=%tenant.tenant_shard_id().tenant_id, + shard_id=%tenant.tenant_shard_id().shard_slug(), + elapsed_ms = elapsed.as_millis(), + "collection took longer than threshold" + ); + } } - debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below, + debug_assert!(EvictionPartition::Above < EvictionPartition::Below, + "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); + debug_assert!(EvictionPartition::EvictNow < EvictionPartition::Above, "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); - // always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we - // will sort later by candidate.relative_last_activity to get compare evictions. - candidates - .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts)); + eviction_order.sort(&mut candidates); Ok(EvictionCandidates::Finished(candidates)) } @@ -940,7 +1027,7 @@ async fn collect_eviction_candidates( /// /// Returns the amount of candidates selected, with the planned usage. fn select_victims( - candidates: &[(MinResidentSizePartition, EvictionCandidate)], + candidates: &[(EvictionPartition, EvictionCandidate)], usage_pre: U, ) -> VictimSelection { let mut usage_when_switched = None; @@ -952,7 +1039,7 @@ fn select_victims( break; } - if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() { + if partition == &EvictionPartition::Below && usage_when_switched.is_none() { usage_when_switched = Some((usage_planned, i)); } @@ -1001,30 +1088,6 @@ impl VictimSelection { } } -struct TimelineKey(Arc); - -impl PartialEq for TimelineKey { - fn eq(&self, other: &Self) -> bool { - Arc::ptr_eq(&self.0, &other.0) - } -} - -impl Eq for TimelineKey {} - -impl std::hash::Hash for TimelineKey { - fn hash(&self, state: &mut H) { - Arc::as_ptr(&self.0).hash(state); - } -} - -impl std::ops::Deref for TimelineKey { - type Target = Timeline; - - fn deref(&self) -> &Self::Target { - self.0.as_ref() - } -} - /// A totally ordered f32 subset we can use with sorting functions. pub(crate) mod finite_f32 { @@ -1070,6 +1133,12 @@ pub(crate) mod finite_f32 { } } + impl From for f32 { + fn from(value: FiniteF32) -> f32 { + value.0 + } + } + impl FiniteF32 { pub const ZERO: FiniteF32 = FiniteF32(0.0); @@ -1082,136 +1151,9 @@ pub(crate) mod finite_f32 { Err(value) } } - } -} -mod summary { - use super::finite_f32::FiniteF32; - use super::{EvictionCandidate, LayerCount}; - use pageserver_api::shard::TenantShardId; - use std::collections::{BTreeMap, HashMap}; - use std::time::SystemTime; - - #[derive(Debug, Default)] - pub(super) struct EvictionSummary { - evicted_per_tenant: HashMap, - total: LayerCount, - - last_absolute: Option, - last_relative: Option, - } - - impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary { - fn from_iter>(iter: T) -> Self { - let mut summary = EvictionSummary::default(); - for item in iter { - let counts = summary - .evicted_per_tenant - .entry(*item.layer.get_tenant_shard_id()) - .or_default(); - - let sz = item.layer.get_file_size(); - - counts.file_sizes += sz; - counts.count += 1; - - summary.total.file_sizes += sz; - summary.total.count += 1; - - summary.last_absolute = Some(item.last_activity_ts); - summary.last_relative = Some(item.relative_last_activity); - } - - summary - } - } - - struct SiBytesAmount(u64); - - impl std::fmt::Display for SiBytesAmount { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - if self.0 < 1024 { - return write!(f, "{}B", self.0); - } - - let mut tmp = self.0; - let mut ch = 0; - let suffixes = b"KMGTPE"; - - while tmp > 1024 * 1024 && ch < suffixes.len() - 1 { - tmp /= 1024; - ch += 1; - } - - let ch = suffixes[ch] as char; - - write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0) - } - } - - impl std::fmt::Display for EvictionSummary { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // wasteful, but it's for testing - - let mut sorted: BTreeMap> = BTreeMap::new(); - - for (tenant_shard_id, count) in &self.evicted_per_tenant { - sorted - .entry(count.count) - .or_default() - .push((*tenant_shard_id, count.file_sizes)); - } - - let total_file_sizes = SiBytesAmount(self.total.file_sizes); - - writeln!( - f, - "selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):", - self.total.count, self.last_absolute, self.last_relative, - )?; - - for (count, per_tenant) in sorted.iter().rev().take(10) { - write!(f, "- {count} layers: ")?; - - if per_tenant.len() < 3 { - for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() { - if i > 0 { - write!(f, ", ")?; - } - let bytes = SiBytesAmount(*bytes); - write!(f, "{tenant_shard_id} ({bytes})")?; - } - } else { - let num_tenants = per_tenant.len(); - let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::(); - let total_bytes = SiBytesAmount(total_bytes); - let layers = num_tenants * count; - - write!( - f, - "{num_tenants} tenants {total_bytes} in total {layers} layers", - )?; - } - - writeln!(f)?; - } - - if sorted.len() > 10 { - let (rem_count, rem_bytes) = sorted - .iter() - .rev() - .map(|(count, per_tenant)| { - ( - count, - per_tenant.iter().map(|(_id, bytes)| bytes).sum::(), - ) - }) - .fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1)); - let rem_bytes = SiBytesAmount(rem_bytes); - writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?; - } - - Ok(()) + pub fn into_inner(self) -> f32 { + self.into() } } } @@ -1225,7 +1167,6 @@ mod filesystem_level_usage { use super::DiskUsageEvictionTaskConfig; #[derive(Debug, Clone, Copy)] - #[allow(dead_code)] pub struct Usage<'a> { config: &'a DiskUsageEvictionTaskConfig, @@ -1297,7 +1238,6 @@ mod filesystem_level_usage { #[test] fn max_usage_pct_pressure() { - use super::EvictionOrder; use super::Usage as _; use std::time::Duration; use utils::serde_percent::Percent; @@ -1309,7 +1249,7 @@ mod filesystem_level_usage { period: Duration::MAX, #[cfg(feature = "testing")] mock_statvfs: None, - eviction_order: EvictionOrder::default(), + eviction_order: pageserver_api::config::EvictionOrder::default(), }, total_bytes: 100_000, avail_bytes: 0, @@ -1336,3 +1276,40 @@ mod filesystem_level_usage { assert!(!usage.has_pressure()); } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn relative_equal_bounds() { + let order = EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first: false, + }; + + let len = 10; + let v = (0..len) + .map(|i| order.relative_last_activity(len, i).into_inner()) + .collect::>(); + + assert_eq!(v.first(), Some(&1.0)); + assert_eq!(v.last(), Some(&0.0)); + assert!(v.windows(2).all(|slice| slice[0] > slice[1])); + } + + #[test] + fn relative_spare_bounds() { + let order = EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first: true, + }; + + let len = 10; + let v = (0..len) + .map(|i| order.relative_last_activity(len, i).into_inner()) + .collect::>(); + + assert_eq!(v.first(), Some(&1.0)); + assert_eq!(v.last(), Some(&0.1)); + assert!(v.windows(2).all(|slice| slice[0] > slice[1])); + } +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index a49eef8bb9..42086dc2e6 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -58,24 +58,6 @@ paths: responses: "200": description: The reload completed successfully. - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error (also hits if no keys were found) - content: - application/json: - schema: - $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}: parameters: @@ -93,91 +75,46 @@ paths: application/json: schema: $ref: "#/components/schemas/TenantInfo" - "400": - description: Error when no tenant id found in path or no timeline id - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" delete: description: | - Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved. - 404 means that deletion successfully finished" + Attempts to delete specified tenant. 500, 503 and 409 errors should be retried. Deleting + a non-existent tenant is considered successful (returns 200). responses: - "400": - description: Error when no tenant id found in path - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "404": - description: Tenant not found - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "409": - description: Deletion is already in progress, continue polling - content: - application/json: - schema: - $ref: "#/components/schemas/ConflictError" - "412": - description: Deletion may not proceed, tenant is not in Active state - content: - application/json: - schema: - $ref: "#/components/schemas/PreconditionFailedError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" + "200": + description: Tenant was successfully deleted, or was already not found. "503": - description: Temporarily unavailable, please retry. + description: Service is unavailable, or tenant is already being modified (perhaps concurrently deleted) + + + /v1/tenant/{tenant_id}/time_travel_remote_storage: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + - name: travel_to + in: query + required: true + schema: + type: string + format: date-time + - name: done_if_after + in: query + required: true + schema: + type: string + format: date-time + put: + description: Time travel the tenant's remote storage + responses: + "200": + description: OK content: application/json: schema: - $ref: "#/components/schemas/ServiceUnavailableError" - + type: string /v1/tenant/{tenant_id}/timeline: parameters: @@ -197,36 +134,6 @@ paths: type: array items: $ref: "#/components/schemas/TimelineInfo" - "400": - description: Error when no tenant id found in path - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/timeline/{timeline_id}: @@ -251,60 +158,12 @@ paths: application/json: schema: $ref: "#/components/schemas/TimelineInfo" - "400": - description: Error when no tenant id found in path or no timeline id - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" delete: description: "Attempts to delete specified timeline. 500 and 409 errors should be retried" responses: - "400": - description: Error when no tenant id found in path or no timeline id - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "404": - description: Timeline not found + description: Timeline not found. This is the success path. content: application/json: schema: @@ -321,18 +180,6 @@ paths: application/json: schema: $ref: "#/components/schemas/PreconditionFailedError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn: parameters: @@ -365,36 +212,6 @@ paths: schema: type: string format: date-time - "400": - description: Error when no tenant id found in path, no timeline id or invalid timestamp - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "404": - description: Timeline not found, or there is no timestamp information for the given lsn - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp: parameters: @@ -419,12 +236,13 @@ paths: type: string format: date-time description: A timestamp to get the LSN - - name: version + - name: with_lease in: query required: false schema: - type: integer - description: The version of the endpoint to use + type: boolean + description: Whether to grant a lease to the corresponding LSN. Default to false. + responses: "200": description: OK @@ -432,36 +250,41 @@ paths: application/json: schema: $ref: "#/components/schemas/LsnByTimestampResponse" - "400": - description: Error when no tenant id found in path, no timeline id or invalid timestamp + + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + post: + description: Obtains a lease for the given LSN. + requestBody: + content: + application/json: + schema: + type: object + required: + - lsn + properties: + lsn: + description: A LSN to obtain the lease for. + type: string + format: hex + responses: + "200": + description: OK content: application/json: schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" + $ref: "#/components/schemas/LsnLease" /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc: parameters: @@ -485,147 +308,48 @@ paths: application/json: schema: type: string - "400": - description: Error when no tenant id found in path, no timeline id or invalid timestamp - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - /v1/tenant/{tenant_id}/attach: + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc: parameters: - - name: tenant_id + - name: tenant_shard_id in: path required: true schema: type: string + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex post: - description: | - Schedules attach operation to happen in the background for the given tenant. - As soon as the caller sends this request, it must assume the pageserver - starts writing to the tenant's S3 state unless it receives one of the - distinguished errors below that state otherwise. - - If a client receives a not-distinguished response, e.g., a network timeout, - it MUST retry the /attach request and poll again for the tenant's - attachment status. - - After the client has received a 202, it MUST poll the tenant's - attachment status (field `attachment_status`) to reach state `attached`. - If the `attachment_status` is missing, the client MUST retry the `/attach` - request (goto previous paragraph). This is a robustness measure in case the tenant - status endpoint is buggy, but the attach operation is ongoing. - - There is no way to cancel an in-flight request. - - In any case, the client - * MUST NOT ASSUME that the /attach request has been lost in the network, - * MUST NOT ASSUME that the request has been lost, based on the observation - that a subsequent tenant status request returns 404. The request may - still be in flight. It must be retried. - - The client SHOULD supply a `TenantConfig` for the tenant in the request body. - Settings specified in the config override the pageserver's defaults. - It is guaranteed that the config settings are applied before the pageserver - starts operating on the tenant. E.g., if the config specifies a specific - PITR interval for a tenant, then that setting will be in effect before the - pageserver starts the garbage collection loop. This enables a client to - guarantee a specific PITR setting across detach/attach cycles. - The pageserver will reject the request if it cannot parse the config, or - if there are any unknown fields in it. - - If the client does not supply a config, the pageserver will use its defaults. - This behavior is deprecated: https://github.com/neondatabase/neon/issues/4282 - requestBody: - required: false - content: - application/json: - schema: - $ref: "#/components/schemas/TenantAttachRequest" + description: Persistently add a gc blocking at the tenant level because of this timeline responses: - "202": - description: Tenant attaching scheduled - "400": - description: Bad Request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "404": - description: Timeline not found - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "409": - description: | - The tenant is already known to Pageserver in some way, - and hence this `/attach` call has been rejected. + "200": + description: OK - Some examples of how this can happen: - - tenant was created on this pageserver - - tenant attachment was started by an earlier call to `/attach`. - - Callers should poll the tenant status's `attachment_status` field, - like for status 202. See the longer description for `POST /attach` - for details. - content: - application/json: - schema: - $ref: "#/components/schemas/ConflictError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - - - /v1/tenant/{tenant_id}/location_config: + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc: parameters: - - name: tenant_id + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + post: + description: Persistently remove a tenant level gc blocking for this timeline + responses: + "200": + description: OK + + /v1/tenant/{tenant_shard_id}/location_config: + parameters: + - name: tenant_shard_id in: path required: true schema: @@ -635,6 +359,12 @@ paths: required: false schema: type: integer + - name: lazy + in: query + required: false + schema: + type: boolean + description: Set to true for attaches to queue up until activated by compute. Eager (false) is the default. put: description: | Configures a _tenant location_, that is how a particular pageserver handles @@ -674,210 +404,19 @@ paths: responses: "200": description: Tenant is now in requested state - "503": - description: Tenant's state cannot be changed right now. Wait a few seconds and retry. content: application/json: schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" + $ref: "#/components/schemas/TenantLocationConfigResponse" "409": description: | - The tenant is already known to Pageserver in some way, - and hence this `/attach` call has been rejected. - - Some examples of how this can happen: - - tenant was created on this pageserver - - tenant attachment was started by an earlier call to `/attach`. - - Callers should poll the tenant status's `attachment_status` field, - like for status 202. See the longer description for `POST /attach` - for details. + The tenant is already being modified, perhaps by a concurrent call to this API content: application/json: schema: $ref: "#/components/schemas/ConflictError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - /v1/tenant/{tenant_id}/detach: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - - name: detach_ignored - in: query - required: false - schema: - type: boolean - description: | - When true, allow to detach a tenant which state is ignored. - post: - description: | - Remove tenant data (including all corresponding timelines) from pageserver's memory and file system. - Files on the remote storage are not affected. - responses: - "200": - description: Tenant detached - "400": - description: Error when no tenant id found in path parameters - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "404": - description: Tenant not found - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - - - /v1/tenant/{tenant_id}/ignore: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - post: - description: | - Remove tenant data (including all corresponding timelines) from pageserver's memory. - Files on local disk and remote storage are not affected. - - Future pageserver restarts won't load the data back until `load` is called on such tenant. - responses: - "200": - description: Tenant ignored - "400": - description: Error when no tenant id found in path parameters - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - - - /v1/tenant/{tenant_id}/load: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - post: - description: | - Schedules an operation that attempts to load a tenant from the local disk and - synchronise it with the remote storage (if enabled), repeating pageserver's restart logic for tenant load. - If the tenant was ignored before, removes the ignore mark and continues with load scheduling. - - Errors if the tenant is absent on disk, already present in memory or fails to schedule its load. - Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness. - requestBody: - required: false - content: - application/json: - schema: - $ref: "#/components/schemas/TenantLoadRequest" - responses: - "202": - description: Tenant scheduled to load successfully - "400": - description: Error when no tenant id found in path parameters - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - - /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive: + /v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive: parameters: - name: tenant_id in: path @@ -896,67 +435,39 @@ paths: responses: "202": description: Tenant scheduled to load successfully - "404": - description: No tenant or timeline found for the specified ids - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - - /v1/tenant/{tenant_id}/synthetic_size: + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config: parameters: - - name: tenant_id + - name: tenant_shard_id in: path required: true schema: type: string - get: + - name: timeline_id + in: path + required: true + schema: + type: string + put: description: | - Calculate tenant's synthetic size + Either archives or unarchives the given timeline. + An archived timeline may not have any non-archived children. + requestBody: + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/ArchivalConfigRequest" responses: "200": - description: Tenant's synthetic size + description: Timeline (un)archived successfully + "409": + description: | + The tenant/timeline is already being modified, perhaps by a concurrent call to this API content: application/json: schema: - $ref: "#/components/schemas/SyntheticSizeResponse" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" + $ref: "#/components/schemas/ConflictError" "500": description: Generic operation error content: @@ -970,7 +481,7 @@ paths: schema: $ref: "#/components/schemas/ServiceUnavailableError" - /v1/tenant/{tenant_id}/size: + /v1/tenant/{tenant_id}/synthetic_size: parameters: - name: tenant_id in: path @@ -1000,19 +511,11 @@ paths: content: application/json: schema: - type: object - required: - - id - - size - properties: - id: - type: string - format: hex - size: - type: integer - nullable: true - description: | - Size metric in bytes or null if inputs_only=true was given. + $ref: "#/components/schemas/SyntheticSizeResponse" + text/html: + schema: + type: string + description: SVG representation of the tenant and its timelines. "401": description: Unauthorized Error content: @@ -1038,6 +541,49 @@ paths: schema: $ref: "#/components/schemas/ServiceUnavailableError" + /v1/tenant/{tenant_shard_id}/heatmap_upload: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + post: + description: | + If the location is in an attached mode, upload the current state to the remote heatmap + responses: + "200": + description: Success + + /v1/tenant/{tenant_shard_id}/secondary/download: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: wait_ms + description: If set, we will wait this long for download to complete, and if it isn't complete then return 202 + in: query + required: false + schema: + type: integer + post: + description: | + If the location is in secondary mode, download latest heatmap and layers + responses: + "200": + description: Success + content: + application/json: + schema: + $ref: "#/components/schemas/SecondaryProgress" + "202": + description: Download has started but not yet finished + content: + application/json: + schema: + $ref: "#/components/schemas/SecondaryProgress" /v1/tenant/{tenant_id}/timeline/: parameters: @@ -1075,29 +621,11 @@ paths: format: hex responses: "201": - description: TimelineInfo + description: Timeline was created, or already existed with matching parameters content: application/json: schema: $ref: "#/components/schemas/TimelineInfo" - "400": - description: Malformed timeline create request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "406": description: Permanently unsatisfiable request, don't retry. content: @@ -1105,24 +633,92 @@ paths: schema: $ref: "#/components/schemas/Error" "409": - description: Timeline already exists, creation skipped + description: Timeline already exists, with different parameters. Creation cannot proceed. content: application/json: schema: $ref: "#/components/schemas/ConflictError" - "500": - description: Generic operation error + "429": + description: A creation request was sent for the same Timeline Id while a creation was already in progress. Back off and retry. content: application/json: schema: $ref: "#/components/schemas/Error" + + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + + put: + description: | + Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`. + Current implementation might not be retryable across failure cases, but will be enhanced in future. + Detaching should be expected to be expensive operation. Timeouts should be retried. + responses: + "200": + description: | + The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented. + If any timelines were deleted after reparenting, they might not be on this list. + content: + application/json: + schema: + $ref: "#/components/schemas/AncestorDetached" + + "400": + description: | + Number of early checks meaning the timeline cannot be detached now: + - the ancestor of timeline has an ancestor: not supported, see RFC + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + + "404": + description: Tenant or timeline not found. + content: + application/json: + schema: + $ref: "#/components/schemas/NotFoundError" + + "409": + description: | + The timeline can never be detached: + - timeline has no ancestor, implying that the timeline has never had an ancestor + content: + application/json: + schema: + $ref: "#/components/schemas/ConflictError" + + "500": + description: | + Transient error, for example, pageserver shutdown happened while + processing the request but we were unable to distinguish that. Must + be retried. + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "503": - description: Temporarily unavailable, please retry. + description: | + Temporarily unavailable, please retry. Possible reasons: + - another timeline detach for the same tenant is underway, please retry later + - detected shutdown error content: application/json: schema: $ref: "#/components/schemas/ServiceUnavailableError" + /v1/tenant/: get: description: Get tenants list @@ -1135,30 +731,6 @@ paths: type: array items: $ref: "#/components/schemas/TenantInfo" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" post: description: | @@ -1179,43 +751,12 @@ paths: application/json: schema: type: string - "400": - description: Malformed tenant create request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "409": description: Tenant already exists, creation skipped content: application/json: schema: $ref: "#/components/schemas/ConflictError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - /v1/tenant/config: put: @@ -1237,36 +778,6 @@ paths: type: array items: $ref: "#/components/schemas/TenantInfo" - "400": - description: Malformed tenant config request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/config/: parameters: @@ -1286,42 +797,19 @@ paths: application/json: schema: $ref: "#/components/schemas/TenantConfigResponse" - "400": - description: Malformed get tenanant config request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "404": - description: Tenand or timeline were not found - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" + + /v1/utilization: + get: + description: | + Returns the pageservers current utilization and fitness score for new tenants. + + responses: + "200": + description: Pageserver utilization and fitness score + content: + application/json: + schema: + $ref: "#/components/schemas/PageserverUtilization" components: securitySchemes: @@ -1355,8 +843,6 @@ components: For example this can be caused by s3 being unreachable. The retry may be implemented with call to detach, though it would be better to not automate it and inspec failed state manually before proceeding with a retry. - - See the tenant `/attach` endpoint for more information. type: object required: - slug @@ -1374,31 +860,19 @@ components: TenantCreateRequest: allOf: - $ref: '#/components/schemas/TenantConfig' + - $ref: '#/components/schemas/TenantLoadRequest' - type: object required: - new_tenant_id properties: new_tenant_id: type: string - generation: - type: integer - description: Attachment generation number. TenantLoadRequest: type: object properties: generation: type: integer description: Attachment generation number. - TenantAttachRequest: - type: object - required: - - config - properties: - config: - $ref: '#/components/schemas/TenantConfig' - generation: - type: integer - description: Attachment generation number. TenantConfigRequest: allOf: - $ref: '#/components/schemas/TenantConfig' @@ -1411,10 +885,8 @@ components: TenantLocationConfigRequest: type: object required: - - tenant_id + - mode properties: - tenant_id: - type: string mode: type: string enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"] @@ -1426,12 +898,47 @@ components: $ref: '#/components/schemas/SecondaryConfig' tenant_conf: $ref: '#/components/schemas/TenantConfig' + TenantLocationConfigResponse: + type: object + required: + - shards + properties: + shards: + description: Pageservers where this tenant's shards are attached. Not populated for secondary locations. + type: array + items: + $ref: "#/components/schemas/TenantShardLocation" + stripe_size: + description: If multiple shards are present, this field contains the sharding stripe size, else it is null. + type: integer + nullable: true + TenantShardLocation: + type: object + required: + - node_id + - shard_id + properties: + node_id: + description: Pageserver node ID where this shard is attached + type: integer + shard_id: + description: Tenant shard ID of the shard + type: string SecondaryConfig: type: object properties: warm: type: boolean description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything. + ArchivalConfigRequest: + type: object + required: + - state + properties: + state: + description: The archival state of a timeline + type: string + enum: ["Archived", "Unarchived"] TenantConfig: type: object properties: @@ -1459,10 +966,8 @@ components: type: string max_lsn_wal_lag: type: integer - trace_read_requests: - type: boolean heatmap_period: - type: integer + type: string TenantConfigResponse: type: object properties: @@ -1536,6 +1041,9 @@ components: format: hex size: type: integer + nullable: true + description: | + Size metric in bytes or null if inputs_only=true was given. segment_sizes: type: array items: @@ -1612,6 +1120,90 @@ components: kind: type: string enum: [past, present, future, nodata] + valid_until: + type: string + format: date-time + description: The expiration time of the granted lease. + + LsnLease: + type: object + required: + - valid_until + properties: + valid_until: + type: string + format: date-time + + PageserverUtilization: + type: object + required: + - disk_usage_bytes + - free_space_bytes + - utilization_score + properties: + disk_usage_bytes: + type: integer + format: int64 + minimum: 0 + description: The amount of disk space currently used. + free_space_bytes: + type: integer + format: int64 + minimum: 0 + description: The amount of usable disk space left. + utilization_score: + type: integer + format: int64 + minimum: 0 + maximum: 9223372036854775807 + default: 9223372036854775807 + description: | + Lower is better score for how good this pageserver would be for the next tenant. + The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated. + + SecondaryProgress: + type: object + required: + - heatmap_mtime + - layers_downloaded + - layers_total + - bytes_downloaded + - bytes_total + properties: + heatmap_mtime: + type: string + format: date-time + description: Modification time of the most recently downloaded layer heatmap (RFC 3339 format) + layers_downloaded: + type: integer + format: int64 + description: How many layers from the latest layer heatmap are present on disk + bytes_downloaded: + type: integer + format: int64 + description: How many bytes of layer content from the latest layer heatmap are present on disk + layers_total: + type: integer + format: int64 + description: How many layers were in the latest layer heatmap + bytes_total: + type: integer + format: int64 + description: How many bytes of layer content were in the latest layer heatmap + + AncestorDetached: + type: object + required: + - reparented_timelines + properties: + reparented_timelines: + type: array + description: Set of reparented timeline ids + items: + type: string + format: hex + description: TimelineId + Error: type: object diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index aa56806246..d645f3b7b6 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,6 +1,8 @@ //! //! Management HTTP API //! +use std::cmp::Reverse; +use std::collections::BinaryHeap; use std::collections::HashMap; use std::str::FromStr; use std::sync::Arc; @@ -8,55 +10,81 @@ use std::time::Duration; use anyhow::{anyhow, Context, Result}; use enumset::EnumSet; +use futures::StreamExt; use futures::TryFutureExt; use humantime::format_rfc3339; use hyper::header; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; +use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest; +use pageserver_api::models::IngestAuxFilesRequest; +use pageserver_api::models::ListAuxFilesRequest; +use pageserver_api::models::LocationConfig; use pageserver_api::models::LocationConfigListResponse; +use pageserver_api::models::LocationConfigMode; +use pageserver_api::models::LsnLease; +use pageserver_api::models::LsnLeaseRequest; use pageserver_api::models::ShardParameters; use pageserver_api::models::TenantDetails; -use pageserver_api::models::TenantState; -use pageserver_api::models::{ - DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest, - TenantLoadRequest, TenantLocationConfigRequest, -}; +use pageserver_api::models::TenantLocationConfigRequest; +use pageserver_api::models::TenantLocationConfigResponse; +use pageserver_api::models::TenantScanRemoteStorageResponse; +use pageserver_api::models::TenantScanRemoteStorageShard; +use pageserver_api::models::TenantShardLocation; +use pageserver_api::models::TenantShardSplitRequest; +use pageserver_api::models::TenantShardSplitResponse; +use pageserver_api::models::TenantSorting; +use pageserver_api::models::TimelineArchivalConfigRequest; +use pageserver_api::models::TopTenantShardItem; +use pageserver_api::models::TopTenantShardsRequest; +use pageserver_api::models::TopTenantShardsResponse; +use pageserver_api::shard::ShardCount; use pageserver_api::shard::TenantShardId; +use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; -use tenant_size_model::{SizeResult, StorageModel}; +use remote_storage::TimeTravelError; +use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel}; +use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::JwtAuth; use utils::failpoint_support::failpoints_handler; +use utils::http::endpoint::prometheus_metrics_handler; use utils::http::endpoint::request_span; -use utils::http::json::json_request_or_empty_body; +use utils::http::request::must_parse_query_param; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::DeletionQueueClient; -use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::{LocationConf, TenantConfOpt}; use crate::tenant::mgr::GetActiveTenantError; use crate::tenant::mgr::{ - GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError, - TenantSlotError, TenantSlotUpsertError, TenantStateError, + GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError, + TenantSlotUpsertError, TenantStateError, }; use crate::tenant::mgr::{TenantSlot, UpsertLocationError}; +use crate::tenant::remote_timeline_client; +use crate::tenant::remote_timeline_client::download_index_part; +use crate::tenant::remote_timeline_client::list_remote_tenant_shards; +use crate::tenant::remote_timeline_client::list_remote_timelines; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; +use crate::tenant::storage_layer::LayerName; use crate::tenant::timeline::CompactFlags; +use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::Timeline; -use crate::tenant::SpawnMode; +use crate::tenant::GetTimelineError; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; use crate::{config::PageServerConf, tenant::mgr}; use crate::{disk_usage_eviction_task, tenant}; use pageserver_api::models::{ - StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, - TimelineCreateRequest, TimelineGcRequest, TimelineInfo, + StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest, + TimelineInfo, }; use utils::{ auth::SwappableJwtAuth, @@ -75,18 +103,25 @@ use utils::{ // For APIs that require an Active tenant, how long should we block waiting for that state? // This is not functionally necessary (clients will retry), but avoids generating a lot of // failed API calls while tenants are activating. -const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000); +#[cfg(not(feature = "testing"))] +pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000); + +// Tests run on slow/oversubscribed nodes, and may need to wait much longer for tenants to +// finish attaching, if calls to remote storage are slow. +#[cfg(feature = "testing")] +pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); pub struct State { conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, allowlist_routes: Vec, - remote_storage: Option, + remote_storage: GenericRemoteStorage, broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, secondary_controller: SecondaryController, + latest_utilization: tokio::sync::Mutex>, } impl State { @@ -95,7 +130,7 @@ impl State { conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, - remote_storage: Option, + remote_storage: GenericRemoteStorage, broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, @@ -115,6 +150,7 @@ impl State { disk_usage_eviction_state, deletion_queue_client, secondary_controller, + latest_utilization: Default::default(), }) } } @@ -142,13 +178,9 @@ fn check_permission(request: &Request, tenant_id: Option) -> Res impl From for ApiError { fn from(pre: PageReconstructError) -> ApiError { match pre { - PageReconstructError::Other(pre) => ApiError::InternalServerError(pre), - PageReconstructError::Cancelled => { - ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) - } - PageReconstructError::AncestorStopping(_) => { - ApiError::ResourceUnavailable(format!("{pre}").into()) - } + PageReconstructError::Other(other) => ApiError::InternalServerError(other), + PageReconstructError::MissingKey(e) => ApiError::InternalServerError(e.into()), + PageReconstructError::Cancelled => ApiError::Cancelled, PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()), PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre), } @@ -172,7 +204,6 @@ impl From for ApiError { NotFound(tenant_id) => { ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into()) } - e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")), InProgress => { ApiError::ResourceUnavailable("Tenant is being modified concurrently".into()) } @@ -199,7 +230,7 @@ impl From for ApiError { BadRequest(e) => ApiError::BadRequest(e), Unavailable(_) => ApiError::ShuttingDown, e @ InProgress => ApiError::Conflict(format!("{e}")), - Flush(e) | Other(e) => ApiError::InternalServerError(e), + Flush(e) | InternalError(e) => ApiError::InternalServerError(e), } } } @@ -232,16 +263,11 @@ impl From for ApiError { fn from(tse: GetTenantError) -> ApiError { match tse { GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()), - GetTenantError::Broken(reason) => { - ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason)) - } GetTenantError::NotActive(_) => { // Why is this not `ApiError::NotFound`? // Because we must be careful to never return 404 for a tenant if it does // in fact exist locally. If we did, the caller could draw the conclusion // that it can attach the tenant to another PS and we'd be in split-brain. - // - // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls). ApiError::ResourceUnavailable("Tenant not yet active".into()) } GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()), @@ -249,27 +275,29 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(gte: GetTimelineError) -> Self { + // Rationale: tenant is activated only after eligble timelines activate + ApiError::NotFound(gte.into()) + } +} + impl From for ApiError { fn from(e: GetActiveTenantError) -> ApiError { match e { + GetActiveTenantError::Broken(reason) => { + ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason)) + } GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)), GetActiveTenantError::Cancelled => ApiError::ShuttingDown, GetActiveTenantError::NotFound(gte) => gte.into(), GetActiveTenantError::WaitForActiveTimeout { .. } => { ApiError::ResourceUnavailable(format!("{}", e).into()) } - } - } -} - -impl From for ApiError { - fn from(e: SetNewTenantConfigError) -> ApiError { - match e { - SetNewTenantConfigError::GetTenant(tid) => { - ApiError::NotFound(anyhow!("tenant {}", tid).into()) - } - e @ (SetNewTenantConfigError::Persist(_) | SetNewTenantConfigError::Other(_)) => { - ApiError::InternalServerError(anyhow::Error::new(e)) + GetActiveTenantError::SwitchedTenant => { + // in our HTTP handlers, this error doesn't happen + // TODO: separate error types + ApiError::ResourceUnavailable("switched tenant".into()) } } } @@ -290,6 +318,27 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(value: crate::tenant::TimelineArchivalError) -> Self { + use crate::tenant::TimelineArchivalError::*; + match value { + NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()), + Timeout => ApiError::Timeout("hit pageserver internal timeout".into()), + e @ HasArchivedParent(_) => { + ApiError::PreconditionFailed(e.to_string().into_boxed_str()) + } + HasUnarchivedChildren(children) => ApiError::PreconditionFailed( + format!( + "Cannot archive timeline which has non-archived child timelines: {children:?}" + ) + .into_boxed_str(), + ), + a @ AlreadyInProgress => ApiError::Conflict(a.to_string()), + Other(e) => ApiError::InternalServerError(e), + } + } +} + impl From for ApiError { fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self { use crate::tenant::mgr::DeleteTimelineError::*; @@ -305,18 +354,12 @@ impl From for ApiError { } } -impl From for ApiError { - fn from(value: crate::tenant::delete::DeleteTenantError) -> Self { - use crate::tenant::delete::DeleteTenantError::*; +impl From for ApiError { + fn from(value: crate::tenant::mgr::DeleteTenantError) -> Self { + use crate::tenant::mgr::DeleteTenantError::*; match value { - Get(g) => ApiError::from(g), - e @ AlreadyInProgress => ApiError::Conflict(e.to_string()), - Timeline(t) => ApiError::from(t), - NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()), SlotError(e) => e.into(), - SlotUpsertError(e) => e.into(), Other(o) => ApiError::InternalServerError(o), - e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()), Cancelled => ApiError::ShuttingDown, } } @@ -366,7 +409,7 @@ async fn build_timeline_info_common( let guard = timeline.last_received_wal.lock().unwrap(); if let Some(info) = guard.as_ref() { ( - Some(format!("{:?}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only. + Some(format!("{}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only. Some(info.last_received_msg_lsn), Some(info.last_received_msg_ts), ) @@ -383,6 +426,8 @@ async fn build_timeline_info_common( let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx); let current_physical_size = Some(timeline.layer_size_sum().await); let state = timeline.current_state(); + // Report is_archived = false if the timeline is still loading + let is_archived = timeline.is_archived().unwrap_or(false); let remote_consistent_lsn_projected = timeline .get_remote_consistent_lsn_projected() .unwrap_or(Lsn(0)); @@ -392,6 +437,8 @@ async fn build_timeline_info_common( let walreceiver_status = timeline.walreceiver_status(); + let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats(); + let info = TimelineInfo { tenant_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, @@ -409,8 +456,11 @@ async fn build_timeline_info_common( tenant::timeline::logical_size::Accuracy::Approximate => false, tenant::timeline::logical_size::Accuracy::Exact => true, }, + directory_entries_counts: timeline.get_directory_metrics().to_vec(), current_physical_size, current_logical_size_non_incremental: None, + pitr_history_size, + within_ancestor_pitr, timeline_dir_layer_file_size_sum: None, wal_source_connstr, last_received_msg_lsn, @@ -418,8 +468,11 @@ async fn build_timeline_info_common( pg_version: timeline.pg_version, state, + is_archived: Some(is_archived), walreceiver_status, + + last_aux_file_policy: timeline.last_aux_file_policy.load(), }; Ok(info) } @@ -454,8 +507,12 @@ async fn reload_auth_validation_keys_handler( json_response(StatusCode::OK, ()) } Err(e) => { + let err_msg = "Error reloading public keys"; warn!("Error reloading public keys from {key_path:?}: {e:}"); - json_response(StatusCode::INTERNAL_SERVER_ERROR, ()) + json_response( + StatusCode::INTERNAL_SERVER_ERROR, + HttpErrorBody::from_msg(err_msg.to_string()), + ) } } } @@ -475,52 +532,77 @@ async fn timeline_create_handler( let state = get_state(&request); async { - let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?; + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - match tenant.create_timeline( - new_timeline_id, - request_data.ancestor_timeline_id.map(TimelineId::from), - request_data.ancestor_start_lsn, - request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION), - request_data.existing_initdb_timeline_id, - state.broker_client.clone(), - &ctx, - ) - .await { + if let Some(ancestor_id) = request_data.ancestor_timeline_id.as_ref() { + tracing::info!(%ancestor_id, "starting to branch"); + } else { + tracing::info!("bootstrapping"); + } + + match tenant + .create_timeline( + new_timeline_id, + request_data.ancestor_timeline_id, + request_data.ancestor_start_lsn, + request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION), + request_data.existing_initdb_timeline_id, + state.broker_client.clone(), + &ctx, + ) + .await + { Ok(new_timeline) => { // Created. Construct a TimelineInfo for it. - let timeline_info = build_timeline_info_common(&new_timeline, &ctx, tenant::timeline::GetLogicalSizePriority::User) - .await - .map_err(ApiError::InternalServerError)?; + let timeline_info = build_timeline_info_common( + &new_timeline, + &ctx, + tenant::timeline::GetLogicalSizePriority::User, + ) + .await + .map_err(ApiError::InternalServerError)?; json_response(StatusCode::CREATED, timeline_info) } Err(_) if tenant.cancel.is_cancelled() => { // In case we get some ugly error type during shutdown, cast it into a clean 503. - json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string())) + json_response( + StatusCode::SERVICE_UNAVAILABLE, + HttpErrorBody::from_msg("Tenant shutting down".to_string()), + ) } - Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => { - json_response(StatusCode::CONFLICT, ()) - } - Err(tenant::CreateTimelineError::AncestorLsn(err)) => { - json_response(StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg( - format!("{err:#}") - )) - } - Err(e @ tenant::CreateTimelineError::AncestorNotActive) => { - json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string())) - } - Err(tenant::CreateTimelineError::ShuttingDown) => { - json_response(StatusCode::SERVICE_UNAVAILABLE,HttpErrorBody::from_msg("tenant shutting down".to_string())) + Err(e @ tenant::CreateTimelineError::Conflict) => { + json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())) } + Err(e @ tenant::CreateTimelineError::AlreadyCreating) => json_response( + StatusCode::TOO_MANY_REQUESTS, + HttpErrorBody::from_msg(e.to_string()), + ), + Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response( + StatusCode::NOT_ACCEPTABLE, + HttpErrorBody::from_msg(format!("{err:#}")), + ), + Err(e @ tenant::CreateTimelineError::AncestorNotActive) => json_response( + StatusCode::SERVICE_UNAVAILABLE, + HttpErrorBody::from_msg(e.to_string()), + ), + Err(tenant::CreateTimelineError::ShuttingDown) => json_response( + StatusCode::SERVICE_UNAVAILABLE, + HttpErrorBody::from_msg("tenant shutting down".to_string()), + ), Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)), } } .instrument(info_span!("timeline_create", tenant_id = %tenant_shard_id.tenant_id, - shard = %tenant_shard_id.shard_slug(), - timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version)) + shard_id = %tenant_shard_id.shard_slug(), + timeline_id = %new_timeline_id, + lsn=?request_data.ancestor_start_lsn, + pg_version=?request_data.pg_version + )) .await } @@ -535,10 +617,16 @@ async fn timeline_list_handler( parse_query_param(&request, "force-await-initial-logical-size")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let response_data = async { - let tenant = mgr::get_tenant(tenant_shard_id, true)?; + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + let timelines = tenant.list_timelines(); let mut response_data = Vec::with_capacity(timelines.len()); @@ -573,6 +661,7 @@ async fn timeline_preserve_initdb_handler( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); // Part of the process for disaster recovery from safekeeper-stored WAL: // If we don't recover into a new timeline but want to keep the timeline ID, @@ -580,11 +669,11 @@ async fn timeline_preserve_initdb_handler( // location where timeline recreation cand find it. async { - let tenant = mgr::get_tenant(tenant_shard_id, true)?; + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; - let timeline = tenant - .get_timeline(timeline_id, false) - .map_err(|e| ApiError::NotFound(e.into()))?; + let timeline = tenant.get_timeline(timeline_id, false)?; timeline .preserve_initdb_archive() @@ -603,6 +692,37 @@ async fn timeline_preserve_initdb_handler( json_response(StatusCode::OK, ()) } +async fn timeline_archival_config_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + let request_data: TimelineArchivalConfigRequest = json_request(&mut request).await?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); + + async { + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant + .apply_timeline_archival_config(timeline_id, request_data.state) + .await?; + Ok::<_, ApiError>(()) + } + .instrument(info_span!("timeline_archival_config", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + state = ?request_data.state, + %timeline_id)) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn timeline_detail_handler( request: Request, _cancel: CancellationToken, @@ -617,13 +737,16 @@ async fn timeline_detail_handler( // Logical size calculation needs downloading. let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + let state = get_state(&request); let timeline_info = async { - let tenant = mgr::get_tenant(tenant_shard_id, true)?; + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; - let timeline = tenant - .get_timeline(timeline_id, false) - .map_err(|e| ApiError::NotFound(e.into()))?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + let timeline = tenant.get_timeline(timeline_id, false)?; let timeline_info = build_timeline_info( &timeline, @@ -652,8 +775,9 @@ async fn get_lsn_by_timestamp_handler( ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { // Requires SLRU contents, which are only stored on shard zero return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" @@ -667,15 +791,25 @@ async fn get_lsn_by_timestamp_handler( .map_err(ApiError::BadRequest)?; let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp); + let with_lease = parse_query_param(&request, "with_lease")?.unwrap_or(false); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; let result = timeline .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx) .await?; - #[derive(serde::Serialize)] + + #[derive(serde::Serialize, Debug)] struct Result { lsn: Lsn, kind: &'static str, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(flatten)] + lease: Option, } let (lsn, kind) = match result { LsnForTimestamp::Present(lsn) => (lsn, "present"), @@ -683,7 +817,31 @@ async fn get_lsn_by_timestamp_handler( LsnForTimestamp::Past(lsn) => (lsn, "past"), LsnForTimestamp::NoData(lsn) => (lsn, "nodata"), }; - json_response(StatusCode::OK, Result { lsn, kind }) + + let lease = if with_lease { + timeline + .make_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx) + .inspect_err(|_| { + warn!("fail to grant a lease to {}", lsn); + }) + .ok() + } else { + None + }; + + let result = Result { lsn, kind, lease }; + let valid_until = result + .lease + .as_ref() + .map(|l| humantime::format_rfc3339_millis(l.valid_until).to_string()); + tracing::info!( + lsn=?result.lsn, + kind=%result.kind, + timestamp=%timestamp_raw, + valid_until=?valid_until, + "lsn_by_timestamp finished" + ); + json_response(StatusCode::OK, result) } async fn get_timestamp_of_lsn_handler( @@ -692,8 +850,9 @@ async fn get_timestamp_of_lsn_handler( ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { // Requires SLRU contents, which are only stored on shard zero return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" @@ -708,82 +867,25 @@ async fn get_timestamp_of_lsn_handler( .map_err(ApiError::BadRequest)?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?; match result { Some(time) => { - let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string(); + let time = format_rfc3339( + postgres_ffi::try_from_pg_timestamp(time).map_err(ApiError::InternalServerError)?, + ) + .to_string(); json_response(StatusCode::OK, time) } - None => json_response(StatusCode::NOT_FOUND, ()), + None => Err(ApiError::NotFound( + anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(), + )), } } -async fn tenant_attach_handler( - mut request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - - let maybe_body: Option = json_request_or_empty_body(&mut request).await?; - let tenant_conf = match &maybe_body { - Some(request) => TenantConfOpt::try_from(&*request.config).map_err(ApiError::BadRequest)?, - None => TenantConfOpt::default(), - }; - - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); - - info!("Handling tenant attach {tenant_id}"); - - let state = get_state(&request); - - let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?; - - if state.remote_storage.is_none() { - return Err(ApiError::BadRequest(anyhow!( - "attach_tenant is not possible because pageserver was configured without remote storage" - ))); - } - - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - let shard_params = ShardParameters::default(); - let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params); - - let tenant = state - .tenant_manager - .upsert_location( - tenant_shard_id, - location_conf, - None, - SpawnMode::Normal, - &ctx, - ) - .await?; - - let Some(tenant) = tenant else { - // This should never happen: indicates a bug in upsert_location - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "Upsert succeeded but didn't return tenant!" - ))); - }; - - // We might have successfully constructed a Tenant, but it could still - // end up in a broken state: - if let TenantState::Broken { - reason, - backtrace: _, - } = tenant.current_state() - { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "Tenant state is Broken: {reason}" - ))); - } - - json_response(StatusCode::ACCEPTED, ()) -} - async fn timeline_delete_handler( request: Request, _cancel: CancellationToken, @@ -796,7 +898,7 @@ async fn timeline_delete_handler( let tenant = state .tenant_manager - .get_attached_tenant_shard(tenant_shard_id, false) + .get_attached_tenant_shard(tenant_shard_id) .map_err(|e| { match e { // GetTenantError has a built-in conversion to ApiError, but in this context we don't @@ -808,37 +910,12 @@ async fn timeline_delete_handler( } })?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id)) + tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id)) .await?; json_response(StatusCode::ACCEPTED, ()) } -async fn tenant_detach_handler( - request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - let detach_ignored: Option = parse_query_param(&request, "detach_ignored")?; - - // This is a legacy API (`/location_conf` is the replacement). It only supports unsharded tenants - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - - let state = get_state(&request); - let conf = state.conf; - mgr::detach_tenant( - conf, - tenant_shard_id, - detach_ignored.unwrap_or(false), - &state.deletion_queue_client, - ) - .instrument(info_span!("tenant_detach", %tenant_id)) - .await?; - - json_response(StatusCode::OK, ()) -} - async fn tenant_reset_handler( request: Request, _cancel: CancellationToken, @@ -852,70 +929,23 @@ async fn tenant_reset_handler( let state = get_state(&request); state .tenant_manager - .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx) + .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), &ctx) .await .map_err(ApiError::InternalServerError)?; json_response(StatusCode::OK, ()) } -async fn tenant_load_handler( - mut request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); - - let maybe_body: Option = json_request_or_empty_body(&mut request).await?; - - let state = get_state(&request); - - // The /load request is only usable when control_plane_api is not set. Once it is set, callers - // should always use /attach instead. - let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?; - - mgr::load_tenant( - state.conf, - tenant_id, - generation, - state.broker_client.clone(), - state.remote_storage.clone(), - state.deletion_queue_client.clone(), - &ctx, - ) - .instrument(info_span!("load", %tenant_id)) - .await?; - - json_response(StatusCode::ACCEPTED, ()) -} - -async fn tenant_ignore_handler( - request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - - let state = get_state(&request); - let conf = state.conf; - mgr::ignore_tenant(conf, tenant_id) - .instrument(info_span!("ignore_tenant", %tenant_id)) - .await?; - - json_response(StatusCode::OK, ()) -} - async fn tenant_list_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { check_permission(&request, None)?; + let state = get_state(&request); - let response_data = mgr::list_tenants() - .instrument(info_span!("tenant_list")) - .await + let response_data = state + .tenant_manager + .list_tenants() .map_err(|_| { ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into()) })? @@ -925,7 +955,10 @@ async fn tenant_list_handler( state: state.clone(), current_physical_size: None, attachment_status: state.attachment_status(), - generation: (*gen).into(), + generation: (*gen) + .into() + .expect("Tenants are always attached with a generation"), + gc_blocking: None, }) .collect::>(); @@ -938,9 +971,27 @@ async fn tenant_status( ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); + + // In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting. + let activate = true; + #[cfg(feature = "testing")] + let activate = parse_query_param(&request, "activate")?.unwrap_or(activate); let tenant_info = async { - let tenant = mgr::get_tenant(tenant_shard_id, false)?; + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + if activate { + // This is advisory: we prefer to let the tenant activate on-demand when this function is + // called, but it is still valid to return 200 and describe the current state of the tenant + // if it doesn't make it into an active state. + tenant + .wait_to_become_active(ACTIVE_TENANT_TIMEOUT) + .await + .ok(); + } // Calculate total physical size of all timelines let mut current_physical_size = 0; @@ -955,8 +1006,13 @@ async fn tenant_status( state: state.clone(), current_physical_size: Some(current_physical_size), attachment_status: state.attachment_status(), - generation: tenant.generation().into(), + generation: tenant + .generation() + .into() + .expect("Tenants are always attached with a generation"), + gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")), }, + walredo: tenant.wal_redo_manager_status(), timelines: tenant.list_timeline_ids(), }) } @@ -980,14 +1036,14 @@ async fn tenant_delete_handler( state .tenant_manager - .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT) + .delete_tenant(tenant_shard_id) .instrument(info_span!("tenant_delete_handler", tenant_id = %tenant_shard_id.tenant_id, - shard = %tenant_shard_id.shard_slug() + shard_id = %tenant_shard_id.shard_slug() )) .await?; - json_response(StatusCode::ACCEPTED, ()) + json_response(StatusCode::OK, ()) } /// HTTP endpoint to query the current tenant_size of a tenant. @@ -1012,16 +1068,20 @@ async fn tenant_size_handler( let inputs_only: Option = parse_query_param(&request, "inputs_only")?; let retention_period: Option = parse_query_param(&request, "retention_period")?; let headers = request.headers(); + let state = get_state(&request); - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let tenant = mgr::get_tenant(tenant_shard_id, true)?; - - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" ))); } + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + // this can be long operation let inputs = tenant .gather_size_inputs( @@ -1031,7 +1091,10 @@ async fn tenant_size_handler( &ctx, ) .await - .map_err(ApiError::InternalServerError)?; + .map_err(|e| match e { + crate::tenant::size::CalculateSyntheticSizeError::Cancelled => ApiError::ShuttingDown, + other => ApiError::InternalServerError(anyhow::anyhow!(other)), + })?; let mut sizes = None; let accepts_html = headers @@ -1039,9 +1102,7 @@ async fn tenant_size_handler( .map(|v| v == "text/html") .unwrap_or_default(); if !inputs_only.unwrap_or(false) { - let storage_model = inputs - .calculate_model() - .map_err(ApiError::InternalServerError)?; + let storage_model = inputs.calculate_model(); let size = storage_model.calculate(); // If request header expects html, return html @@ -1080,6 +1141,35 @@ async fn tenant_size_handler( ) } +async fn tenant_shard_split_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let req: TenantShardSplitRequest = json_request(&mut request).await?; + + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let state = get_state(&request); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + let new_shards = state + .tenant_manager + .shard_split( + tenant, + ShardCount::new(req.new_shard_count), + req.new_stripe_size, + &ctx, + ) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, TenantShardSplitResponse { new_shards }) +} + async fn layer_map_info_handler( request: Request, _cancel: CancellationToken, @@ -1088,11 +1178,17 @@ async fn layer_map_info_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let reset: LayerAccessStatsReset = parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset); + let state = get_state(&request); check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; - let layer_map_info = timeline.layer_map_info(reset).await; + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + let layer_map_info = timeline + .layer_map_info(reset) + .await + .map_err(|_shutdown| ApiError::ShuttingDown)?; json_response(StatusCode::OK, layer_map_info) } @@ -1105,10 +1201,15 @@ async fn layer_download_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let layer_name = LayerName::from_str(layer_file_name) + .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?; + let state = get_state(&request); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; let downloaded = timeline - .download_layer(layer_file_name) + .download_layer(&layer_name) .await .map_err(ApiError::InternalServerError)?; @@ -1130,10 +1231,16 @@ async fn evict_timeline_layer_handler( check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; + let state = get_state(&request); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let layer_name = LayerName::from_str(layer_file_name) + .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?; + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; let evicted = timeline - .evict_layer(layer_file_name) + .evict_layer(&layer_name) .await .map_err(ApiError::InternalServerError)?; @@ -1147,6 +1254,72 @@ async fn evict_timeline_layer_handler( } } +async fn timeline_gc_blocking_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + block_or_unblock_gc(request, true).await +} + +async fn timeline_gc_unblocking_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + block_or_unblock_gc(request, false).await +} + +/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`. +/// +/// Both are technically unsafe because they might fire off index uploads, thus they are POST. +async fn block_or_unblock_gc( + request: Request, + block: bool, +) -> Result, ApiError> { + use crate::tenant::{ + remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized, + }; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let state = get_state(&request); + + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + let timeline = tenant.get_timeline(timeline_id, true)?; + + let fut = async { + if block { + timeline.block_gc(&tenant).await.map(|_| ()) + } else { + timeline.unblock_gc(&tenant).await + } + }; + + let span = tracing::info_span!( + "block_or_unblock_gc", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + timeline_id = %timeline_id, + block = block, + ); + + let res = fut.instrument(span).await; + + res.map_err(|e| { + if e.is::() || e.is::() { + ApiError::ShuttingDown + } else { + ApiError::InternalServerError(e) + } + })?; + + json_response(StatusCode::OK, ()) +} + /// Get tenant_size SVG graph along with the JSON data. fn synthetic_size_html_response( inputs: ModelInputs, @@ -1159,10 +1332,15 @@ fn synthetic_size_html_response( timeline_map.insert(ti.timeline_id, index); timeline_ids.push(ti.timeline_id.to_string()); } - let seg_to_branch: Vec = inputs + let seg_to_branch: Vec<(usize, SvgBranchKind)> = inputs .segments .iter() - .map(|seg| *timeline_map.get(&seg.timeline_id).unwrap()) + .map(|seg| { + ( + *timeline_map.get(&seg.timeline_id).unwrap(), + seg.kind.into(), + ) + }) .collect(); let svg = @@ -1203,83 +1381,17 @@ pub fn html_response(status: StatusCode, data: String) -> Result, Ok(response) } -/// Helper for requests that may take a generation, which is mandatory -/// when control_plane_api is set, but otherwise defaults to Generation::none() -fn get_request_generation(state: &State, req_gen: Option) -> Result { - if state.conf.control_plane_api.is_some() { - req_gen - .map(Generation::new) - .ok_or(ApiError::BadRequest(anyhow!( - "generation attribute missing" - ))) - } else { - // Legacy mode: all tenants operate with no generation - Ok(Generation::none()) - } -} - -async fn tenant_create_handler( - mut request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - let request_data: TenantCreateRequest = json_request(&mut request).await?; - let target_tenant_id = request_data.new_tenant_id; - check_permission(&request, None)?; - - let _timer = STORAGE_TIME_GLOBAL - .get_metric_with_label_values(&[StorageTimeOperation::CreateTenant.into()]) - .expect("bug") - .start_timer(); - - let tenant_conf = - TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?; - - let state = get_state(&request); - - let generation = get_request_generation(state, request_data.generation)?; - - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); - - let location_conf = - LocationConf::attached_single(tenant_conf, generation, &request_data.shard_parameters); - - let new_tenant = state - .tenant_manager - .upsert_location( - target_tenant_id, - location_conf, - None, - SpawnMode::Create, - &ctx, - ) - .await?; - - let Some(new_tenant) = new_tenant else { - // This should never happen: indicates a bug in upsert_location - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "Upsert succeeded but didn't return tenant!" - ))); - }; - // We created the tenant. Existing API semantics are that the tenant - // is Active when this function returns. - new_tenant - .wait_to_become_active(ACTIVE_TENANT_TIMEOUT) - .await?; - - json_response( - StatusCode::CREATED, - TenantCreateResponse(new_tenant.tenant_shard_id().tenant_id), - ) -} - async fn get_tenant_config_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); - let tenant = mgr::get_tenant(tenant_shard_id, false)?; + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; let response = HashMap::from([ ( @@ -1307,13 +1419,31 @@ async fn update_tenant_config_handler( let tenant_id = request_data.tenant_id; check_permission(&request, Some(tenant_id))?; - let tenant_conf = + let new_tenant_conf = TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?; let state = get_state(&request); - mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id) - .instrument(info_span!("tenant_config", %tenant_id)) - .await?; + + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + // This is a legacy API that only operates on attached tenants: the preferred + // API to use is the location_config/ endpoint, which lets the caller provide + // the full LocationConf. + let location_conf = LocationConf::attached_single( + new_tenant_conf.clone(), + tenant.get_generation(), + &ShardParameters::default(), + ); + + crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) + .await + .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; + tenant.set_new_tenant_config(new_tenant_conf); json_response(StatusCode::OK, ()) } @@ -1326,6 +1456,7 @@ async fn put_tenant_location_config_handler( let request_data: TenantLocationConfigRequest = json_request(&mut request).await?; let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis); + let lazy = parse_query_param(&request, "lazy")?.unwrap_or(false); check_permission(&request, Some(tenant_shard_id.tenant_id))?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); @@ -1335,13 +1466,14 @@ async fn put_tenant_location_config_handler( // The `Detached` state is special, it doesn't upsert a tenant, it removes // its local disk content and drops it from memory. if let LocationConfigMode::Detached = request_data.config.mode { - if let Err(e) = - mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client) - .instrument(info_span!("tenant_detach", - tenant_id = %tenant_shard_id.tenant_id, - shard = %tenant_shard_id.shard_slug() - )) - .await + if let Err(e) = state + .tenant_manager + .detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client) + .instrument(info_span!("tenant_detach", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug() + )) + .await { match e { TenantStateError::SlotError(TenantSlotError::NotFound(_)) => { @@ -1356,16 +1488,20 @@ async fn put_tenant_location_config_handler( let location_conf = LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?; - state + // lazy==true queues up for activation or jumps the queue like normal when a compute connects, + // similar to at startup ordering. + let spawn_mode = if lazy { + tenant::SpawnMode::Lazy + } else { + tenant::SpawnMode::Eager + }; + + let tenant = state .tenant_manager - .upsert_location( - tenant_shard_id, - location_conf, - flush, - tenant::SpawnMode::Normal, - &ctx, - ) + .upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx) .await?; + let stripe_size = tenant.as_ref().map(|t| t.get_shard_stripe_size()); + let attached = tenant.is_some(); if let Some(_flush_ms) = flush { match state @@ -1384,7 +1520,26 @@ async fn put_tenant_location_config_handler( tracing::info!("No flush requested when configuring"); } - json_response(StatusCode::OK, ()) + // This API returns a vector of pageservers where the tenant is attached: this is + // primarily for use in the sharding service. For compatibilty, we also return this + // when called directly on a pageserver, but the payload is always zero or one shards. + let mut response = TenantLocationConfigResponse { + shards: Vec::new(), + stripe_size: None, + }; + if attached { + response.shards.push(TenantShardLocation { + shard_id: tenant_shard_id, + node_id: state.conf.id, + }); + if tenant_shard_id.shard_count.count() > 1 { + // Stripe size should be set if we are attached + debug_assert!(stripe_size.is_some()); + response.stripe_size = stripe_size; + } + } + + json_response(StatusCode::OK, response) } async fn list_location_config_handler( @@ -1409,6 +1564,96 @@ async fn list_location_config_handler( json_response(StatusCode::OK, result) } +async fn get_location_config_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&request); + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let slot = state.tenant_manager.get(tenant_shard_id); + + let Some(slot) = slot else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant shard not found").into(), + )); + }; + + let result: Option = match slot { + TenantSlot::Attached(t) => Some(t.get_location_conf()), + TenantSlot::Secondary(s) => Some(s.get_location_conf()), + TenantSlot::InProgress(_) => None, + }; + + json_response(StatusCode::OK, result) +} + +// Do a time travel recovery on the given tenant/tenant shard. Tenant needs to be detached +// (from all pageservers) as it invalidates consistency assumptions. +async fn tenant_time_travel_remote_storage_handler( + request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let timestamp_raw = must_get_query_param(&request, "travel_to")?; + let timestamp = humantime::parse_rfc3339(×tamp_raw) + .with_context(|| format!("Invalid time for travel_to: {timestamp_raw:?}")) + .map_err(ApiError::BadRequest)?; + + let done_if_after_raw = must_get_query_param(&request, "done_if_after")?; + let done_if_after = humantime::parse_rfc3339(&done_if_after_raw) + .with_context(|| format!("Invalid time for done_if_after: {done_if_after_raw:?}")) + .map_err(ApiError::BadRequest)?; + + // This is just a sanity check to fend off naive wrong usages of the API: + // the tenant needs to be detached *everywhere* + let state = get_state(&request); + let we_manage_tenant = state.tenant_manager.manages_tenant_shard(tenant_shard_id); + if we_manage_tenant { + return Err(ApiError::BadRequest(anyhow!( + "Tenant {tenant_shard_id} is already attached at this pageserver" + ))); + } + + if timestamp > done_if_after { + return Err(ApiError::BadRequest(anyhow!( + "The done_if_after timestamp comes before the timestamp to recover to" + ))); + } + + tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}"); + + remote_timeline_client::upload::time_travel_recover_tenant( + &state.remote_storage, + &tenant_shard_id, + timestamp, + done_if_after, + &cancel, + ) + .await + .map_err(|e| match e { + TimeTravelError::BadInput(e) => { + warn!("bad input error: {e}"); + ApiError::BadRequest(anyhow!("bad input error")) + } + TimeTravelError::Unimplemented => { + ApiError::BadRequest(anyhow!("unimplemented for the configured remote storage")) + } + TimeTravelError::Cancelled => ApiError::InternalServerError(anyhow!("cancelled")), + TimeTravelError::TooManyVersions => { + ApiError::InternalServerError(anyhow!("too many versions in remote storage")) + } + TimeTravelError::Other(e) => { + warn!("internal error: {e}"); + ApiError::InternalServerError(anyhow!("internal error")) + } + })?; + + json_response(StatusCode::OK, ()) +} + /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`]. async fn handle_tenant_break( r: Request, @@ -1416,14 +1661,40 @@ async fn handle_tenant_break( ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?; - let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true) - .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?; - - tenant.set_broken("broken from test".to_owned()).await; + let state = get_state(&r); + state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)? + .set_broken("broken from test".to_owned()) + .await; json_response(StatusCode::OK, ()) } +// Obtains an lsn lease on the given timeline. +async fn lsn_lease_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let lsn = json_request::(&mut request).await?.lsn; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + + let state = get_state(&request); + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + let result = timeline + .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx) + .map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?; + + json_response(StatusCode::OK, result) +} + // Run GC immediately on given timeline. async fn timeline_gc_handler( mut request: Request, @@ -1436,13 +1707,7 @@ async fn timeline_gc_handler( let gc_req: TimelineGcRequest = json_request(&mut request).await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let wait_task_done = - mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?; - let gc_result = wait_task_done - .await - .context("wait for gc task") - .map_err(ApiError::InternalServerError)? - .map_err(ApiError::InternalServerError)?; + let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?; json_response(StatusCode::OK, gc_result) } @@ -1456,17 +1721,37 @@ async fn timeline_compact_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); + let mut flags = EnumSet::empty(); if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { flags |= CompactFlags::ForceRepartition; } + if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? { + flags |= CompactFlags::ForceImageLayerCreation; + } + if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? { + flags |= CompactFlags::EnhancedGcBottomMostCompaction; + } + if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? { + flags |= CompactFlags::DryRun; + } + + let wait_until_uploaded = + parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); + async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; timeline .compact(&cancel, flags, &ctx) .await .map_err(|e| ApiError::InternalServerError(e.into()))?; + if wait_until_uploaded { + timeline.remote_client.wait_completion().await + // XXX map to correct ApiError for the cases where it's due to shutdown + .context("wait completion").map_err(ApiError::InternalServerError)?; + } json_response(StatusCode::OK, ()) } .instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) @@ -1482,21 +1767,54 @@ async fn timeline_checkpoint_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); + let mut flags = EnumSet::empty(); if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { flags |= CompactFlags::ForceRepartition; } + if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? { + flags |= CompactFlags::ForceImageLayerCreation; + } + + // By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload. + let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true); + + let wait_until_uploaded = + parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); + async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; timeline .freeze_and_flush() .await - .map_err(ApiError::InternalServerError)?; - timeline - .compact(&cancel, flags, &ctx) - .await - .map_err(|e| ApiError::InternalServerError(e.into()))?; + .map_err(|e| { + match e { + tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown, + other => ApiError::InternalServerError(other.into()), + + } + })?; + if compact { + timeline + .compact(&cancel, flags, &ctx) + .await + .map_err(|e| + match e { + CompactionError::ShuttingDown => ApiError::ShuttingDown, + CompactionError::Other(e) => ApiError::InternalServerError(e) + } + )?; + } + + if wait_until_uploaded { + tracing::info!("Waiting for uploads to complete..."); + timeline.remote_client.wait_completion().await + // XXX map to correct ApiError for the cases where it's due to shutdown + .context("wait completion").map_err(ApiError::InternalServerError)?; + tracing::info!("Uploads completed up to {}", timeline.get_remote_consistent_lsn_projected().unwrap_or(Lsn(0))); + } json_response(StatusCode::OK, ()) } @@ -1513,7 +1831,11 @@ async fn timeline_download_remote_layers_handler_post( let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let state = get_state(&request); + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; match timeline.spawn_download_all_remote_layers(body).await { Ok(st) => json_response(StatusCode::ACCEPTED, st), Err(st) => json_response(StatusCode::CONFLICT, st), @@ -1527,8 +1849,11 @@ async fn timeline_download_remote_layers_handler_get( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let state = get_state(&request); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; let info = timeline .get_download_all_remote_layers_task_info() .context("task never started since last pageserver process start") @@ -1536,17 +1861,88 @@ async fn timeline_download_remote_layers_handler_get( json_response(StatusCode::OK, info) } +async fn timeline_detach_ancestor_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + use crate::tenant::timeline::detach_ancestor; + use pageserver_api::models::detach_ancestor::AncestorDetached; + + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id); + + async move { + let mut options = detach_ancestor::Options::default(); + + let rewrite_concurrency = + parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?; + let copy_concurrency = + parse_query_param::<_, std::num::NonZeroUsize>(&request, "copy_concurrency")?; + + [ + (&mut options.rewrite_concurrency, rewrite_concurrency), + (&mut options.copy_concurrency, copy_concurrency), + ] + .into_iter() + .filter_map(|(target, val)| val.map(|val| (target, val))) + .for_each(|(target, val)| *target = val); + + let state = get_state(&request); + + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download); + let ctx = &ctx; + + let timeline = tenant.get_timeline(timeline_id, true)?; + + let progress = timeline + .prepare_to_detach_from_ancestor(&tenant, options, ctx) + .await?; + + // uncomment to allow early as possible Tenant::drop + // drop(tenant); + + let resp = match progress { + detach_ancestor::Progress::Prepared(attempt, prepared) => { + // it would be great to tag the guard on to the tenant activation future + let reparented_timelines = state + .tenant_manager + .complete_detaching_timeline_ancestor( + tenant_shard_id, + timeline_id, + prepared, + attempt, + ctx, + ) + .await?; + + AncestorDetached { + reparented_timelines, + } + } + detach_ancestor::Progress::Done(resp) => resp, + }; + + json_response(StatusCode::OK, resp) + } + .instrument(span) + .await +} + async fn deletion_queue_flush( r: Request, cancel: CancellationToken, ) -> Result, ApiError> { let state = get_state(&r); - if state.remote_storage.is_none() { - // Nothing to do if remote storage is disabled. - return json_response(StatusCode::OK, ()); - } - let execute = parse_query_param(&r, "execute")?.unwrap_or(false); let flush = async { @@ -1577,6 +1973,7 @@ async fn getpage_at_lsn_handler( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); struct Key(crate::repository::Key); @@ -1595,7 +1992,7 @@ async fn getpage_at_lsn_handler( async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; let page = timeline.get(key.0, lsn, &ctx).await?; @@ -1618,19 +2015,22 @@ async fn timeline_collect_keyspace( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); let at_lsn: Option = parse_query_param(&request, "at_lsn")?; async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); - let keys = timeline + let (dense_ks, sparse_ks) = timeline .collect_keyspace(at_lsn, &ctx) .await .map_err(|e| ApiError::InternalServerError(e.into()))?; - let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn }; + // This API is currently used by pagebench. Pagebench will iterate all keys within the keyspace. + // Therefore, we split dense/sparse keys in this API. + let res = pageserver_api::models::partitioning::Partitioning { keys: dense_ks, sparse_keys: sparse_ks, at_lsn }; json_response(StatusCode::OK, res) } @@ -1639,13 +2039,15 @@ async fn timeline_collect_keyspace( } async fn active_timeline_of_active_tenant( + tenant_manager: &TenantManager, tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result, ApiError> { - let tenant = mgr::get_tenant(tenant_shard_id, true)?; - tenant - .get_timeline(timeline_id, true) - .map_err(|e| ApiError::NotFound(e.into())) + let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + Ok(tenant.get_timeline(timeline_id, true)?) } async fn always_panic_handler( @@ -1674,7 +2076,7 @@ async fn disk_usage_eviction_run( evict_bytes: u64, #[serde(default)] - eviction_order: crate::disk_usage_eviction_task::EvictionOrder, + eviction_order: pageserver_api::config::EvictionOrder, } #[derive(Debug, Clone, Copy, serde::Serialize)] @@ -1703,21 +2105,14 @@ async fn disk_usage_eviction_run( }; let state = get_state(&r); - - let Some(storage) = state.remote_storage.as_ref() else { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "remote storage not configured, cannot run eviction iteration" - ))); - }; - let eviction_state = state.disk_usage_eviction_state.clone(); let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( &eviction_state, - storage, + &state.remote_storage, usage, &state.tenant_manager, - config.eviction_order, + config.eviction_order.into(), &cancel, ) .await; @@ -1744,19 +2139,144 @@ async fn secondary_upload_handler( json_response(StatusCode::OK, ()) } +async fn tenant_scan_remote_handler( + request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&request); + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + + let mut response = TenantScanRemoteStorageResponse::default(); + + let (shards, _other_keys) = + list_remote_tenant_shards(&state.remote_storage, tenant_id, cancel.clone()) + .await + .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; + + for tenant_shard_id in shards { + let (timeline_ids, _other_keys) = + list_remote_timelines(&state.remote_storage, tenant_shard_id, cancel.clone()) + .await + .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; + + let mut generation = Generation::none(); + for timeline_id in timeline_ids { + match download_index_part( + &state.remote_storage, + &tenant_shard_id, + &timeline_id, + Generation::MAX, + &cancel, + ) + .instrument(info_span!("download_index_part", + tenant_id=%tenant_shard_id.tenant_id, + shard_id=%tenant_shard_id.shard_slug(), + %timeline_id)) + .await + { + Ok((index_part, index_generation)) => { + tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)", + index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn()); + generation = std::cmp::max(generation, index_generation); + } + Err(DownloadError::NotFound) => { + // This is normal for tenants that were created with multiple shards: they have an unsharded path + // containing the timeline's initdb tarball but no index. Otherwise it is a bit strange. + tracing::info!("Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping"); + continue; + } + Err(e) => { + return Err(ApiError::InternalServerError(anyhow::anyhow!(e))); + } + }; + } + + response.shards.push(TenantScanRemoteStorageShard { + tenant_shard_id, + generation: generation.into(), + }); + } + + if response.shards.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("No shards found for tenant ID {tenant_id}").into(), + )); + } + + json_response(StatusCode::OK, response) +} + async fn secondary_download_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let state = get_state(&request); let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; - state - .secondary_controller - .download_tenant(tenant_shard_id) - .await - .map_err(ApiError::InternalServerError)?; + let wait = parse_query_param(&request, "wait_ms")?.map(Duration::from_millis); - json_response(StatusCode::OK, ()) + // We don't need this to issue the download request, but: + // - it enables us to cleanly return 404 if we get a request for an absent shard + // - we will use this to provide status feedback in the response + let Some(secondary_tenant) = state + .tenant_manager + .get_secondary_tenant_shard(tenant_shard_id) + else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(), + )); + }; + + let timeout = wait.unwrap_or(Duration::MAX); + + let result = tokio::time::timeout( + timeout, + state.secondary_controller.download_tenant(tenant_shard_id), + ) + .await; + + let progress = secondary_tenant.progress.lock().unwrap().clone(); + + let status = match result { + Ok(Ok(())) => { + if progress.layers_downloaded >= progress.layers_total { + // Download job ran to completion + StatusCode::OK + } else { + // Download dropped out without errors because it ran out of time budget + StatusCode::ACCEPTED + } + } + // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered + // okay. We could get an error here in the unlikely edge case that the tenant + // was detached between our check above and executing the download job. + Ok(Err(e)) => return Err(ApiError::InternalServerError(e)), + // A timeout is not an error: we have started the download, we're just not done + // yet. The caller will get a response body indicating status. + Err(_) => StatusCode::ACCEPTED, + }; + + json_response(status, progress) +} + +async fn secondary_status_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&request); + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + + let Some(secondary_tenant) = state + .tenant_manager + .get_secondary_tenant_shard(tenant_shard_id) + else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(), + )); + }; + + let progress = secondary_tenant.progress.lock().unwrap().clone(); + + json_response(StatusCode::OK, progress) } async fn handler_404(_: Request) -> Result, ApiError> { @@ -1799,6 +2319,454 @@ async fn post_tracing_event_handler( json_response(StatusCode::OK, ()) } +async fn force_aux_policy_switch_handler( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&r, None)?; + let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?; + let policy: AuxFilePolicy = json_request(&mut r).await?; + + let state = get_state(&r); + + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + timeline + .do_switch_aux_policy(policy) + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) +} + +async fn put_io_engine_handler( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&r, None)?; + let kind: crate::virtual_file::IoEngineKind = json_request(&mut r).await?; + crate::virtual_file::io_engine::set(kind); + json_response(StatusCode::OK, ()) +} + +async fn put_io_alignment_handler( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&r, None)?; + let align: usize = json_request(&mut r).await?; + crate::virtual_file::set_io_buffer_alignment(align).map_err(|align| { + ApiError::PreconditionFailed( + format!("Requested io alignment ({align}) is not a power of two").into(), + ) + })?; + json_response(StatusCode::OK, ()) +} + +/// Polled by control plane. +/// +/// See [`crate::utilization`]. +async fn get_utilization( + r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + fail::fail_point!("get-utilization-http-handler", |_| { + Err(ApiError::ResourceUnavailable("failpoint".into())) + }); + + // this probably could be completely public, but lets make that change later. + check_permission(&r, None)?; + + let state = get_state(&r); + let mut g = state.latest_utilization.lock().await; + + let regenerate_every = Duration::from_secs(1); + let still_valid = g + .as_ref() + .is_some_and(|(captured_at, _)| captured_at.elapsed() < regenerate_every); + + // avoid needless statvfs calls even though those should be non-blocking fast. + // regenerate at most 1Hz to allow polling at any rate. + if !still_valid { + let path = state.conf.tenants_path(); + let doc = + crate::utilization::regenerate(state.conf, path.as_std_path(), &state.tenant_manager) + .map_err(ApiError::InternalServerError)?; + + let mut buf = Vec::new(); + serde_json::to_writer(&mut buf, &doc) + .context("serialize") + .map_err(ApiError::InternalServerError)?; + + let body = bytes::Bytes::from(buf); + + *g = Some((std::time::Instant::now(), body)); + } + + // hyper 0.14 doesn't yet have Response::clone so this is a bit of extra legwork + let cached = g.as_ref().expect("just set").1.clone(); + + Response::builder() + .header(hyper::http::header::CONTENT_TYPE, "application/json") + // thought of using http date header, but that is second precision which does not give any + // debugging aid + .status(StatusCode::OK) + .body(hyper::Body::from(cached)) + .context("build response") + .map_err(ApiError::InternalServerError) +} + +async fn list_aux_files( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let body: ListAuxFilesRequest = json_request(&mut request).await?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + let files = timeline.list_aux_files(body.lsn, &ctx).await?; + json_response(StatusCode::OK, files) +} + +async fn perf_info( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + let result = timeline.perf_info().await; + + json_response(StatusCode::OK, result) +} + +async fn ingest_aux_files( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let body: IngestAuxFilesRequest = json_request(&mut request).await?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + let mut modification = timeline.begin_modification( + Lsn(timeline.get_last_record_lsn().0 + 8), /* advance LSN by 8 */ + ); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + for (fname, content) in body.aux_files { + modification + .put_file(&fname, content.as_bytes(), &ctx) + .await + .map_err(ApiError::InternalServerError)?; + } + modification + .commit(&ctx) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) +} + +/// Report on the largest tenants on this pageserver, for the storage controller to identify +/// candidates for splitting +async fn post_top_tenants( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&r, None)?; + let request: TopTenantShardsRequest = json_request(&mut r).await?; + let state = get_state(&r); + + fn get_size_metric(sizes: &TopTenantShardItem, order_by: &TenantSorting) -> u64 { + match order_by { + TenantSorting::ResidentSize => sizes.resident_size, + TenantSorting::MaxLogicalSize => sizes.max_logical_size, + } + } + + #[derive(Eq, PartialEq)] + struct HeapItem { + metric: u64, + sizes: TopTenantShardItem, + } + + impl PartialOrd for HeapItem { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + + /// Heap items have reverse ordering on their metric: this enables using BinaryHeap, which + /// supports popping the greatest item but not the smallest. + impl Ord for HeapItem { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + Reverse(self.metric).cmp(&Reverse(other.metric)) + } + } + + let mut top_n: BinaryHeap = BinaryHeap::with_capacity(request.limit); + + // FIXME: this is a lot of clones to take this tenant list + for (tenant_shard_id, tenant_slot) in state.tenant_manager.list() { + if let Some(shards_lt) = request.where_shards_lt { + // Ignore tenants which already have >= this many shards + if tenant_shard_id.shard_count >= shards_lt { + continue; + } + } + + let sizes = match tenant_slot { + TenantSlot::Attached(tenant) => tenant.get_sizes(), + TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => { + continue; + } + }; + let metric = get_size_metric(&sizes, &request.order_by); + + if let Some(gt) = request.where_gt { + // Ignore tenants whose metric is <= the lower size threshold, to do less sorting work + if metric <= gt { + continue; + } + }; + + match top_n.peek() { + None => { + // Top N list is empty: candidate becomes first member + top_n.push(HeapItem { metric, sizes }); + } + Some(i) if i.metric > metric && top_n.len() < request.limit => { + // Lowest item in list is greater than our candidate, but we aren't at limit yet: push to end + top_n.push(HeapItem { metric, sizes }); + } + Some(i) if i.metric > metric => { + // List is at limit and lowest value is greater than our candidate, drop it. + } + Some(_) => top_n.push(HeapItem { metric, sizes }), + } + + while top_n.len() > request.limit { + top_n.pop(); + } + } + + json_response( + StatusCode::OK, + TopTenantShardsResponse { + shards: top_n.into_iter().map(|i| i.sizes).collect(), + }, + ) +} + +async fn put_tenant_timeline_import_basebackup( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?; + let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?; + let pg_version: u32 = must_parse_query_param(&request, "pg_version")?; + + check_permission(&request, Some(tenant_id))?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + + let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version); + async move { + let state = get_state(&request); + let tenant = state + .tenant_manager + .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?; + + let broker_client = state.broker_client.clone(); + + let mut body = StreamReader::new(request.into_body().map(|res| { + res.map_err(|error| { + std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error)) + }) + })); + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + let timeline = tenant + .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx) + .map_err(ApiError::InternalServerError) + .await?; + + // TODO mark timeline as not ready until it reaches end_lsn. + // We might have some wal to import as well, and we should prevent compute + // from connecting before that and writing conflicting wal. + // + // This is not relevant for pageserver->pageserver migrations, since there's + // no wal to import. But should be fixed if we want to import from postgres. + + // TODO leave clean state on error. For now you can use detach to clean + // up broken state from a failed import. + + // Import basebackup provided via CopyData + info!("importing basebackup"); + + timeline + .import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx) + .await + .map_err(ApiError::InternalServerError)?; + + // Read the end of the tar archive. + read_tar_eof(body) + .await + .map_err(ApiError::InternalServerError)?; + + // TODO check checksum + // Meanwhile you can verify client-side by taking fullbackup + // and checking that it matches in size with what was imported. + // It wouldn't work if base came from vanilla postgres though, + // since we discard some log files. + + info!("done"); + json_response(StatusCode::OK, ()) + } + .instrument(span) + .await +} + +async fn put_tenant_timeline_import_wal( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let start_lsn: Lsn = must_parse_query_param(&request, "start_lsn")?; + let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?; + + check_permission(&request, Some(tenant_id))?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + + let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn); + async move { + let state = get_state(&request); + + let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?; + + let mut body = StreamReader::new(request.into_body().map(|res| { + res.map_err(|error| { + std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error)) + }) + })); + + let last_record_lsn = timeline.get_last_record_lsn(); + if last_record_lsn != start_lsn { + return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))); + } + + // TODO leave clean state on error. For now you can use detach to clean + // up broken state from a failed import. + + // Import wal provided via CopyData + info!("importing wal"); + crate::import_datadir::import_wal_from_tar(&timeline, &mut body, start_lsn, end_lsn, &ctx).await.map_err(ApiError::InternalServerError)?; + info!("wal import complete"); + + // Read the end of the tar archive. + read_tar_eof(body).await.map_err(ApiError::InternalServerError)?; + + // TODO Does it make sense to overshoot? + if timeline.get_last_record_lsn() < end_lsn { + return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))); + } + + // Flush data to disk, then upload to s3. No need for a forced checkpoint. + // We only want to persist the data, and it doesn't matter if it's in the + // shape of deltas or images. + info!("flushing layers"); + timeline.freeze_and_flush().await.map_err(|e| match e { + tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown, + other => ApiError::InternalServerError(anyhow::anyhow!(other)), + })?; + + info!("done"); + + json_response(StatusCode::OK, ()) + }.instrument(span).await +} + +/// Read the end of a tar archive. +/// +/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each. +/// `tokio_tar` already read the first such block. Read the second all-zeros block, +/// and check that there is no more data after the EOF marker. +/// +/// 'tar' command can also write extra blocks of zeros, up to a record +/// size, controlled by the --record-size argument. Ignore them too. +async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> { + use tokio::io::AsyncReadExt; + let mut buf = [0u8; 512]; + + // Read the all-zeros block, and verify it + let mut total_bytes = 0; + while total_bytes < 512 { + let nbytes = reader.read(&mut buf[total_bytes..]).await?; + total_bytes += nbytes; + if nbytes == 0 { + break; + } + } + if total_bytes < 512 { + anyhow::bail!("incomplete or invalid tar EOF marker"); + } + if !buf.iter().all(|&x| x == 0) { + anyhow::bail!("invalid tar EOF marker"); + } + + // Drain any extra zero-blocks after the EOF marker + let mut trailing_bytes = 0; + let mut seen_nonzero_bytes = false; + loop { + let nbytes = reader.read(&mut buf).await?; + trailing_bytes += nbytes; + if !buf.iter().all(|&x| x == 0) { + seen_nonzero_bytes = true; + } + if nbytes == 0 { + break; + } + } + if seen_nonzero_bytes { + anyhow::bail!("unexpected non-zero bytes after the tar archive"); + } + if trailing_bytes % 512 != 0 { + anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive"); + } + Ok(()) +} + /// Common functionality of all the HTTP API handlers. /// /// - Adds a tracing span to each request (by `request_span`) @@ -1813,6 +2781,16 @@ where R: std::future::Future, ApiError>> + Send + 'static, H: FnOnce(Request, CancellationToken) -> R + Send + Sync + 'static, { + if request.uri() != &"/v1/failpoints".parse::().unwrap() { + fail::fail_point!("api-503", |_| Err(ApiError::ResourceUnavailable( + "failpoint".into() + ))); + + fail::fail_point!("api-500", |_| Err(ApiError::InternalServerError( + anyhow::anyhow!("failpoint") + ))); + } + // Spawn a new task to handle the request, to protect the handler from unexpected // async cancellations. Most pageserver functions are not async cancellation safe. // We arm a drop-guard, so that if Hyper drops the Future, we signal the task @@ -1924,6 +2902,7 @@ pub fn make_router( Ok(router .data(state) + .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .get("/v1/status", |r| api_handler(r, status_handler)) .put("/v1/failpoints", |r| { testing_api_handler("manage failpoints", r, failpoints_handler) @@ -1932,7 +2911,6 @@ pub fn make_router( api_handler(r, reload_auth_validation_keys_handler) }) .get("/v1/tenant", |r| api_handler(r, tenant_list_handler)) - .post("/v1/tenant", |r| api_handler(r, tenant_create_handler)) .get("/v1/tenant/:tenant_shard_id", |r| { api_handler(r, tenant_status) }) @@ -1945,6 +2923,9 @@ pub fn make_router( .put("/v1/tenant/config", |r| { api_handler(r, update_tenant_config_handler) }) + .put("/v1/tenant/:tenant_shard_id/shard_split", |r| { + api_handler(r, tenant_shard_split_handler) + }) .get("/v1/tenant/:tenant_shard_id/config", |r| { api_handler(r, get_tenant_config_handler) }) @@ -1954,31 +2935,30 @@ pub fn make_router( .get("/v1/location_config", |r| { api_handler(r, list_location_config_handler) }) + .get("/v1/location_config/:tenant_shard_id", |r| { + api_handler(r, get_location_config_handler) + }) + .put( + "/v1/tenant/:tenant_shard_id/time_travel_remote_storage", + |r| api_handler(r, tenant_time_travel_remote_storage_handler), + ) .get("/v1/tenant/:tenant_shard_id/timeline", |r| { api_handler(r, timeline_list_handler) }) .post("/v1/tenant/:tenant_shard_id/timeline", |r| { api_handler(r, timeline_create_handler) }) - .post("/v1/tenant/:tenant_id/attach", |r| { - api_handler(r, tenant_attach_handler) - }) - .post("/v1/tenant/:tenant_id/detach", |r| { - api_handler(r, tenant_detach_handler) - }) .post("/v1/tenant/:tenant_shard_id/reset", |r| { api_handler(r, tenant_reset_handler) }) - .post("/v1/tenant/:tenant_id/load", |r| { - api_handler(r, tenant_load_handler) - }) - .post("/v1/tenant/:tenant_id/ignore", |r| { - api_handler(r, tenant_ignore_handler) - }) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive", |r| api_handler(r, timeline_preserve_initdb_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config", + |r| api_handler(r, timeline_archival_config_handler), + ) .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| { api_handler(r, timeline_detail_handler) }) @@ -1990,13 +2970,17 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn", |r| api_handler(r, get_timestamp_of_lsn_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease", + |r| api_handler(r, lsn_lease_handler), + ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc", |r| api_handler(r, timeline_gc_handler), ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact", - |r| testing_api_handler("run timeline compaction", r, timeline_compact_handler), + |r| api_handler(r, timeline_compact_handler), ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint", @@ -2010,6 +2994,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers", |r| api_handler(r, timeline_download_remote_layers_handler_get), ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/detach_ancestor", + |r| api_handler(r, timeline_detach_ancestor_handler), + ) .delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| { api_handler(r, timeline_delete_handler) }) @@ -2025,15 +3013,29 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", |r| api_handler(r, evict_timeline_layer_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc", + |r| api_handler(r, timeline_gc_blocking_handler), + ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc", + |r| api_handler(r, timeline_gc_unblocking_handler), + ) .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| { api_handler(r, secondary_upload_handler) }) + .get("/v1/tenant/:tenant_id/scan_remote_storage", |r| { + api_handler(r, tenant_scan_remote_handler) + }) .put("/v1/disk_usage_eviction/run", |r| { api_handler(r, disk_usage_eviction_run) }) .put("/v1/deletion_queue/flush", |r| { api_handler(r, deletion_queue_flush) }) + .get("/v1/tenant/:tenant_shard_id/secondary/status", |r| { + api_handler(r, secondary_status_handler) + }) .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| { api_handler(r, secondary_download_handler) }) @@ -2050,7 +3052,37 @@ pub fn make_router( ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace", - |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace), + |r| api_handler(r, timeline_collect_keyspace), + ) + .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler)) + .put("/v1/io_alignment", |r| { + api_handler(r, put_io_alignment_handler) + }) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch", + |r| api_handler(r, force_aux_policy_switch_handler), + ) + .get("/v1/utilization", |r| api_handler(r, get_utilization)) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files", + |r| testing_api_handler("ingest_aux_files", r, ingest_aux_files), + ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/list_aux_files", + |r| testing_api_handler("list_aux_files", r, list_aux_files), + ) + .post("/v1/top_tenants", |r| api_handler(r, post_top_tenants)) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info", + |r| testing_api_handler("perf_info", r, perf_info), + ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/import_basebackup", + |r| api_handler(r, put_tenant_timeline_import_basebackup), + ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal", + |r| api_handler(r, put_tenant_timeline_import_wal), ) .any(handler_404)) } diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index d66df36b3a..5a0894cd1b 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -2,30 +2,24 @@ //! Import data and WAL from a PostgreSQL data directory and WAL segments into //! a neon Timeline. //! -use std::io::SeekFrom; use std::path::{Path, PathBuf}; use anyhow::{bail, ensure, Context, Result}; -use async_compression::tokio::bufread::ZstdDecoder; -use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level}; use bytes::Bytes; use camino::Utf8Path; use futures::StreamExt; -use nix::NixPath; -use tokio::fs::{File, OpenOptions}; -use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; +use pageserver_api::key::rel_block_to_key; +use tokio::io::{AsyncRead, AsyncReadExt}; use tokio_tar::Archive; -use tokio_tar::Builder; -use tokio_tar::HeaderMode; use tracing::*; use walkdir::WalkDir; use crate::context::RequestContext; use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; -use crate::tenant::remote_timeline_client::INITDB_PATH; use crate::tenant::Timeline; use crate::walingest::WalIngest; +use crate::walrecord::decode_wal_record; use crate::walrecord::DecodedWALRecord; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants; @@ -178,7 +172,10 @@ async fn import_rel( let r = reader.read_exact(&mut buf).await; match r { Ok(_) => { - modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?; + let key = rel_block_to_key(rel, blknum); + if modification.tline.get_shard_identity().is_key_local(&key) { + modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?; + } } // TODO: UnexpectedEof is expected @@ -314,11 +311,13 @@ async fn import_wal( let mut nrecords = 0; let mut modification = tline.begin_modification(last_lsn); - let mut decoded = DecodedWALRecord::default(); while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { + let mut decoded = DecodedWALRecord::default(); + decode_wal_record(recdata, &mut decoded, tline.pg_version)?; + walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx) + .ingest_record(decoded, lsn, &mut modification, ctx) .await?; WAL_INGEST.records_committed.inc(); @@ -453,11 +452,12 @@ pub async fn import_wal_from_tar( waldecoder.feed_bytes(&bytes[offset..]); let mut modification = tline.begin_modification(last_lsn); - let mut decoded = DecodedWALRecord::default(); while last_lsn <= end_lsn { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { + let mut decoded = DecodedWALRecord::default(); + decode_wal_record(recdata, &mut decoded, tline.pg_version)?; walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx) + .ingest_record(decoded, lsn, &mut modification, ctx) .await?; modification.commit(ctx).await?; last_lsn = lsn; @@ -633,65 +633,3 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result reader.read_to_end(&mut buf).await?; Ok(Bytes::from(buf)) } - -pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> { - let file = OpenOptions::new() - .create(true) - .truncate(true) - .read(true) - .write(true) - .open(&tmp_path) - .await - .with_context(|| format!("tempfile creation {tmp_path}"))?; - - let mut paths = Vec::new(); - for entry in WalkDir::new(pgdata_path) { - let entry = entry?; - let metadata = entry.metadata().expect("error getting dir entry metadata"); - // Also allow directories so that we also get empty directories - if !(metadata.is_file() || metadata.is_dir()) { - continue; - } - let path = entry.into_path(); - paths.push(path); - } - // Do a sort to get a more consistent listing - paths.sort_unstable(); - let zstd = ZstdEncoder::with_quality_and_params( - file, - Level::Default, - &[CParameter::enable_long_distance_matching(true)], - ); - let mut builder = Builder::new(zstd); - // Use reproducible header mode - builder.mode(HeaderMode::Deterministic); - for path in paths { - let rel_path = path.strip_prefix(pgdata_path)?; - if rel_path.is_empty() { - // The top directory should not be compressed, - // the tar crate doesn't like that - continue; - } - builder.append_path_with_name(&path, rel_path).await?; - } - let mut zstd = builder.into_inner().await?; - zstd.shutdown().await?; - let mut compressed = zstd.into_inner(); - let compressed_len = compressed.metadata().await?.len(); - const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024; - if compressed_len > INITDB_TAR_ZST_WARN_LIMIT { - warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."); - } - compressed.seek(SeekFrom::Start(0)).await?; - Ok((compressed, compressed_len)) -} - -pub async fn extract_tar_zst( - pgdata_path: &Utf8Path, - tar_zst: impl AsyncBufRead + Unpin, -) -> Result<()> { - let tar = Box::pin(ZstdDecoder::new(tar_zst)); - let mut archive = Archive::new(tar); - archive.unpack(pgdata_path).await?; - Ok(()) -} diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs new file mode 100644 index 0000000000..491c9fb96c --- /dev/null +++ b/pageserver/src/l0_flush.rs @@ -0,0 +1,47 @@ +use std::{num::NonZeroUsize, sync::Arc}; + +#[derive(Debug, PartialEq, Eq, Clone)] +pub enum L0FlushConfig { + Direct { max_concurrency: NonZeroUsize }, +} + +impl Default for L0FlushConfig { + fn default() -> Self { + Self::Direct { + // TODO: using num_cpus results in different peak memory usage on different instance types. + max_concurrency: NonZeroUsize::new(usize::max(1, num_cpus::get())).unwrap(), + } + } +} + +impl From for L0FlushConfig { + fn from(config: pageserver_api::models::L0FlushConfig) -> Self { + match config { + pageserver_api::models::L0FlushConfig::Direct { max_concurrency } => { + Self::Direct { max_concurrency } + } + } + } +} + +#[derive(Clone)] +pub struct L0FlushGlobalState(Arc); + +pub enum Inner { + Direct { semaphore: tokio::sync::Semaphore }, +} + +impl L0FlushGlobalState { + pub fn new(config: L0FlushConfig) -> Self { + match config { + L0FlushConfig::Direct { max_concurrency } => { + let semaphore = tokio::sync::Semaphore::new(max_concurrency.get()); + Self(Arc::new(Inner::Direct { semaphore })) + } + } + } + + pub fn inner(&self) -> &Arc { + &self.0 + } +} diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 26070e0cc1..7a9cf495c7 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -1,3 +1,4 @@ +#![recursion_limit = "300"] #![deny(clippy::undocumented_unsafe_blocks)] mod auth; @@ -10,25 +11,35 @@ pub mod deletion_queue; pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; +pub mod l0_flush; + +use futures::{stream::FuturesUnordered, StreamExt}; pub use pageserver_api::keyspace; +use tokio_util::sync::CancellationToken; +mod assert_u64_eq_usize; +pub mod aux_file; pub mod metrics; pub mod page_cache; pub mod page_service; pub mod pgdatadir_mapping; pub mod repository; +pub mod span; pub(crate) mod statvfs; pub mod task_mgr; pub mod tenant; -pub mod trace; +pub mod utilization; pub mod virtual_file; pub mod walingest; pub mod walrecord; pub mod walredo; -use crate::task_mgr::TaskKind; use camino::Utf8Path; use deletion_queue::DeletionQueue; -use tracing::info; +use tenant::{ + mgr::{BackgroundPurges, TenantManager}, + secondary, +}; +use tracing::{info, info_span}; /// Current storage format version /// @@ -39,7 +50,7 @@ use tracing::info; /// backwards-compatible changes to the metadata format. pub const STORAGE_FORMAT_VERSION: u16 = 3; -pub const DEFAULT_PG_VERSION: u32 = 15; +pub const DEFAULT_PG_VERSION: u32 = 16; // Magic constants used to identify different kinds of files pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; @@ -49,13 +60,113 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); pub use crate::metrics::preinitialize_metrics; +pub struct CancellableTask { + pub task: tokio::task::JoinHandle<()>, + pub cancel: CancellationToken, +} +pub struct HttpEndpointListener(pub CancellableTask); +pub struct ConsumptionMetricsTasks(pub CancellableTask); +pub struct DiskUsageEvictionTask(pub CancellableTask); +impl CancellableTask { + pub async fn shutdown(self) { + self.cancel.cancel(); + self.task.await.unwrap(); + } +} + #[tracing::instrument(skip_all, fields(%exit_code))] -pub async fn shutdown_pageserver(deletion_queue: Option, exit_code: i32) { +#[allow(clippy::too_many_arguments)] +pub async fn shutdown_pageserver( + http_listener: HttpEndpointListener, + page_service: page_service::Listener, + consumption_metrics_worker: ConsumptionMetricsTasks, + disk_usage_eviction_task: Option, + tenant_manager: &TenantManager, + background_purges: BackgroundPurges, + mut deletion_queue: DeletionQueue, + secondary_controller_tasks: secondary::GlobalTasks, + exit_code: i32, +) { use std::time::Duration; + + let started_at = std::time::Instant::now(); + + // If the orderly shutdown below takes too long, we still want to make + // sure that all walredo processes are killed and wait()ed on by us, not systemd. + // + // (Leftover walredo processes are the hypothesized trigger for the systemd freezes + // that we keep seeing in prod => https://github.com/neondatabase/cloud/issues/11387. + // + // We use a thread instead of a tokio task because the background runtime is likely busy + // with the final flushing / uploads. This activity here has priority, and due to lack + // of scheduling priority feature sin the tokio scheduler, using a separate thread is + // an effective priority booster. + let walredo_extraordinary_shutdown_thread_span = { + let span = info_span!(parent: None, "walredo_extraordinary_shutdown_thread"); + span.follows_from(tracing::Span::current()); + span + }; + let walredo_extraordinary_shutdown_thread_cancel = CancellationToken::new(); + let walredo_extraordinary_shutdown_thread = std::thread::spawn({ + let walredo_extraordinary_shutdown_thread_cancel = + walredo_extraordinary_shutdown_thread_cancel.clone(); + move || { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + let _entered = rt.enter(); + let _entered = walredo_extraordinary_shutdown_thread_span.enter(); + if let Ok(()) = rt.block_on(tokio::time::timeout( + Duration::from_secs(8), + walredo_extraordinary_shutdown_thread_cancel.cancelled(), + )) { + info!("cancellation requested"); + return; + } + let managers = tenant::WALREDO_MANAGERS + .lock() + .unwrap() + // prevents new walredo managers from being inserted + .take() + .expect("only we take()"); + // Use FuturesUnordered to get in queue early for each manager's + // heavier_once_cell semaphore wait list. + // Also, for idle tenants that for some reason haven't + // shut down yet, it's quite likely that we're not going + // to get Poll::Pending once. + let mut futs: FuturesUnordered<_> = managers + .into_iter() + .filter_map(|(_, mgr)| mgr.upgrade()) + .map(|mgr| async move { tokio::task::unconstrained(mgr.shutdown()).await }) + .collect(); + info!(count=%futs.len(), "built FuturesUnordered"); + let mut last_log_at = std::time::Instant::now(); + #[derive(Debug, Default)] + struct Results { + initiated: u64, + already: u64, + } + let mut results = Results::default(); + while let Some(we_initiated) = rt.block_on(futs.next()) { + if we_initiated { + results.initiated += 1; + } else { + results.already += 1; + } + if last_log_at.elapsed() > Duration::from_millis(100) { + info!(remaining=%futs.len(), ?results, "progress"); + last_log_at = std::time::Instant::now(); + } + } + info!(?results, "done"); + } + }); + // Shut down the libpq endpoint task. This prevents new connections from // being accepted. - timed( - task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None), + let remaining_connections = timed( + page_service.stop_accepting(), "shutdown LibpqEndpointListener", Duration::from_secs(1), ) @@ -64,7 +175,7 @@ pub async fn shutdown_pageserver(deletion_queue: Option, exit_cod // Shut down all the tenants. This flushes everything to disk and kills // the checkpoint and GC tasks. timed( - tenant::mgr::shutdown_all_tenants(), + tenant_manager.shutdown(), "shutdown all tenants", Duration::from_secs(5), ) @@ -73,27 +184,53 @@ pub async fn shutdown_pageserver(deletion_queue: Option, exit_cod // Shut down any page service tasks: any in-progress work for particular timelines or tenants // should already have been canclled via mgr::shutdown_all_tenants timed( - task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None), + remaining_connections.shutdown(), "shutdown PageRequestHandlers", Duration::from_secs(1), ) .await; // Best effort to persist any outstanding deletions, to avoid leaking objects - if let Some(mut deletion_queue) = deletion_queue { - deletion_queue.shutdown(Duration::from_secs(5)).await; - } + deletion_queue.shutdown(Duration::from_secs(5)).await; + + timed( + consumption_metrics_worker.0.shutdown(), + "shutdown consumption metrics", + Duration::from_secs(1), + ) + .await; + + timed( + futures::future::OptionFuture::from(disk_usage_eviction_task.map(|t| t.0.shutdown())), + "shutdown disk usage eviction", + Duration::from_secs(1), + ) + .await; + + timed( + background_purges.shutdown(), + "shutdown background purges", + Duration::from_secs(1), + ) + .await; // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. // FIXME: We should probably stop accepting commands like attach/detach earlier. timed( - task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None), + http_listener.0.shutdown(), "shutdown http", Duration::from_secs(1), ) .await; + timed( + secondary_controller_tasks.wait(), // cancellation happened in caller + "secondary controller wait", + Duration::from_secs(1), + ) + .await; + // There should be nothing left, but let's be sure timed( task_mgr::shutdown_tasks(None, None, None), @@ -101,44 +238,38 @@ pub async fn shutdown_pageserver(deletion_queue: Option, exit_cod Duration::from_secs(1), ) .await; - info!("Shut down successfully completed"); + + info!("cancel & join walredo_extraordinary_shutdown_thread"); + walredo_extraordinary_shutdown_thread_cancel.cancel(); + walredo_extraordinary_shutdown_thread.join().unwrap(); + info!("walredo_extraordinary_shutdown_thread done"); + + info!( + elapsed_ms = started_at.elapsed().as_millis(), + "Shut down successfully completed" + ); std::process::exit(exit_code); } -/// The name of the metadata file pageserver creates per timeline. -/// Full path: `tenants//timelines//metadata`. -pub const METADATA_FILE_NAME: &str = "metadata"; - /// Per-tenant configuration file. -/// Full path: `tenants//config`. -pub const TENANT_CONFIG_NAME: &str = "config"; - -/// Per-tenant configuration file. -/// Full path: `tenants//config`. -pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1"; +/// Full path: `tenants//config-v1`. +pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1"; /// Per-tenant copy of their remote heatmap, downloaded into the local /// tenant path while in secondary mode. -pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json"; +pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json"; /// A suffix used for various temporary files. Any temporary files found in the /// data directory at pageserver startup can be automatically removed. -pub const TEMP_FILE_SUFFIX: &str = "___temp"; +pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp"; /// A marker file to mark that a timeline directory was not fully initialized. /// If a timeline directory with this marker is encountered at pageserver startup, /// the timeline directory and the marker file are both removed. /// Full path: `tenants//timelines/___uninit`. -pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit"; +pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit"; -pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete"; - -/// A marker file to prevent pageserver from loading a certain tenant on restart. -/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding -/// `ignore` management API command, that expects the ignored tenant to be properly loaded -/// into pageserver's memory before being ignored. -/// Full path: `tenants//___ignored_tenant`. -pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant"; +pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete"; pub fn is_temporary(path: &Utf8Path) -> bool { match path.file_name() { @@ -158,23 +289,14 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool { // from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once // from the name. -pub fn is_uninit_mark(path: &Utf8Path) -> bool { +pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool { ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX) } -pub fn is_delete_mark(path: &Utf8Path) -> bool { +pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool { ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX) } -fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool { - if let Some(e) = e.io_error() { - if e.kind() == std::io::ErrorKind::NotFound { - return true; - } - } - false -} - /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by /// blocking. /// diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 993685db6e..c4011d593c 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,16 +1,16 @@ use enum_map::EnumMap; -use metrics::metric_vec_duration::DurationResultObserver; use metrics::{ register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec, - Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPairVec, - IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, + Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, + IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; -use strum::{EnumCount, IntoEnumIterator, VariantNames}; +use strum::{EnumCount, VariantNames}; use strum_macros::{EnumVariantNames, IntoStaticStr}; +use tracing::warn; use utils::id::TimelineId; /// Prometheus histogram buckets (in seconds) for operations in the critical @@ -51,8 +51,8 @@ pub(crate) enum StorageTimeOperation { #[strum(serialize = "gc")] Gc, - #[strum(serialize = "create tenant")] - CreateTenant, + #[strum(serialize = "find gc cutoffs")] + FindGcCutoffs, } pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy = Lazy::new(|| { @@ -86,77 +86,201 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub(crate) static READ_NUM_FS_LAYERS: Lazy = Lazy::new(|| { +pub(crate) static READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { register_histogram!( - "pageserver_read_num_fs_layers", - "Number of persistent layers accessed for processing a read request, including those in the cache", - vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0], + "pageserver_layers_visited_per_read_global", + "Number of layers visited to reconstruct one key", + vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], + ) + .expect("failed to define a metric") +}); + +pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_layers_visited_per_vectored_read_global", + "Average number of layers visited to reconstruct one key", + vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], ) .expect("failed to define a metric") }); // Metrics collected on operations on the storage repository. +#[derive( + Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr, +)] +pub(crate) enum GetKind { + Singular, + Vectored, +} pub(crate) struct ReconstructTimeMetrics { - ok: Histogram, - err: Histogram, + singular: Histogram, + vectored: Histogram, } pub(crate) static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { let inner = register_histogram_vec!( "pageserver_getpage_reconstruct_seconds", "Time spent in reconstruct_value (reconstruct a page from deltas)", - &["result"], + &["get_kind"], CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric"); + ReconstructTimeMetrics { - ok: inner.get_metric_with_label_values(&["ok"]).unwrap(), - err: inner.get_metric_with_label_values(&["err"]).unwrap(), + singular: inner.with_label_values(&[GetKind::Singular.into()]), + vectored: inner.with_label_values(&[GetKind::Vectored.into()]), } }); impl ReconstructTimeMetrics { - pub(crate) fn for_result(&self, result: &Result) -> &Histogram { - match result { - Ok(_) => &self.ok, - Err(_) => &self.err, + pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram { + match get_kind { + GetKind::Singular => &self.singular, + GetKind::Vectored => &self.vectored, } } } -pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy = Lazy::new(|| { - register_int_counter!( - "pageserver_materialized_cache_hits_direct_total", - "Number of cache hits from materialized page cache without redo", - ) - .expect("failed to define a metric") -}); +pub(crate) struct ReconstructDataTimeMetrics { + singular: Histogram, + vectored: Histogram, +} -pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { - register_histogram!( +impl ReconstructDataTimeMetrics { + pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram { + match get_kind { + GetKind::Singular => &self.singular, + GetKind::Vectored => &self.vectored, + } + } +} + +pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { + let inner = register_histogram_vec!( "pageserver_getpage_get_reconstruct_data_seconds", "Time spent in get_reconstruct_value_data", + &["get_kind"], CRITICAL_OP_BUCKETS.into(), ) - .expect("failed to define a metric") + .expect("failed to define a metric"); + + ReconstructDataTimeMetrics { + singular: inner.with_label_values(&[GetKind::Singular.into()]), + vectored: inner.with_label_values(&[GetKind::Vectored.into()]), + } }); -pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { - register_int_counter!( - "pageserver_materialized_cache_hits_total", - "Number of cache hits from materialized page cache", +pub(crate) struct GetVectoredLatency { + map: EnumMap>, +} + +#[allow(dead_code)] +pub(crate) struct ScanLatency { + map: EnumMap>, +} + +impl GetVectoredLatency { + // Only these task types perform vectored gets. Filter all other tasks out to reduce total + // cardinality of the metric. + const TRACKED_TASK_KINDS: [TaskKind; 2] = [TaskKind::Compaction, TaskKind::PageRequestHandler]; + + pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> { + self.map[task_kind].as_ref() + } +} + +impl ScanLatency { + // Only these task types perform vectored gets. Filter all other tasks out to reduce total + // cardinality of the metric. + const TRACKED_TASK_KINDS: [TaskKind; 1] = [TaskKind::PageRequestHandler]; + + pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> { + self.map[task_kind].as_ref() + } +} + +pub(crate) struct ScanLatencyOngoingRecording<'a> { + parent: &'a Histogram, + start: std::time::Instant, +} + +impl<'a> ScanLatencyOngoingRecording<'a> { + pub(crate) fn start_recording(parent: &'a Histogram) -> ScanLatencyOngoingRecording<'a> { + let start = Instant::now(); + ScanLatencyOngoingRecording { parent, start } + } + + pub(crate) fn observe(self, throttled: Option) { + let elapsed = self.start.elapsed(); + let ex_throttled = if let Some(throttled) = throttled { + elapsed.checked_sub(throttled) + } else { + Some(elapsed) + }; + if let Some(ex_throttled) = ex_throttled { + self.parent.observe(ex_throttled.as_secs_f64()); + } else { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!("error deducting time spent throttled; this message is logged at a global rate limit"); + }); + } + } +} + +pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| { + let inner = register_histogram_vec!( + "pageserver_get_vectored_seconds", + "Time spent in get_vectored, excluding time spent in timeline_get_throttle.", + &["task_kind"], + CRITICAL_OP_BUCKETS.into(), ) - .expect("failed to define a metric") + .expect("failed to define a metric"); + + GetVectoredLatency { + map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| { + let task_kind = ::from_usize(task_kind_idx); + + if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) { + let task_kind = task_kind.into(); + Some(inner.with_label_values(&[task_kind])) + } else { + None + } + })), + } +}); + +pub(crate) static SCAN_LATENCY: Lazy = Lazy::new(|| { + let inner = register_histogram_vec!( + "pageserver_scan_seconds", + "Time spent in scan, excluding time spent in timeline_get_throttle.", + &["task_kind"], + CRITICAL_OP_BUCKETS.into(), + ) + .expect("failed to define a metric"); + + ScanLatency { + map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| { + let task_kind = ::from_usize(task_kind_idx); + + if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) { + let task_kind = task_kind.into(); + Some(inner.with_label_values(&[task_kind])) + } else { + None + } + })), + } }); pub(crate) struct PageCacheMetricsForTaskKind { - pub read_accesses_materialized_page: IntCounter, pub read_accesses_immutable: IntCounter, - pub read_hits_immutable: IntCounter, - pub read_hits_materialized_page_exact: IntCounter, - pub read_hits_materialized_page_older_lsn: IntCounter, } pub(crate) struct PageCacheMetrics { @@ -189,16 +313,6 @@ pub(crate) static PAGE_CACHE: Lazy = Lazy::new(|| PageCacheMet let content_kind = ::from_usize(content_kind); let content_kind: &'static str = content_kind.into(); PageCacheMetricsForTaskKind { - read_accesses_materialized_page: { - PAGE_CACHE_READ_ACCESSES - .get_metric_with_label_values(&[ - task_kind, - "materialized_page", - content_kind, - ]) - .unwrap() - }, - read_accesses_immutable: { PAGE_CACHE_READ_ACCESSES .get_metric_with_label_values(&[task_kind, "immutable", content_kind]) @@ -210,28 +324,6 @@ pub(crate) static PAGE_CACHE: Lazy = Lazy::new(|| PageCacheMet .get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"]) .unwrap() }, - - read_hits_materialized_page_exact: { - PAGE_CACHE_READ_HITS - .get_metric_with_label_values(&[ - task_kind, - "materialized_page", - content_kind, - "exact", - ]) - .unwrap() - }, - - read_hits_materialized_page_older_lsn: { - PAGE_CACHE_READ_HITS - .get_metric_with_label_values(&[ - task_kind, - "materialized_page", - content_kind, - "older_lsn", - ]) - .unwrap() - }, } })) })), @@ -247,7 +339,6 @@ pub(crate) struct PageCacheSizeMetrics { pub max_bytes: UIntGauge, pub current_bytes_immutable: UIntGauge, - pub current_bytes_materialized_page: UIntGauge, } static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy = Lazy::new(|| { @@ -273,11 +364,6 @@ pub(crate) static PAGE_CACHE_SIZE: Lazy = .get_metric_with_label_values(&["immutable"]) .unwrap() }, - current_bytes_materialized_page: { - PAGE_CACHE_SIZE_CURRENT_BYTES - .get_metric_with_label_values(&["materialized_page"]) - .unwrap() - }, }); pub(crate) mod page_cache_eviction_metrics { @@ -378,9 +464,70 @@ static LAST_RECORD_LSN: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static PITR_HISTORY_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_pitr_history_size", + "Data written since PITR cutoff on this timeline", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)] +#[strum(serialize_all = "kebab_case")] +pub(crate) enum MetricLayerKind { + Delta, + Image, +} + +static TIMELINE_LAYER_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_layer_bytes", + "Sum of layer physical sizes in bytes", + &["tenant_id", "shard_id", "timeline_id", "kind"] + ) + .expect("failed to define a metric") +}); + +static TIMELINE_LAYER_COUNT: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_layer_count", + "Number of layers that exist", + &["tenant_id", "shard_id", "timeline_id", "kind"] + ) + .expect("failed to define a metric") +}); + +static TIMELINE_ARCHIVE_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_archive_size", + "Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static STANDBY_HORIZON: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_standby_horizon", + "Standby apply LSN for which GC is hold off, by timeline.", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + static RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_resident_physical_size", + "The size of the layer files present in the pageserver's filesystem, for attached locations.", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static VISIBLE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_visible_physical_size", "The size of the layer files present in the pageserver's filesystem.", &["tenant_id", "shard_id", "timeline_id"] ) @@ -398,7 +545,7 @@ pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy = Lazy::new(|| static REMOTE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_remote_physical_size", - "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.", + "The size of the layer files present in the remote storage that are listed in the remote index_part.json.", // Corollary: If any files are missing from the index part, they won't be included here. &["tenant_id", "shard_id", "timeline_id"] ) @@ -438,6 +585,72 @@ static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define current logical size metric") }); +static AUX_FILE_SIZE: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_aux_file_estimated_size", + "The size of all aux files for a timeline in aux file v2 store.", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static VALID_LSN_LEASE_COUNT: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_valid_lsn_lease_count", + "The number of valid leases after refreshing gc info.", + &["tenant_id", "shard_id", "timeline_id"], + ) + .expect("failed to define a metric") +}); + +pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_circuit_breaker_broken", + "How many times a circuit breaker has broken" + ) + .expect("failed to define a metric") +}); + +pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_circuit_breaker_unbroken", + "How many times a circuit breaker has been un-broken (recovered)" + ) + .expect("failed to define a metric") +}); + +pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_compression_image_in_bytes_total", + "Size of data written into image layers before compression" + ) + .expect("failed to define a metric") +}); + +pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_compression_image_in_bytes_considered", + "Size of potentially compressible data written into image layers before compression" + ) + .expect("failed to define a metric") +}); + +pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_compression_image_in_bytes_chosen", + "Size of data whose compressed form was written into image layers" + ) + .expect("failed to define a metric") +}); + +pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_compression_image_out_bytes_total", + "Size of compressed image layer written" + ) + .expect("failed to define a metric") +}); + pub(crate) mod initial_logical_size { use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; use once_cell::sync::Lazy; @@ -565,6 +778,15 @@ pub(crate) mod initial_logical_size { }); } +static DIRECTORY_ENTRIES_COUNT: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_directory_entries_count", + "Sum of the entries in pageserver-stored directory listings", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + pub(crate) static TENANT_STATE_METRIC: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_tenant_states_count", @@ -596,26 +818,6 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy = Lazy::new(| .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric") }); -// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, -// or in testing they estimate how much we would upload if we did. -static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "pageserver_created_persistent_files_total", - "Number of files created that are meant to be uploaded to cloud storage", - &["tenant_id", "shard_id", "timeline_id"] - ) - .expect("failed to define a metric") -}); - -static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "pageserver_written_persistent_bytes_total", - "Total bytes written that are meant to be uploaded to cloud storage", - &["tenant_id", "shard_id", "timeline_id"] - ) - .expect("failed to define a metric") -}); - pub(crate) static EVICTION_ITERATION_DURATION: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_eviction_iteration_duration_seconds_global", @@ -673,6 +875,14 @@ pub static STARTUP_IS_LOADING: Lazy = Lazy::new(|| { .expect("Failed to register pageserver_startup_is_loading") }); +pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy = Lazy::new(|| { + register_uint_gauge!( + "pageserver_timeline_ephemeral_bytes", + "Total number of bytes in ephemeral layers, summed for all timelines. Approximate, lazily updated." + ) + .expect("Failed to register metric") +}); + /// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things /// like how long it took to load. /// @@ -932,6 +1142,7 @@ pub(crate) static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +#[cfg(not(test))] pub(crate) mod virtual_file_descriptor_cache { use super::*; @@ -951,28 +1162,60 @@ pub(crate) mod virtual_file_descriptor_cache { // ``` } -#[derive(Debug)] -struct GlobalAndPerTimelineHistogram { - global: Histogram, - per_tenant_timeline: Histogram, +#[cfg(not(test))] +pub(crate) mod virtual_file_io_engine { + use super::*; + + pub(crate) static KIND: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_virtual_file_io_engine_kind", + "The configured io engine for VirtualFile", + &["kind"], + ) + .unwrap() + }); } -impl GlobalAndPerTimelineHistogram { - fn observe(&self, value: f64) { - self.global.observe(value); - self.per_tenant_timeline.observe(value); - } -} +struct GlobalAndPerTimelineHistogramTimer<'a, 'c> { + global_metric: &'a Histogram, -struct GlobalAndPerTimelineHistogramTimer<'a> { - h: &'a GlobalAndPerTimelineHistogram, + // Optional because not all op types are tracked per-timeline + timeline_metric: Option<&'a Histogram>, + + ctx: &'c RequestContext, start: std::time::Instant, + op: SmgrQueryType, } -impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> { +impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> { fn drop(&mut self) { let elapsed = self.start.elapsed(); - self.h.observe(elapsed.as_secs_f64()); + let ex_throttled = self + .ctx + .micros_spent_throttled + .close_and_checked_sub_from(elapsed); + let ex_throttled = match ex_throttled { + Ok(res) => res, + Err(error) => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy>> = + Lazy::new(|| { + Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| { + RateLimit::new(Duration::from_secs(10)) + }))) + }); + let mut guard = LOGGED.lock().unwrap(); + let rate_limit = &mut guard[self.op]; + rate_limit.call(|| { + warn!(op=?self.op, error, "error deducting time spent throttled; this message is logged at a global rate limit"); + }); + elapsed + } + }; + self.global_metric.observe(ex_throttled.as_secs_f64()); + if let Some(timeline_metric) = self.timeline_metric { + timeline_metric.observe(ex_throttled.as_secs_f64()); + } } } @@ -984,6 +1227,7 @@ impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> { strum_macros::EnumCount, strum_macros::EnumIter, strum_macros::FromRepr, + enum_map::Enum, )] #[strum(serialize_all = "snake_case")] pub enum SmgrQueryType { @@ -991,11 +1235,13 @@ pub enum SmgrQueryType { GetRelSize, GetPageAtLsn, GetDbSize, + GetSlruSegment, } #[derive(Debug)] pub(crate) struct SmgrQueryTimePerTimeline { - metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT], + global_metrics: [Histogram; SmgrQueryType::COUNT], + per_timeline_getpage: Histogram, } static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { @@ -1073,28 +1319,65 @@ impl SmgrQueryTimePerTimeline { let tenant_id = tenant_shard_id.tenant_id.to_string(); let shard_slug = format!("{}", tenant_shard_id.shard_slug()); let timeline_id = timeline_id.to_string(); - let metrics = std::array::from_fn(|i| { + let global_metrics = std::array::from_fn(|i| { let op = SmgrQueryType::from_repr(i).unwrap(); - let global = SMGR_QUERY_TIME_GLOBAL + SMGR_QUERY_TIME_GLOBAL .get_metric_with_label_values(&[op.into()]) - .unwrap(); - let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE - .get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id]) - .unwrap(); - GlobalAndPerTimelineHistogram { - global, - per_tenant_timeline, - } + .unwrap() }); - Self { metrics } - } - pub(crate) fn start_timer(&self, op: SmgrQueryType) -> impl Drop + '_ { - let metric = &self.metrics[op as usize]; - GlobalAndPerTimelineHistogramTimer { - h: metric, - start: std::time::Instant::now(), + + let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE + .get_metric_with_label_values(&[ + SmgrQueryType::GetPageAtLsn.into(), + &tenant_id, + &shard_slug, + &timeline_id, + ]) + .unwrap(); + Self { + global_metrics, + per_timeline_getpage, } } + pub(crate) fn start_timer<'c: 'a, 'a>( + &'a self, + op: SmgrQueryType, + ctx: &'c RequestContext, + ) -> Option { + let global_metric = &self.global_metrics[op as usize]; + let start = Instant::now(); + match ctx.micros_spent_throttled.open() { + Ok(()) => (), + Err(error) => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy>> = + Lazy::new(|| { + Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| { + RateLimit::new(Duration::from_secs(10)) + }))) + }); + let mut guard = LOGGED.lock().unwrap(); + let rate_limit = &mut guard[op]; + rate_limit.call(|| { + warn!(?op, error, "error opening micros_spent_throttled; this message is logged at a global rate limit"); + }); + } + } + + let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) { + Some(&self.per_timeline_getpage) + } else { + None + }; + + Some(GlobalAndPerTimelineHistogramTimer { + global_metric, + timeline_metric, + ctx, + start, + op, + }) + } } #[cfg(test)] @@ -1103,15 +1386,21 @@ mod smgr_query_time_tests { use strum::IntoEnumIterator; use utils::id::{TenantId, TimelineId}; + use crate::{ + context::{DownloadBehavior, RequestContext}, + task_mgr::TaskKind, + }; + // Regression test, we used hard-coded string constants before using an enum. #[test] fn op_label_name() { use super::SmgrQueryType::*; - let expect: [(super::SmgrQueryType, &'static str); 4] = [ + let expect: [(super::SmgrQueryType, &'static str); 5] = [ (GetRelExists, "get_rel_exists"), (GetRelSize, "get_rel_size"), (GetPageAtLsn, "get_page_at_lsn"), (GetDbSize, "get_db_size"), + (GetSlruSegment, "get_slru_segment"), ]; for (op, expect) in expect { let actual: &'static str = op.into(); @@ -1134,27 +1423,25 @@ mod smgr_query_time_tests { let get_counts = || { let global: u64 = ops .iter() - .map(|op| metrics.metrics[*op as usize].global.get_sample_count()) + .map(|op| metrics.global_metrics[*op as usize].get_sample_count()) .sum(); - let per_tenant_timeline: u64 = ops - .iter() - .map(|op| { - metrics.metrics[*op as usize] - .per_tenant_timeline - .get_sample_count() - }) - .sum(); - (global, per_tenant_timeline) + (global, metrics.per_timeline_getpage.get_sample_count()) }; let (pre_global, pre_per_tenant_timeline) = get_counts(); assert_eq!(pre_per_tenant_timeline, 0); - let timer = metrics.start_timer(*op); + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download); + let timer = metrics.start_timer(*op, &ctx); drop(timer); let (post_global, post_per_tenant_timeline) = get_counts(); - assert_eq!(post_per_tenant_timeline, 1); + if matches!(op, super::SmgrQueryType::GetPageAtLsn) { + // getpage ops are tracked per-timeline, others aren't + assert_eq!(post_per_tenant_timeline, 1); + } else { + assert_eq!(post_per_tenant_timeline, 0); + } assert!(post_global > pre_global); } } @@ -1171,45 +1458,140 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| { .map(|ms| (ms as f64) / 1000.0) }); -pub(crate) struct BasebackupQueryTime(HistogramVec); +pub(crate) struct BasebackupQueryTime { + ok: Histogram, + error: Histogram, +} + pub(crate) static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(|| { - BasebackupQueryTime({ - register_histogram_vec!( - "pageserver_basebackup_query_seconds", - "Histogram of basebackup queries durations, by result type", - &["result"], - COMPUTE_STARTUP_BUCKETS.to_vec(), - ) - .expect("failed to define a metric") - }) + let vec = register_histogram_vec!( + "pageserver_basebackup_query_seconds", + "Histogram of basebackup queries durations, by result type", + &["result"], + COMPUTE_STARTUP_BUCKETS.to_vec(), + ) + .expect("failed to define a metric"); + BasebackupQueryTime { + ok: vec.get_metric_with_label_values(&["ok"]).unwrap(), + error: vec.get_metric_with_label_values(&["error"]).unwrap(), + } }); -impl DurationResultObserver for BasebackupQueryTime { - fn observe_result(&self, res: &Result, duration: std::time::Duration) { - let label_value = if res.is_ok() { "ok" } else { "error" }; - let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap(); - metric.observe(duration.as_secs_f64()); +pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> { + parent: &'a BasebackupQueryTime, + ctx: &'c RequestContext, + start: std::time::Instant, +} + +impl BasebackupQueryTime { + pub(crate) fn start_recording<'c: 'a, 'a>( + &'a self, + ctx: &'c RequestContext, + ) -> BasebackupQueryTimeOngoingRecording<'_, '_> { + let start = Instant::now(); + match ctx.micros_spent_throttled.open() { + Ok(()) => (), + Err(error) => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit"); + }); + } + } + BasebackupQueryTimeOngoingRecording { + parent: self, + ctx, + start, + } } } -pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "pageserver_live_connections", - "Number of live network connections", +impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> { + pub(crate) fn observe(self, res: &Result) { + let elapsed = self.start.elapsed(); + let ex_throttled = self + .ctx + .micros_spent_throttled + .close_and_checked_sub_from(elapsed); + let ex_throttled = match ex_throttled { + Ok(ex_throttled) => ex_throttled, + Err(error) => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit"); + }); + elapsed + } + }; + let metric = if res.is_ok() { + &self.parent.ok + } else { + &self.parent.error + }; + metric.observe(ex_throttled.as_secs_f64()); + } +} + +pub(crate) static LIVE_CONNECTIONS: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( + "pageserver_live_connections_started", + "Number of network connections that we started handling", + "pageserver_live_connections_finished", + "Number of network connections that we finished handling", &["pageserver_connection_kind"] ) .expect("failed to define a metric") }); +#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)] +pub(crate) enum ComputeCommandKind { + PageStreamV2, + Basebackup, + Fullbackup, + LeaseLsn, +} + +pub(crate) struct ComputeCommandCounters { + map: EnumMap, +} + +pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy = Lazy::new(|| { + let inner = register_int_counter_vec!( + "pageserver_compute_commands", + "Number of compute -> pageserver commands processed", + &["command"] + ) + .expect("failed to define a metric"); + + ComputeCommandCounters { + map: EnumMap::from_array(std::array::from_fn(|i| { + let command = ::from_usize(i); + let command_str: &'static str = command.into(); + inner.with_label_values(&[command_str]) + })), + } +}); + +impl ComputeCommandCounters { + pub(crate) fn for_command(&self, command: ComputeCommandKind) -> &IntCounter { + &self.map[command] + } +} + // remote storage metrics -/// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`]. -static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "pageserver_remote_timeline_client_calls_unfinished", - "Number of ongoing calls to remote timeline client. \ - Used to populate pageserver_remote_timeline_client_calls_started. \ - This metric is not useful for sampling from Prometheus, but useful in tests.", +static REMOTE_TIMELINE_CLIENT_CALLS: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( + "pageserver_remote_timeline_client_calls_started", + "Number of started calls to remote timeline client.", + "pageserver_remote_timeline_client_calls_finished", + "Number of finshed calls to remote timeline client.", &[ "tenant_id", "shard_id", @@ -1218,23 +1600,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy = Lazy:: "op_kind" ], ) - .expect("failed to define a metric") -}); - -static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy = Lazy::new(|| { - register_histogram_vec!( - "pageserver_remote_timeline_client_calls_started", - "When calling a remote timeline client method, we record the current value \ - of the calls_unfinished gauge in this histogram. Plot the histogram \ - over time in a heatmap to visualize how many operations were ongoing \ - at a given instant. It gives you a better idea of the queue depth \ - than plotting the gauge directly, since operations may complete faster \ - than the sampling interval.", - &["file_kind", "op_kind"], - // The calls_unfinished gauge is an integer gauge, hence we have integer buckets. - vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0], - ) - .expect("failed to define a metric") + .unwrap() }); static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy = @@ -1259,29 +1625,80 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy = Lazy }); pub(crate) struct TenantManagerMetrics { - pub(crate) tenant_slots: UIntGauge, + tenant_slots_attached: UIntGauge, + tenant_slots_secondary: UIntGauge, + tenant_slots_inprogress: UIntGauge, pub(crate) tenant_slot_writes: IntCounter, pub(crate) unexpected_errors: IntCounter, } +impl TenantManagerMetrics { + /// Helpers for tracking slots. Note that these do not track the lifetime of TenantSlot objects + /// exactly: they track the lifetime of the slots _in the tenant map_. + pub(crate) fn slot_inserted(&self, slot: &TenantSlot) { + match slot { + TenantSlot::Attached(_) => { + self.tenant_slots_attached.inc(); + } + TenantSlot::Secondary(_) => { + self.tenant_slots_secondary.inc(); + } + TenantSlot::InProgress(_) => { + self.tenant_slots_inprogress.inc(); + } + } + } + + pub(crate) fn slot_removed(&self, slot: &TenantSlot) { + match slot { + TenantSlot::Attached(_) => { + self.tenant_slots_attached.dec(); + } + TenantSlot::Secondary(_) => { + self.tenant_slots_secondary.dec(); + } + TenantSlot::InProgress(_) => { + self.tenant_slots_inprogress.dec(); + } + } + } + + #[cfg(all(debug_assertions, not(test)))] + pub(crate) fn slots_total(&self) -> u64 { + self.tenant_slots_attached.get() + + self.tenant_slots_secondary.get() + + self.tenant_slots_inprogress.get() + } +} + pub(crate) static TENANT_MANAGER: Lazy = Lazy::new(|| { - TenantManagerMetrics { - tenant_slots: register_uint_gauge!( + let tenant_slots = register_uint_gauge_vec!( "pageserver_tenant_manager_slots", "How many slots currently exist, including all attached, secondary and in-progress operations", + &["mode"] ) - .expect("failed to define a metric"), - tenant_slot_writes: register_int_counter!( - "pageserver_tenant_manager_slot_writes", - "Writes to a tenant slot, including all of create/attach/detach/delete" - ) - .expect("failed to define a metric"), - unexpected_errors: register_int_counter!( - "pageserver_tenant_manager_unexpected_errors_total", - "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug." - ) - .expect("failed to define a metric"), -} + .expect("failed to define a metric"); + TenantManagerMetrics { + tenant_slots_attached: tenant_slots + .get_metric_with_label_values(&["attached"]) + .unwrap(), + tenant_slots_secondary: tenant_slots + .get_metric_with_label_values(&["secondary"]) + .unwrap(), + tenant_slots_inprogress: tenant_slots + .get_metric_with_label_values(&["inprogress"]) + .unwrap(), + tenant_slot_writes: register_int_counter!( + "pageserver_tenant_manager_slot_writes", + "Writes to a tenant slot, including all of create/attach/detach/delete" + ) + .expect("failed to define a metric"), + unexpected_errors: register_int_counter!( + "pageserver_tenant_manager_unexpected_errors_total", + "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug." + ) + .expect("failed to define a metric"), + } }); pub(crate) struct DeletionQueueMetrics { @@ -1339,29 +1756,6 @@ pub(crate) static DELETION_QUEUE: Lazy = Lazy::new(|| { } }); -pub(crate) struct WalIngestMetrics { - pub(crate) records_received: IntCounter, - pub(crate) records_committed: IntCounter, - pub(crate) records_filtered: IntCounter, -} - -pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMetrics { - records_received: register_int_counter!( - "pageserver_wal_ingest_records_received", - "Number of WAL records received from safekeepers" - ) - .expect("failed to define a metric"), - records_committed: register_int_counter!( - "pageserver_wal_ingest_records_committed", - "Number of WAL records which resulted in writes to pageserver storage" - ) - .expect("failed to define a metric"), - records_filtered: register_int_counter!( - "pageserver_wal_ingest_records_filtered", - "Number of WAL records filtered out due to sharding" - ) - .expect("failed to define a metric"), -}); pub(crate) struct SecondaryModeMetrics { pub(crate) upload_heatmap: IntCounter, pub(crate) upload_heatmap_errors: IntCounter, @@ -1369,7 +1763,8 @@ pub(crate) struct SecondaryModeMetrics { pub(crate) download_heatmap: IntCounter, pub(crate) download_layer: IntCounter, } -pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| SecondaryModeMetrics { +pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| { + SecondaryModeMetrics { upload_heatmap: register_int_counter!( "pageserver_secondary_upload_heatmap", "Number of heatmaps written to remote storage by attached tenants" @@ -1387,7 +1782,7 @@ pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| Seco .expect("failed to define a metric"), download_heatmap: register_int_counter!( "pageserver_secondary_download_heatmap", - "Number of downloads of heatmaps by secondary mode locations" + "Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed" ) .expect("failed to define a metric"), download_layer: register_int_counter!( @@ -1395,6 +1790,33 @@ pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| Seco "Number of downloads of layers by secondary mode locations" ) .expect("failed to define a metric"), +} +}); + +pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_secondary_resident_physical_size", + "The size of the layer files present in the pageserver's filesystem, for secondary locations.", + &["tenant_id", "shard_id"] + ) + .expect("failed to define a metric") +}); + +pub(crate) static NODE_UTILIZATION_SCORE: Lazy = Lazy::new(|| { + register_uint_gauge!( + "pageserver_utilization_score", + "The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded", + ) + .expect("failed to define a metric") +}); + +pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_secondary_heatmap_total_size", + "The total size in bytes of all layers in the most recently downloaded heatmap.", + &["tenant_id", "shard_id"] + ) + .expect("failed to define a metric") }); #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -1447,16 +1869,64 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { .expect("Failed to register tenant_task_events metric") }); -pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "pageserver_background_loop_semaphore_wait_start_count", - "Counter for background loop concurrency-limiting semaphore acquire calls started", - "pageserver_background_loop_semaphore_wait_finish_count", - "Counter for background loop concurrency-limiting semaphore acquire calls finished", - &["task"], - ) - .unwrap() -}); +pub struct BackgroundLoopSemaphoreMetrics { + counters: EnumMap, + durations: EnumMap, +} + +pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy = Lazy::new( + || { + let counters = register_int_counter_pair_vec!( + "pageserver_background_loop_semaphore_wait_start_count", + "Counter for background loop concurrency-limiting semaphore acquire calls started", + "pageserver_background_loop_semaphore_wait_finish_count", + "Counter for background loop concurrency-limiting semaphore acquire calls finished", + &["task"], + ) + .unwrap(); + + let durations = register_counter_vec!( + "pageserver_background_loop_semaphore_wait_duration_seconds", + "Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls", + &["task"], + ) + .unwrap(); + + BackgroundLoopSemaphoreMetrics { + counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| { + let kind = ::from_usize(i); + counters.with_label_values(&[kind.into()]) + })), + durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| { + let kind = ::from_usize(i); + durations.with_label_values(&[kind.into()]) + })), + } + }, +); + +impl BackgroundLoopSemaphoreMetrics { + pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ { + struct Record<'a> { + metrics: &'a BackgroundLoopSemaphoreMetrics, + task: BackgroundLoopKind, + _counter_guard: metrics::IntCounterPairGuard, + start: Instant, + } + impl Drop for Record<'_> { + fn drop(&mut self) { + let elapsed = self.start.elapsed().as_secs_f64(); + self.metrics.durations[self.task].inc_by(elapsed); + } + } + Record { + metrics: self, + task, + _counter_guard: self.counters[task].guard(), + start: Instant::now(), + } + } +} pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy = Lazy::new(|| { register_int_counter_vec!( @@ -1561,6 +2031,36 @@ macro_rules! redo_bytes_histogram_count_buckets { }; } +pub(crate) struct WalIngestMetrics { + pub(crate) bytes_received: IntCounter, + pub(crate) records_received: IntCounter, + pub(crate) records_committed: IntCounter, + pub(crate) records_filtered: IntCounter, +} + +pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMetrics { + bytes_received: register_int_counter!( + "pageserver_wal_ingest_bytes_received", + "Bytes of WAL ingested from safekeepers", + ) + .unwrap(), + records_received: register_int_counter!( + "pageserver_wal_ingest_records_received", + "Number of WAL records received from safekeepers" + ) + .expect("failed to define a metric"), + records_committed: register_int_counter!( + "pageserver_wal_ingest_records_committed", + "Number of WAL records which resulted in writes to pageserver storage" + ) + .expect("failed to define a metric"), + records_filtered: register_int_counter!( + "pageserver_wal_ingest_records_filtered", + "Number of WAL records filtered out due to sharding" + ) + .expect("failed to define a metric"), +}); + pub(crate) static WAL_REDO_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_seconds", @@ -1597,11 +2097,18 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy = Lazy::new(|| { .unwrap() }); +#[rustfmt::skip] pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_process_launch_duration", "Histogram of the duration of successful WalRedoProcess::launch calls", - redo_histogram_time_buckets!(), + vec![ + 0.0002, 0.0004, 0.0006, 0.0008, 0.0010, + 0.0020, 0.0040, 0.0060, 0.0080, 0.0100, + 0.0200, 0.0400, 0.0600, 0.0800, 0.1000, + 0.2000, 0.4000, 0.6000, 0.8000, 1.0000, + 1.5000, 2.0000, 2.5000, 3.0000, 4.0000, 10.0000 + ], ) .expect("failed to define a metric") }); @@ -1684,6 +2191,22 @@ impl StorageTimeMetricsTimer { self.metrics.timeline_count.inc(); self.metrics.global_histogram.observe(duration); } + + /// Turns this timer into a timer, which will always record -- usually this means recording + /// regardless an early `?` path was taken in a function. + pub(crate) fn record_on_drop(self) -> AlwaysRecordingStorageTimeMetricsTimer { + AlwaysRecordingStorageTimeMetricsTimer(Some(self)) + } +} + +pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option); + +impl Drop for AlwaysRecordingStorageTimeMetricsTimer { + fn drop(&mut self) { + if let Some(inner) = self.0.take() { + inner.stop_and_record(); + } + } } /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and @@ -1744,25 +2267,37 @@ pub(crate) struct TimelineMetrics { pub imitate_logical_size_histo: StorageTimeMetrics, pub load_layer_map_histo: StorageTimeMetrics, pub garbage_collect_histo: StorageTimeMetrics, + pub find_gc_cutoffs_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, - resident_physical_size_gauge: UIntGauge, + pub pitr_history_size: UIntGauge, + pub archival_size: UIntGauge, + pub(crate) layer_size_image: UIntGauge, + pub(crate) layer_count_image: UIntGauge, + pub(crate) layer_size_delta: UIntGauge, + pub(crate) layer_count_delta: UIntGauge, + pub standby_horizon_gauge: IntGauge, + pub resident_physical_size_gauge: UIntGauge, + pub visible_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, - pub num_persistent_files_created: IntCounter, - pub persistent_bytes_written: IntCounter, + pub aux_file_size_gauge: IntGauge, + pub directory_entries_count_gauge: Lazy UIntGauge>>, pub evictions: IntCounter, pub evictions_with_low_residence_duration: std::sync::RwLock, + /// Number of valid LSN leases. + pub valid_lsn_lease_count_gauge: UIntGauge, + shutdown: std::sync::atomic::AtomicBool, } impl TimelineMetrics { pub fn new( tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, + timeline_id_raw: &TimelineId, evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder, ) -> Self { let tenant_id = tenant_shard_id.tenant_id.to_string(); let shard_id = format!("{}", tenant_shard_id.shard_slug()); - let timeline_id = timeline_id.to_string(); + let timeline_id = timeline_id_raw.to_string(); let flush_time_histo = StorageTimeMetrics::new( StorageTimeOperation::LayerFlush, &tenant_id, @@ -1805,28 +2340,102 @@ impl TimelineMetrics { &shard_id, &timeline_id, ); + let find_gc_cutoffs_histo = StorageTimeMetrics::new( + StorageTimeOperation::FindGcCutoffs, + &tenant_id, + &shard_id, + &timeline_id, + ); let last_record_gauge = LAST_RECORD_LSN .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + + let pitr_history_size = PITR_HISTORY_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + + let archival_size = TIMELINE_ARCHIVE_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + + let layer_size_image = TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Image.into(), + ]) + .unwrap(); + + let layer_count_image = TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Image.into(), + ]) + .unwrap(); + + let layer_size_delta = TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Delta.into(), + ]) + .unwrap(); + + let layer_count_delta = TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&[ + &tenant_id, + &shard_id, + &timeline_id, + MetricLayerKind::Delta.into(), + ]) + .unwrap(); + + let standby_horizon_gauge = STANDBY_HORIZON + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); // TODO: we shouldn't expose this metric let current_logical_size_gauge = CURRENT_LOGICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); - let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED - .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) - .unwrap(); - let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN + let aux_file_size_gauge = AUX_FILE_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + // TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065 + let directory_entries_count_gauge_closure = { + let tenant_shard_id = *tenant_shard_id; + let timeline_id_raw = *timeline_id_raw; + move || { + let tenant_id = tenant_shard_id.tenant_id.to_string(); + let shard_id = format!("{}", tenant_shard_id.shard_slug()); + let timeline_id = timeline_id_raw.to_string(); + let gauge: UIntGauge = DIRECTORY_ENTRIES_COUNT + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + gauge + } + }; + let directory_entries_count_gauge: Lazy UIntGauge>> = + Lazy::new(Box::new(directory_entries_count_gauge_closure)); let evictions = EVICTIONS .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder .build(&tenant_id, &shard_id, &timeline_id); + let valid_lsn_lease_count_gauge = VALID_LSN_LEASE_COUNT + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + TimelineMetrics { tenant_id, shard_id, @@ -1837,23 +2446,32 @@ impl TimelineMetrics { logical_size_histo, imitate_logical_size_histo, garbage_collect_histo, + find_gc_cutoffs_histo, load_layer_map_histo, last_record_gauge, + pitr_history_size, + archival_size, + layer_size_image, + layer_count_image, + layer_size_delta, + layer_count_delta, + standby_horizon_gauge, resident_physical_size_gauge, + visible_physical_size_gauge, current_logical_size_gauge, - num_persistent_files_created, - persistent_bytes_written, + aux_file_size_gauge, + directory_entries_count_gauge, evictions, evictions_with_low_residence_duration: std::sync::RwLock::new( evictions_with_low_residence_duration, ), + valid_lsn_lease_count_gauge, + shutdown: std::sync::atomic::AtomicBool::default(), } } pub(crate) fn record_new_file_metrics(&self, sz: u64) { self.resident_physical_size_add(sz); - self.num_persistent_files_created.inc_by(1); - self.persistent_bytes_written.inc_by(sz); } pub(crate) fn resident_physical_size_sub(&self, sz: u64) { @@ -1869,24 +2487,65 @@ impl TimelineMetrics { pub(crate) fn resident_physical_size_get(&self) -> u64 { self.resident_physical_size_gauge.get() } -} -impl Drop for TimelineMetrics { - fn drop(&mut self) { + pub(crate) fn shutdown(&self) { + let was_shutdown = self + .shutdown + .swap(true, std::sync::atomic::Ordering::Relaxed); + + if was_shutdown { + // this happens on tenant deletion because tenant first shuts down timelines, then + // invokes timeline deletion which first shuts down the timeline again. + // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080 + return; + } + let tenant_id = &self.tenant_id; let timeline_id = &self.timeline_id; let shard_id = &self.shard_id; - let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]); + let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]); { RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); - let _ = - RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]); + let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); } - let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]); - let _ = - NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]); - let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]); - let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]); + let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) { + let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]); + } + + let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + + let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Image.into(), + ]); + let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Image.into(), + ]); + let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Delta.into(), + ]); + let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + MetricLayerKind::Delta.into(), + ]); + + let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]); self.evictions_with_low_residence_duration .write() @@ -1917,20 +2576,18 @@ impl Drop for TimelineMetrics { let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]); } - for op in SmgrQueryType::iter() { - let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[ - op.into(), - tenant_id, - shard_id, - timeline_id, - ]); - } + let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[ + SmgrQueryType::GetPageAtLsn.into(), + tenant_id, + shard_id, + timeline_id, + ]); } } pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { // Only shard zero deals in synthetic sizes - if tenant_shard_id.is_zero() { + if tenant_shard_id.is_shard_zero() { let tid = tenant_shard_id.tenant_id.to_string(); let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]); } @@ -1941,44 +2598,48 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { use futures::Future; use pin_project_lite::pin_project; use std::collections::HashMap; +use std::num::NonZeroUsize; use std::pin::Pin; +use std::sync::atomic::AtomicU64; use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; use std::time::{Duration, Instant}; use crate::context::{PageContentKind, RequestContext}; use crate::task_mgr::TaskKind; +use crate::tenant::mgr::TenantSlot; +use crate::tenant::tasks::BackgroundLoopKind; /// Maintain a per timeline gauge in addition to the global gauge. -struct PerTimelineRemotePhysicalSizeGauge { - last_set: u64, +pub(crate) struct PerTimelineRemotePhysicalSizeGauge { + last_set: AtomicU64, gauge: UIntGauge, } impl PerTimelineRemotePhysicalSizeGauge { fn new(per_timeline_gauge: UIntGauge) -> Self { Self { - last_set: per_timeline_gauge.get(), + last_set: AtomicU64::new(0), gauge: per_timeline_gauge, } } - fn set(&mut self, sz: u64) { + pub(crate) fn set(&self, sz: u64) { self.gauge.set(sz); - if sz < self.last_set { - REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz); + let prev = self.last_set.swap(sz, std::sync::atomic::Ordering::Relaxed); + if sz < prev { + REMOTE_PHYSICAL_SIZE_GLOBAL.sub(prev - sz); } else { - REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set); + REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - prev); }; - self.last_set = sz; } - fn get(&self) -> u64 { + pub(crate) fn get(&self) -> u64 { self.gauge.get() } } impl Drop for PerTimelineRemotePhysicalSizeGauge { fn drop(&mut self) { - REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set); + REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set.load(std::sync::atomic::Ordering::Relaxed)); } } @@ -1986,46 +2647,35 @@ pub(crate) struct RemoteTimelineClientMetrics { tenant_id: String, shard_id: String, timeline_id: String, - remote_physical_size_gauge: Mutex>, - calls_unfinished_gauge: Mutex>, + pub(crate) remote_physical_size_gauge: PerTimelineRemotePhysicalSizeGauge, + calls: Mutex>, bytes_started_counter: Mutex>, bytes_finished_counter: Mutex>, } impl RemoteTimelineClientMetrics { pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self { + let tenant_id_str = tenant_shard_id.tenant_id.to_string(); + let shard_id_str = format!("{}", tenant_shard_id.shard_slug()); + let timeline_id_str = timeline_id.to_string(); + + let remote_physical_size_gauge = PerTimelineRemotePhysicalSizeGauge::new( + REMOTE_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str]) + .unwrap(), + ); + RemoteTimelineClientMetrics { - tenant_id: tenant_shard_id.tenant_id.to_string(), - shard_id: format!("{}", tenant_shard_id.shard_slug()), - timeline_id: timeline_id.to_string(), - calls_unfinished_gauge: Mutex::new(HashMap::default()), + tenant_id: tenant_id_str, + shard_id: shard_id_str, + timeline_id: timeline_id_str, + calls: Mutex::new(HashMap::default()), bytes_started_counter: Mutex::new(HashMap::default()), bytes_finished_counter: Mutex::new(HashMap::default()), - remote_physical_size_gauge: Mutex::new(None), + remote_physical_size_gauge, } } - pub(crate) fn remote_physical_size_set(&self, sz: u64) { - let mut guard = self.remote_physical_size_gauge.lock().unwrap(); - let gauge = guard.get_or_insert_with(|| { - PerTimelineRemotePhysicalSizeGauge::new( - REMOTE_PHYSICAL_SIZE - .get_metric_with_label_values(&[ - &self.tenant_id, - &self.shard_id, - &self.timeline_id, - ]) - .unwrap(), - ) - }); - gauge.set(sz); - } - - pub(crate) fn remote_physical_size_get(&self) -> u64 { - let guard = self.remote_physical_size_gauge.lock().unwrap(); - guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0) - } - pub fn remote_operation_time( &self, file_kind: &RemoteOpFileKind, @@ -2038,15 +2688,15 @@ impl RemoteTimelineClientMetrics { .unwrap() } - fn calls_unfinished_gauge( + fn calls_counter_pair( &self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, - ) -> IntGauge { - let mut guard = self.calls_unfinished_gauge.lock().unwrap(); + ) -> IntCounterPair { + let mut guard = self.calls.lock().unwrap(); let key = (file_kind.as_str(), op_kind.as_str()); let metric = guard.entry(key).or_insert_with(move || { - REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE + REMOTE_TIMELINE_CLIENT_CALLS .get_metric_with_label_values(&[ &self.tenant_id, &self.shard_id, @@ -2059,17 +2709,6 @@ impl RemoteTimelineClientMetrics { metric.clone() } - fn calls_started_hist( - &self, - file_kind: &RemoteOpFileKind, - op_kind: &RemoteOpKind, - ) -> Histogram { - let key = (file_kind.as_str(), op_kind.as_str()); - REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST - .get_metric_with_label_values(&[key.0, key.1]) - .unwrap() - } - fn bytes_started_counter( &self, file_kind: &RemoteOpFileKind, @@ -2140,7 +2779,7 @@ impl RemoteTimelineClientMetrics { #[must_use] pub(crate) struct RemoteTimelineClientCallMetricGuard { /// Decremented on drop. - calls_unfinished_metric: Option, + calls_counter_pair: Option, /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop. bytes_finished: Option<(IntCounter, u64)>, } @@ -2150,10 +2789,10 @@ impl RemoteTimelineClientCallMetricGuard { /// The caller vouches to do the metric updates manually. pub fn will_decrement_manually(mut self) { let RemoteTimelineClientCallMetricGuard { - calls_unfinished_metric, + calls_counter_pair, bytes_finished, } = &mut self; - calls_unfinished_metric.take(); + calls_counter_pair.take(); bytes_finished.take(); } } @@ -2161,10 +2800,10 @@ impl RemoteTimelineClientCallMetricGuard { impl Drop for RemoteTimelineClientCallMetricGuard { fn drop(&mut self) { let RemoteTimelineClientCallMetricGuard { - calls_unfinished_metric, + calls_counter_pair, bytes_finished, } = self; - if let Some(guard) = calls_unfinished_metric.take() { + if let Some(guard) = calls_counter_pair.take() { guard.dec(); } if let Some((bytes_finished_metric, value)) = bytes_finished { @@ -2197,10 +2836,8 @@ impl RemoteTimelineClientMetrics { op_kind: &RemoteOpKind, size: RemoteTimelineClientMetricsCallTrackSize, ) -> RemoteTimelineClientCallMetricGuard { - let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); - self.calls_started_hist(file_kind, op_kind) - .observe(calls_unfinished_metric.get() as f64); - calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric + let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind); + calls_counter_pair.inc(); let bytes_finished = match size { RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => { @@ -2214,7 +2851,7 @@ impl RemoteTimelineClientMetrics { } }; RemoteTimelineClientCallMetricGuard { - calls_unfinished_metric: Some(calls_unfinished_metric), + calls_counter_pair: Some(calls_counter_pair), bytes_finished, } } @@ -2228,12 +2865,8 @@ impl RemoteTimelineClientMetrics { op_kind: &RemoteOpKind, size: RemoteTimelineClientMetricsCallTrackSize, ) { - let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); - debug_assert!( - calls_unfinished_metric.get() > 0, - "begin and end should cancel out" - ); - calls_unfinished_metric.dec(); + let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind); + calls_counter_pair.dec(); match size { RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {} RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => { @@ -2250,18 +2883,15 @@ impl Drop for RemoteTimelineClientMetrics { shard_id, timeline_id, remote_physical_size_gauge, - calls_unfinished_gauge, + calls, bytes_started_counter, bytes_finished_counter, } = self; - for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() { - let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[ - tenant_id, - shard_id, - timeline_id, - a, - b, - ]); + for ((a, b), _) in calls.get_mut().unwrap().drain() { + let mut res = [Ok(()), Ok(())]; + REMOTE_TIMELINE_CLIENT_CALLS + .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id, a, b]); + // don't care about results } for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() { let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[ @@ -2339,24 +2969,239 @@ impl>, O, E> Future for MeasuredRemoteOp { } } +pub mod tokio_epoll_uring { + use metrics::{register_int_counter, UIntGauge}; + use once_cell::sync::Lazy; + + pub struct Collector { + descs: Vec, + systems_created: UIntGauge, + systems_destroyed: UIntGauge, + } + + impl metrics::core::Collector for Collector { + fn desc(&self) -> Vec<&metrics::core::Desc> { + self.descs.iter().collect() + } + + fn collect(&self) -> Vec { + let mut mfs = Vec::with_capacity(Self::NMETRICS); + let tokio_epoll_uring::metrics::Metrics { + systems_created, + systems_destroyed, + } = tokio_epoll_uring::metrics::global(); + self.systems_created.set(systems_created); + mfs.extend(self.systems_created.collect()); + self.systems_destroyed.set(systems_destroyed); + mfs.extend(self.systems_destroyed.collect()); + mfs + } + } + + impl Collector { + const NMETRICS: usize = 2; + + #[allow(clippy::new_without_default)] + pub fn new() -> Self { + let mut descs = Vec::new(); + + let systems_created = UIntGauge::new( + "pageserver_tokio_epoll_uring_systems_created", + "counter of tokio-epoll-uring systems that were created", + ) + .unwrap(); + descs.extend( + metrics::core::Collector::desc(&systems_created) + .into_iter() + .cloned(), + ); + + let systems_destroyed = UIntGauge::new( + "pageserver_tokio_epoll_uring_systems_destroyed", + "counter of tokio-epoll-uring systems that were destroyed", + ) + .unwrap(); + descs.extend( + metrics::core::Collector::desc(&systems_destroyed) + .into_iter() + .cloned(), + ); + + Self { + descs, + systems_created, + systems_destroyed, + } + } + } + + pub(crate) static THREAD_LOCAL_LAUNCH_SUCCESSES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_success_count", + "Number of times where thread_local_system creation spanned multiple executor threads", + ) + .unwrap() + }); + + pub(crate) static THREAD_LOCAL_LAUNCH_FAILURES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_failures_count", + "Number of times thread_local_system creation failed and was retried after back-off.", + ) + .unwrap() + }); +} + +pub(crate) mod tenant_throttling { + use metrics::{register_int_counter_vec, IntCounter}; + use once_cell::sync::Lazy; + + use crate::tenant::{self, throttle::Metric}; + + pub(crate) struct TimelineGet { + wait_time: IntCounter, + count: IntCounter, + } + + pub(crate) static TIMELINE_GET: Lazy = Lazy::new(|| { + static WAIT_USECS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_throttling_wait_usecs_sum_global", + "Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.", + &["kind"] + ) + .unwrap() + }); + + static WAIT_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_throttling_count_global", + "Count of tenant throttlings, by kind of throttle.", + &["kind"] + ) + .unwrap() + }); + + let kind = "timeline_get"; + TimelineGet { + wait_time: WAIT_USECS.with_label_values(&[kind]), + count: WAIT_COUNT.with_label_values(&[kind]), + } + }); + + impl Metric for &'static TimelineGet { + #[inline(always)] + fn observe_throttling( + &self, + tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation, + ) { + let val = u64::try_from(wait_time.as_micros()).unwrap(); + self.wait_time.inc_by(val); + self.count.inc(); + } + } +} + +pub(crate) mod disk_usage_based_eviction { + use super::*; + + pub(crate) struct Metrics { + pub(crate) tenant_collection_time: Histogram, + pub(crate) tenant_layer_count: Histogram, + pub(crate) layers_collected: IntCounter, + pub(crate) layers_selected: IntCounter, + pub(crate) layers_evicted: IntCounter, + } + + impl Default for Metrics { + fn default() -> Self { + let tenant_collection_time = register_histogram!( + "pageserver_disk_usage_based_eviction_tenant_collection_seconds", + "Time spent collecting layers from a tenant -- not normalized by collected layer amount", + vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0] + ) + .unwrap(); + + let tenant_layer_count = register_histogram!( + "pageserver_disk_usage_based_eviction_tenant_collected_layers", + "Amount of layers gathered from a tenant", + vec![5.0, 50.0, 500.0, 5000.0, 50000.0] + ) + .unwrap(); + + let layers_collected = register_int_counter!( + "pageserver_disk_usage_based_eviction_collected_layers_total", + "Amount of layers collected" + ) + .unwrap(); + + let layers_selected = register_int_counter!( + "pageserver_disk_usage_based_eviction_select_layers_total", + "Amount of layers selected" + ) + .unwrap(); + + let layers_evicted = register_int_counter!( + "pageserver_disk_usage_based_eviction_evicted_layers_total", + "Amount of layers successfully evicted" + ) + .unwrap(); + + Self { + tenant_collection_time, + tenant_layer_count, + layers_collected, + layers_selected, + layers_evicted, + } + } + } + + pub(crate) static METRICS: Lazy = Lazy::new(Metrics::default); +} + +static TOKIO_EXECUTOR_THREAD_COUNT: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_tokio_executor_thread_configured_count", + "Total number of configued tokio executor threads in the process. + The `setup` label denotes whether we're running with multiple runtimes or a single runtime.", + &["setup"], + ) + .unwrap() +}); + +pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) { + static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(()); + let _guard = SERIALIZE.lock().unwrap(); + TOKIO_EXECUTOR_THREAD_COUNT.reset(); + TOKIO_EXECUTOR_THREAD_COUNT + .get_metric_with_label_values(&[setup]) + .unwrap() + .set(u64::try_from(num_threads.get()).unwrap()); +} + pub fn preinitialize_metrics() { // Python tests need these and on some we do alerting. // // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of // order: // - global metrics reside in a Lazy - // - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc() + // - access via crate::metrics::PS_METRICS.some_metric.inc() // - could move the statics into TimelineMetrics::new()? // counters [ - &MATERIALIZED_PAGE_CACHE_HIT, - &MATERIALIZED_PAGE_CACHE_HIT_DIRECT, &UNEXPECTED_ONDEMAND_DOWNLOADS, &WALRECEIVER_STARTED_CONNECTIONS, &WALRECEIVER_BROKER_UPDATES, &WALRECEIVER_CANDIDATES_ADDED, &WALRECEIVER_CANDIDATES_REMOVED, + &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES, + &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES, + &REMOTE_ONDEMAND_DOWNLOADED_LAYERS, + &REMOTE_ONDEMAND_DOWNLOADED_BYTES, + &CIRCUIT_BREAKERS_BROKEN, + &CIRCUIT_BREAKERS_UNBROKEN, ] .into_iter() .for_each(|c| { @@ -2373,6 +3218,13 @@ pub fn preinitialize_metrics() { Lazy::force(&TENANT_MANAGER); Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS); + Lazy::force(&disk_usage_based_eviction::METRICS); + + for state_name in pageserver_api::models::TenantState::VARIANTS { + // initialize the metric for all gauges, otherwise the time series might seemingly show + // values from last restart. + TENANT_STATE_METRIC.with_label_values(&[state_name]).set(0); + } // countervecs [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT] @@ -2386,7 +3238,8 @@ pub fn preinitialize_metrics() { // histograms [ - &READ_NUM_FS_LAYERS, + &READ_NUM_LAYERS_VISITED, + &VEC_READ_NUM_LAYERS_VISITED, &WAIT_LSN_TIME, &WAL_REDO_TIME, &WAL_REDO_RECORDS_HISTOGRAM, @@ -2400,4 +3253,7 @@ pub fn preinitialize_metrics() { // Custom Lazy::force(&RECONSTRUCT_TIME); + Lazy::force(&tenant_throttling::TIMELINE_GET); + Lazy::force(&BASEBACKUP_QUERY_TIME); + Lazy::force(&COMPUTE_COMMANDS_COUNTERS); } diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 28d2584bf4..f386c825b8 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -17,7 +17,6 @@ //! //! Two types of pages are supported: //! -//! * **Materialized pages**, filled & used by page reconstruction //! * **Immutable File pages**, filled & used by [`crate::tenant::block_io`] and [`crate::tenant::ephemeral_file`]. //! //! Note that [`crate::tenant::ephemeral_file::EphemeralFile`] is generally mutable, but, it's append-only. @@ -28,9 +27,6 @@ //! Page cache maps from a cache key to a buffer slot. //! The cache key uniquely identifies the piece of data that is being cached. //! -//! The cache key for **materialized pages** is [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`]. -//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access. -//! //! The cache key for **immutable file** pages is [`FileId`] and a block number. //! Users of page cache that wish to page-cache an arbitrary (immutable!) on-disk file do the following: //! * Have a mechanism to deterministically associate the on-disk file with a [`FileId`]. @@ -73,7 +69,6 @@ use std::{ collections::{hash_map::Entry, HashMap}, - convert::TryInto, sync::{ atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering}, Arc, Weak, @@ -83,13 +78,10 @@ use std::{ use anyhow::Context; use once_cell::sync::OnceCell; -use pageserver_api::shard::TenantShardId; -use utils::{id::TimelineId, lsn::Lsn}; use crate::{ context::RequestContext, metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics}, - repository::Key, }; static PAGE_CACHE: OnceCell = OnceCell::new(); @@ -140,33 +132,7 @@ pub fn next_file_id() -> FileId { #[derive(Debug, PartialEq, Eq, Clone)] #[allow(clippy::enum_variant_names)] enum CacheKey { - MaterializedPage { - hash_key: MaterializedPageHashKey, - lsn: Lsn, - }, - ImmutableFilePage { - file_id: FileId, - blkno: u32, - }, -} - -#[derive(Debug, PartialEq, Eq, Hash, Clone)] -struct MaterializedPageHashKey { - /// Why is this TenantShardId rather than TenantId? - /// - /// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant. However, this - /// this not the case for certain internally-generated pages (e.g. relation sizes). In future, we may make this - /// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are - /// special-cased in some other way. - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - key: Key, -} - -#[derive(Clone)] -struct Version { - lsn: Lsn, - slot_idx: usize, + ImmutableFilePage { file_id: FileId, blkno: u32 }, } struct Slot { @@ -237,17 +203,6 @@ impl SlotInner { } pub struct PageCache { - /// This contains the mapping from the cache key to buffer slot that currently - /// contains the page, if any. - /// - /// TODO: This is protected by a single lock. If that becomes a bottleneck, - /// this HashMap can be replaced with a more concurrent version, there are - /// plenty of such crates around. - /// - /// If you add support for caching different kinds of objects, each object kind - /// can have a separate mapping map, next to this field. - materialized_page_map: std::sync::RwLock>>, - immutable_page_map: std::sync::RwLock>, /// The actual buffers with their metadata. @@ -262,7 +217,9 @@ pub struct PageCache { size_metrics: &'static PageCacheSizeMetrics, } -struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit); +struct PinnedSlotsPermit { + _permit: tokio::sync::OwnedSemaphorePermit, +} /// /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked @@ -370,175 +327,14 @@ pub enum ReadBufResult<'a> { } impl PageCache { - // - // Section 1.1: Public interface functions for looking up and memorizing materialized page - // versions in the page cache - // - - /// Look up a materialized page version. - /// - /// The 'lsn' is an upper bound, this will return the latest version of - /// the given block, but not newer than 'lsn'. Returns the actual LSN of the - /// returned page. - pub async fn lookup_materialized_page( - &self, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - key: &Key, - lsn: Lsn, - ctx: &RequestContext, - ) -> Option<(Lsn, PageReadGuard)> { - let Ok(permit) = self.try_get_pinned_slot_permit().await else { - return None; - }; - - crate::metrics::PAGE_CACHE - .for_ctx(ctx) - .read_accesses_materialized_page - .inc(); - - let mut cache_key = CacheKey::MaterializedPage { - hash_key: MaterializedPageHashKey { - tenant_shard_id, - timeline_id, - key: *key, - }, - lsn, - }; - - if let Some(guard) = self - .try_lock_for_read(&mut cache_key, &mut Some(permit)) - .await - { - if let CacheKey::MaterializedPage { - hash_key: _, - lsn: available_lsn, - } = cache_key - { - if available_lsn == lsn { - crate::metrics::PAGE_CACHE - .for_ctx(ctx) - .read_hits_materialized_page_exact - .inc(); - } else { - crate::metrics::PAGE_CACHE - .for_ctx(ctx) - .read_hits_materialized_page_older_lsn - .inc(); - } - Some((available_lsn, guard)) - } else { - panic!("unexpected key type in slot"); - } - } else { - None - } - } - - /// - /// Store an image of the given page in the cache. - /// - pub async fn memorize_materialized_page( - &self, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - key: Key, - lsn: Lsn, - img: &[u8], - ) -> anyhow::Result<()> { - let cache_key = CacheKey::MaterializedPage { - hash_key: MaterializedPageHashKey { - tenant_shard_id, - timeline_id, - key, - }, - lsn, - }; - - let mut permit = Some(self.try_get_pinned_slot_permit().await?); - loop { - // First check if the key already exists in the cache. - if let Some(slot_idx) = self.search_mapping_exact(&cache_key) { - // The page was found in the mapping. Lock the slot, and re-check - // that it's still what we expected (because we don't released the mapping - // lock already, another thread could have evicted the page) - let slot = &self.slots[slot_idx]; - let inner = slot.inner.write().await; - if inner.key.as_ref() == Some(&cache_key) { - slot.inc_usage_count(); - debug_assert!( - { - let guard = inner.permit.lock().unwrap(); - guard.upgrade().is_none() - }, - "we hold a write lock, so, no one else should have a permit" - ); - debug_assert_eq!(inner.buf.len(), img.len()); - // We already had it in cache. Another thread must've put it there - // concurrently. Check that it had the same contents that we - // replayed. - assert!(inner.buf == img); - return Ok(()); - } - } - debug_assert!(permit.is_some()); - - // Not found. Find a victim buffer - let (slot_idx, mut inner) = self - .find_victim(permit.as_ref().unwrap()) - .await - .context("Failed to find evict victim")?; - - // Insert mapping for this. At this point, we may find that another - // thread did the same thing concurrently. In that case, we evicted - // our victim buffer unnecessarily. Put it into the free list and - // continue with the slot that the other thread chose. - if let Some(_existing_slot_idx) = self.try_insert_mapping(&cache_key, slot_idx) { - // TODO: put to free list - - // We now just loop back to start from beginning. This is not - // optimal, we'll perform the lookup in the mapping again, which - // is not really necessary because we already got - // 'existing_slot_idx'. But this shouldn't happen often enough - // to matter much. - continue; - } - - // Make the slot ready - let slot = &self.slots[slot_idx]; - inner.key = Some(cache_key.clone()); - slot.set_usage_count(1); - // Create a write guard for the slot so we go through the expected motions. - debug_assert!( - { - let guard = inner.permit.lock().unwrap(); - guard.upgrade().is_none() - }, - "we hold a write lock, so, no one else should have a permit" - ); - let mut write_guard = PageWriteGuard { - state: PageWriteGuardState::Invalid { - _permit: permit.take().unwrap(), - inner, - }, - }; - write_guard.copy_from_slice(img); - let _ = write_guard.mark_valid(); - return Ok(()); - } - } - - // Section 1.2: Public interface functions for working with immutable file pages. - pub async fn read_immutable_buf( &self, file_id: FileId, blkno: u32, ctx: &RequestContext, ) -> anyhow::Result { - let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno }; - - self.lock_for_read(&mut cache_key, ctx).await + self.lock_for_read(&(CacheKey::ImmutableFilePage { file_id, blkno }), ctx) + .await } // @@ -558,9 +354,9 @@ impl PageCache { ) .await { - Ok(res) => Ok(PinnedSlotsPermit( - res.expect("this semaphore is never closed"), - )), + Ok(res) => Ok(PinnedSlotsPermit { + _permit: res.expect("this semaphore is never closed"), + }), Err(_timeout) => { crate::metrics::page_cache_errors_inc( crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout, @@ -572,19 +368,11 @@ impl PageCache { /// Look up a page in the cache. /// - /// If the search criteria is not exact, *cache_key is updated with the key - /// for exact key of the returned page. (For materialized pages, that means - /// that the LSN in 'cache_key' is updated with the LSN of the returned page - /// version.) - /// - /// If no page is found, returns None and *cache_key is left unmodified. - /// async fn try_lock_for_read( &self, - cache_key: &mut CacheKey, + cache_key: &CacheKey, permit: &mut Option, ) -> Option { - let cache_key_orig = cache_key.clone(); if let Some(slot_idx) = self.search_mapping(cache_key) { // The page was found in the mapping. Lock the slot, and re-check // that it's still what we expected (because we released the mapping @@ -597,9 +385,6 @@ impl PageCache { _permit: inner.coalesce_readers_permit(permit.take().unwrap()), slot_guard: inner, }); - } else { - // search_mapping might have modified the search key; restore it. - *cache_key = cache_key_orig; } } None @@ -636,15 +421,12 @@ impl PageCache { /// async fn lock_for_read( &self, - cache_key: &mut CacheKey, + cache_key: &CacheKey, ctx: &RequestContext, ) -> anyhow::Result { let mut permit = Some(self.try_get_pinned_slot_permit().await?); let (read_access, hit) = match cache_key { - CacheKey::MaterializedPage { .. } => { - unreachable!("Materialized pages use lookup_materialized_page") - } CacheKey::ImmutableFilePage { .. } => ( &crate::metrics::PAGE_CACHE .for_ctx(ctx) @@ -716,52 +498,15 @@ impl PageCache { /// Search for a page in the cache using the given search key. /// - /// Returns the slot index, if any. If the search criteria is not exact, - /// *cache_key is updated with the actual key of the found page. + /// Returns the slot index, if any. /// /// NOTE: We don't hold any lock on the mapping on return, so the slot might /// get recycled for an unrelated page immediately after this function /// returns. The caller is responsible for re-checking that the slot still /// contains the page with the same key before using it. /// - fn search_mapping(&self, cache_key: &mut CacheKey) -> Option { + fn search_mapping(&self, cache_key: &CacheKey) -> Option { match cache_key { - CacheKey::MaterializedPage { hash_key, lsn } => { - let map = self.materialized_page_map.read().unwrap(); - let versions = map.get(hash_key)?; - - let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) { - Ok(version_idx) => version_idx, - Err(0) => return None, - Err(version_idx) => version_idx - 1, - }; - let version = &versions[version_idx]; - *lsn = version.lsn; - Some(version.slot_idx) - } - CacheKey::ImmutableFilePage { file_id, blkno } => { - let map = self.immutable_page_map.read().unwrap(); - Some(*map.get(&(*file_id, *blkno))?) - } - } - } - - /// Search for a page in the cache using the given search key. - /// - /// Like 'search_mapping, but performs an "exact" search. Used for - /// allocating a new buffer. - fn search_mapping_exact(&self, key: &CacheKey) -> Option { - match key { - CacheKey::MaterializedPage { hash_key, lsn } => { - let map = self.materialized_page_map.read().unwrap(); - let versions = map.get(hash_key)?; - - if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) { - Some(versions[version_idx].slot_idx) - } else { - None - } - } CacheKey::ImmutableFilePage { file_id, blkno } => { let map = self.immutable_page_map.read().unwrap(); Some(*map.get(&(*file_id, *blkno))?) @@ -774,27 +519,6 @@ impl PageCache { /// fn remove_mapping(&self, old_key: &CacheKey) { match old_key { - CacheKey::MaterializedPage { - hash_key: old_hash_key, - lsn: old_lsn, - } => { - let mut map = self.materialized_page_map.write().unwrap(); - if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) { - let versions = old_entry.get_mut(); - - if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) { - versions.remove(version_idx); - self.size_metrics - .current_bytes_materialized_page - .sub_page_sz(1); - if versions.is_empty() { - old_entry.remove_entry(); - } - } - } else { - panic!("could not find old key in mapping") - } - } CacheKey::ImmutableFilePage { file_id, blkno } => { let mut map = self.immutable_page_map.write().unwrap(); map.remove(&(*file_id, *blkno)) @@ -811,30 +535,6 @@ impl PageCache { /// of the existing mapping and leaves it untouched. fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option { match new_key { - CacheKey::MaterializedPage { - hash_key: new_key, - lsn: new_lsn, - } => { - let mut map = self.materialized_page_map.write().unwrap(); - let versions = map.entry(new_key.clone()).or_default(); - match versions.binary_search_by_key(new_lsn, |v| v.lsn) { - Ok(version_idx) => Some(versions[version_idx].slot_idx), - Err(version_idx) => { - versions.insert( - version_idx, - Version { - lsn: *new_lsn, - slot_idx, - }, - ); - self.size_metrics - .current_bytes_materialized_page - .add_page_sz(1); - None - } - } - } - CacheKey::ImmutableFilePage { file_id, blkno } => { let mut map = self.immutable_page_map.write().unwrap(); match map.entry((*file_id, *blkno)) { @@ -948,7 +648,6 @@ impl PageCache { let size_metrics = &crate::metrics::PAGE_CACHE_SIZE; size_metrics.max_bytes.set_page_sz(num_pages); size_metrics.current_bytes_immutable.set_page_sz(0); - size_metrics.current_bytes_materialized_page.set_page_sz(0); let slots = page_buffer .chunks_exact_mut(PAGE_SZ) @@ -967,7 +666,6 @@ impl PageCache { .collect(); Self { - materialized_page_map: Default::default(), immutable_page_map: Default::default(), slots, next_evict_slot: AtomicUsize::new(0), diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index a8a3487b4e..39c6a6fb74 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1,52 +1,36 @@ -// //! The Page Service listens for client connections and serves their GetPage@LSN //! requests. -// -// It is possible to connect here using usual psql/pgbench/libpq. Following -// commands are supported now: -// *status* -- show actual info about this pageserver, -// *pagestream* -- enter mode where smgr and pageserver talk with their -// custom protocol. -// use anyhow::Context; use async_compression::tokio::write::GzipEncoder; use bytes::Buf; -use bytes::Bytes; -use futures::stream::FuturesUnordered; -use futures::Stream; -use futures::StreamExt; -use pageserver_api::key::Key; +use futures::FutureExt; +use once_cell::sync::OnceCell; use pageserver_api::models::TenantState; use pageserver_api::models::{ PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, - PagestreamNblocksRequest, PagestreamNblocksResponse, + PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, + PagestreamNblocksResponse, PagestreamProtocolVersion, }; -use pageserver_api::shard::ShardIndex; -use pageserver_api::shard::{ShardCount, ShardNumber}; -use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError}; +use pageserver_api::shard::TenantShardId; +use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError}; use pq_proto::framed::ConnectionError; use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; use std::borrow::Cow; -use std::collections::HashMap; use std::io; -use std::net::TcpListener; -use std::pin::pin; use std::str; use std::str::FromStr; use std::sync::Arc; -use std::time::Duration; +use std::time::SystemTime; +use std::time::{Duration, Instant}; use tokio::io::AsyncWriteExt; use tokio::io::{AsyncRead, AsyncWrite}; -use tokio_util::io::StreamReader; +use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; -use tracing::field; use tracing::*; -use utils::id::ConnectionId; -use utils::sync::gate::GateGuard; use utils::{ auth::{Claims, Scope, SwappableJwtAuth}, id::{TenantId, TimelineId}, @@ -56,137 +40,151 @@ use utils::{ use crate::auth::check_permission; use crate::basebackup; +use crate::basebackup::BasebackupError; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; -use crate::import_datadir::import_wal_from_tar; use crate::metrics; -use crate::metrics::LIVE_CONNECTIONS_COUNT; +use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS}; use crate::pgdatadir_mapping::Version; -use crate::task_mgr; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; use crate::task_mgr::TaskKind; -use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::mgr; -use crate::tenant::mgr::get_active_tenant_with_timeout; -use crate::tenant::mgr::GetActiveTenantError; +use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME}; use crate::tenant::mgr::ShardSelector; -use crate::tenant::timeline::WaitLsnError; +use crate::tenant::mgr::TenantManager; +use crate::tenant::mgr::{GetActiveTenantError, GetTenantError, ShardResolveResult}; +use crate::tenant::timeline::{self, WaitLsnError}; use crate::tenant::GetTimelineError; use crate::tenant::PageReconstructError; use crate::tenant::Timeline; -use crate::trace::Tracer; - use pageserver_api::key::rel_block_to_key; +use pageserver_api::reltag::SlruKind; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; -// How long we may wait for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which -// is not yet in state [`TenantState::Active`]. +/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which +/// is not yet in state [`TenantState::Active`]. +/// +/// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`]. const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); -/// Read the end of a tar archive. -/// -/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each. -/// `tokio_tar` already read the first such block. Read the second all-zeros block, -/// and check that there is no more data after the EOF marker. -/// -/// XXX: Currently, any trailing data after the EOF marker prints a warning. -/// Perhaps it should be a hard error? -async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> { - use tokio::io::AsyncReadExt; - let mut buf = [0u8; 512]; +/////////////////////////////////////////////////////////////////////////////// - // Read the all-zeros block, and verify it - let mut total_bytes = 0; - while total_bytes < 512 { - let nbytes = reader.read(&mut buf[total_bytes..]).await?; - total_bytes += nbytes; - if nbytes == 0 { - break; - } - } - if total_bytes < 512 { - anyhow::bail!("incomplete or invalid tar EOF marker"); - } - if !buf.iter().all(|&x| x == 0) { - anyhow::bail!("invalid tar EOF marker"); - } - - // Drain any data after the EOF marker - let mut trailing_bytes = 0; - loop { - let nbytes = reader.read(&mut buf).await?; - trailing_bytes += nbytes; - if nbytes == 0 { - break; - } - } - if trailing_bytes > 0 { - warn!("ignored {trailing_bytes} unexpected bytes after the tar archive"); - } - Ok(()) +pub struct Listener { + cancel: CancellationToken, + /// Cancel the listener task through `listen_cancel` to shut down the listener + /// and get a handle on the existing connections. + task: JoinHandle, } -/////////////////////////////////////////////////////////////////////////////// +pub struct Connections { + cancel: CancellationToken, + tasks: tokio::task::JoinSet, +} + +pub fn spawn( + conf: &'static PageServerConf, + tenant_manager: Arc, + pg_auth: Option>, + tcp_listener: tokio::net::TcpListener, +) -> Listener { + let cancel = CancellationToken::new(); + let libpq_ctx = RequestContext::todo_child( + TaskKind::LibpqEndpointListener, + // listener task shouldn't need to download anything. (We will + // create a separate sub-contexts for each connection, with their + // own download behavior. This context is used only to listen and + // accept connections.) + DownloadBehavior::Error, + ); + let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "libpq listener", + libpq_listener_main( + tenant_manager, + pg_auth, + tcp_listener, + conf.pg_auth_type, + libpq_ctx, + cancel.clone(), + ) + .map(anyhow::Ok), + )); + + Listener { cancel, task } +} + +impl Listener { + pub async fn stop_accepting(self) -> Connections { + self.cancel.cancel(); + self.task + .await + .expect("unreachable: we wrap the listener task in task_mgr::exit_on_panic_or_error") + } +} +impl Connections { + pub(crate) async fn shutdown(self) { + let Self { cancel, mut tasks } = self; + cancel.cancel(); + while let Some(res) = tasks.join_next().await { + Self::handle_connection_completion(res); + } + } + + fn handle_connection_completion(res: Result, tokio::task::JoinError>) { + match res { + Ok(Ok(())) => {} + Ok(Err(e)) => error!("error in page_service connection task: {:?}", e), + Err(e) => error!("page_service connection task panicked: {:?}", e), + } + } +} /// /// Main loop of the page service. /// /// Listens for connections, and launches a new handler task for each. /// +/// Returns Ok(()) upon cancellation via `cancel`, returning the set of +/// open connections. +/// pub async fn libpq_listener_main( - conf: &'static PageServerConf, - broker_client: storage_broker::BrokerClientChannel, + tenant_manager: Arc, auth: Option>, - listener: TcpListener, + listener: tokio::net::TcpListener, auth_type: AuthType, listener_ctx: RequestContext, - cancel: CancellationToken, -) -> anyhow::Result<()> { - listener.set_nonblocking(true)?; - let tokio_listener = tokio::net::TcpListener::from_std(listener)?; + listener_cancel: CancellationToken, +) -> Connections { + let connections_cancel = CancellationToken::new(); + let mut connection_handler_tasks = tokio::task::JoinSet::default(); - // Wait for a new connection to arrive, or for server shutdown. - while let Some(res) = tokio::select! { - biased; + loop { + let accepted = tokio::select! { + biased; + _ = listener_cancel.cancelled() => break, + next = connection_handler_tasks.join_next(), if !connection_handler_tasks.is_empty() => { + let res = next.expect("we dont poll while empty"); + Connections::handle_connection_completion(res); + continue; + } + accepted = listener.accept() => accepted, + }; - _ = cancel.cancelled() => { - // We were requested to shut down. - None - } - - res = tokio_listener.accept() => { - Some(res) - } - } { - match res { + match accepted { Ok((socket, peer_addr)) => { // Connection established. Spawn a new task to handle it. debug!("accepted connection from {}", peer_addr); let local_auth = auth.clone(); - let connection_ctx = listener_ctx .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download); - - // PageRequestHandler tasks are not associated with any particular - // timeline in the task manager. In practice most connections will - // only deal with a particular timeline, but we don't know which one - // yet. - task_mgr::spawn( - &tokio::runtime::Handle::current(), - TaskKind::PageRequestHandler, - None, - None, - "serving compute connection task", - false, - page_service_conn_main( - conf, - broker_client.clone(), - local_auth, - socket, - auth_type, - connection_ctx, - ), - ); + connection_handler_tasks.spawn(page_service_conn_main( + tenant_manager.clone(), + local_auth, + socket, + auth_type, + connection_ctx, + connections_cancel.child_token(), + )); } Err(err) => { // accept() failed. Log the error, and loop back to retry on next connection. @@ -195,28 +193,28 @@ pub async fn libpq_listener_main( } } - debug!("page_service loop terminated"); + debug!("page_service listener loop terminated"); - Ok(()) + Connections { + cancel: connections_cancel, + tasks: connection_handler_tasks, + } } +type ConnectionHandlerResult = anyhow::Result<()>; + #[instrument(skip_all, fields(peer_addr))] async fn page_service_conn_main( - conf: &'static PageServerConf, - broker_client: storage_broker::BrokerClientChannel, + tenant_manager: Arc, auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, connection_ctx: RequestContext, -) -> anyhow::Result<()> { - // Immediately increment the gauge, then create a job to decrement it on task exit. - // One of the pros of `defer!` is that this will *most probably* - // get called, even in presence of panics. - let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]); - gauge.inc(); - scopeguard::defer! { - gauge.dec(); - } + cancel: CancellationToken, +) -> ConnectionHandlerResult { + let _guard = LIVE_CONNECTIONS + .with_label_values(&["page_service"]) + .guard(); socket .set_nodelay(true) @@ -255,17 +253,17 @@ async fn page_service_conn_main( socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms))); let socket = std::pin::pin!(socket); + fail::fail_point!("ps::connection-start::pre-login"); + // XXX: pgbackend.run() should take the connection_ctx, // and create a child per-query context when it invokes process_query. // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // and create the per-query context in process_query ourselves. - let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx); + let mut conn_handler = + PageServerHandler::new(tenant_manager, auth, connection_ctx, cancel.clone()); let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; - match pgbackend - .run(&mut conn_handler, task_mgr::shutdown_watcher) - .await - { + match pgbackend.run(&mut conn_handler, &cancel).await { Ok(()) => { // we've been requested to shut down Ok(()) @@ -282,16 +280,7 @@ async fn page_service_conn_main( } } -/// While a handler holds a reference to a Timeline, it also holds a the -/// timeline's Gate open. -struct HandlerTimeline { - timeline: Arc, - _guard: GateGuard, -} - struct PageServerHandler { - _conf: &'static PageServerConf, - broker_client: storage_broker::BrokerClientChannel, auth: Option>, claims: Option, @@ -301,13 +290,144 @@ struct PageServerHandler { /// `process_query` creates a child context from this one. connection_ctx: RequestContext, - /// See [`Self::cache_timeline`] for usage. - /// + cancel: CancellationToken, + + timeline_handles: TimelineHandles, +} + +struct TimelineHandles { + wrapper: TenantManagerWrapper, /// Note on size: the typical size of this map is 1. The largest size we expect /// to see is the number of shards divided by the number of pageservers (typically < 2), /// or the ratio used when splitting shards (i.e. how many children created from one) /// parent shard, where a "large" number might be ~8. - shard_timelines: HashMap, + handles: timeline::handle::Cache, +} + +impl TimelineHandles { + fn new(tenant_manager: Arc) -> Self { + Self { + wrapper: TenantManagerWrapper { + tenant_manager, + tenant_id: OnceCell::new(), + }, + handles: Default::default(), + } + } + async fn get( + &mut self, + tenant_id: TenantId, + timeline_id: TimelineId, + shard_selector: ShardSelector, + ) -> Result, GetActiveTimelineError> { + if *self.wrapper.tenant_id.get_or_init(|| tenant_id) != tenant_id { + return Err(GetActiveTimelineError::Tenant( + GetActiveTenantError::SwitchedTenant, + )); + } + self.handles + .get(timeline_id, shard_selector, &self.wrapper) + .await + .map_err(|e| match e { + timeline::handle::GetError::TenantManager(e) => e, + timeline::handle::GetError::TimelineGateClosed => { + trace!("timeline gate closed"); + GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) + } + timeline::handle::GetError::PerTimelineStateShutDown => { + trace!("per-timeline state shut down"); + GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) + } + }) + } +} + +pub(crate) struct TenantManagerWrapper { + tenant_manager: Arc, + // We do not support switching tenant_id on a connection at this point. + // We can can add support for this later if needed without changing + // the protocol. + tenant_id: once_cell::sync::OnceCell, +} + +#[derive(Debug)] +pub(crate) struct TenantManagerTypes; + +impl timeline::handle::Types for TenantManagerTypes { + type TenantManagerError = GetActiveTimelineError; + type TenantManager = TenantManagerWrapper; + type Timeline = Arc; +} + +impl timeline::handle::ArcTimeline for Arc { + fn gate(&self) -> &utils::sync::gate::Gate { + &self.gate + } + + fn shard_timeline_id(&self) -> timeline::handle::ShardTimelineId { + Timeline::shard_timeline_id(self) + } + + fn per_timeline_state(&self) -> &timeline::handle::PerTimelineState { + &self.handles + } + + fn get_shard_identity(&self) -> &pageserver_api::shard::ShardIdentity { + Timeline::get_shard_identity(self) + } +} + +impl timeline::handle::TenantManager for TenantManagerWrapper { + async fn resolve( + &self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + ) -> Result, GetActiveTimelineError> { + let tenant_id = self.tenant_id.get().expect("we set this in get()"); + let timeout = ACTIVE_TENANT_TIMEOUT; + let wait_start = Instant::now(); + let deadline = wait_start + timeout; + let tenant_shard = loop { + let resolved = self + .tenant_manager + .resolve_attached_shard(tenant_id, shard_selector); + match resolved { + ShardResolveResult::Found(tenant_shard) => break tenant_shard, + ShardResolveResult::NotFound => { + return Err(GetActiveTimelineError::Tenant( + GetActiveTenantError::NotFound(GetTenantError::NotFound(*tenant_id)), + )); + } + ShardResolveResult::InProgress(barrier) => { + // We can't authoritatively answer right now: wait for InProgress state + // to end, then try again + tokio::select! { + _ = barrier.wait() => { + // The barrier completed: proceed around the loop to try looking up again + }, + _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => { + return Err(GetActiveTimelineError::Tenant(GetActiveTenantError::WaitForActiveTimeout { + latest_state: None, + wait_time: timeout, + })); + } + } + } + }; + }; + + tracing::debug!("Waiting for tenant to enter active state..."); + tenant_shard + .wait_to_become_active(deadline.duration_since(Instant::now())) + .await + .map_err(GetActiveTimelineError::Tenant)?; + + let timeline = tenant_shard + .get_timeline(timeline_id, true) + .map_err(GetActiveTimelineError::Timeline)?; + set_tracing_field_shard_id(&timeline); + Ok(timeline) + } } #[derive(thiserror::Error, Debug)] @@ -351,7 +471,11 @@ impl From for PageStreamError { impl From for PageStreamError { fn from(value: GetActiveTimelineError) -> Self { match value { - GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown, + GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) + | GetActiveTimelineError::Tenant(GetActiveTenantError::WillNotBecomeActive( + TenantState::Stopping { .. }, + )) + | GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) => Self::Shutdown, GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()), GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()), } @@ -363,76 +487,37 @@ impl From for PageStreamError { match value { e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e), WaitLsnError::Shutdown => Self::Shutdown, - WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()), + e @ WaitLsnError::BadState { .. } => Self::Reconnect(format!("{e}").into()), + } + } +} + +impl From for QueryError { + fn from(value: WaitLsnError) -> Self { + match value { + e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)), + WaitLsnError::Shutdown => Self::Shutdown, + WaitLsnError::BadState { .. } => Self::Reconnect, } } } impl PageServerHandler { pub fn new( - conf: &'static PageServerConf, - broker_client: storage_broker::BrokerClientChannel, + tenant_manager: Arc, auth: Option>, connection_ctx: RequestContext, + cancel: CancellationToken, ) -> Self { PageServerHandler { - _conf: conf, - broker_client, auth, claims: None, connection_ctx, - shard_timelines: HashMap::new(), + timeline_handles: TimelineHandles::new(tenant_manager), + cancel, } } - /// Future that completes when we need to shut down the connection. - /// - /// We currently need to shut down when any of the following happens: - /// 1. any of the timelines we hold GateGuards for in `shard_timelines` is cancelled - /// 2. task_mgr requests shutdown of the connection - /// - /// NB on (1): the connection's lifecycle is not actually tied to any of the - /// `shard_timelines`s' lifecycles. But it's _necessary_ in the current - /// implementation to be responsive to timeline cancellation because - /// the connection holds their `GateGuards` open (sored in `shard_timelines`). - /// We currently do the easy thing and terminate the connection if any of the - /// shard_timelines gets cancelled. But really, we cuold spend more effort - /// and simply remove the cancelled timeline from the `shard_timelines`, thereby - /// dropping the guard. - /// - /// NB: keep in sync with [`Self::is_connection_cancelled`] - async fn await_connection_cancelled(&self) { - // A short wait before we expend the cycles to walk our timeline map. This avoids incurring - // that cost every time we check for cancellation. - tokio::time::sleep(Duration::from_millis(10)).await; - - // This function is never called concurrently with code that adds timelines to shard_timelines, - // which is enforced by the borrow checker (the future returned by this function carries the - // immutable &self). So it's fine to evaluate shard_timelines after the sleep, we don't risk - // missing any inserts to the map. - - let mut cancellation_sources = Vec::with_capacity(1 + self.shard_timelines.len()); - use futures::future::Either; - cancellation_sources.push(Either::Left(task_mgr::shutdown_watcher())); - cancellation_sources.extend( - self.shard_timelines - .values() - .map(|ht| Either::Right(ht.timeline.cancel.cancelled())), - ); - FuturesUnordered::from_iter(cancellation_sources) - .next() - .await; - } - - /// Checking variant of [`Self::await_connection_cancelled`]. - fn is_connection_cancelled(&self) -> bool { - task_mgr::is_shutdown_requested() - || self - .shard_timelines - .values() - .any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping()) - } - /// This function always respects cancellation of any timeline in `[Self::shard_timelines]`. Pass in /// a cancellation token at the next scope up (such as a tenant cancellation token) to ensure we respect /// cancellation if there aren't any timelines in the cache. @@ -451,132 +536,56 @@ impl PageServerHandler { flush_r = pgb.flush() => { Ok(flush_r?) }, - _ = self.await_connection_cancelled() => { - Err(QueryError::Shutdown) - } _ = cancel.cancelled() => { Err(QueryError::Shutdown) } ) } - fn copyin_stream<'a, IO>( - &'a self, - pgb: &'a mut PostgresBackend, - cancel: &'a CancellationToken, - ) -> impl Stream> + 'a - where - IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, - { - async_stream::try_stream! { - loop { - let msg = tokio::select! { - biased; - - _ = cancel.cancelled() => { - // We were requested to shut down. - let msg = "pageserver is shutting down"; - let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None)); - Err(QueryError::Shutdown) - } - - msg = pgb.read_message() => { msg.map_err(QueryError::from)} - }; - - match msg { - Ok(Some(message)) => { - let copy_data_bytes = match message { - FeMessage::CopyData(bytes) => bytes, - FeMessage::CopyDone => { break }, - FeMessage::Sync => continue, - FeMessage::Terminate => { - let msg = "client terminated connection with Terminate message during COPY"; - let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg))); - // error can't happen here, ErrorResponse serialization should be always ok - pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?; - Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; - break; - } - m => { - let msg = format!("unexpected message {m:?}"); - // error can't happen here, ErrorResponse serialization should be always ok - pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?; - Err(io::Error::new(io::ErrorKind::Other, msg))?; - break; - } - }; - - yield copy_data_bytes; - } - Ok(None) => { - let msg = "client closed connection during COPY"; - let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg))); - // error can't happen here, ErrorResponse serialization should be always ok - pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?; - self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?; - Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; - } - Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => { - Err(io_error)?; - } - Err(other) => { - Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?; - } - }; - } - } - } - + /// Pagestream sub-protocol handler. + /// + /// It is a simple request-response protocol inside a COPYBOTH session. + /// + /// # Coding Discipline + /// + /// Coding discipline within this function: all interaction with the `pgb` connection + /// needs to be sensitive to connection shutdown, currently signalled via [`Self::cancel`]. + /// This is so that we can shutdown page_service quickly. #[instrument(skip_all)] async fn handle_pagerequests( &mut self, pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, + _protocol_version: PagestreamProtocolVersion, ctx: RequestContext, ) -> Result<(), QueryError> where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - debug_assert_current_span_has_tenant_and_timeline_id(); - - let tenant = mgr::get_active_tenant_with_timeout( - tenant_id, - ShardSelector::First, - ACTIVE_TENANT_TIMEOUT, - &task_mgr::shutdown_token(), - ) - .await?; - - // Make request tracer if needed - let mut tracer = if tenant.get_trace_read_requests() { - let connection_id = ConnectionId::generate(); - let path = - tenant - .conf - .trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id); - Some(Tracer::new(path)) - } else { - None - }; + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); // switch client to COPYBOTH pgb.write_message_noflush(&BeMessage::CopyBothResponse)?; - self.flush_cancellable(pgb, &tenant.cancel).await?; + tokio::select! { + biased; + _ = self.cancel.cancelled() => { + return Err(QueryError::Shutdown) + } + res = pgb.flush() => { + res?; + } + } loop { + // read request bytes (it's exactly 1 PagestreamFeMessage per CopyData) let msg = tokio::select! { biased; - - _ = self.await_connection_cancelled() => { - // We were requested to shut down. - info!("shutdown request received in page handler"); + _ = self.cancel.cancelled() => { return Err(QueryError::Shutdown) } - msg = pgb.read_message() => { msg } }; - let copy_data_bytes = match msg? { Some(FeMessage::CopyData(bytes)) => bytes, Some(FeMessage::Terminate) => break, @@ -589,20 +598,16 @@ impl PageServerHandler { }; trace!("query: {copy_data_bytes:?}"); + fail::fail_point!("ps::handle-pagerequest-message"); - // Trace request if needed - if let Some(t) = tracer.as_mut() { - t.trace(©_data_bytes) - } - + // parse request let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?; - // TODO: We could create a new per-request context here, with unique ID. - // Currently we use the same per-timeline context for all requests - - let (response, span) = match neon_fe_msg { + // invoke handler function + let (handler_result, span) = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { - let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn); + fail::fail_point!("ps::handle-pagerequest-message::exists"); + let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn); ( self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx) .instrument(span.clone()) @@ -611,7 +616,8 @@ impl PageServerHandler { ) } PagestreamFeMessage::Nblocks(req) => { - let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn); + fail::fail_point!("ps::handle-pagerequest-message::nblocks"); + let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn); ( self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx) .instrument(span.clone()) @@ -620,7 +626,9 @@ impl PageServerHandler { ) } PagestreamFeMessage::GetPage(req) => { - let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn); + fail::fail_point!("ps::handle-pagerequest-message::getpage"); + // shard_id is filled in by the handler + let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn); ( self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx) .instrument(span.clone()) @@ -629,7 +637,8 @@ impl PageServerHandler { ) } PagestreamFeMessage::DbSize(req) => { - let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn); + fail::fail_point!("ps::handle-pagerequest-message::dbsize"); + let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn); ( self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx) .instrument(span.clone()) @@ -637,33 +646,38 @@ impl PageServerHandler { span, ) } + PagestreamFeMessage::GetSlruSegment(req) => { + fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); + let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn); + ( + self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx) + .instrument(span.clone()) + .await, + span, + ) + } }; - match response { - Err(PageStreamError::Shutdown) => { - // If we fail to fulfil a request during shutdown, which may be _because_ of - // shutdown, then do not send the error to the client. Instead just drop the - // connection. - span.in_scope(|| info!("dropping connection due to shutdown")); - return Err(QueryError::Shutdown); - } - Err(PageStreamError::Reconnect(reason)) => { - span.in_scope(|| info!("handler requested reconnect: {reason}")); - return Err(QueryError::Reconnect); - } - Err(e) if self.is_connection_cancelled() => { - // This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean - // shutdown error, this may be buried inside a PageReconstructError::Other for example. - // - // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet, - // because wait_lsn etc will drop out - // is_stopping(): [`Timeline::flush_and_shutdown`] has entered - // is_canceled(): [`Timeline::shutdown`]` has entered - span.in_scope(|| info!("dropped error response during shutdown: {e:#}")); - return Err(QueryError::Shutdown); - } - r => { - let response_msg = r.unwrap_or_else(|e| { + // Map handler result to protocol behavior. + // Some handler errors cause exit from pagestream protocol. + // Other handler errors are sent back as an error message and we stay in pagestream protocol. + let response_msg = match handler_result { + Err(e) => match &e { + PageStreamError::Shutdown => { + // If we fail to fulfil a request during shutdown, which may be _because_ of + // shutdown, then do not send the error to the client. Instead just drop the + // connection. + span.in_scope(|| info!("dropping connection due to shutdown")); + return Err(QueryError::Shutdown); + } + PageStreamError::Reconnect(reason) => { + span.in_scope(|| info!("handler requested reconnect: {reason}")); + return Err(QueryError::Reconnect); + } + PageStreamError::Read(_) + | PageStreamError::LsnTimeout(_) + | PageStreamError::NotFound(_) + | PageStreamError::BadRequest(_) => { // print the all details to the log with {:#}, but for the client the // error message is enough. Do not log if shutting down, as the anyhow::Error // here includes cancellation which is not an error. @@ -674,205 +688,160 @@ impl PageServerHandler { PagestreamBeMessage::Error(PagestreamErrorResponse { message: e.to_string(), }) - }); + } + }, + Ok(response_msg) => response_msg, + }; - pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?; - self.flush_cancellable(pgb, &tenant.cancel).await?; + // marshal & transmit response message + pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?; + tokio::select! { + biased; + _ = self.cancel.cancelled() => { + // We were requested to shut down. + info!("shutdown request received in page handler"); + return Err(QueryError::Shutdown) + } + res = pgb.flush() => { + res?; } } } Ok(()) } - #[allow(clippy::too_many_arguments)] - #[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))] - async fn handle_import_basebackup( - &self, - pgb: &mut PostgresBackend, - tenant_id: TenantId, - timeline_id: TimelineId, - base_lsn: Lsn, - _end_lsn: Lsn, - pg_version: u32, - ctx: RequestContext, - ) -> Result<(), QueryError> - where - IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, - { - debug_assert_current_span_has_tenant_and_timeline_id(); - - // Create empty timeline - info!("creating new timeline"); - let tenant = get_active_tenant_with_timeout( - tenant_id, - ShardSelector::Zero, - ACTIVE_TENANT_TIMEOUT, - &task_mgr::shutdown_token(), - ) - .await?; - let timeline = tenant - .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx) - .await?; - - // TODO mark timeline as not ready until it reaches end_lsn. - // We might have some wal to import as well, and we should prevent compute - // from connecting before that and writing conflicting wal. - // - // This is not relevant for pageserver->pageserver migrations, since there's - // no wal to import. But should be fixed if we want to import from postgres. - - // TODO leave clean state on error. For now you can use detach to clean - // up broken state from a failed import. - - // Import basebackup provided via CopyData - info!("importing basebackup"); - pgb.write_message_noflush(&BeMessage::CopyInResponse)?; - self.flush_cancellable(pgb, &tenant.cancel).await?; - - let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel))); - timeline - .import_basebackup_from_tar( - &mut copyin_reader, - base_lsn, - self.broker_client.clone(), - &ctx, - ) - .await?; - - // Read the end of the tar archive. - read_tar_eof(copyin_reader).await?; - - // TODO check checksum - // Meanwhile you can verify client-side by taking fullbackup - // and checking that it matches in size with what was imported. - // It wouldn't work if base came from vanilla postgres though, - // since we discard some log files. - - info!("done"); - Ok(()) - } - - #[instrument(skip_all, fields(%start_lsn, %end_lsn))] - async fn handle_import_wal( - &self, - pgb: &mut PostgresBackend, - tenant_id: TenantId, - timeline_id: TimelineId, - start_lsn: Lsn, - end_lsn: Lsn, - ctx: RequestContext, - ) -> Result<(), QueryError> - where - IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, - { - debug_assert_current_span_has_tenant_and_timeline_id(); - - let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) - .await?; - let last_record_lsn = timeline.get_last_record_lsn(); - if last_record_lsn != start_lsn { - return Err(QueryError::Other( - anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")) - ); - } - - // TODO leave clean state on error. For now you can use detach to clean - // up broken state from a failed import. - - // Import wal provided via CopyData - info!("importing wal"); - pgb.write_message_noflush(&BeMessage::CopyInResponse)?; - self.flush_cancellable(pgb, &timeline.cancel).await?; - let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel))); - import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?; - info!("wal import complete"); - - // Read the end of the tar archive. - read_tar_eof(copyin_reader).await?; - - // TODO Does it make sense to overshoot? - if timeline.get_last_record_lsn() < end_lsn { - return Err(QueryError::Other( - anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")) - ); - } - - // Flush data to disk, then upload to s3. No need for a forced checkpoint. - // We only want to persist the data, and it doesn't matter if it's in the - // shape of deltas or images. - info!("flushing layers"); - timeline.freeze_and_flush().await?; - - info!("done"); - Ok(()) - } - /// Helper function to handle the LSN from client request. /// /// Each GetPage (and Exists and Nblocks) request includes information about - /// which version of the page is being requested. The client can request the - /// latest version of the page, or the version that's valid at a particular - /// LSN. The primary compute node will always request the latest page - /// version, while a standby will request a version at the LSN that it's - /// currently caught up to. + /// which version of the page is being requested. The primary compute node + /// will always request the latest page version, by setting 'request_lsn' to + /// the last inserted or flushed WAL position, while a standby will request + /// a version at the LSN that it's currently caught up to. /// /// In either case, if the page server hasn't received the WAL up to the /// requested LSN yet, we will wait for it to arrive. The return value is /// the LSN that should be used to look up the page versions. + /// + /// In addition to the request LSN, each request carries another LSN, + /// 'not_modified_since', which is a hint to the pageserver that the client + /// knows that the page has not been modified between 'not_modified_since' + /// and the request LSN. This allows skipping the wait, as long as the WAL + /// up to 'not_modified_since' has arrived. If the client doesn't have any + /// information about when the page was modified, it will use + /// not_modified_since == lsn. If the client lies and sends a too low + /// not_modified_hint such that there are in fact later page versions, the + /// behavior is undefined: the pageserver may return any of the page versions + /// or an error. async fn wait_or_get_last_lsn( timeline: &Timeline, - mut lsn: Lsn, - latest: bool, + request_lsn: Lsn, + not_modified_since: Lsn, latest_gc_cutoff_lsn: &RcuReadGuard, ctx: &RequestContext, ) -> Result { - if latest { - // Latest page version was requested. If LSN is given, it is a hint - // to the page server that there have been no modifications to the - // page after that LSN. If we haven't received WAL up to that point, - // wait until it arrives. - let last_record_lsn = timeline.get_last_record_lsn(); + let last_record_lsn = timeline.get_last_record_lsn(); - // Note: this covers the special case that lsn == Lsn(0). That - // special case means "return the latest version whatever it is", - // and it's used for bootstrapping purposes, when the page server is - // connected directly to the compute node. That is needed because - // when you connect to the compute node, to receive the WAL, the - // walsender process will do a look up in the pg_authid catalog - // table for authentication. That poses a deadlock problem: the - // catalog table lookup will send a GetPage request, but the GetPage - // request will block in the page server because the recent WAL - // hasn't been received yet, and it cannot be received until the - // walsender completes the authentication and starts streaming the - // WAL. - if lsn <= last_record_lsn { - lsn = last_record_lsn; - } else { - timeline.wait_lsn(lsn, ctx).await?; - // Since we waited for 'lsn' to arrive, that is now the last - // record LSN. (Or close enough for our purposes; the - // last-record LSN can advance immediately after we return - // anyway) + // Sanity check the request + if request_lsn < not_modified_since { + return Err(PageStreamError::BadRequest( + format!( + "invalid request with request LSN {} and not_modified_since {}", + request_lsn, not_modified_since, + ) + .into(), + )); + } + + if request_lsn < **latest_gc_cutoff_lsn { + let gc_info = &timeline.gc_info.read().unwrap(); + if !gc_info.leases.contains_key(&request_lsn) { + // The requested LSN is below gc cutoff and is not guarded by a lease. + + // Check explicitly for INVALID just to get a less scary error message if the + // request is obviously bogus + return Err(if request_lsn == Lsn::INVALID { + PageStreamError::BadRequest("invalid LSN(0) in request".into()) + } else { + PageStreamError::BadRequest(format!( + "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", + request_lsn, **latest_gc_cutoff_lsn + ).into()) + }); } + } + + // Wait for WAL up to 'not_modified_since' to arrive, if necessary + if not_modified_since > last_record_lsn { + timeline + .wait_lsn( + not_modified_since, + crate::tenant::timeline::WaitLsnWaiter::PageService, + ctx, + ) + .await?; + // Since we waited for 'not_modified_since' to arrive, that is now the last + // record LSN. (Or close enough for our purposes; the last-record LSN can + // advance immediately after we return anyway) + Ok(not_modified_since) } else { - if lsn == Lsn(0) { - return Err(PageStreamError::BadRequest( - "invalid LSN(0) in request".into(), - )); - } - timeline.wait_lsn(lsn, ctx).await?; + // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn) + // here instead. That would give the same result, since we know that there + // haven't been any modifications since 'not_modified_since'. Using an older + // LSN might be faster, because that could allow skipping recent layers when + // finding the page. However, we have historically used 'last_record_lsn', so + // stick to that for now. + Ok(std::cmp::min(last_record_lsn, request_lsn)) } - - if lsn < **latest_gc_cutoff_lsn { - return Err(PageStreamError::BadRequest(format!( - "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", - lsn, **latest_gc_cutoff_lsn - ).into())); - } - Ok(lsn) } + /// Handles the lsn lease request. + /// If a lease cannot be obtained, the client will receive NULL. + #[instrument(skip_all, fields(shard_id, %lsn))] + async fn handle_make_lsn_lease( + &mut self, + pgb: &mut PostgresBackend, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result<(), QueryError> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + { + let timeline = self + .timeline_handles + .get( + tenant_shard_id.tenant_id, + timeline_id, + ShardSelector::Known(tenant_shard_id.to_index()), + ) + .await?; + set_tracing_field_shard_id(&timeline); + + let lease = timeline + .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx) + .inspect_err(|e| { + warn!("{e}"); + }) + .ok(); + let valid_until_str = lease.map(|l| { + l.valid_until + .duration_since(SystemTime::UNIX_EPOCH) + .expect("valid_until is earlier than UNIX_EPOCH") + .as_millis() + .to_string() + }); + let bytes = valid_until_str.as_ref().map(|x| x.as_bytes()); + + pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( + b"valid_until", + )]))? + .write_message_noflush(&BeMessage::DataRow(&[bytes]))?; + + Ok(()) + } + + #[instrument(skip_all, fields(shard_id))] async fn handle_get_rel_exists_request( &mut self, tenant_id: TenantId, @@ -880,18 +849,26 @@ impl PageServerHandler { req: &PagestreamExistsRequest, ctx: &RequestContext, ) -> Result { - let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?; + let timeline = self + .timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .await?; let _timer = timeline .query_metrics - .start_timer(metrics::SmgrQueryType::GetRelExists); + .start_timer(metrics::SmgrQueryType::GetRelExists, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; + let lsn = Self::wait_or_get_last_lsn( + &timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; let exists = timeline - .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx) + .get_rel_exists(req.rel, Version::Lsn(lsn), ctx) .await?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { @@ -899,6 +876,7 @@ impl PageServerHandler { })) } + #[instrument(skip_all, fields(shard_id))] async fn handle_get_nblocks_request( &mut self, tenant_id: TenantId, @@ -906,19 +884,27 @@ impl PageServerHandler { req: &PagestreamNblocksRequest, ctx: &RequestContext, ) -> Result { - let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?; + let timeline = self + .timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .await?; let _timer = timeline .query_metrics - .start_timer(metrics::SmgrQueryType::GetRelSize); + .start_timer(metrics::SmgrQueryType::GetRelSize, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; + let lsn = Self::wait_or_get_last_lsn( + &timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; let n_blocks = timeline - .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx) + .get_rel_size(req.rel, Version::Lsn(lsn), ctx) .await?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { @@ -926,6 +912,7 @@ impl PageServerHandler { })) } + #[instrument(skip_all, fields(shard_id))] async fn handle_db_size_request( &mut self, tenant_id: TenantId, @@ -933,25 +920,27 @@ impl PageServerHandler { req: &PagestreamDbSizeRequest, ctx: &RequestContext, ) -> Result { - let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?; + let timeline = self + .timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .await?; let _timer = timeline .query_metrics - .start_timer(metrics::SmgrQueryType::GetDbSize); + .start_timer(metrics::SmgrQueryType::GetDbSize, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; + let lsn = Self::wait_or_get_last_lsn( + &timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; let total_blocks = timeline - .get_db_size( - DEFAULTTABLESPACE_OID, - req.dbnode, - Version::Lsn(lsn), - req.latest, - ctx, - ) + .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx) .await?; let db_size = total_blocks as i64 * BLCKSZ as i64; @@ -960,122 +949,7 @@ impl PageServerHandler { })) } - /// For most getpage requests, we will already have a Timeline to serve the request: this function - /// looks up such a Timeline synchronously and without touching any global state. - fn get_cached_timeline_for_page( - &mut self, - req: &PagestreamGetPageRequest, - ) -> Result<&Arc, Key> { - let key = if let Some((first_idx, first_timeline)) = self.shard_timelines.iter().next() { - // Fastest path: single sharded case - if first_idx.shard_count < ShardCount(2) { - return Ok(&first_timeline.timeline); - } - - let key = rel_block_to_key(req.rel, req.blkno); - let shard_num = first_timeline - .timeline - .get_shard_identity() - .get_shard_number(&key); - - // Fast path: matched the first timeline in our local handler map. This case is common if - // only one shard per tenant is attached to this pageserver. - if first_timeline.timeline.get_shard_identity().number == shard_num { - return Ok(&first_timeline.timeline); - } - - let shard_index = ShardIndex { - shard_number: shard_num, - shard_count: first_timeline.timeline.get_shard_identity().count, - }; - - // Fast-ish path: timeline is in the connection handler's local cache - if let Some(found) = self.shard_timelines.get(&shard_index) { - return Ok(&found.timeline); - } - - key - } else { - rel_block_to_key(req.rel, req.blkno) - }; - - Err(key) - } - - /// Having looked up the [`Timeline`] instance for a particular shard, cache it to enable - /// use in future requests without having to traverse [`crate::tenant::mgr::TenantManager`] - /// again. - /// - /// Note that all the Timelines in this cache are for the same timeline_id: they're differ - /// in which shard they belong to. When we serve a getpage@lsn request, we choose a shard - /// based on key. - /// - /// The typical size of this cache is 1, as we generally create shards to distribute work - /// across pageservers, so don't tend to have multiple shards for the same tenant on the - /// same pageserver. - fn cache_timeline( - &mut self, - timeline: Arc, - ) -> Result<&Arc, GetActiveTimelineError> { - let gate_guard = timeline - .gate - .enter() - .map_err(|_| GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled))?; - - let shard_index = timeline.tenant_shard_id.to_index(); - let entry = self - .shard_timelines - .entry(shard_index) - .or_insert(HandlerTimeline { - timeline, - _guard: gate_guard, - }); - - Ok(&entry.timeline) - } - - /// If [`Self::get_cached_timeline_for_page`] missed, then this function is used to populate the cache with - /// a Timeline to serve requests for this key, if such a Timeline is present on this pageserver. If no such - /// Timeline is found, then we will return an error (this indicates that the client is talking to the wrong node). - async fn load_timeline_for_page( - &mut self, - tenant_id: TenantId, - timeline_id: TimelineId, - key: Key, - ) -> anyhow::Result<&Arc, GetActiveTimelineError> { - // Slow path: we must call out to the TenantManager to find the timeline for this Key - let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Page(key)) - .await?; - - self.cache_timeline(timeline) - } - - async fn get_timeline_shard_zero( - &mut self, - tenant_id: TenantId, - timeline_id: TimelineId, - ) -> anyhow::Result<&Arc, GetActiveTimelineError> { - // This is a borrow-checker workaround: we can't return from inside of the `if let Some` because - // that would be an immutable-borrow-self return, whereas later in the function we will use a mutable - // ref to salf. So instead, we first build a bool, and then return while not borrowing self. - let have_cached = if let Some((idx, _tl)) = self.shard_timelines.iter().next() { - idx.shard_number == ShardNumber(0) - } else { - false - }; - - if have_cached { - let entry = self.shard_timelines.iter().next().unwrap(); - Ok(&entry.1.timeline) - } else { - let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) - .await?; - Ok(self.cache_timeline(timeline)?) - } - } - + #[instrument(skip_all, fields(shard_id))] async fn handle_get_page_at_lsn_request( &mut self, tenant_id: TenantId, @@ -1083,43 +957,48 @@ impl PageServerHandler { req: &PagestreamGetPageRequest, ctx: &RequestContext, ) -> Result { - let timeline = match self.get_cached_timeline_for_page(req) { + let timeline = match self + .timeline_handles + .get( + tenant_id, + timeline_id, + ShardSelector::Page(rel_block_to_key(req.rel, req.blkno)), + ) + .await + { Ok(tl) => tl, - Err(key) => { - match self - .load_timeline_for_page(tenant_id, timeline_id, key) - .await - { - Ok(t) => t, - Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { - // We already know this tenant exists in general, because we resolved it at - // start of connection. Getting a NotFound here indicates that the shard containing - // the requested page is not present on this node: the client's knowledge of shard->pageserver - // mapping is out of date. - // - // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via - // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration - // and talk to a different pageserver. - return Err(PageStreamError::Reconnect( - "getpage@lsn request routed to wrong shard".into(), - )); - } - Err(e) => return Err(e.into()), - } + Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { + // We already know this tenant exists in general, because we resolved it at + // start of connection. Getting a NotFound here indicates that the shard containing + // the requested page is not present on this node: the client's knowledge of shard->pageserver + // mapping is out of date. + // + // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via + // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration + // and talk to a different pageserver. + return Err(PageStreamError::Reconnect( + "getpage@lsn request routed to wrong shard".into(), + )); } + Err(e) => return Err(e.into()), }; let _timer = timeline .query_metrics - .start_timer(metrics::SmgrQueryType::GetPageAtLsn); + .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; + let lsn = Self::wait_or_get_last_lsn( + &timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; let page = timeline - .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx) + .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx) .await?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { @@ -1127,8 +1006,57 @@ impl PageServerHandler { })) } + #[instrument(skip_all, fields(shard_id))] + async fn handle_get_slru_segment_request( + &mut self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &PagestreamGetSlruSegmentRequest, + ctx: &RequestContext, + ) -> Result { + let timeline = self + .timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .await?; + + let _timer = timeline + .query_metrics + .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx); + + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let lsn = Self::wait_or_get_last_lsn( + &timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; + + let kind = SlruKind::from_repr(req.kind) + .ok_or(PageStreamError::BadRequest("invalid SLRU kind".into()))?; + let segment = timeline.get_slru_segment(kind, req.segno, lsn, ctx).await?; + + Ok(PagestreamBeMessage::GetSlruSegment( + PagestreamGetSlruSegmentResponse { segment }, + )) + } + + /// Note on "fullbackup": + /// Full basebackups should only be used for debugging purposes. + /// Originally, it was introduced to enable breaking storage format changes, + /// but that is not applicable anymore. + /// + /// # Coding Discipline + /// + /// Coding discipline within this function: all interaction with the `pgb` connection + /// needs to be sensitive to connection shutdown, currently signalled via [`Self::cancel`]. + /// This is so that we can shutdown page_service quickly. + /// + /// TODO: wrap the pgb that we pass to the basebackup handler so that it's sensitive + /// to connection cancellation. #[allow(clippy::too_many_arguments)] - #[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))] + #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))] async fn handle_basebackup_request( &mut self, pgb: &mut PostgresBackend, @@ -1138,24 +1066,36 @@ impl PageServerHandler { prev_lsn: Option, full_backup: bool, gzip: bool, - ctx: RequestContext, - ) -> anyhow::Result<()> + ctx: &RequestContext, + ) -> Result<(), QueryError> where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - debug_assert_current_span_has_tenant_and_timeline_id(); + fn map_basebackup_error(err: BasebackupError) -> QueryError { + match err { + BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)), + BasebackupError::Server(e) => QueryError::Other(e), + } + } let started = std::time::Instant::now(); - // check that the timeline exists let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) + .timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) .await?; + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { // Backup was requested at a particular LSN. Wait for it to arrive. info!("waiting for {}", lsn); - timeline.wait_lsn(lsn, &ctx).await?; + timeline + .wait_lsn( + lsn, + crate::tenant::timeline::WaitLsnWaiter::PageService, + ctx, + ) + .await?; timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) .context("invalid basebackup lsn")?; @@ -1164,8 +1104,9 @@ impl PageServerHandler { let lsn_awaited_after = started.elapsed(); // switch client to COPYOUT - pgb.write_message_noflush(&BeMessage::CopyOutResponse)?; - self.flush_cancellable(pgb, &timeline.cancel).await?; + pgb.write_message_noflush(&BeMessage::CopyOutResponse) + .map_err(QueryError::Disconnected)?; + self.flush_cancellable(pgb, &self.cancel).await?; // Send a tarball of the latest layer on the timeline. Compress if not // fullbackup. TODO Compress in that case too (tests need to be updated) @@ -1177,9 +1118,10 @@ impl PageServerHandler { lsn, prev_lsn, full_backup, - &ctx, + ctx, ) - .await?; + .await + .map_err(map_basebackup_error)?; } else { let mut writer = pgb.copyout_writer(); if gzip { @@ -1198,11 +1140,15 @@ impl PageServerHandler { lsn, prev_lsn, full_backup, - &ctx, + ctx, ) - .await?; + .await + .map_err(map_basebackup_error)?; // shutdown the encoder to ensure the gzip footer is written - encoder.shutdown().await?; + encoder + .shutdown() + .await + .map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?; } else { basebackup::send_basebackup_tarball( &mut writer, @@ -1210,13 +1156,15 @@ impl PageServerHandler { lsn, prev_lsn, full_backup, - &ctx, + ctx, ) - .await?; + .await + .map_err(map_basebackup_error)?; } } - pgb.write_message_noflush(&BeMessage::CopyDone)?; + pgb.write_message_noflush(&BeMessage::CopyDone) + .map_err(QueryError::Disconnected)?; self.flush_cancellable(pgb, &timeline.cancel).await?; let basebackup_after = started @@ -1249,25 +1197,6 @@ impl PageServerHandler { .expect("claims presence already checked"); check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0)) } - - /// Shorthand for getting a reference to a Timeline of an Active tenant. - async fn get_active_tenant_timeline( - &self, - tenant_id: TenantId, - timeline_id: TimelineId, - selector: ShardSelector, - ) -> Result, GetActiveTimelineError> { - let tenant = get_active_tenant_with_timeout( - tenant_id, - selector, - ACTIVE_TENANT_TIMEOUT, - &task_mgr::shutdown_token(), - ) - .await - .map_err(GetActiveTimelineError::Tenant)?; - let timeline = tenant.get_timeline(timeline_id, true)?; - Ok(timeline) - } } #[async_trait::async_trait] @@ -1309,6 +1238,7 @@ where _pgb: &mut PostgresBackend, _sm: &FeStartupPacket, ) -> Result<(), QueryError> { + fail::fail_point!("ps::connection-start::startup-packet"); Ok(()) } @@ -1323,11 +1253,12 @@ where Err(QueryError::SimulatedConnectionError) }); + fail::fail_point!("ps::connection-start::process-query"); + let ctx = self.connection_ctx.attached_child(); debug!("process query {query_string:?}"); - if query_string.starts_with("pagestream ") { - let (_, params_raw) = query_string.split_at("pagestream ".len()); - let params = params_raw.split(' ').collect::>(); + let parts = query_string.split_whitespace().collect::>(); + if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) { if params.len() != 2 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for pagestream command" @@ -1344,12 +1275,19 @@ where self.check_permission(Some(tenant_id))?; - self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx) - .await?; - } else if query_string.starts_with("basebackup ") { - let (_, params_raw) = query_string.split_at("basebackup ".len()); - let params = params_raw.split_whitespace().collect::>(); + COMPUTE_COMMANDS_COUNTERS + .for_command(ComputeCommandKind::PageStreamV2) + .inc(); + self.handle_pagerequests( + pgb, + tenant_id, + timeline_id, + PagestreamProtocolVersion::V2, + ctx, + ) + .await?; + } else if let Some(params) = parts.strip_prefix(&["basebackup"]) { if params.len() < 2 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for basebackup command" @@ -1367,90 +1305,51 @@ where self.check_permission(Some(tenant_id))?; - let lsn = if params.len() >= 3 { + COMPUTE_COMMANDS_COUNTERS + .for_command(ComputeCommandKind::Basebackup) + .inc(); + + let lsn = if let Some(lsn_str) = params.get(2) { Some( - Lsn::from_str(params[2]) - .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?, + Lsn::from_str(lsn_str) + .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?, ) } else { None }; - let gzip = if params.len() >= 4 { - if params[3] == "--gzip" { - true - } else { + let gzip = match params.get(3) { + Some(&"--gzip") => true, + None => false, + Some(third_param) => { return Err(QueryError::Other(anyhow::anyhow!( - "Parameter in position 3 unknown {}", - params[3], - ))); + "Parameter in position 3 unknown {third_param}", + ))) } - } else { - false }; - ::metrics::metric_vec_duration::observe_async_block_duration_by_result( - &*metrics::BASEBACKUP_QUERY_TIME, - async move { - self.handle_basebackup_request( - pgb, - tenant_id, - timeline_id, - lsn, - None, - false, - gzip, - ctx, - ) - .await?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - anyhow::Ok(()) - }, - ) - .await?; - } - // return pair of prev_lsn and last_lsn - else if query_string.starts_with("get_last_record_rlsn ") { - let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len()); - let params = params_raw.split_whitespace().collect::>(); - - if params.len() != 2 { - return Err(QueryError::Other(anyhow::anyhow!( - "invalid param number for get_last_record_rlsn command" - ))); - } - - let tenant_id = TenantId::from_str(params[0]) - .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; - let timeline_id = TimelineId::from_str(params[1]) - .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; - - tracing::Span::current() - .record("tenant_id", field::display(tenant_id)) - .record("timeline_id", field::display(timeline_id)); - - self.check_permission(Some(tenant_id))?; - let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) + let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx); + let res = async { + self.handle_basebackup_request( + pgb, + tenant_id, + timeline_id, + lsn, + None, + false, + gzip, + &ctx, + ) .await?; - - let end_of_timeline = timeline.get_last_record_rlsn(); - - pgb.write_message_noflush(&BeMessage::RowDescription(&[ - RowDescriptor::text_col(b"prev_lsn"), - RowDescriptor::text_col(b"last_lsn"), - ]))? - .write_message_noflush(&BeMessage::DataRow(&[ - Some(end_of_timeline.prev.to_string().as_bytes()), - Some(end_of_timeline.last.to_string().as_bytes()), - ]))? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + Result::<(), QueryError>::Ok(()) + } + .await; + metric_recording.observe(&res); + res?; } // same as basebackup, but result includes relational data as well - else if query_string.starts_with("fullbackup ") { - let (_, params_raw) = query_string.split_at("fullbackup ".len()); - let params = params_raw.split_whitespace().collect::>(); - + else if let Some(params) = parts.strip_prefix(&["fullbackup"]) { if params.len() < 2 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for fullbackup command" @@ -1467,18 +1366,18 @@ where .record("timeline_id", field::display(timeline_id)); // The caller is responsible for providing correct lsn and prev_lsn. - let lsn = if params.len() > 2 { + let lsn = if let Some(lsn_str) = params.get(2) { Some( - Lsn::from_str(params[2]) - .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?, + Lsn::from_str(lsn_str) + .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?, ) } else { None }; - let prev_lsn = if params.len() > 3 { + let prev_lsn = if let Some(prev_lsn_str) = params.get(3) { Some( - Lsn::from_str(params[3]) - .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?, + Lsn::from_str(prev_lsn_str) + .with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?, ) } else { None @@ -1486,6 +1385,10 @@ where self.check_permission(Some(tenant_id))?; + COMPUTE_COMMANDS_COUNTERS + .for_command(ComputeCommandKind::Fullbackup) + .inc(); + // Check that the timeline exists self.handle_basebackup_request( pgb, @@ -1495,169 +1398,55 @@ where prev_lsn, true, false, - ctx, + &ctx, ) .await?; pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("import basebackup ") { - // Import the `base` section (everything but the wal) of a basebackup. - // Assumes the tenant already exists on this pageserver. - // - // Files are scheduled to be persisted to remote storage, and the - // caller should poll the http api to check when that is done. - // - // Example import command: - // 1. Get start/end LSN from backup_manifest file - // 2. Run: - // cat my_backup/base.tar | psql -h $PAGESERVER \ - // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION" - let (_, params_raw) = query_string.split_at("import basebackup ".len()); - let params = params_raw.split_whitespace().collect::>(); - if params.len() != 5 { - return Err(QueryError::Other(anyhow::anyhow!( - "invalid param number for import basebackup command" - ))); - } - let tenant_id = TenantId::from_str(params[0]) - .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; - let timeline_id = TimelineId::from_str(params[1]) - .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; - let base_lsn = Lsn::from_str(params[2]) - .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?; - let end_lsn = Lsn::from_str(params[3]) - .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?; - let pg_version = u32::from_str(params[4]) - .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?; - - tracing::Span::current() - .record("tenant_id", field::display(tenant_id)) - .record("timeline_id", field::display(timeline_id)); - - self.check_permission(Some(tenant_id))?; - - match self - .handle_import_basebackup( - pgb, - tenant_id, - timeline_id, - base_lsn, - end_lsn, - pg_version, - ctx, - ) - .await - { - Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, - Err(e) => { - error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}"); - pgb.write_message_noflush(&BeMessage::ErrorResponse( - &e.to_string(), - Some(e.pg_error_code()), - ))? - } - }; - } else if query_string.starts_with("import wal ") { - // Import the `pg_wal` section of a basebackup. - // - // Files are scheduled to be persisted to remote storage, and the - // caller should poll the http api to check when that is done. - let (_, params_raw) = query_string.split_at("import wal ".len()); - let params = params_raw.split_whitespace().collect::>(); - if params.len() != 4 { - return Err(QueryError::Other(anyhow::anyhow!( - "invalid param number for import wal command" - ))); - } - let tenant_id = TenantId::from_str(params[0]) - .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; - let timeline_id = TimelineId::from_str(params[1]) - .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; - let start_lsn = Lsn::from_str(params[2]) - .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?; - let end_lsn = Lsn::from_str(params[3]) - .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?; - - tracing::Span::current() - .record("tenant_id", field::display(tenant_id)) - .record("timeline_id", field::display(timeline_id)); - - self.check_permission(Some(tenant_id))?; - - match self - .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx) - .await - { - Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, - Err(e) => { - error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}"); - pgb.write_message_noflush(&BeMessage::ErrorResponse( - &e.to_string(), - Some(e.pg_error_code()), - ))? - } - }; } else if query_string.to_ascii_lowercase().starts_with("set ") { // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("show ") { - // show - let (_, params_raw) = query_string.split_at("show ".len()); - let params = params_raw.split(' ').collect::>(); - if params.len() != 1 { + } else if query_string.starts_with("lease lsn ") { + let params = &parts[2..]; + if params.len() != 3 { return Err(QueryError::Other(anyhow::anyhow!( - "invalid param number for config command" + "invalid param number {} for lease lsn command", + params.len() ))); } - let tenant_id = TenantId::from_str(params[0]) + + let tenant_shard_id = TenantShardId::from_str(params[0]) .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; - tracing::Span::current().record("tenant_id", field::display(tenant_id)); + tracing::Span::current() + .record("tenant_id", field::display(tenant_shard_id)) + .record("timeline_id", field::display(timeline_id)); - self.check_permission(Some(tenant_id))?; + self.check_permission(Some(tenant_shard_id.tenant_id))?; - let tenant = get_active_tenant_with_timeout( - tenant_id, - ShardSelector::Zero, - ACTIVE_TENANT_TIMEOUT, - &task_mgr::shutdown_token(), - ) - .await?; - pgb.write_message_noflush(&BeMessage::RowDescription(&[ - RowDescriptor::int8_col(b"checkpoint_distance"), - RowDescriptor::int8_col(b"checkpoint_timeout"), - RowDescriptor::int8_col(b"compaction_target_size"), - RowDescriptor::int8_col(b"compaction_period"), - RowDescriptor::int8_col(b"compaction_threshold"), - RowDescriptor::int8_col(b"gc_horizon"), - RowDescriptor::int8_col(b"gc_period"), - RowDescriptor::int8_col(b"image_creation_threshold"), - RowDescriptor::int8_col(b"pitr_interval"), - ]))? - .write_message_noflush(&BeMessage::DataRow(&[ - Some(tenant.get_checkpoint_distance().to_string().as_bytes()), - Some( - tenant - .get_checkpoint_timeout() - .as_secs() - .to_string() - .as_bytes(), - ), - Some(tenant.get_compaction_target_size().to_string().as_bytes()), - Some( - tenant - .get_compaction_period() - .as_secs() - .to_string() - .as_bytes(), - ), - Some(tenant.get_compaction_threshold().to_string().as_bytes()), - Some(tenant.get_gc_horizon().to_string().as_bytes()), - Some(tenant.get_gc_period().as_secs().to_string().as_bytes()), - Some(tenant.get_image_creation_threshold().to_string().as_bytes()), - Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()), - ]))? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + COMPUTE_COMMANDS_COUNTERS + .for_command(ComputeCommandKind::LeaseLsn) + .inc(); + + // The caller is responsible for providing correct lsn. + let lsn = Lsn::from_str(params[2]) + .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?; + + match self + .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx) + .await + { + Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, + Err(e) => { + error!("error obtaining lsn lease for {lsn}: {e:?}"); + pgb.write_message_noflush(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))? + } + }; } else { return Err(QueryError::Other(anyhow::anyhow!( "unknown command {query_string}" @@ -1678,13 +1467,14 @@ impl From for QueryError { | GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => { QueryError::Shutdown } + e @ GetActiveTenantError::NotFound(_) => QueryError::NotFound(format!("{e}").into()), e => QueryError::Other(anyhow::anyhow!(e)), } } } #[derive(Debug, thiserror::Error)] -enum GetActiveTimelineError { +pub(crate) enum GetActiveTimelineError { #[error(transparent)] Tenant(GetActiveTenantError), #[error(transparent)] @@ -1700,3 +1490,12 @@ impl From for QueryError { } } } + +fn set_tracing_field_shard_id(timeline: &Timeline) { + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); + tracing::Span::current().record( + "shard_id", + tracing::field::display(timeline.tenant_shard_id.shard_slug()), + ); + debug_assert_current_span_has_tenant_and_timeline_id(); +} diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index b65fe1eddd..808d4b666e 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -9,30 +9,41 @@ use super::tenant::{PageReconstructError, Timeline}; use crate::context::RequestContext; use crate::keyspace::{KeySpace, KeySpaceAccum}; -use crate::repository::*; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; use crate::walrecord::NeonWalRecord; +use crate::{aux_file, repository::*}; use anyhow::{ensure, Context}; -use bytes::{Buf, Bytes}; +use bytes::{Buf, Bytes, BytesMut}; +use enum_map::Enum; use pageserver_api::key::{ - dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key, - rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key, + dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key, + relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, - AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, + CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, }; +use pageserver_api::keyspace::SparseKeySpace; +use pageserver_api::models::AuxFilePolicy; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::BLCKSZ; -use postgres_ffi::{Oid, TimestampTz, TransactionId}; +use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId}; use serde::{Deserialize, Serialize}; use std::collections::{hash_map, HashMap, HashSet}; use std::ops::ControlFlow; use std::ops::Range; use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; -use tracing::{debug, trace, warn}; +use tracing::{debug, info, trace, warn}; use utils::bin_ser::DeserializeError; +use utils::pausable_failpoint; use utils::{bin_ser::BeSer, lsn::Lsn}; +/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached. +pub const MAX_AUX_FILE_DELTAS: usize = 1024; + +/// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached. +pub const MAX_AUX_FILE_V2_DELTAS: usize = 64; + #[derive(Debug)] pub enum LsnForTimestamp { /// Found commits both before and after the given timestamp @@ -66,11 +77,19 @@ pub enum LsnForTimestamp { } #[derive(Debug, thiserror::Error)] -pub enum CalculateLogicalSizeError { +pub(crate) enum CalculateLogicalSizeError { #[error("cancelled")] Cancelled, + + /// Something went wrong while reading the metadata we use to calculate logical size + /// Note that cancellation variants of `PageReconstructError` are transformed to [`Self::Cancelled`] + /// in the `From` implementation for this variant. #[error(transparent)] - Other(#[from] anyhow::Error), + PageRead(PageReconstructError), + + /// Something went wrong deserializing metadata that we read to calculate logical size + #[error("decode error: {0}")] + Decode(#[from] DeserializeError), } #[derive(Debug, thiserror::Error)] @@ -95,10 +114,8 @@ impl From for CollectKeySpaceError { impl From for CalculateLogicalSizeError { fn from(pre: PageReconstructError) -> Self { match pre { - PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => { - Self::Cancelled - } - _ => Self::Other(pre.into()), + PageReconstructError::Cancelled => Self::Cancelled, + _ => Self::PageRead(pre), } } } @@ -151,9 +168,13 @@ impl Timeline { DatadirModification { tline: self, pending_lsns: Vec::new(), - pending_updates: HashMap::new(), + pending_metadata_pages: HashMap::new(), + pending_data_pages: Vec::new(), + pending_zero_data_pages: Default::default(), pending_deletions: Vec::new(), pending_nblocks: 0, + pending_directory_entries: Vec::new(), + pending_bytes: 0, lsn, } } @@ -168,7 +189,6 @@ impl Timeline { tag: RelTag, blknum: BlockNumber, version: Version<'_>, - latest: bool, ctx: &RequestContext, ) -> Result { if tag.relnode == 0 { @@ -177,7 +197,7 @@ impl Timeline { )); } - let nblocks = self.get_rel_size(tag, version, latest, ctx).await?; + let nblocks = self.get_rel_size(tag, version, ctx).await?; if blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", @@ -199,7 +219,6 @@ impl Timeline { spcnode: Oid, dbnode: Oid, version: Version<'_>, - latest: bool, ctx: &RequestContext, ) -> Result { let mut total_blocks = 0; @@ -207,7 +226,7 @@ impl Timeline { let rels = self.list_rels(spcnode, dbnode, version, ctx).await?; for rel in rels { - let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?; + let n_blocks = self.get_rel_size(rel, version, ctx).await?; total_blocks += n_blocks as usize; } Ok(total_blocks) @@ -218,7 +237,6 @@ impl Timeline { &self, tag: RelTag, version: Version<'_>, - latest: bool, ctx: &RequestContext, ) -> Result { if tag.relnode == 0 { @@ -232,7 +250,7 @@ impl Timeline { } if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) - && !self.get_rel_exists(tag, version, latest, ctx).await? + && !self.get_rel_exists(tag, version, ctx).await? { // FIXME: Postgres sometimes calls smgrcreate() to create // FSM, and smgrnblocks() on it immediately afterwards, @@ -245,16 +263,8 @@ impl Timeline { let mut buf = version.get(self, key, ctx).await?; let nblocks = buf.get_u32_le(); - if latest { - // Update relation size cache only if "latest" flag is set. - // This flag is set by compute when it is working with most recent version of relation. - // Typically master compute node always set latest=true. - // Please notice, that even if compute node "by mistake" specifies old LSN but set - // latest=true, then it can not cause cache corruption, because with latest=true - // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be - // associated with most recent value of LSN. - self.update_cached_rel_size(tag, version.get_lsn(), nblocks); - } + self.update_cached_rel_size(tag, version.get_lsn(), nblocks); + Ok(nblocks) } @@ -263,7 +273,6 @@ impl Timeline { &self, tag: RelTag, version: Version<'_>, - _latest: bool, ctx: &RequestContext, ) -> Result { if tag.relnode == 0 { @@ -276,17 +285,19 @@ impl Timeline { if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) { return Ok(true); } + // then check if the database was already initialized. + // get_rel_exists can be called before dbdir is created. + let buf = version.get(self, DBDIR_KEY, ctx).await?; + let dbdirs = DbDirectory::des(&buf)?.dbdirs; + if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) { + return Ok(false); + } // fetch directory listing let key = rel_dir_to_key(tag.spcnode, tag.dbnode); let buf = version.get(self, key, ctx).await?; - match RelDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => { - let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); - Ok(exists) - } - Err(e) => Err(PageReconstructError::from(e)), - } + let dir = RelDirectory::des(&buf)?; + Ok(dir.rels.contains(&(tag.relnode, tag.forknum))) } /// Get a list of all existing relations in given tablespace and database. @@ -305,20 +316,37 @@ impl Timeline { let key = rel_dir_to_key(spcnode, dbnode); let buf = version.get(self, key, ctx).await?; - match RelDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => { - let rels: HashSet = - HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { - spcnode, - dbnode, - relnode: *relnode, - forknum: *forknum, - })); + let dir = RelDirectory::des(&buf)?; + let rels: HashSet = + HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { + spcnode, + dbnode, + relnode: *relnode, + forknum: *forknum, + })); - Ok(rels) - } - Err(e) => Err(PageReconstructError::from(e)), + Ok(rels) + } + + /// Get the whole SLRU segment + pub(crate) async fn get_slru_segment( + &self, + kind: SlruKind, + segno: u32, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result { + let n_blocks = self + .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx) + .await?; + let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize); + for blkno in 0..n_blocks { + let block = self + .get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx) + .await?; + segment.extend_from_slice(&block[..BLCKSZ as usize]); } + Ok(segment.freeze()) } /// Look up given SLRU page version. @@ -359,13 +387,8 @@ impl Timeline { let key = slru_dir_to_key(kind); let buf = version.get(self, key, ctx).await?; - match SlruSegmentDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => { - let exists = dir.segments.get(&segno).is_some(); - Ok(exists) - } - Err(e) => Err(PageReconstructError::from(e)), - } + let dir = SlruSegmentDirectory::des(&buf)?; + Ok(dir.segments.contains(&segno)) } /// Locate LSN, such that all transactions that committed before @@ -381,6 +404,8 @@ impl Timeline { cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { + pausable_failpoint!("find-lsn-for-timestamp-pausable"); + let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn(); // We use this method to figure out the branching LSN for the new branch, but the // GC cutoff could be before the branching point and we cannot create a new branch @@ -396,6 +421,7 @@ impl Timeline { let mut found_smaller = false; let mut found_larger = false; + while low < high { if cancel.is_cancelled() { return Err(PageReconstructError::Cancelled); @@ -438,6 +464,12 @@ impl Timeline { // Didn't find any commit timestamps smaller than the request Ok(LsnForTimestamp::Past(min_lsn)) } + (true, _) if commit_lsn < min_lsn => { + // the search above did set found_smaller to true but it never increased the lsn. + // Then, low is still the old min_lsn, and the subtraction above gave a value + // below the min_lsn. We should never do that. + Ok(LsnForTimestamp::Past(min_lsn)) + } (true, false) => { // Only found commits with timestamps smaller than the request. // It's still a valid case for branch creation, return it. @@ -484,7 +516,7 @@ impl Timeline { ctx: &RequestContext, ) -> Result, PageReconstructError> { let mut max: Option = None; - self.map_all_timestamps(probe_lsn, ctx, |timestamp| { + self.map_all_timestamps::<()>(probe_lsn, ctx, |timestamp| { if let Some(max_prev) = max { max = Some(max_prev.max(timestamp)); } else { @@ -572,10 +604,7 @@ impl Timeline { let key = slru_dir_to_key(kind); let buf = version.get(self, key, ctx).await?; - match SlruSegmentDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => Ok(dir.segments), - Err(e) => Err(PageReconstructError::from(e)), - } + Ok(SlruSegmentDirectory::des(&buf)?.segments) } pub(crate) async fn get_relmap_file( @@ -599,10 +628,7 @@ impl Timeline { // fetch directory entry let buf = self.get(DBDIR_KEY, lsn, ctx).await?; - match DbDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => Ok(dir.dbdirs), - Err(e) => Err(PageReconstructError::from(e)), - } + Ok(DbDirectory::des(&buf)?.dbdirs) } pub(crate) async fn get_twophase_file( @@ -624,10 +650,7 @@ impl Timeline { // fetch directory entry let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; - match TwoPhaseDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => Ok(dir.xids), - Err(e) => Err(PageReconstructError::from(e)), - } + Ok(TwoPhaseDirectory::des(&buf)?.xids) } pub(crate) async fn get_control_file( @@ -646,16 +669,13 @@ impl Timeline { self.get(CHECKPOINT_KEY, lsn, ctx).await } - pub(crate) async fn list_aux_files( + async fn list_aux_files_v1( &self, lsn: Lsn, ctx: &RequestContext, ) -> Result, PageReconstructError> { match self.get(AUX_FILES_KEY, lsn, ctx).await { - Ok(buf) => match AuxFilesDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => Ok(dir.files), - Err(e) => Err(PageReconstructError::from(e)), - }, + Ok(buf) => Ok(AuxFilesDirectory::des(&buf)?.files), Err(e) => { // This is expected: historical databases do not have the key. debug!("Failed to get info about AUX files: {}", e); @@ -664,6 +684,115 @@ impl Timeline { } } + async fn list_aux_files_v2( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + let kv = self + .scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx) + .await?; + let mut result = HashMap::new(); + let mut sz = 0; + for (_, v) in kv { + let v = v?; + let v = aux_file::decode_file_value_bytes(&v) + .context("value decode") + .map_err(PageReconstructError::Other)?; + for (fname, content) in v { + sz += fname.len(); + sz += content.len(); + result.insert(fname, content); + } + } + self.aux_file_size_estimator.on_initial(sz); + Ok(result) + } + + pub(crate) async fn trigger_aux_file_size_computation( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result<(), PageReconstructError> { + let current_policy = self.last_aux_file_policy.load(); + if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy { + self.list_aux_files_v2(lsn, ctx).await?; + } + Ok(()) + } + + pub(crate) async fn list_aux_files( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + let current_policy = self.last_aux_file_policy.load(); + match current_policy { + Some(AuxFilePolicy::V1) => { + let res = self.list_aux_files_v1(lsn, ctx).await?; + let empty_str = if res.is_empty() { ", empty" } else { "" }; + warn!( + "this timeline is using deprecated aux file policy V1 (policy=v1{empty_str})" + ); + Ok(res) + } + None => { + let res = self.list_aux_files_v1(lsn, ctx).await?; + if !res.is_empty() { + warn!("this timeline is using deprecated aux file policy V1 (policy=None)"); + } + Ok(res) + } + Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await, + Some(AuxFilePolicy::CrossValidation) => { + let v1_result = self.list_aux_files_v1(lsn, ctx).await; + let v2_result = self.list_aux_files_v2(lsn, ctx).await; + match (v1_result, v2_result) { + (Ok(v1), Ok(v2)) => { + if v1 != v2 { + tracing::error!( + "unmatched aux file v1 v2 result:\nv1 {v1:?}\nv2 {v2:?}" + ); + return Err(PageReconstructError::Other(anyhow::anyhow!( + "unmatched aux file v1 v2 result" + ))); + } + Ok(v1) + } + (Ok(_), Err(v2)) => { + tracing::error!("aux file v1 returns Ok while aux file v2 returns an err"); + Err(v2) + } + (Err(v1), Ok(_)) => { + tracing::error!("aux file v2 returns Ok while aux file v1 returns an err"); + Err(v1) + } + (Err(_), Err(v2)) => Err(v2), + } + } + } + } + + pub(crate) async fn get_replorigins( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + let kv = self + .scan(KeySpace::single(repl_origin_key_range()), lsn, ctx) + .await?; + let mut result = HashMap::new(); + for (k, v) in kv { + let v = v?; + let origin_id = k.field6 as RepOriginId; + let origin_lsn = Lsn::des(&v).unwrap(); + if origin_lsn != Lsn::INVALID { + result.insert(origin_id, origin_lsn); + } + } + Ok(result) + } + /// Does the same as get_current_logical_size but counted on demand. /// Used to initialize the logical size tracking on startup. /// @@ -673,16 +802,16 @@ impl Timeline { /// # Cancel-Safety /// /// This method is cancellation-safe. - pub async fn get_current_logical_size_non_incremental( + pub(crate) async fn get_current_logical_size_non_incremental( &self, lsn: Lsn, ctx: &RequestContext, ) -> Result { - crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); // Fetch list of database dirs and iterate them let buf = self.get(DBDIR_KEY, lsn, ctx).await?; - let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?; + let dbdir = DbDirectory::des(&buf)?; let mut total_size: u64 = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { @@ -707,11 +836,13 @@ impl Timeline { /// Get a KeySpace that covers all the Keys that are in use at the given LSN. /// Anything that's not listed maybe removed from the underlying storage (from /// that LSN forwards). + /// + /// The return value is (dense keyspace, sparse keyspace). pub(crate) async fn collect_keyspace( &self, lsn: Lsn, ctx: &RequestContext, - ) -> Result { + ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> { // Iterate through key ranges, greedily packing them into partitions let mut result = KeySpaceAccum::new(); @@ -719,13 +850,14 @@ impl Timeline { result.add_key(DBDIR_KEY); // Fetch list of database dirs and iterate them - let buf = self.get(DBDIR_KEY, lsn, ctx).await?; - let dbdir = DbDirectory::des(&buf)?; + let dbdir = self.list_dbdirs(lsn, ctx).await?; + let mut dbs: Vec<((Oid, Oid), bool)> = dbdir.into_iter().collect(); - let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect(); - dbs.sort_unstable(); - for (spcnode, dbnode) in dbs { - result.add_key(relmap_file_key(spcnode, dbnode)); + dbs.sort_unstable_by(|(k_a, _), (k_b, _)| k_a.cmp(k_b)); + for ((spcnode, dbnode), has_relmap_file) in dbs { + if has_relmap_file { + result.add_key(relmap_file_key(spcnode, dbnode)); + } result.add_key(rel_dir_to_key(spcnode, dbnode)); let mut rels: Vec = self @@ -783,13 +915,66 @@ impl Timeline { if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() { result.add_key(AUX_FILES_KEY); } - Ok(result.to_keyspace()) + + // Add extra keyspaces in the test cases. Some test cases write keys into the storage without + // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace` + // and the keys will not be garbage-colllected. + #[cfg(test)] + { + let guard = self.extra_test_dense_keyspace.load(); + for kr in &guard.ranges { + result.add_range(kr.clone()); + } + } + + let dense_keyspace = result.to_keyspace(); + let sparse_keyspace = SparseKeySpace(KeySpace { + ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()], + }); + + if cfg!(debug_assertions) { + // Verify if the sparse keyspaces are ordered and non-overlapping. + + // We do not use KeySpaceAccum for sparse_keyspace because we want to ensure each + // category of sparse keys are split into their own image/delta files. If there + // are overlapping keyspaces, they will be automatically merged by keyspace accum, + // and we want the developer to keep the keyspaces separated. + + let ranges = &sparse_keyspace.0.ranges; + + // TODO: use a single overlaps_with across the codebase + fn overlaps_with(a: &Range, b: &Range) -> bool { + !(a.end <= b.start || b.end <= a.start) + } + for i in 0..ranges.len() { + for j in 0..i { + if overlaps_with(&ranges[i], &ranges[j]) { + panic!( + "overlapping sparse keyspace: {}..{} and {}..{}", + ranges[i].start, ranges[i].end, ranges[j].start, ranges[j].end + ); + } + } + } + for i in 1..ranges.len() { + assert!( + ranges[i - 1].end <= ranges[i].start, + "unordered sparse keyspace: {}..{} and {}..{}", + ranges[i - 1].start, + ranges[i - 1].end, + ranges[i].start, + ranges[i].end + ); + } + } + + Ok((dense_keyspace, sparse_keyspace)) } /// Get cached size of relation if it not updated after specified LSN pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option { let rel_size_cache = self.rel_size_cache.read().unwrap(); - if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) { + if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) { if lsn >= *cached_lsn { return Some(*nblocks); } @@ -800,7 +985,16 @@ impl Timeline { /// Update cached relation size if there is no more recent update pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - match rel_size_cache.entry(tag) { + + if lsn < rel_size_cache.complete_as_of { + // Do not cache old values. It's safe to cache the size on read, as long as + // the read was at an LSN since we started the WAL ingestion. Reasoning: we + // never evict values from the cache, so if the relation size changed after + // 'lsn', the new value is already in the cache. + return; + } + + match rel_size_cache.map.entry(tag) { hash_map::Entry::Occupied(mut entry) => { let cached_lsn = entry.get_mut(); if lsn >= cached_lsn.0 { @@ -816,20 +1010,21 @@ impl Timeline { /// Store cached relation size pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - rel_size_cache.insert(tag, (lsn, nblocks)); + rel_size_cache.map.insert(tag, (lsn, nblocks)); } /// Remove cached relation size pub fn remove_cached_rel_size(&self, tag: &RelTag) { let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - rel_size_cache.remove(tag); + rel_size_cache.map.remove(tag); } } /// DatadirModification represents an operation to ingest an atomic set of -/// updates to the repository. It is created by the 'begin_record' -/// function. It is called for each WAL record, so that all the modifications -/// by a one WAL record appear atomic. +/// updates to the repository. +/// +/// It is created by the 'begin_record' function. It is called for each WAL +/// record, so that all the modifications by a one WAL record appear atomic. pub struct DatadirModification<'a> { /// The timeline this modification applies to. You can access this to /// read the state, but note that any pending updates are *not* reflected @@ -843,17 +1038,51 @@ pub struct DatadirModification<'a> { // The put-functions add the modifications here, and they are flushed to the // underlying key-value store by the 'finish' function. pending_lsns: Vec, - pending_updates: HashMap>, pending_deletions: Vec<(Range, Lsn)>, pending_nblocks: i64, + + /// Metadata writes, indexed by key so that they can be read from not-yet-committed modifications + /// while ingesting subsequent records. See [`Self::is_data_key`] for the definition of 'metadata'. + pending_metadata_pages: HashMap>, + + /// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for + /// which keys are stored here. + pending_data_pages: Vec<(CompactKey, Lsn, usize, Value)>, + + // Sometimes during ingest, for example when extending a relation, we would like to write a zero page. However, + // if we encounter a write from postgres in the same wal record, we will drop this entry. + // + // Unlike other 'pending' fields, this does not last until the next call to commit(): it is flushed + // at the end of each wal record, and all these writes implicitly are at lsn Self::lsn + pending_zero_data_pages: HashSet, + + /// For special "directory" keys that store key-value maps, track the size of the map + /// if it was updated in this modification. + pending_directory_entries: Vec<(DirectoryKind, usize)>, + + /// An **approximation** of how large our EphemeralFile write will be when committed. + pending_bytes: usize, } impl<'a> DatadirModification<'a> { + // When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can + // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we + // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed. + pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024; + /// Get the current lsn pub(crate) fn get_lsn(&self) -> Lsn { self.lsn } + pub(crate) fn approx_pending_bytes(&self) -> usize { + self.pending_bytes + } + + pub(crate) fn has_dirty_data_pages(&self) -> bool { + (!self.pending_data_pages.is_empty()) || (!self.pending_zero_data_pages.is_empty()) + } + /// Set the current lsn pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> { ensure!( @@ -862,6 +1091,10 @@ impl<'a> DatadirModification<'a> { lsn, self.lsn ); + + // If we are advancing LSN, then state from previous wal record should have been flushed. + assert!(self.pending_zero_data_pages.is_empty()); + if lsn > self.lsn { self.pending_lsns.push(self.lsn); self.lsn = lsn; @@ -869,6 +1102,17 @@ impl<'a> DatadirModification<'a> { Ok(()) } + /// In this context, 'metadata' means keys that are only read by the pageserver internally, and 'data' means + /// keys that represent literal blocks that postgres can read. So data includes relation blocks and + /// SLRU blocks, which are read directly by postgres, and everything else is considered metadata. + /// + /// The distinction is important because data keys are handled on a fast path where dirty writes are + /// not readable until this modification is committed, whereas metadata keys are visible for read + /// via [`Self::get`] as soon as their record has been ingested. + fn is_data_key(key: &Key) -> bool { + key.is_rel_block_key() || key.is_slru_block_key() + } + /// Initialize a completely new repository. /// /// This inserts the directory metadata entries that are assumed to @@ -877,6 +1121,7 @@ impl<'a> DatadirModification<'a> { let buf = DbDirectory::ser(&DbDirectory { dbdirs: HashMap::new(), })?; + self.pending_directory_entries.push((DirectoryKind::Db, 0)); self.put(DBDIR_KEY, Value::Image(buf.into())); // Create AuxFilesDirectory @@ -885,16 +1130,24 @@ impl<'a> DatadirModification<'a> { let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory { xids: HashSet::new(), })?; + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, 0)); self.put(TWOPHASEDIR_KEY, Value::Image(buf.into())); let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into(); let empty_dir = Value::Image(buf); self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone()); + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); self.put( slru_dir_to_key(SlruKind::MultiXactMembers), empty_dir.clone(), ); + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir); + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0)); Ok(()) } @@ -967,6 +1220,31 @@ impl<'a> DatadirModification<'a> { Ok(()) } + pub(crate) fn put_rel_page_image_zero(&mut self, rel: RelTag, blknum: BlockNumber) { + self.pending_zero_data_pages + .insert(rel_block_to_key(rel, blknum).to_compact()); + self.pending_bytes += ZERO_PAGE.len(); + } + + pub(crate) fn put_slru_page_image_zero( + &mut self, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + ) { + self.pending_zero_data_pages + .insert(slru_block_to_key(kind, segno, blknum).to_compact()); + self.pending_bytes += ZERO_PAGE.len(); + } + + /// Call this at the end of each WAL record. + pub(crate) fn on_record_end(&mut self) { + let pending_zero_data_pages = std::mem::take(&mut self.pending_zero_data_pages); + for key in pending_zero_data_pages { + self.put_data(key, Value::Image(ZERO_PAGE.clone())); + } + } + /// Store a relmapper file (pg_filenode.map) in the repository pub async fn put_relmap_file( &mut self, @@ -995,6 +1273,7 @@ impl<'a> DatadirModification<'a> { let buf = RelDirectory::ser(&RelDirectory { rels: HashSet::new(), })?; + self.pending_directory_entries.push((DirectoryKind::Rel, 0)); self.put( rel_dir_to_key(spcnode, dbnode), Value::Image(Bytes::from(buf)), @@ -1017,6 +1296,8 @@ impl<'a> DatadirModification<'a> { if !dir.xids.insert(xid) { anyhow::bail!("twophase file for xid {} already exists", xid); } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); self.put( TWOPHASEDIR_KEY, Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), @@ -1026,6 +1307,20 @@ impl<'a> DatadirModification<'a> { Ok(()) } + pub async fn set_replorigin( + &mut self, + origin_id: RepOriginId, + origin_lsn: Lsn, + ) -> anyhow::Result<()> { + let key = repl_origin_key(origin_id); + self.put(key, Value::Image(origin_lsn.ser().unwrap().into())); + Ok(()) + } + + pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> { + self.set_replorigin(origin_id, Lsn::INVALID).await + } + pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> { self.put(CONTROLFILE_KEY, Value::Image(img)); Ok(()) @@ -1044,7 +1339,7 @@ impl<'a> DatadirModification<'a> { ) -> anyhow::Result<()> { let total_blocks = self .tline - .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx) + .get_db_size(spcnode, dbnode, Version::Modified(self), ctx) .await?; // Remove entry from dbdir @@ -1052,6 +1347,8 @@ impl<'a> DatadirModification<'a> { let mut dir = DbDirectory::des(&buf)?; if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() { let buf = DbDirectory::ser(&dir)?; + self.pending_directory_entries + .push((DirectoryKind::Db, dir.dbdirs.len())); self.put(DBDIR_KEY, Value::Image(buf.into())); } else { warn!( @@ -1085,24 +1382,31 @@ impl<'a> DatadirModification<'a> { let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?) .context("deserialize db")?; let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); - let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() { - // Didn't exist. Update dbdir - dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false); - let buf = DbDirectory::ser(&dbdir).context("serialize db")?; - self.put(DBDIR_KEY, Value::Image(buf.into())); + let mut rel_dir = + if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) { + // Didn't exist. Update dbdir + e.insert(false); + let buf = DbDirectory::ser(&dbdir).context("serialize db")?; + self.pending_directory_entries + .push((DirectoryKind::Db, dbdir.dbdirs.len())); + self.put(DBDIR_KEY, Value::Image(buf.into())); - // and create the RelDirectory - RelDirectory::default() - } else { - // reldir already exists, fetch it - RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) - .context("deserialize db")? - }; + // and create the RelDirectory + RelDirectory::default() + } else { + // reldir already exists, fetch it + RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) + .context("deserialize db")? + }; // Add the new relation to the rel directory entry, and write it back if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { return Err(RelationError::AlreadyExists); } + + self.pending_directory_entries + .push((DirectoryKind::Rel, rel_dir.rels.len())); + self.put( rel_dir_key, Value::Image(Bytes::from( @@ -1135,7 +1439,7 @@ impl<'a> DatadirModification<'a> { anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); if self .tline - .get_rel_exists(rel, Version::Modified(self), true, ctx) + .get_rel_exists(rel, Version::Modified(self), ctx) .await? { let size_key = rel_size_to_key(rel); @@ -1194,6 +1498,9 @@ impl<'a> DatadirModification<'a> { let buf = self.get(dir_key, ctx).await?; let mut dir = RelDirectory::des(&buf)?; + self.pending_directory_entries + .push((DirectoryKind::Rel, dir.rels.len())); + if dir.rels.remove(&(rel.relnode, rel.forknum)) { self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?))); } else { @@ -1229,6 +1536,8 @@ impl<'a> DatadirModification<'a> { if !dir.segments.insert(segno) { anyhow::bail!("slru segment {kind:?}/{segno} already exists"); } + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(kind), dir.segments.len())); self.put( dir_key, Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), @@ -1273,6 +1582,8 @@ impl<'a> DatadirModification<'a> { if !dir.segments.remove(&segno) { warn!("slru segment {:?}/{} does not exist", kind, segno); } + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(kind), dir.segments.len())); self.put( dir_key, Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), @@ -1303,6 +1614,8 @@ impl<'a> DatadirModification<'a> { if !dir.xids.remove(&xid) { warn!("twophase file for xid {} does not exist", xid); } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); self.put( TWOPHASEDIR_KEY, Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), @@ -1315,9 +1628,14 @@ impl<'a> DatadirModification<'a> { } pub fn init_aux_dir(&mut self) -> anyhow::Result<()> { + if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() { + return Ok(()); + } let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { files: HashMap::new(), })?; + self.pending_directory_entries + .push((DirectoryKind::AuxFiles, 0)); self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); Ok(()) } @@ -1328,28 +1646,182 @@ impl<'a> DatadirModification<'a> { content: &[u8], ctx: &RequestContext, ) -> anyhow::Result<()> { - let mut dir = match self.get(AUX_FILES_KEY, ctx).await { - Ok(buf) => AuxFilesDirectory::des(&buf)?, - Err(e) => { - // This is expected: historical databases do not have the key. - debug!("Failed to get info about AUX files: {}", e); - AuxFilesDirectory { - files: HashMap::new(), + let switch_policy = self.tline.get_switch_aux_file_policy(); + + let policy = { + let current_policy = self.tline.last_aux_file_policy.load(); + // Allowed switch path: + // * no aux files -> v1/v2/cross-validation + // * cross-validation->v2 + + let current_policy = if current_policy.is_none() { + // This path will only be hit once per tenant: we will decide the final policy in this code block. + // The next call to `put_file` will always have `last_aux_file_policy != None`. + let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); + let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?; + if aux_files_key_v1.is_empty() { + None + } else { + warn!("this timeline is using deprecated aux file policy V1 (detected existing v1 files)"); + self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?; + Some(AuxFilePolicy::V1) } + } else { + current_policy + }; + + if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) { + self.tline.do_switch_aux_policy(switch_policy)?; + info!(current=?current_policy, next=?switch_policy, "switching aux file policy"); + switch_policy + } else { + // This branch handles non-valid migration path, and the case that switch_policy == current_policy. + // And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit. + current_policy.unwrap_or(AuxFilePolicy::default_tenant_config()) } }; - let path = path.to_string(); - if content.is_empty() { - dir.files.remove(&path); - } else { - dir.files.insert(path, Bytes::copy_from_slice(content)); + + if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy { + let key = aux_file::encode_aux_file_key(path); + // retrieve the key from the engine + let old_val = match self.get(key, ctx).await { + Ok(val) => Some(val), + Err(PageReconstructError::MissingKey(_)) => None, + Err(e) => return Err(e.into()), + }; + let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val { + aux_file::decode_file_value(old_val)? + } else { + Vec::new() + }; + let mut other_files = Vec::with_capacity(files.len()); + let mut modifying_file = None; + for file @ (p, content) in files { + if path == p { + assert!( + modifying_file.is_none(), + "duplicated entries found for {}", + path + ); + modifying_file = Some(content); + } else { + other_files.push(file); + } + } + let mut new_files = other_files; + match (modifying_file, content.is_empty()) { + (Some(old_content), false) => { + self.tline + .aux_file_size_estimator + .on_update(old_content.len(), content.len()); + new_files.push((path, content)); + } + (Some(old_content), true) => { + self.tline + .aux_file_size_estimator + .on_remove(old_content.len()); + // not adding the file key to the final `new_files` vec. + } + (None, false) => { + self.tline.aux_file_size_estimator.on_add(content.len()); + new_files.push((path, content)); + } + (None, true) => warn!("removing non-existing aux file: {}", path), + } + let new_val = aux_file::encode_file_value(&new_files)?; + self.put(key, Value::Image(new_val.into())); } - self.put( - AUX_FILES_KEY, - Value::Image(Bytes::from( - AuxFilesDirectory::ser(&dir).context("serialize")?, - )), - ); + + if let AuxFilePolicy::V1 | AuxFilePolicy::CrossValidation = policy { + let file_path = path.to_string(); + let content = if content.is_empty() { + None + } else { + Some(Bytes::copy_from_slice(content)) + }; + + let n_files; + let mut aux_files = self.tline.aux_files.lock().await; + if let Some(mut dir) = aux_files.dir.take() { + // We already updated aux files in `self`: emit a delta and update our latest value. + dir.upsert(file_path.clone(), content.clone()); + n_files = dir.files.len(); + if aux_files.n_deltas == MAX_AUX_FILE_DELTAS { + self.put( + AUX_FILES_KEY, + Value::Image(Bytes::from( + AuxFilesDirectory::ser(&dir).context("serialize")?, + )), + ); + aux_files.n_deltas = 0; + } else { + self.put( + AUX_FILES_KEY, + Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }), + ); + aux_files.n_deltas += 1; + } + aux_files.dir = Some(dir); + } else { + // Check if the AUX_FILES_KEY is initialized + match self.get(AUX_FILES_KEY, ctx).await { + Ok(dir_bytes) => { + let mut dir = AuxFilesDirectory::des(&dir_bytes)?; + // Key is already set, we may append a delta + self.put( + AUX_FILES_KEY, + Value::WalRecord(NeonWalRecord::AuxFile { + file_path: file_path.clone(), + content: content.clone(), + }), + ); + dir.upsert(file_path, content); + n_files = dir.files.len(); + aux_files.dir = Some(dir); + } + Err( + e @ (PageReconstructError::Cancelled + | PageReconstructError::AncestorLsnTimeout(_)), + ) => { + // Important that we do not interpret a shutdown error as "not found" and thereby + // reset the map. + return Err(e.into()); + } + // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but + // the original code assumes all other errors are missing keys. Therefore, we keep the code path + // the same for now, though in theory, we should only match the `MissingKey` variant. + Err( + e @ (PageReconstructError::Other(_) + | PageReconstructError::WalRedo(_) + | PageReconstructError::MissingKey(_)), + ) => { + // Key is missing, we must insert an image as the basis for subsequent deltas. + + if !matches!(e, PageReconstructError::MissingKey(_)) { + let e = utils::error::report_compact_sources(&e); + tracing::warn!("treating error as if it was a missing key: {}", e); + } + + let mut dir = AuxFilesDirectory { + files: HashMap::new(), + }; + dir.upsert(file_path, content); + self.put( + AUX_FILES_KEY, + Value::Image(Bytes::from( + AuxFilesDirectory::ser(&dir).context("serialize")?, + )), + ); + n_files = 1; + aux_files.dir = Some(dir); + } + } + } + + self.pending_directory_entries + .push((DirectoryKind::AuxFiles, n_files)); + } + Ok(()) } @@ -1371,7 +1843,7 @@ impl<'a> DatadirModification<'a> { /// retains all the metadata, but data pages are flushed. That's again OK /// for bulk import, where you are just loading data pages and won't try to /// modify the same pages twice. - pub async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { + pub(crate) async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { // Unless we have accumulated a decent amount of changes, it's not worth it // to scan through the pending_updates list. let pending_nblocks = self.pending_nblocks; @@ -1379,32 +1851,25 @@ impl<'a> DatadirModification<'a> { return Ok(()); } - let writer = self.tline.writer().await; + let mut writer = self.tline.writer().await; // Flush relation and SLRU data blocks, keep metadata. - let mut retained_pending_updates = HashMap::<_, Vec<_>>::new(); - for (key, values) in self.pending_updates.drain() { - for (lsn, value) in values { - if is_rel_block_key(&key) || is_slru_block_key(key) { - // This bails out on first error without modifying pending_updates. - // That's Ok, cf this function's doc comment. - writer.put(key, lsn, &value, ctx).await?; - } else { - retained_pending_updates - .entry(key) - .or_default() - .push((lsn, value)); - } - } - } + let pending_data_pages = std::mem::take(&mut self.pending_data_pages); - self.pending_updates = retained_pending_updates; + // This bails out on first error without modifying pending_updates. + // That's Ok, cf this function's doc comment. + writer.put_batch(pending_data_pages, ctx).await?; + self.pending_bytes = 0; if pending_nblocks != 0 { writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); self.pending_nblocks = 0; } + for (kind, count) in std::mem::take(&mut self.pending_directory_entries) { + writer.update_directory_entries_count(kind, count as u64); + } + Ok(()) } @@ -1414,18 +1879,35 @@ impl<'a> DatadirModification<'a> { /// All the modifications in this atomic update are stamped by the specified LSN. /// pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { - let writer = self.tline.writer().await; + // Commit should never be called mid-wal-record + assert!(self.pending_zero_data_pages.is_empty()); + + let mut writer = self.tline.writer().await; let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; - if !self.pending_updates.is_empty() { - writer.put_batch(&self.pending_updates, ctx).await?; - self.pending_updates.clear(); + // Ordering: the items in this batch do not need to be in any global order, but values for + // a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on + // this to do efficient updates to its index. + let mut write_batch = std::mem::take(&mut self.pending_data_pages); + + write_batch.extend( + self.pending_metadata_pages + .drain() + .flat_map(|(key, values)| { + values + .into_iter() + .map(move |(lsn, value_size, value)| (key, lsn, value_size, value)) + }), + ); + + if !write_batch.is_empty() { + writer.put_batch(write_batch, ctx).await?; } if !self.pending_deletions.is_empty() { - writer.delete_batch(&self.pending_deletions).await?; + writer.delete_batch(&self.pending_deletions, ctx).await?; self.pending_deletions.clear(); } @@ -1442,51 +1924,121 @@ impl<'a> DatadirModification<'a> { writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); } + for (kind, count) in std::mem::take(&mut self.pending_directory_entries) { + writer.update_directory_entries_count(kind, count as u64); + } + + self.pending_bytes = 0; + Ok(()) } pub(crate) fn len(&self) -> usize { - self.pending_updates.len() + self.pending_deletions.len() + self.pending_metadata_pages.len() + + self.pending_data_pages.len() + + self.pending_deletions.len() } - // Internal helper functions to batch the modifications - + /// Read a page from the Timeline we are writing to. For metadata pages, this passes through + /// a cache in Self, which makes writes earlier in this modification visible to WAL records later + /// in the modification. + /// + /// For data pages, reads pass directly to the owning Timeline: any ingest code which reads a data + /// page must ensure that the pages they read are already committed in Timeline, for example + /// DB create operations are always preceded by a call to commit(). This is special cased because + /// it's rare: all the 'normal' WAL operations will only read metadata pages such as relation sizes, + /// and not data pages. async fn get(&self, key: Key, ctx: &RequestContext) -> Result { - // Have we already updated the same key? Read the latest pending updated - // version in that case. - // - // Note: we don't check pending_deletions. It is an error to request a - // value that has been removed, deletion only avoids leaking storage. - if let Some(values) = self.pending_updates.get(&key) { - if let Some((_, value)) = values.last() { - return if let Value::Image(img) = value { - Ok(img.clone()) - } else { - // Currently, we never need to read back a WAL record that we - // inserted in the same "transaction". All the metadata updates - // work directly with Images, and we never need to read actual - // data pages. We could handle this if we had to, by calling - // the walredo manager, but let's keep it simple for now. - Err(PageReconstructError::from(anyhow::anyhow!( - "unexpected pending WAL record" - ))) - }; + if !Self::is_data_key(&key) { + // Have we already updated the same key? Read the latest pending updated + // version in that case. + // + // Note: we don't check pending_deletions. It is an error to request a + // value that has been removed, deletion only avoids leaking storage. + if let Some(values) = self.pending_metadata_pages.get(&key.to_compact()) { + if let Some((_, _, value)) = values.last() { + return if let Value::Image(img) = value { + Ok(img.clone()) + } else { + // Currently, we never need to read back a WAL record that we + // inserted in the same "transaction". All the metadata updates + // work directly with Images, and we never need to read actual + // data pages. We could handle this if we had to, by calling + // the walredo manager, but let's keep it simple for now. + Err(PageReconstructError::Other(anyhow::anyhow!( + "unexpected pending WAL record" + ))) + }; + } + } + } else { + // This is an expensive check, so we only do it in debug mode. If reading a data key, + // this key should never be present in pending_data_pages. We ensure this by committing + // modifications before ingesting DB create operations, which are the only kind that reads + // data pages during ingest. + if cfg!(debug_assertions) { + for (dirty_key, _, _, _) in &self.pending_data_pages { + debug_assert!(&key.to_compact() != dirty_key); + } + + debug_assert!(!self.pending_zero_data_pages.contains(&key.to_compact())) } } + + // Metadata page cache miss, or we're reading a data page. let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); self.tline.get(key, lsn, ctx).await } + /// Only used during unit tests, force putting a key into the modification. + #[cfg(test)] + pub(crate) fn put_for_test(&mut self, key: Key, val: Value) { + self.put(key, val); + } + fn put(&mut self, key: Key, val: Value) { - let values = self.pending_updates.entry(key).or_default(); + if Self::is_data_key(&key) { + self.put_data(key.to_compact(), val) + } else { + self.put_metadata(key.to_compact(), val) + } + } + + fn put_data(&mut self, key: CompactKey, val: Value) { + let val_serialized_size = val.serialized_size().unwrap() as usize; + + // If this page was previously zero'd in the same WalRecord, then drop the previous zero page write. This + // is an optimization that avoids persisting both the zero page generated by us (e.g. during a relation extend), + // and the subsequent postgres-originating write + if self.pending_zero_data_pages.remove(&key) { + self.pending_bytes -= ZERO_PAGE.len(); + } + + self.pending_bytes += val_serialized_size; + self.pending_data_pages + .push((key, self.lsn, val_serialized_size, val)) + } + + fn put_metadata(&mut self, key: CompactKey, val: Value) { + let values = self.pending_metadata_pages.entry(key).or_default(); // Replace the previous value if it exists at the same lsn - if let Some((last_lsn, last_value)) = values.last_mut() { + if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() { if *last_lsn == self.lsn { + // Update the pending_bytes contribution from this entry, and update the serialized size in place + self.pending_bytes -= *last_value_ser_size; + *last_value_ser_size = val.serialized_size().unwrap() as usize; + self.pending_bytes += *last_value_ser_size; + + // Use the latest value, this replaces any earlier write to the same (key,lsn), such as much + // have been generated by synthesized zero page writes prior to the first real write to a page. *last_value = val; return; } } - values.push((self.lsn, val)); + + let val_serialized_size = val.serialized_size().unwrap() as usize; + self.pending_bytes += val_serialized_size; + values.push((self.lsn, val_serialized_size, val)); } fn delete(&mut self, key_range: Range) { @@ -1497,6 +2049,7 @@ impl<'a> DatadirModification<'a> { /// This struct facilitates accessing either a committed key from the timeline at a /// specific LSN, or the latest uncommitted key from a pending modification. +/// /// During WAL ingestion, the records from multiple LSNs may be batched in the same /// modification before being flushed to the timeline. Hence, the routines in WalIngest /// need to look up the keys in the modification first before looking them up in the @@ -1550,9 +2103,19 @@ struct RelDirectory { rels: HashSet<(Oid, u8)>, } -#[derive(Debug, Serialize, Deserialize, Default)] -struct AuxFilesDirectory { - files: HashMap, +#[derive(Debug, Serialize, Deserialize, Default, PartialEq)] +pub(crate) struct AuxFilesDirectory { + pub(crate) files: HashMap, +} + +impl AuxFilesDirectory { + pub(crate) fn upsert(&mut self, key: String, value: Option) { + if let Some(value) = value { + self.files.insert(key, value); + } else { + self.files.remove(&key); + } + } } #[derive(Debug, Serialize, Deserialize)] @@ -1566,13 +2129,82 @@ struct SlruSegmentDirectory { segments: HashSet, } +#[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)] +#[repr(u8)] +pub(crate) enum DirectoryKind { + Db, + TwoPhase, + Rel, + AuxFiles, + SlruSegment(SlruKind), +} + +impl DirectoryKind { + pub(crate) const KINDS_NUM: usize = ::LENGTH; + pub(crate) fn offset(&self) -> usize { + self.into_usize() + } +} + static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { - //use super::repo_harness::*; - //use super::*; + use hex_literal::hex; + use utils::id::TimelineId; + + use super::*; + + use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION}; + + /// Test a round trip of aux file updates, from DatadirModification to reading back from the Timeline + #[tokio::test] + async fn aux_files_round_trip() -> anyhow::Result<()> { + let name = "aux_files_round_trip"; + let harness = TenantHarness::create(name).await?; + + pub const TIMELINE_ID: TimelineId = + TimelineId::from_array(hex!("11223344556677881122334455667788")); + + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + let tline = tline.raw_timeline().unwrap(); + + // First modification: insert two keys + let mut modification = tline.begin_modification(Lsn(0x1000)); + modification.put_file("foo/bar1", b"content1", &ctx).await?; + modification.set_lsn(Lsn(0x1008))?; + modification.put_file("foo/bar2", b"content2", &ctx).await?; + modification.commit(&ctx).await?; + let expect_1008 = HashMap::from([ + ("foo/bar1".to_string(), Bytes::from_static(b"content1")), + ("foo/bar2".to_string(), Bytes::from_static(b"content2")), + ]); + + let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?; + assert_eq!(readback, expect_1008); + + // Second modification: update one key, remove the other + let mut modification = tline.begin_modification(Lsn(0x2000)); + modification.put_file("foo/bar1", b"content3", &ctx).await?; + modification.set_lsn(Lsn(0x2008))?; + modification.put_file("foo/bar2", b"", &ctx).await?; + modification.commit(&ctx).await?; + let expect_2008 = + HashMap::from([("foo/bar1".to_string(), Bytes::from_static(b"content3"))]); + + let readback = tline.list_aux_files(Lsn(0x2008), &ctx).await?; + assert_eq!(readback, expect_2008); + + // Reading back in time works + let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?; + assert_eq!(readback, expect_1008); + + Ok(()) + } /* fn assert_current_logical_size(timeline: &DatadirTimeline, lsn: Lsn) { diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index c726139524..e4ebafd927 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -8,8 +8,7 @@ use std::time::Duration; pub use pageserver_api::key::{Key, KEY_SIZE}; /// A 'value' stored for a one Key. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(test, derive(PartialEq))] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum Value { /// An Image value contains a full copy of the value Image(Bytes), @@ -33,11 +32,53 @@ impl Value { } } +#[derive(Debug, PartialEq)] +pub(crate) enum InvalidInput { + TooShortValue, + TooShortPostgresRecord, +} + +/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets +/// use this type for querying if a slice looks some particular way. +pub(crate) struct ValueBytes; + +impl ValueBytes { + pub(crate) fn will_init(raw: &[u8]) -> Result { + if raw.len() < 12 { + return Err(InvalidInput::TooShortValue); + } + + let value_discriminator = &raw[0..4]; + + if value_discriminator == [0, 0, 0, 0] { + // Value::Image always initializes + return Ok(true); + } + + if value_discriminator != [0, 0, 0, 1] { + // not a Value::WalRecord(..) + return Ok(false); + } + + let walrecord_discriminator = &raw[4..8]; + + if walrecord_discriminator != [0, 0, 0, 0] { + // only NeonWalRecord::Postgres can have will_init + return Ok(false); + } + + if raw.len() < 17 { + return Err(InvalidInput::TooShortPostgresRecord); + } + + Ok(raw[8] == 1) + } +} + #[cfg(test)] mod test { use super::*; - use bytes::Bytes; use utils::bin_ser::BeSer; macro_rules! roundtrip { @@ -71,6 +112,8 @@ mod test { ]; roundtrip!(image, expected); + + assert!(ValueBytes::will_init(&expected).unwrap()); } #[test] @@ -94,6 +137,96 @@ mod test { ]; roundtrip!(rec, expected); + + assert!(ValueBytes::will_init(&expected).unwrap()); + } + + #[test] + fn bytes_inspection_too_short_image() { + let rec = Value::Image(Bytes::from_static(b"")); + + #[rustfmt::skip] + let expected = [ + // top level discriminator of 4 bytes + 0x00, 0x00, 0x00, 0x00, + // 8 byte length + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ]; + + roundtrip!(rec, expected); + + assert!(ValueBytes::will_init(&expected).unwrap()); + assert_eq!(expected.len(), 12); + for len in 0..12 { + assert_eq!( + ValueBytes::will_init(&expected[..len]).unwrap_err(), + InvalidInput::TooShortValue + ); + } + } + + #[test] + fn bytes_inspection_too_short_postgres_record() { + let rec = NeonWalRecord::Postgres { + will_init: false, + rec: Bytes::from_static(b""), + }; + let rec = Value::WalRecord(rec); + + #[rustfmt::skip] + let expected = [ + // flattened discriminator of total 8 bytes + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x00, + // will_init + 0x00, + // 8 byte length + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ]; + + roundtrip!(rec, expected); + + assert!(!ValueBytes::will_init(&expected).unwrap()); + assert_eq!(expected.len(), 17); + for len in 12..17 { + assert_eq!( + ValueBytes::will_init(&expected[..len]).unwrap_err(), + InvalidInput::TooShortPostgresRecord + ) + } + for len in 0..12 { + assert_eq!( + ValueBytes::will_init(&expected[..len]).unwrap_err(), + InvalidInput::TooShortValue + ) + } + } + + #[test] + fn clear_visibility_map_flags_example() { + let rec = NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno: Some(0x11), + old_heap_blkno: None, + flags: 0x03, + }; + let rec = Value::WalRecord(rec); + + #[rustfmt::skip] + let expected = [ + // discriminators + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x01, + // Some == 1 followed by 4 bytes + 0x01, 0x00, 0x00, 0x00, 0x11, + // None == 0 + 0x00, + // flags + 0x03 + ]; + + roundtrip!(rec, expected); + + assert!(!ValueBytes::will_init(&expected).unwrap()); } } @@ -106,6 +239,7 @@ pub struct GcResult { pub layers_needed_by_cutoff: u64, pub layers_needed_by_pitr: u64, pub layers_needed_by_branches: u64, + pub layers_needed_by_leases: u64, pub layers_not_updated: u64, pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. @@ -135,6 +269,7 @@ impl AddAssign for GcResult { self.layers_needed_by_pitr += other.layers_needed_by_pitr; self.layers_needed_by_cutoff += other.layers_needed_by_cutoff; self.layers_needed_by_branches += other.layers_needed_by_branches; + self.layers_needed_by_leases += other.layers_needed_by_leases; self.layers_not_updated += other.layers_not_updated; self.layers_removed += other.layers_removed; diff --git a/pageserver/src/span.rs b/pageserver/src/span.rs new file mode 100644 index 0000000000..91fee50514 --- /dev/null +++ b/pageserver/src/span.rs @@ -0,0 +1,43 @@ +use utils::tracing_span_assert::check_fields_present; + +mod extractors { + use utils::tracing_span_assert::ConstExtractor; + + pub(super) const TENANT_ID: ConstExtractor = ConstExtractor::new("tenant_id"); + pub(super) const SHARD_ID: ConstExtractor = ConstExtractor::new("shard_id"); + pub(super) const TIMELINE_ID: ConstExtractor = ConstExtractor::new("timeline_id"); +} + +#[track_caller] +pub(crate) fn debug_assert_current_span_has_tenant_id() { + if cfg!(debug_assertions) { + if let Err(missing) = check_fields_present!([&extractors::TENANT_ID, &extractors::SHARD_ID]) + { + panic!("missing extractors: {missing:?}") + } + } +} + +#[track_caller] +pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { + if cfg!(debug_assertions) { + if let Err(missing) = check_fields_present!([ + &extractors::TENANT_ID, + &extractors::SHARD_ID, + &extractors::TIMELINE_ID, + ]) { + panic!("missing extractors: {missing:?}") + } + } +} + +#[track_caller] +pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id() { + if cfg!(debug_assertions) { + if let Err(missing) = + check_fields_present!([&extractors::TENANT_ID, &extractors::TIMELINE_ID,]) + { + panic!("missing extractors: {missing:?}") + } + } +} diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs index 45a516566f..5a6f6e5176 100644 --- a/pageserver/src/statvfs.rs +++ b/pageserver/src/statvfs.rs @@ -56,37 +56,11 @@ impl Statvfs { } pub mod mock { - use anyhow::Context; use camino::Utf8Path; use regex::Regex; use tracing::log::info; - #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] - #[serde(tag = "type")] - pub enum Behavior { - Success { - blocksize: u64, - total_blocks: u64, - name_filter: Option, - }, - Failure { - mocked_error: MockedError, - }, - } - - #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] - #[allow(clippy::upper_case_acronyms)] - pub enum MockedError { - EIO, - } - - impl From for nix::Error { - fn from(e: MockedError) -> Self { - match e { - MockedError::EIO => nix::Error::EIO, - } - } - } + pub use pageserver_api::config::statvfs::mock::Behavior; pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result { info!("running mocked statvfs"); @@ -117,6 +91,7 @@ pub mod mock { block_size: *blocksize, }) } + #[cfg(feature = "testing")] Behavior::Failure { mocked_error } => Err((*mocked_error).into()), } } @@ -135,14 +110,30 @@ pub mod mock { { continue; } - total += entry - .metadata() - .with_context(|| format!("get metadata of {:?}", entry.path()))? - .len(); + let m = match entry.metadata() { + Ok(m) => m, + Err(e) if is_not_found(&e) => { + // some temp file which got removed right as we are walking + continue; + } + Err(e) => { + return Err(anyhow::Error::new(e) + .context(format!("get metadata of {:?}", entry.path()))) + } + }; + total += m.len(); } Ok(total) } + fn is_not_found(e: &walkdir::Error) -> bool { + let Some(io_error) = e.io_error() else { + return false; + }; + let kind = io_error.kind(); + matches!(kind, std::io::ErrorKind::NotFound) + } + pub struct Statvfs { pub blocks: u64, pub blocks_available: u64, diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 5a06a97525..6a4e90dd55 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -30,20 +30,17 @@ //! only a single tenant or timeline. //! -// Clippy 1.60 incorrectly complains about the tokio::task_local!() macro. -// Silence it. See https://github.com/rust-lang/rust-clippy/issues/9224. -#![allow(clippy::declare_interior_mutable_const)] - use std::collections::HashMap; use std::fmt; use std::future::Future; +use std::num::NonZeroUsize; use std::panic::AssertUnwindSafe; +use std::str::FromStr; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use futures::FutureExt; use pageserver_api::shard::TenantShardId; -use tokio::runtime::Runtime; use tokio::task::JoinHandle; use tokio::task_local; use tokio_util::sync::CancellationToken; @@ -52,9 +49,10 @@ use tracing::{debug, error, info, warn}; use once_cell::sync::Lazy; +use utils::env; use utils::id::TimelineId; -use crate::shutdown_pageserver; +use crate::metrics::set_tokio_runtime_setup; // // There are four runtimes: @@ -104,52 +102,128 @@ use crate::shutdown_pageserver; // other operations, if the upload tasks e.g. get blocked on locks. It shouldn't // happen, but still. // -pub static COMPUTE_REQUEST_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("compute request worker") - .enable_all() - .build() - .expect("Failed to create compute request runtime") -}); -pub static MGMT_REQUEST_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("mgmt request worker") - .enable_all() - .build() - .expect("Failed to create mgmt request runtime") -}); - -pub static WALRECEIVER_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("walreceiver worker") - .enable_all() - .build() - .expect("Failed to create walreceiver runtime") -}); - -pub static BACKGROUND_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("background op worker") - // if you change the number of worker threads please change the constant below - .enable_all() - .build() - .expect("Failed to create background op runtime") -}); - -pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy = Lazy::new(|| { - // force init and thus panics - let _ = BACKGROUND_RUNTIME.handle(); +pub(crate) static TOKIO_WORKER_THREADS: Lazy = Lazy::new(|| { // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly // tokio would had already panicked for parsing errors or NotUnicode // // this will be wrong if any of the runtimes gets their worker threads configured to something // else, but that has not been needed in a long time. - std::env::var("TOKIO_WORKER_THREADS") - .map(|s| s.parse::().unwrap()) - .unwrap_or_else(|_e| usize::max(2, num_cpus::get())) + NonZeroUsize::new( + std::env::var("TOKIO_WORKER_THREADS") + .map(|s| s.parse::().unwrap()) + .unwrap_or_else(|_e| usize::max(2, num_cpus::get())), + ) + .expect("the max() ensures that this is not zero") }); +enum TokioRuntimeMode { + SingleThreaded, + MultiThreaded { num_workers: NonZeroUsize }, +} + +impl FromStr for TokioRuntimeMode { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "current_thread" => Ok(TokioRuntimeMode::SingleThreaded), + s => match s.strip_prefix("multi_thread:") { + Some("default") => Ok(TokioRuntimeMode::MultiThreaded { + num_workers: *TOKIO_WORKER_THREADS, + }), + Some(suffix) => { + let num_workers = suffix.parse::().map_err(|e| { + format!( + "invalid number of multi-threaded runtime workers ({suffix:?}): {e}", + ) + })?; + Ok(TokioRuntimeMode::MultiThreaded { num_workers }) + } + None => Err(format!("invalid runtime config: {s:?}")), + }, + } + } +} + +static TOKIO_THREAD_STACK_SIZE: Lazy = Lazy::new(|| { + env::var("NEON_PAGESERVER_TOKIO_THREAD_STACK_SIZE") + // the default 2MiB are insufficent, especially in debug mode + .unwrap_or_else(|| NonZeroUsize::new(4 * 1024 * 1024).unwrap()) +}); + +static ONE_RUNTIME: Lazy> = Lazy::new(|| { + let thread_name = "pageserver-tokio"; + let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else { + // If the env var is not set, leave this static as None. + set_tokio_runtime_setup( + "multiple-runtimes", + NUM_MULTIPLE_RUNTIMES + .checked_mul(*TOKIO_WORKER_THREADS) + .unwrap(), + ); + return None; + }; + Some(match mode { + TokioRuntimeMode::SingleThreaded => { + set_tokio_runtime_setup("one-runtime-single-threaded", NonZeroUsize::new(1).unwrap()); + tokio::runtime::Builder::new_current_thread() + .thread_name(thread_name) + .enable_all() + .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get()) + .build() + .expect("failed to create one single runtime") + } + TokioRuntimeMode::MultiThreaded { num_workers } => { + set_tokio_runtime_setup("one-runtime-multi-threaded", num_workers); + tokio::runtime::Builder::new_multi_thread() + .thread_name(thread_name) + .enable_all() + .worker_threads(num_workers.get()) + .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get()) + .build() + .expect("failed to create one multi-threaded runtime") + } + }) +}); + +/// Declare a lazy static variable named `$varname` that will resolve +/// to a tokio runtime handle. If the env var `NEON_PAGESERVER_USE_ONE_RUNTIME` +/// is set, this will resolve to `ONE_RUNTIME`. Otherwise, the macro invocation +/// declares a separate runtime and the lazy static variable `$varname` +/// will resolve to that separate runtime. +/// +/// The result is is that `$varname.spawn()` will use `ONE_RUNTIME` if +/// `NEON_PAGESERVER_USE_ONE_RUNTIME` is set, and will use the separate runtime +/// otherwise. +macro_rules! pageserver_runtime { + ($varname:ident, $name:literal) => { + pub static $varname: Lazy<&'static tokio::runtime::Runtime> = Lazy::new(|| { + if let Some(runtime) = &*ONE_RUNTIME { + return runtime; + } + static RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name($name) + .worker_threads(TOKIO_WORKER_THREADS.get()) + .enable_all() + .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get()) + .build() + .expect(std::concat!("Failed to create runtime ", $name)) + }); + &*RUNTIME + }); + }; +} + +pageserver_runtime!(COMPUTE_REQUEST_RUNTIME, "compute request worker"); +pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker"); +pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker"); +pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker"); +// Bump this number when adding a new pageserver_runtime! +// SAFETY: it's obviously correct +const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) }; + #[derive(Debug, Clone, Copy)] pub struct PageserverTaskId(u64); @@ -192,6 +266,7 @@ task_local! { serde::Serialize, serde::Deserialize, strum_macros::IntoStaticStr, + strum_macros::EnumString, )] pub enum TaskKind { // Pageserver startup, i.e., `main` @@ -219,13 +294,12 @@ pub enum TaskKind { /// Internally, `Client` hands over requests to the `Connection` object. /// The `Connection` object is responsible for speaking the wire protocol. /// - /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection. - /// That abstraction doesn't use `task_mgr`. + /// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection. /// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task. /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind. /// - /// Once the connection is established, the `TaskHandle` task creates a - /// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling + /// Once the connection is established, the `TaskHandle` task spawns a + /// [`WalReceiverConnectionPoller`] task that is responsible for polling /// the `Connection` object. /// A `CancellationToken` created by the `TaskHandle` task ensures /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped. @@ -235,7 +309,6 @@ pub enum TaskKind { WalReceiverManager, /// The `TaskHandle` task that executes `handle_walreceiver_connection`. - /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`. /// See the comment on [`WalReceiverManager`]. /// /// [`WalReceiverManager`]: Self::WalReceiverManager @@ -255,6 +328,9 @@ pub enum TaskKind { // Eviction. One per timeline. Eviction, + // Ingest housekeeping (flushing ephemeral layers on time threshold or disk pressure) + IngestHousekeeping, + /// See [`crate::disk_usage_eviction_task`]. DiskUsageEviction, @@ -275,9 +351,6 @@ pub enum TaskKind { // Task that uploads a file to remote storage RemoteUploadTask, - // Task that downloads a file from remote storage - RemoteDownloadTask, - // task that handles the initial downloading of all tenants InitialLoad, @@ -300,8 +373,14 @@ pub enum TaskKind { DebugTool, + EphemeralFilePreWarmPageCache, + + LayerDownload, + #[cfg(test)] UnitTest, + + DetachAncestor, } #[derive(Default)] @@ -312,7 +391,6 @@ struct MutableTaskState { } struct PageServerTask { - #[allow(dead_code)] // unused currently task_id: PageserverTaskId, kind: TaskKind, @@ -324,7 +402,7 @@ struct PageServerTask { /// Tasks may optionally be launched for a particular tenant/timeline, enabling /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`] - tenant_shard_id: Option, + tenant_shard_id: TenantShardId, timeline_id: Option, mutable: Mutex, @@ -336,10 +414,9 @@ struct PageServerTask { pub fn spawn( runtime: &tokio::runtime::Handle, kind: TaskKind, - tenant_shard_id: Option, + tenant_shard_id: TenantShardId, timeline_id: Option, name: &str, - shutdown_process_on_error: bool, future: F, ) -> PageserverTaskId where @@ -368,7 +445,6 @@ where task_id, task_cloned, cancel, - shutdown_process_on_error, future, )); task_mut.join_handle = Some(join_handle); @@ -385,82 +461,78 @@ async fn task_wrapper( task_id: u64, task: Arc, shutdown_token: CancellationToken, - shutdown_process_on_error: bool, future: F, ) where F: Future> + Send + 'static, { debug!("Starting task '{}'", task_name); - let result = SHUTDOWN_TOKEN - .scope( - shutdown_token, - CURRENT_TASK.scope(task, { - // We use AssertUnwindSafe here so that the payload function - // doesn't need to be UnwindSafe. We don't do anything after the - // unwinding that would expose us to unwind-unsafe behavior. - AssertUnwindSafe(future).catch_unwind() - }), - ) - .await; - task_finish(result, task_name, task_id, shutdown_process_on_error).await; -} - -async fn task_finish( - result: std::result::Result< - anyhow::Result<()>, - std::boxed::Box, - >, - task_name: String, - task_id: u64, - shutdown_process_on_error: bool, -) { - // Remove our entry from the global hashmap. - let task = TASKS - .lock() - .unwrap() - .remove(&task_id) - .expect("no task in registry"); - - let mut shutdown_process = false; - { + // wrap the future so we log panics and errors + let tenant_shard_id = task.tenant_shard_id; + let timeline_id = task.timeline_id; + let fut = async move { + // We use AssertUnwindSafe here so that the payload function + // doesn't need to be UnwindSafe. We don't do anything after the + // unwinding that would expose us to unwind-unsafe behavior. + let result = AssertUnwindSafe(future).catch_unwind().await; match result { Ok(Ok(())) => { debug!("Task '{}' exited normally", task_name); } Ok(Err(err)) => { - if shutdown_process_on_error { - error!( - "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", - task_name, task.tenant_shard_id, task.timeline_id, err - ); - shutdown_process = true; - } else { - error!( - "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", - task_name, task.tenant_shard_id, task.timeline_id, err - ); - } + error!( + "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, tenant_shard_id, timeline_id, err + ); } Err(err) => { - if shutdown_process_on_error { - error!( - "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", - task_name, task.tenant_shard_id, task.timeline_id, err - ); - shutdown_process = true; - } else { - error!( - "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", - task_name, task.tenant_shard_id, task.timeline_id, err - ); - } + error!( + "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, tenant_shard_id, timeline_id, err + ); } } - } + }; - if shutdown_process { - shutdown_pageserver(None, 1).await; + // add the task-locals + let fut = CURRENT_TASK.scope(task, fut); + let fut = SHUTDOWN_TOKEN.scope(shutdown_token, fut); + + // poll future to completion + fut.await; + + // Remove our entry from the global hashmap. + TASKS + .lock() + .unwrap() + .remove(&task_id) + .expect("no task in registry"); +} + +pub async fn exit_on_panic_or_error( + task_name: &'static str, + future: impl Future>, +) -> T +where + E: std::fmt::Debug, +{ + // We use AssertUnwindSafe here so that the payload function + // doesn't need to be UnwindSafe. We don't do anything after the + // unwinding that would expose us to unwind-unsafe behavior. + let result = AssertUnwindSafe(future).catch_unwind().await; + match result { + Ok(Ok(val)) => val, + Ok(Err(err)) => { + error!( + task_name, + "Task exited with error, exiting process: {err:?}" + ); + std::process::exit(1); + } + Err(panic_obj) => { + error!(task_name, "Task panicked, exiting process: {panic_obj:?}"); + std::process::exit(1); + } } } @@ -487,7 +559,7 @@ pub async fn shutdown_tasks( let tasks = TASKS.lock().unwrap(); for task in tasks.values() { if (kind.is_none() || Some(task.kind) == kind) - && (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id) + && (tenant_shard_id.is_none() || Some(task.tenant_shard_id) == tenant_shard_id) && (timeline_id.is_none() || task.timeline_id == timeline_id) { task.cancel.cancel(); @@ -510,13 +582,8 @@ pub async fn shutdown_tasks( }; if let Some(mut join_handle) = join_handle { if log_all { - if tenant_shard_id.is_none() { - // there are quite few of these - info!(name = task.name, kind = ?task_kind, "stopping global task"); - } else { - // warn to catch these in tests; there shouldn't be any - warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over"); - } + // warn to catch these in tests; there shouldn't be any + warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over"); } if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle) .await @@ -576,8 +643,8 @@ pub fn shutdown_token() -> CancellationToken { /// Has the current task been requested to shut down? pub fn is_shutdown_requested() -> bool { - if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) { - cancel.is_cancelled() + if let Ok(true_or_false) = SHUTDOWN_TOKEN.try_with(|t| t.is_cancelled()) { + true_or_false } else { if !cfg!(test) { warn!("is_shutdown_requested() called in an unexpected task or thread"); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 7bb5881aab..fd2520a42e 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1,8 +1,9 @@ +//! Timeline repository implementation that keeps old data in layer files, and +//! the recent changes in ephemeral files. //! -//! Timeline repository implementation that keeps old data in files on disk, and -//! the recent changes in memory. See tenant/*_layer.rs files. -//! The functions here are responsible for locating the correct layer for the -//! get/put call, walking back the timeline branching history as needed. +//! See tenant/*_layer.rs files. The functions here are responsible for locating +//! the correct layer for the get/put call, walking back the timeline branching +//! history as needed. //! //! The files are stored in the .neon/tenants//timelines/ //! directory. See docs/pageserver-storage.md for how the files are managed. @@ -12,6 +13,7 @@ //! use anyhow::{bail, Context}; +use arc_swap::ArcSwap; use camino::Utf8Path; use camino::Utf8PathBuf; use enumset::EnumSet; @@ -19,44 +21,56 @@ use futures::stream::FuturesUnordered; use futures::FutureExt; use futures::StreamExt; use pageserver_api::models; +use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::TimelineArchivalState; use pageserver_api::models::TimelineState; +use pageserver_api::models::TopTenantShardItem; +use pageserver_api::models::WalRedoManagerStatus; use pageserver_api::shard::ShardIdentity; +use pageserver_api::shard::ShardStripeSize; use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; +use remote_storage::TimeoutOrCancel; +use std::collections::BTreeMap; use std::fmt; +use std::sync::Weak; +use std::time::SystemTime; use storage_broker::BrokerClientChannel; use tokio::io::BufReader; -use tokio::runtime::Handle; use tokio::sync::watch; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; +use upload_queue::NotInitialized; use utils::backoff; +use utils::circuit_breaker::CircuitBreaker; use utils::completion; use utils::crashsafe::path_with_suffix_extension; use utils::failpoint_support; use utils::fs_ext; +use utils::pausable_failpoint; use utils::sync::gate::Gate; use utils::sync::gate::GateGuard; use utils::timeout::timeout_cancellable; use utils::timeout::TimeoutCancellableError; +use utils::zstd::create_zst_tarball; +use utils::zstd::extract_zst_tarball; use self::config::AttachedLocationConfig; use self::config::AttachmentMode; use self::config::LocationConf; use self::config::TenantConf; -use self::delete::DeleteTenantFlow; -use self::metadata::LoadMetadataError; use self::metadata::TimelineMetadata; use self::mgr::GetActiveTenantError; use self::mgr::GetTenantError; -use self::mgr::TenantsMap; +use self::remote_timeline_client::upload::upload_index_part; use self::remote_timeline_client::RemoteTimelineClient; +use self::timeline::uninit::TimelineCreateGuard; use self::timeline::uninit::TimelineExclusionError; -use self::timeline::uninit::TimelineUninitMark; use self::timeline::uninit::UninitializedTimeline; use self::timeline::EvictionTaskTenantState; +use self::timeline::GcCutoffs; use self::timeline::TimelineResources; use self::timeline::WaitLsnError; use crate::config::PageServerConf; @@ -65,38 +79,38 @@ use crate::deletion_queue::DeletionQueueClient; use crate::deletion_queue::DeletionQueueError; use crate::import_datadir; use crate::is_uninit_mark; +use crate::l0_flush::L0FlushGlobalState; use crate::metrics::TENANT; -use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC}; +use crate::metrics::{ + remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, + TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, +}; use crate::repository::GcResult; use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::tenant::config::LocationMode; use crate::tenant::config::TenantConfOpt; -use crate::tenant::metadata::load_metadata; pub use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::remote_initdb_archive_path; use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; use crate::tenant::remote_timeline_client::INITDB_PATH; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; +use crate::walredo; use crate::InitializationOrder; -use std::cmp::min; use std::collections::hash_map::Entry; -use std::collections::BTreeSet; use std::collections::HashMap; use std::collections::HashSet; use std::fmt::Debug; use std::fmt::Display; use std::fs; use std::fs::File; -use std::io; -use std::ops::Bound::Included; -use std::sync::atomic::AtomicU64; -use std::sync::atomic::Ordering; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; -use std::sync::{Mutex, RwLock}; +use std::sync::Mutex; use std::time::{Duration, Instant}; +use crate::span; use crate::tenant::timeline::delete::DeleteTimelineFlow; use crate::tenant::timeline::uninit::cleanup_timeline_directory; use crate::virtual_file::VirtualFile; @@ -107,7 +121,6 @@ pub use pageserver_api::models::TenantState; use tokio::sync::Semaphore; static INIT_DB_SEMAPHORE: Lazy = Lazy::new(|| Semaphore::new(8)); -use toml_edit; use utils::{ crashsafe, generation::Generation, @@ -115,47 +128,19 @@ use utils::{ lsn::{Lsn, RecordLsn}, }; -/// Declare a failpoint that can use the `pause` failpoint action. -/// We don't want to block the executor thread, hence, spawn_blocking + await. -macro_rules! pausable_failpoint { - ($name:literal) => { - if cfg!(feature = "testing") { - tokio::task::spawn_blocking({ - let current = tracing::Span::current(); - move || { - let _entered = current.entered(); - tracing::info!("at failpoint {}", $name); - fail::fail_point!($name); - } - }) - .await - .expect("spawn_blocking"); - } - }; - ($name:literal, $cond:expr) => { - if cfg!(feature = "testing") { - if $cond { - pausable_failpoint!($name) - } - } - }; -} - pub mod blob_io; pub mod block_io; +pub mod vectored_blob_io; pub mod disk_btree; pub(crate) mod ephemeral_file; pub mod layer_map; -mod span; pub mod metadata; -mod par_fsync; pub mod remote_timeline_client; pub mod storage_layer; pub mod config; -pub mod delete; pub mod mgr; pub mod secondary; pub mod tasks; @@ -165,11 +150,11 @@ pub(crate) mod timeline; pub mod size; -pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; -pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; +mod gc_block; +pub(crate) mod throttle; -// re-export for use in remote_timeline_client.rs -pub use crate::tenant::metadata::save_metadata; +pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; // re-export for use in walreceiver pub use crate::tenant::timeline::WalReceiverInfo; @@ -180,15 +165,14 @@ pub const TENANTS_SEGMENT_NAME: &str = "tenants"; /// Parts of the `.neon/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; -pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted"; - /// References to shared objects that are passed into each tenant, such /// as the shared remote storage client and process initialization state. #[derive(Clone)] pub struct TenantSharedResources { pub broker_client: storage_broker::BrokerClientChannel, - pub remote_storage: Option, + pub remote_storage: GenericRemoteStorage, pub deletion_queue_client: DeletionQueueClient, + pub l0_flush_global_state: L0FlushGlobalState, } /// A [`Tenant`] is really an _attached_ tenant. The configuration @@ -200,11 +184,18 @@ pub(super) struct AttachedTenantConf { } impl AttachedTenantConf { + fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self { + Self { + tenant_conf, + location, + } + } + fn try_from(location_conf: LocationConf) -> anyhow::Result { match &location_conf.mode { LocationMode::Attached(attach_conf) => Ok(Self { tenant_conf: location_conf.tenant_conf, - location: attach_conf.clone(), + location: *attach_conf, }), LocationMode::Secondary(_) => { anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode") @@ -219,15 +210,16 @@ struct TimelinePreload { } pub(crate) struct TenantPreload { - deleting: bool, timelines: HashMap, } /// When we spawn a tenant, there is a special mode for tenant creation that /// avoids trying to read anything from remote storage. pub(crate) enum SpawnMode { - Normal, - Create, + /// Activate as soon as possible + Eager, + /// Lazy activation in the background, with the option to skip the queue if the need comes up + Lazy, } /// @@ -247,7 +239,7 @@ pub struct Tenant { // We keep TenantConfOpt sturct here to preserve the information // about parameters that are not set. // This is necessary to allow global config updates. - tenant_conf: Arc>, + tenant_conf: Arc>, tenant_shard_id: TenantShardId, @@ -275,10 +267,10 @@ pub struct Tenant { // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn // timeout... gc_cs: tokio::sync::Mutex<()>, - walredo_mgr: Arc, + walredo_mgr: Option>, // provides access to timeline data sitting in the remote storage - pub(crate) remote_storage: Option, + pub(crate) remote_storage: GenericRemoteStorage, // Access to global deletion queue for when this tenant wants to schedule a deletion deletion_queue_client: DeletionQueueClient, @@ -289,13 +281,15 @@ pub struct Tenant { eviction_task_tenant_state: tokio::sync::Mutex, + /// Track repeated failures to compact, so that we can back off. + /// Overhead of mutex is acceptable because compaction is done with a multi-second period. + compaction_circuit_breaker: std::sync::Mutex, + /// If the tenant is in Activating state, notify this to encourage it /// to proceed to Active as soon as possible, rather than waiting for lazy /// background warmup. pub(crate) activate_now_sem: tokio::sync::Semaphore, - pub(crate) delete_progress: Arc>, - // Cancellation token fires when we have entered shutdown(). This is a parent of // Timelines' cancellation token. pub(crate) cancel: CancellationToken, @@ -303,6 +297,26 @@ pub struct Tenant { // Users of the Tenant such as the page service must take this Gate to avoid // trying to use a Tenant which is shutting down. pub(crate) gate: Gate, + + /// Throttle applied at the top of [`Timeline::get`]. + /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance. + pub(crate) timeline_get_throttle: + Arc>, + + /// An ongoing timeline detach concurrency limiter. + /// + /// As a tenant will likely be restarted as part of timeline detach ancestor it makes no sense + /// to have two running at the same time. A different one can be started if an earlier one + /// has failed for whatever reason. + ongoing_timeline_detach: std::sync::Mutex>, + + /// `index_part.json` based gc blocking reason tracking. + /// + /// New gc iterations must start a new iteration by acquiring `GcBlock::start` before + /// proceeding. + pub(crate) gc_block: gc_block::GcBlock, + + l0_flush_global_state: L0FlushGlobalState, } impl std::fmt::Debug for Tenant { @@ -312,14 +326,66 @@ impl std::fmt::Debug for Tenant { } pub(crate) enum WalRedoManager { - Prod(PostgresRedoManager), + Prod(WalredoManagerId, PostgresRedoManager), #[cfg(test)] Test(harness::TestRedoManager), } -impl From for WalRedoManager { - fn from(mgr: PostgresRedoManager) -> Self { - Self::Prod(mgr) +#[derive(thiserror::Error, Debug)] +#[error("pageserver is shutting down")] +pub(crate) struct GlobalShutDown; + +impl WalRedoManager { + pub(crate) fn new(mgr: PostgresRedoManager) -> Result, GlobalShutDown> { + let id = WalredoManagerId::next(); + let arc = Arc::new(Self::Prod(id, mgr)); + let mut guard = WALREDO_MANAGERS.lock().unwrap(); + match &mut *guard { + Some(map) => { + map.insert(id, Arc::downgrade(&arc)); + Ok(arc) + } + None => Err(GlobalShutDown), + } + } +} + +impl Drop for WalRedoManager { + fn drop(&mut self) { + match self { + Self::Prod(id, _) => { + let mut guard = WALREDO_MANAGERS.lock().unwrap(); + if let Some(map) = &mut *guard { + map.remove(id).expect("new() registers, drop() unregisters"); + } + } + #[cfg(test)] + Self::Test(_) => { + // Not applicable to test redo manager + } + } + } +} + +/// Global registry of all walredo managers so that [`crate::shutdown_pageserver`] can shut down +/// the walredo processes outside of the regular order. +/// +/// This is necessary to work around a systemd bug where it freezes if there are +/// walredo processes left => +#[allow(clippy::type_complexity)] +pub(crate) static WALREDO_MANAGERS: once_cell::sync::Lazy< + Mutex>>>, +> = once_cell::sync::Lazy::new(|| Mutex::new(Some(HashMap::new()))); +#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)] +pub(crate) struct WalredoManagerId(u64); +impl WalredoManagerId { + pub fn next() -> Self { + static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1); + let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + if id == 0 { + panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique"); + } + Self(id) } } @@ -331,9 +397,20 @@ impl From for WalRedoManager { } impl WalRedoManager { + pub(crate) async fn shutdown(&self) -> bool { + match self { + Self::Prod(_, mgr) => mgr.shutdown().await, + #[cfg(test)] + Self::Test(_) => { + // Not applicable to test redo manager + true + } + } + } + pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) { match self { - Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout), + Self::Prod(_, mgr) => mgr.maybe_quiesce(idle_timeout), #[cfg(test)] Self::Test(_) => { // Not applicable to test redo manager @@ -351,9 +428,9 @@ impl WalRedoManager { base_img: Option<(Lsn, bytes::Bytes)>, records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>, pg_version: u32, - ) -> anyhow::Result { + ) -> Result { match self { - Self::Prod(mgr) => { + Self::Prod(_, mgr) => { mgr.request_redo(key, lsn, base_img, records, pg_version) .await } @@ -364,10 +441,20 @@ impl WalRedoManager { } } } + + pub(crate) fn status(&self) -> Option { + match self { + WalRedoManager::Prod(_, m) => Some(m.status()), + #[cfg(test)] + WalRedoManager::Test(_) => None, + } + } } #[derive(Debug, thiserror::Error, PartialEq, Eq)] pub enum GetTimelineError { + #[error("Timeline is shutting down")] + ShuttingDown, #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")] NotActive { tenant_id: TenantShardId, @@ -415,6 +502,42 @@ impl Debug for DeleteTimelineError { } } +#[derive(thiserror::Error)] +pub enum TimelineArchivalError { + #[error("NotFound")] + NotFound, + + #[error("Timeout")] + Timeout, + + #[error("ancestor is archived: {}", .0)] + HasArchivedParent(TimelineId), + + #[error("HasUnarchivedChildren")] + HasUnarchivedChildren(Vec), + + #[error("Timeline archival is already in progress")] + AlreadyInProgress, + + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl Debug for TimelineArchivalError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::NotFound => write!(f, "NotFound"), + Self::Timeout => write!(f, "Timeout"), + Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(), + Self::HasUnarchivedChildren(c) => { + f.debug_tuple("HasUnarchivedChildren").field(c).finish() + } + Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(), + Self::Other(e) => f.debug_tuple("Other").field(e).finish(), + } + } +} + pub enum SetStoppingError { AlreadyStopping(completion::Barrier), Broken, @@ -475,16 +598,75 @@ impl From for InitdbError { } } -struct TenantDirectoryScan { - sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>, - timelines_to_resume_deletion: Vec<(TimelineId, Option)>, -} - enum CreateTimelineCause { Load, Delete, } +#[derive(thiserror::Error, Debug)] +pub(crate) enum GcError { + // The tenant is shutting down + #[error("tenant shutting down")] + TenantCancelled, + + // The tenant is shutting down + #[error("timeline shutting down")] + TimelineCancelled, + + // The tenant is in a state inelegible to run GC + #[error("not active")] + NotActive, + + // A requested GC cutoff LSN was invalid, for example it tried to move backwards + #[error("not active")] + BadLsn { why: String }, + + // A remote storage error while scheduling updates after compaction + #[error(transparent)] + Remote(anyhow::Error), + + // An error reading while calculating GC cutoffs + #[error(transparent)] + GcCutoffs(PageReconstructError), + + // If GC was invoked for a particular timeline, this error means it didn't exist + #[error("timeline not found")] + TimelineNotFound, +} + +impl From for GcError { + fn from(value: PageReconstructError) -> Self { + match value { + PageReconstructError::Cancelled => Self::TimelineCancelled, + other => Self::GcCutoffs(other), + } + } +} + +impl From for GcError { + fn from(value: NotInitialized) -> Self { + match value { + NotInitialized::Uninitialized => GcError::Remote(value.into()), + NotInitialized::Stopped | NotInitialized::ShuttingDown => GcError::TimelineCancelled, + } + } +} + +impl From for GcError { + fn from(_: timeline::layer_manager::Shutdown) -> Self { + GcError::TimelineCancelled + } +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum LoadConfigError { + #[error("TOML deserialization error: '{0}'")] + DeserializeToml(#[from] toml_edit::de::Error), + + #[error("Config not found at {0}")] + NotFound(Utf8PathBuf), +} + impl Tenant { /// Yet another helper for timeline initialization. /// @@ -503,6 +685,7 @@ impl Tenant { index_part: Option, metadata: TimelineMetadata, ancestor: Option>, + last_aux_file_policy: Option, _ctx: &RequestContext, ) -> anyhow::Result<()> { let tenant_id = self.tenant_shard_id; @@ -513,6 +696,10 @@ impl Tenant { ancestor.clone(), resources, CreateTimelineCause::Load, + // This could be derived from ancestor branch + index part. Though the only caller of `timeline_init_and_sync` is `load_remote_timeline`, + // there will potentially be other caller of this function in the future, and we don't know whether `index_part` or `ancestor` takes precedence. + // Therefore, we pass this field explicitly for now, and remove it once we fully migrate to aux file v2. + last_aux_file_policy, )?; let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); anyhow::ensure!( @@ -526,20 +713,26 @@ impl Tenant { ); if let Some(index_part) = index_part.as_ref() { + timeline.remote_client.init_upload_queue(index_part)?; + timeline - .remote_client - .as_ref() - .unwrap() - .init_upload_queue(index_part)?; - } else if self.remote_storage.is_some() { + .last_aux_file_policy + .store(index_part.last_aux_file_policy()); + } else { // No data on the remote storage, but we have local metadata file. We can end up // here with timeline_create being interrupted before finishing index part upload. // By doing what we do here, the index part upload is retried. // If control plane retries timeline creation in the meantime, the mgmt API handler // for timeline creation will coalesce on the upload we queue here. - let rtc = timeline.remote_client.as_ref().unwrap(); - rtc.init_upload_queue_for_empty_remote(&metadata)?; - rtc.schedule_index_upload_for_metadata_update(&metadata)?; + + // FIXME: this branch should be dead code as we no longer write local metadata. + + timeline + .remote_client + .init_upload_queue_for_empty_remote(&metadata)?; + timeline + .remote_client + .schedule_index_upload_for_full_metadata_update(&metadata)?; } timeline @@ -553,9 +746,8 @@ impl Tenant { // avoiding holding it across awaits let mut timelines_accessor = self.timelines.lock().unwrap(); match timelines_accessor.entry(timeline_id) { + // We should never try and load the same timeline twice during startup Entry::Occupied(_) => { - // The uninit mark file acts as a lock that prevents another task from - // initializing the timeline at the same time. unreachable!( "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" ); @@ -575,6 +767,7 @@ impl Tenant { .read() .await .layer_map() + .expect("currently loading, layer manager cannot be shutdown already") .iter_historic_layers() .next() .is_some(), @@ -601,38 +794,40 @@ impl Tenant { attached_conf: AttachedTenantConf, shard_identity: ShardIdentity, init_order: Option, - tenants: &'static std::sync::RwLock, mode: SpawnMode, ctx: &RequestContext, - ) -> anyhow::Result> { - let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( - conf, - tenant_shard_id, - ))); + ) -> Result, GlobalShutDown> { + let wal_redo_manager = + WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?; let TenantSharedResources { broker_client, remote_storage, deletion_queue_client, + l0_flush_global_state, } = resources; + let attach_mode = attached_conf.location.attach_mode; + let generation = attached_conf.location.generation; + let tenant = Arc::new(Tenant::new( TenantState::Attaching, conf, attached_conf, shard_identity, - wal_redo_manager, + Some(wal_redo_manager), tenant_shard_id, remote_storage.clone(), deletion_queue_client, + l0_flush_global_state, )); // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if // we shut down while attaching. - let Ok(attach_gate_guard) = tenant.gate.enter() else { - // We just created the Tenant: nothing else can have shut it down yet - unreachable!(); - }; + let attach_gate_guard = tenant + .gate + .enter() + .expect("We just created the Tenant: nothing else can have shut it down yet"); // Do all the hard work in the background let tenant_clone = Arc::clone(&tenant); @@ -640,11 +835,16 @@ impl Tenant { task_mgr::spawn( &tokio::runtime::Handle::current(), TaskKind::Attach, - Some(tenant_shard_id), + tenant_shard_id, None, "attach tenant", - false, async move { + + info!( + ?attach_mode, + "Attaching tenant" + ); + let _gate_guard = attach_gate_guard; // Is this tenant being spawned as part of process startup? @@ -656,21 +856,38 @@ impl Tenant { } // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state. + enum BrokenVerbosity { + Error, + Info + } let make_broken = - |t: &Tenant, err: anyhow::Error| { - error!("attach failed, setting tenant state to Broken: {err:?}"); + |t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| { + match verbosity { + BrokenVerbosity::Info => { + info!("attach cancelled, setting tenant state to Broken: {err}"); + }, + BrokenVerbosity::Error => { + error!("attach failed, setting tenant state to Broken: {err:?}"); + } + } t.state.send_modify(|state| { // The Stopping case is for when we have passed control on to DeleteTenantFlow: // if it errors, we will call make_broken when tenant is already in Stopping. assert!( - matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }), - "the attach task owns the tenant state until activation is complete" - ); + matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }), + "the attach task owns the tenant state until activation is complete" + ); *state = TenantState::broken_from_reason(err.to_string()); }); }; + // TODO: should also be rejecting tenant conf changes that violate this check. + if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) { + make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error); + return Ok(()); + } + let mut init_order = init_order; // take the completion because initial tenant loading will complete when all of // these tasks complete. @@ -682,41 +899,37 @@ impl Tenant { .and_then(|x| x.initial_tenant_load_remote.take()); enum AttachType<'a> { - // During pageserver startup, we are attaching this tenant lazily in the background - Warmup(tokio::sync::SemaphorePermit<'a>), - // During pageserver startup, we are attaching this tenant as soon as we can, - // because a client tried to access it. + /// We are attaching this tenant lazily in the background. + Warmup { + _permit: tokio::sync::SemaphorePermit<'a>, + during_startup: bool + }, + /// We are attaching this tenant as soon as we can, because for example an + /// endpoint tried to access it. OnDemand, - // During normal operations after startup, we are attaching a tenant. + /// During normal operations after startup, we are attaching a tenant, and + /// eager attach was requested. Normal, } - // Before doing any I/O, wait for either or: - // - A client to attempt to access to this tenant (on-demand loading) - // - A permit to become available in the warmup semaphore (background warmup) - // - // Some-ness of init_order is how we know if we're attaching during startup or later - // in process lifetime. - let attach_type = if init_order.is_some() { + let attach_type = if matches!(mode, SpawnMode::Lazy) { + // Before doing any I/O, wait for at least one of: + // - A client attempting to access to this tenant (on-demand loading) + // - A permit becoming available in the warmup semaphore (background warmup) + tokio::select!( - _ = tenant_clone.activate_now_sem.acquire() => { + permit = tenant_clone.activate_now_sem.acquire() => { + let _ = permit.expect("activate_now_sem is never closed"); tracing::info!("Activating tenant (on-demand)"); AttachType::OnDemand }, - permit_result = conf.concurrent_tenant_warmup.inner().acquire() => { - match permit_result { - Ok(p) => { - tracing::info!("Activating tenant (warmup)"); - AttachType::Warmup(p) - } - Err(_) => { - // This is unexpected: the warmup semaphore should stay alive - // for the lifetime of init_order. Log a warning and proceed. - tracing::warn!("warmup_limit semaphore unexpectedly closed"); - AttachType::Normal - } + permit = conf.concurrent_tenant_warmup.inner().acquire() => { + let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed"); + tracing::info!("Activating tenant (warmup)"); + AttachType::Warmup { + _permit, + during_startup: init_order.is_some() } - } _ = tenant_clone.cancel.cancelled() => { // This is safe, but should be pretty rare: it is interesting if a tenant @@ -726,110 +939,49 @@ impl Tenant { // Make the tenant broken so that set_stopping will not hang waiting for it to leave // the Attaching state. This is an over-reaction (nothing really broke, the tenant is // just shutting down), but ensures progress. - make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching")); + make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info); return Ok(()); }, ) } else { + // SpawnMode::{Create,Eager} always cause jumping ahead of the + // concurrent_tenant_warmup queue AttachType::Normal }; - let preload_timer = TENANT.preload.start_timer(); - let preload = match mode { - SpawnMode::Create => { - // Don't count the skipped preload into the histogram of preload durations - preload_timer.stop_and_discard(); - None - }, - SpawnMode::Normal => { - match &remote_storage { - Some(remote_storage) => Some( - match tenant_clone - .preload(remote_storage, task_mgr::shutdown_token()) - .instrument( - tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()), - ) - .await { - Ok(p) => { - preload_timer.observe_duration(); - p - } - , - Err(e) => { - make_broken(&tenant_clone, anyhow::anyhow!(e)); - return Ok(()); - } - }, - ), - None => None, + let preload = match &mode { + SpawnMode::Eager | SpawnMode::Lazy => { + let _preload_timer = TENANT.preload.start_timer(); + let res = tenant_clone + .preload(&remote_storage, task_mgr::shutdown_token()) + .await; + match res { + Ok(p) => Some(p), + Err(e) => { + make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error); + return Ok(()); + } } } + }; // Remote preload is complete. drop(remote_load_completion); - let pending_deletion = { - match DeleteTenantFlow::should_resume_deletion( - conf, - preload.as_ref().map(|p| p.deleting).unwrap_or(false), - &tenant_clone, - ) - .await - { - Ok(should_resume_deletion) => should_resume_deletion, - Err(err) => { - make_broken(&tenant_clone, anyhow::anyhow!(err)); - return Ok(()); - } - } - }; - - info!("pending_deletion {}", pending_deletion.is_some()); - - if let Some(deletion) = pending_deletion { - // as we are no longer loading, signal completion by dropping - // the completion while we resume deletion - drop(_completion); - let background_jobs_can_start = - init_order.as_ref().map(|x| &x.background_jobs_can_start); - if let Some(background) = background_jobs_can_start { - info!("waiting for backgound jobs barrier"); - background.clone().wait().await; - info!("ready for backgound jobs barrier"); - } - - match DeleteTenantFlow::resume_from_attach( - deletion, - &tenant_clone, - preload, - tenants, - &ctx, - ) - .await - { - Err(err) => { - make_broken(&tenant_clone, anyhow::anyhow!(err)); - return Ok(()); - } - Ok(()) => return Ok(()), - } - } - // We will time the duration of the attach phase unless this is a creation (attach will do no work) - let attach_timer = match mode { - SpawnMode::Create => None, - SpawnMode::Normal => {Some(TENANT.attach.start_timer())} + let attached = { + let _attach_timer = Some(TENANT.attach.start_timer()); + tenant_clone.attach(preload, &ctx).await }; - match tenant_clone.attach(preload, mode, &ctx).await { + + match attached { Ok(()) => { info!("attach finished, activating"); - if let Some(t)= attach_timer {t.observe_duration();} tenant_clone.activate(broker_client, None, &ctx); } Err(e) => { - if let Some(t)= attach_timer {t.observe_duration();} - make_broken(&tenant_clone, anyhow::anyhow!(e)); + make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error); } } @@ -840,35 +992,27 @@ impl Tenant { // It also prevents the warmup proccess competing with the concurrency limit on // logical size calculations: if logical size calculation semaphore is saturated, // then warmup will wait for that before proceeding to the next tenant. - if let AttachType::Warmup(_permit) = attach_type { - let mut futs = FuturesUnordered::new(); - let timelines: Vec<_> = tenant_clone.timelines.lock().unwrap().values().cloned().collect(); - for t in timelines { - futs.push(t.await_initial_logical_size()) - } + if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) { + let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect(); tracing::info!("Waiting for initial logical sizes while warming up..."); - while futs.next().await.is_some() { - - } + while futs.next().await.is_some() {} tracing::info!("Warm-up complete"); } Ok(()) } - .instrument({ - let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()); - span.follows_from(Span::current()); - span - }), + .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)), ); Ok(tenant) } + #[instrument(skip_all)] pub(crate) async fn preload( - self: &Arc, + self: &Arc, remote_storage: &GenericRemoteStorage, cancel: CancellationToken, ) -> anyhow::Result { + span::debug_assert_current_span_has_tenant_id(); // Get list of remote timelines // download index files for every tenant timeline info!("listing remote timelines"); @@ -879,24 +1023,20 @@ impl Tenant { ) .await?; - let deleting = other_keys.contains(TENANT_DELETED_MARKER_FILE_NAME); - info!( - "found {} timelines, deleting={}", - remote_timeline_ids.len(), - deleting - ); + info!("found {} timelines", remote_timeline_ids.len(),); for k in other_keys { - if k != TENANT_DELETED_MARKER_FILE_NAME { - warn!("Unexpected non timeline key {k}"); - } + warn!("Unexpected non timeline key {k}"); } Ok(TenantPreload { - deleting, - timelines: self - .load_timeline_metadata(remote_timeline_ids, remote_storage, cancel) - .await?, + timelines: Self::load_timeline_metadata( + self, + remote_timeline_ids, + remote_storage, + cancel, + ) + .await?, }) } @@ -908,24 +1048,14 @@ impl Tenant { async fn attach( self: &Arc, preload: Option, - mode: SpawnMode, ctx: &RequestContext, ) -> anyhow::Result<()> { span::debug_assert_current_span_has_tenant_id(); failpoint_support::sleep_millis_async!("before-attaching-tenant"); - let preload = match (preload, mode) { - (Some(p), _) => p, - (None, SpawnMode::Create) => TenantPreload { - deleting: false, - timelines: HashMap::new(), - }, - (None, SpawnMode::Normal) => { - // Deprecated dev mode: load from local disk state instead of remote storage - // https://github.com/neondatabase/neon/issues/5624 - return self.load_local(ctx).await; - } + let Some(preload) = preload else { + anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624"); }; let mut timelines_to_resume_deletions = vec![]; @@ -977,6 +1107,8 @@ impl Tenant { } } + let mut gc_blocks = HashMap::new(); + // For every timeline, download the metadata file, scan the local directory, // and build a layer map that contains an entry for each remote and local // layer file. @@ -986,14 +1118,25 @@ impl Tenant { .remove(&timeline_id) .expect("just put it in above"); + if let Some(blocking) = index_part.gc_blocking.as_ref() { + // could just filter these away, but it helps while testing + anyhow::ensure!( + !blocking.reasons.is_empty(), + "index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons" + ); + let prev = gc_blocks.insert(timeline_id, blocking.reasons); + assert!(prev.is_none()); + } + // TODO again handle early failure self.load_remote_timeline( timeline_id, index_part, remote_metadata, TimelineResources { - remote_client: Some(remote_client), - deletion_queue_client: self.deletion_queue_client.clone(), + remote_client, + timeline_get_throttle: self.timeline_get_throttle.clone(), + l0_flush_global_state: self.l0_flush_global_state.clone(), }, ctx, ) @@ -1017,9 +1160,9 @@ impl Tenant { Arc::clone(self), timeline_id, &index_part.metadata, - Some(remote_timeline_client), - self.deletion_queue_client.clone(), + remote_timeline_client, ) + .instrument(tracing::info_span!("timeline_delete", %timeline_id)) .await .context("resume_deletion") .map_err(LoadLocalTimelineError::ResumeDeletion)?; @@ -1029,6 +1172,8 @@ impl Tenant { // IndexPart is the source of truth. self.clean_up_timelines(&existent_timelines)?; + self.gc_block.set_scanned(gc_blocks); + fail::fail_point!("attach-before-activate", |_| { anyhow::bail!("attach-before-activate"); }); @@ -1061,8 +1206,7 @@ impl Tenant { let entry_path = entry.path(); let purge = if crate::is_temporary(entry_path) - // TODO: uninit_mark isn't needed any more, since uninitialized timelines are already - // covered by the check that the timeline must exist in remote storage. + // TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718) || is_uninit_mark(entry_path) || crate::is_delete_mark(entry_path) { @@ -1109,9 +1253,7 @@ impl Tenant { let mut size = 0; for timeline in self.list_timelines() { - if let Some(remote_client) = &timeline.remote_client { - size += remote_client.get_remote_physical_size(); - } + size += timeline.remote_client.get_remote_physical_size(); } size @@ -1146,16 +1288,7 @@ impl Tenant { None }; - // timeline loading after attach expects to find metadata file for each metadata - save_metadata( - self.conf, - &self.tenant_shard_id, - &timeline_id, - &remote_metadata, - ) - .await - .context("save_metadata") - .map_err(LoadLocalTimelineError::Load)?; + let last_aux_file_policy = index_part.last_aux_file_policy(); self.timeline_init_and_sync( timeline_id, @@ -1163,181 +1296,12 @@ impl Tenant { Some(index_part), remote_metadata, ancestor, + last_aux_file_policy, ctx, ) .await } - /// Create a placeholder Tenant object for a broken tenant - pub fn create_broken_tenant( - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - reason: String, - ) -> Arc { - let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( - conf, - tenant_shard_id, - ))); - Arc::new(Tenant::new( - TenantState::Broken { - reason, - backtrace: String::new(), - }, - conf, - AttachedTenantConf::try_from(LocationConf::default()).unwrap(), - // Shard identity isn't meaningful for a broken tenant: it's just a placeholder - // to occupy the slot for this TenantShardId. - ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count), - wal_redo_manager, - tenant_shard_id, - None, - DeletionQueueClient::broken(), - )) - } - - fn scan_and_sort_timelines_dir(self: Arc) -> anyhow::Result { - let mut timelines_to_load: HashMap = HashMap::new(); - // Note timelines_to_resume_deletion needs to be separate because it can be not sortable - // from the point of `tree_sort_timelines`. I e some parents can be missing because deletion - // completed in non topological order (for example because parent has smaller number of layer files in it) - let mut timelines_to_resume_deletion: Vec<(TimelineId, Option)> = vec![]; - - let timelines_dir = self.conf.timelines_path(&self.tenant_shard_id); - - for entry in timelines_dir - .read_dir_utf8() - .context("list timelines directory for tenant")? - { - let entry = entry.context("read timeline dir entry")?; - let timeline_dir = entry.path(); - - if crate::is_temporary(timeline_dir) { - info!("Found temporary timeline directory, removing: {timeline_dir}"); - if let Err(e) = std::fs::remove_dir_all(timeline_dir) { - error!("Failed to remove temporary directory '{timeline_dir}': {e:?}"); - } - } else if is_uninit_mark(timeline_dir) { - if !timeline_dir.exists() { - warn!("Timeline dir entry become invalid: {timeline_dir}"); - continue; - } - - let timeline_uninit_mark_file = &timeline_dir; - info!( - "Found an uninit mark file {timeline_uninit_mark_file}, removing the timeline and its uninit mark", - ); - let timeline_id = - TimelineId::try_from(timeline_uninit_mark_file.file_stem()) - .with_context(|| { - format!( - "Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}", - ) - })?; - let timeline_dir = self.conf.timeline_path(&self.tenant_shard_id, &timeline_id); - if let Err(e) = - remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) - { - error!("Failed to clean up uninit marked timeline: {e:?}"); - } - } else if crate::is_delete_mark(timeline_dir) { - // If metadata exists, load as usual, continue deletion - let timeline_id = TimelineId::try_from(timeline_dir.file_stem()) - .with_context(|| { - format!( - "Could not parse timeline id out of the timeline uninit mark name {timeline_dir}", - ) - })?; - - info!("Found deletion mark for timeline {}", timeline_id); - - match load_metadata(self.conf, &self.tenant_shard_id, &timeline_id) { - Ok(metadata) => { - timelines_to_resume_deletion.push((timeline_id, Some(metadata))) - } - Err(e) => match &e { - LoadMetadataError::Read(r) => { - if r.kind() != io::ErrorKind::NotFound { - return Err(anyhow::anyhow!(e)).with_context(|| { - format!("Failed to load metadata for timeline_id {timeline_id}") - }); - } - - // If metadata doesnt exist it means that we've crashed without - // completing cleanup_remaining_timeline_fs_traces in DeleteTimelineFlow. - // So save timeline_id for later call to `DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`. - // We cant do it here because the method is async so we'd need block_on - // and here we're in spawn_blocking. cleanup_remaining_timeline_fs_traces uses fs operations - // so that basically results in a cycle: - // spawn_blocking - // - block_on - // - spawn_blocking - // which can lead to running out of threads in blocing pool. - timelines_to_resume_deletion.push((timeline_id, None)); - } - _ => { - return Err(anyhow::anyhow!(e)).with_context(|| { - format!("Failed to load metadata for timeline_id {timeline_id}") - }) - } - }, - } - } else { - if !timeline_dir.exists() { - warn!("Timeline dir entry become invalid: {timeline_dir}"); - continue; - } - let timeline_id = TimelineId::try_from(timeline_dir.file_name()) - .with_context(|| { - format!( - "Could not parse timeline id out of the timeline dir name {timeline_dir}", - ) - })?; - let timeline_uninit_mark_file = self - .conf - .timeline_uninit_mark_file_path(self.tenant_shard_id, timeline_id); - if timeline_uninit_mark_file.exists() { - info!( - %timeline_id, - "Found an uninit mark file, removing the timeline and its uninit mark", - ); - if let Err(e) = - remove_timeline_and_uninit_mark(timeline_dir, &timeline_uninit_mark_file) - { - error!("Failed to clean up uninit marked timeline: {e:?}"); - } - continue; - } - - let timeline_delete_mark_file = self - .conf - .timeline_delete_mark_file_path(self.tenant_shard_id, timeline_id); - if timeline_delete_mark_file.exists() { - // Cleanup should be done in `is_delete_mark` branch above - continue; - } - - let file_name = entry.file_name(); - if let Ok(timeline_id) = file_name.parse::() { - let metadata = load_metadata(self.conf, &self.tenant_shard_id, &timeline_id) - .context("failed to load metadata")?; - timelines_to_load.insert(timeline_id, metadata); - } else { - // A file or directory that doesn't look like a timeline ID - warn!("unexpected file or directory in timelines directory: {file_name}"); - } - } - } - - // Sort the array of timeline IDs into tree-order, so that parent comes before - // all its children. - tree_sort_timelines(timelines_to_load, |m| m.ancestor_timeline()).map(|sorted_timelines| { - TenantDirectoryScan { - sorted_timelines_to_load: sorted_timelines, - timelines_to_resume_deletion, - } - }) - } - async fn load_timeline_metadata( self: &Arc, timeline_ids: HashSet, @@ -1359,7 +1323,7 @@ impl Tenant { async move { debug!("starting index part download"); - let index_part = client.download_index_file(cancel_clone).await; + let index_part = client.download_index_file(&cancel_clone).await; debug!("finished index part download"); @@ -1401,139 +1365,65 @@ impl Tenant { Ok(timeline_preloads) } - /// - /// Background task to load in-memory data structures for this tenant, from - /// files on disk. Used at pageserver startup. - /// - /// No background tasks are started as part of this routine. - async fn load_local(self: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { - span::debug_assert_current_span_has_tenant_id(); - - debug!("loading tenant task"); - - // Load in-memory state to reflect the local files on disk - // - // Scan the directory, peek into the metadata file of each timeline, and - // collect a list of timelines and their ancestors. - let span = info_span!("blocking"); - let cloned = Arc::clone(self); - - let scan = tokio::task::spawn_blocking(move || { - let _g = span.entered(); - cloned.scan_and_sort_timelines_dir() - }) - .await - .context("load spawn_blocking") - .and_then(|res| res)?; - - // FIXME original collect_timeline_files contained one more check: - // 1. "Timeline has no ancestor and no layer files" - - // Process loadable timelines first - for (timeline_id, local_metadata) in scan.sorted_timelines_to_load { - if let Err(e) = self - .load_local_timeline(timeline_id, local_metadata, ctx, false) - .await - { - match e { - LoadLocalTimelineError::Load(source) => { - return Err(anyhow::anyhow!(source)).with_context(|| { - format!("Failed to load local timeline: {timeline_id}") - }) - } - LoadLocalTimelineError::ResumeDeletion(source) => { - // Make sure resumed deletion wont fail loading for entire tenant. - error!("Failed to resume timeline deletion: {source:#}") - } - } - } - } - - // Resume deletion ones with deleted_mark - for (timeline_id, maybe_local_metadata) in scan.timelines_to_resume_deletion { - match maybe_local_metadata { - None => { - // See comment in `scan_and_sort_timelines_dir`. - if let Err(e) = - DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(self, timeline_id) - .await - { - warn!( - "cannot clean up deleted timeline dir timeline_id: {} error: {:#}", - timeline_id, e - ); - } - } - Some(local_metadata) => { - if let Err(e) = self - .load_local_timeline(timeline_id, local_metadata, ctx, true) - .await - { - match e { - LoadLocalTimelineError::Load(source) => { - // We tried to load deleted timeline, this is a bug. - return Err(anyhow::anyhow!(source).context( - format!("This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}") - )); - } - LoadLocalTimelineError::ResumeDeletion(source) => { - // Make sure resumed deletion wont fail loading for entire tenant. - error!("Failed to resume timeline deletion: {source:#}") - } - } - } - } - } - } - - trace!("Done"); - - Ok(()) - } - - /// Subroutine of `load_tenant`, to load an individual timeline - /// - /// NB: The parent is assumed to be already loaded! - #[instrument(skip(self, local_metadata, ctx))] - async fn load_local_timeline( - self: &Arc, + pub(crate) async fn apply_timeline_archival_config( + &self, timeline_id: TimelineId, - local_metadata: TimelineMetadata, - ctx: &RequestContext, - found_delete_mark: bool, - ) -> Result<(), LoadLocalTimelineError> { - span::debug_assert_current_span_has_tenant_id(); + state: TimelineArchivalState, + ) -> Result<(), TimelineArchivalError> { + info!("setting timeline archival config"); + let timeline = { + let timelines = self.timelines.lock().unwrap(); - let resources = self.build_timeline_resources(timeline_id); + let Some(timeline) = timelines.get(&timeline_id) else { + return Err(TimelineArchivalError::NotFound); + }; - if found_delete_mark { - // There is no remote client, we found local metadata. - // Continue cleaning up local disk. - DeleteTimelineFlow::resume_deletion( - Arc::clone(self), - timeline_id, - &local_metadata, - None, - self.deletion_queue_client.clone(), - ) - .await - .context("resume deletion") - .map_err(LoadLocalTimelineError::ResumeDeletion)?; - return Ok(()); - } + if state == TimelineArchivalState::Unarchived { + if let Some(ancestor_timeline) = timeline.ancestor_timeline() { + if ancestor_timeline.is_archived() == Some(true) { + return Err(TimelineArchivalError::HasArchivedParent( + ancestor_timeline.timeline_id, + )); + } + } + } - let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() { - let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false) - .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}")) - .map_err(LoadLocalTimelineError::Load)?; - Some(ancestor_timeline) - } else { - None + // Ensure that there are no non-archived child timelines + let children: Vec = timelines + .iter() + .filter_map(|(id, entry)| { + if entry.get_ancestor_timeline_id() != Some(timeline_id) { + return None; + } + if entry.is_archived() == Some(true) { + return None; + } + Some(*id) + }) + .collect(); + + if !children.is_empty() && state == TimelineArchivalState::Archived { + return Err(TimelineArchivalError::HasUnarchivedChildren(children)); + } + Arc::clone(timeline) }; - self.timeline_init_and_sync(timeline_id, resources, None, local_metadata, ancestor, ctx) - .await - .map_err(LoadLocalTimelineError::Load) + let upload_needed = timeline + .remote_client + .schedule_index_upload_for_timeline_archival_state(state)?; + + if upload_needed { + info!("Uploading new state"); + const MAX_WAIT: Duration = Duration::from_secs(10); + let Ok(v) = + tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await + else { + tracing::warn!("reached timeout for waiting on upload queue"); + return Err(TimelineArchivalError::Timeout); + }; + v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?; + } + Ok(()) } pub(crate) fn tenant_shard_id(&self) -> TenantShardId { @@ -1588,11 +1478,6 @@ impl Tenant { /// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0)) /// and the timeline will fail to load at a restart. /// - /// That's why we add an uninit mark file, and wrap it together witht the Timeline - /// in-memory object into UninitializedTimeline. - /// Once the caller is done setting up the timeline, they should call - /// `UninitializedTimeline::initialize_with_lock` to remove the uninit mark. - /// /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the /// minimum amount of keys required to get a writable timeline. /// (Without it, `put` might fail due to `repartition` failing.) @@ -1608,7 +1493,9 @@ impl Tenant { "Cannot create empty timelines on inactive tenant" ); - let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id)?; + // Protect against concurrent attempts to use this TimelineId + let create_guard = self.create_timeline_create_guard(new_timeline_id)?; + let new_metadata = TimelineMetadata::new( // Initialize disk_consistent LSN to 0, The caller must import some data to // make it valid, before calling finish_creation() @@ -1623,9 +1510,10 @@ impl Tenant { self.prepare_new_timeline( new_timeline_id, &new_metadata, - timeline_uninit_mark, + create_guard, initdb_lsn, None, + None, ) .await } @@ -1664,13 +1552,7 @@ impl Tenant { tline.freeze_and_flush().await.context("freeze_and_flush")?; // Make sure the freeze_and_flush reaches remote storage. - tline - .remote_client - .as_ref() - .unwrap() - .wait_completion() - .await - .unwrap(); + tline.remote_client.wait_completion().await.unwrap(); let tl = uninit_tl.finish_creation()?; // The non-test code would call tl.activate() here. @@ -1678,6 +1560,36 @@ impl Tenant { Ok(tl) } + /// Helper for unit tests to create a timeline with some pre-loaded states. + #[cfg(test)] + #[allow(clippy::too_many_arguments)] + pub async fn create_test_timeline_with_layers( + &self, + new_timeline_id: TimelineId, + initdb_lsn: Lsn, + pg_version: u32, + ctx: &RequestContext, + delta_layer_desc: Vec, + image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, + end_lsn: Lsn, + ) -> anyhow::Result> { + let tline = self + .create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx) + .await?; + tline.force_advance_lsn(end_lsn); + for deltas in delta_layer_desc { + tline + .force_create_delta_layer(deltas, Some(initdb_lsn), ctx) + .await?; + } + for (lsn, images) in image_layer_desc { + tline + .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx) + .await?; + } + Ok(tline) + } + /// Create a new timeline. /// /// Returns the new timeline ID and reference to its Timeline object. @@ -1686,7 +1598,7 @@ impl Tenant { /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists. #[allow(clippy::too_many_arguments)] pub(crate) async fn create_timeline( - &self, + self: &Arc, new_timeline_id: TimelineId, ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, @@ -1711,9 +1623,8 @@ impl Tenant { .map_err(|_| CreateTimelineError::ShuttingDown)?; // Get exclusive access to the timeline ID: this ensures that it does not already exist, - // and that no other creation attempts will be allowed in while we are working. The - // uninit_mark is a guard. - let uninit_mark = match self.create_timeline_uninit_mark(new_timeline_id) { + // and that no other creation attempts will be allowed in while we are working. + let create_guard = match self.create_timeline_create_guard(new_timeline_id) { Ok(m) => m, Err(TimelineExclusionError::AlreadyCreating) => { // Creation is in progress, we cannot create it again, and we cannot @@ -1737,25 +1648,26 @@ impl Tenant { return Err(CreateTimelineError::Conflict); } - if let Some(remote_client) = existing.remote_client.as_ref() { - // Wait for uploads to complete, so that when we return Ok, the timeline - // is known to be durable on remote storage. Just like we do at the end of - // this function, after we have created the timeline ourselves. - // - // We only really care that the initial version of `index_part.json` has - // been uploaded. That's enough to remember that the timeline - // exists. However, there is no function to wait specifically for that so - // we just wait for all in-progress uploads to finish. - remote_client - .wait_completion() - .await - .context("wait for timeline uploads to complete")?; - } + // Wait for uploads to complete, so that when we return Ok, the timeline + // is known to be durable on remote storage. Just like we do at the end of + // this function, after we have created the timeline ourselves. + // + // We only really care that the initial version of `index_part.json` has + // been uploaded. That's enough to remember that the timeline + // exists. However, there is no function to wait specifically for that so + // we just wait for all in-progress uploads to finish. + existing + .remote_client + .wait_completion() + .await + .context("wait for timeline uploads to complete")?; return Ok(existing); } }; + pausable_failpoint!("timeline-creation-after-uninit"); + let loaded_timeline = match ancestor_timeline_id { Some(ancestor_timeline_id) => { let ancestor_timeline = self @@ -1789,10 +1701,10 @@ impl Tenant { // sizes etc. and that would get confused if the previous page versions // are not in the repository yet. ancestor_timeline - .wait_lsn(*lsn, ctx) + .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx) .await .map_err(|e| match e { - e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => { + e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => { CreateTimelineError::AncestorLsn(anyhow::anyhow!(e)) } WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown, @@ -1803,7 +1715,7 @@ impl Tenant { &ancestor_timeline, new_timeline_id, ancestor_start_lsn, - uninit_mark, + create_guard, ctx, ) .await? @@ -1813,7 +1725,7 @@ impl Tenant { new_timeline_id, pg_version, load_existing_initdb, - uninit_mark, + create_guard, ctx, ) .await? @@ -1824,16 +1736,16 @@ impl Tenant { // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet. We must // not send a success to the caller until it is. The same applies to handling retries, // see the handling of [`TimelineExclusionError::AlreadyExists`] above. - if let Some(remote_client) = loaded_timeline.remote_client.as_ref() { - let kind = ancestor_timeline_id - .map(|_| "branched") - .unwrap_or("bootstrapped"); - remote_client.wait_completion().await.with_context(|| { - format!("wait for {} timeline initial uploads to complete", kind) - })?; - } + let kind = ancestor_timeline_id + .map(|_| "branched") + .unwrap_or("bootstrapped"); + loaded_timeline + .remote_client + .wait_completion() + .await + .with_context(|| format!("wait for {} timeline initial uploads to complete", kind))?; - loaded_timeline.activate(broker_client, None, ctx); + loaded_timeline.activate(self.clone(), broker_client, None, ctx); Ok(loaded_timeline) } @@ -1842,7 +1754,7 @@ impl Tenant { self: Arc, timeline_id: TimelineId, ) -> Result<(), DeleteTimelineError> { - DeleteTimelineFlow::run(&self, timeline_id, false).await?; + DeleteTimelineFlow::run(&self, timeline_id).await?; Ok(()) } @@ -1860,27 +1772,26 @@ impl Tenant { /// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever /// requires more history to be retained. // - pub async fn gc_iteration( + pub(crate) async fn gc_iteration( &self, target_timeline_id: Option, horizon: u64, pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { // Don't start doing work during shutdown if let TenantState::Stopping { .. } = self.current_state() { return Ok(GcResult::default()); } // there is a global allowed_error for this - anyhow::ensure!( - self.is_active(), - "Cannot run GC iteration on inactive tenant" - ); + if !self.is_active() { + return Err(GcError::NotActive); + } { - let conf = self.tenant_conf.read().unwrap(); + let conf = self.tenant_conf.load(); if !conf.location.may_delete_layers_hint() { info!("Skipping GC in location state {:?}", conf.location); @@ -1888,6 +1799,14 @@ impl Tenant { } } + let _guard = match self.gc_block.start().await { + Ok(guard) => guard, + Err(reasons) => { + info!("Skipping GC: {reasons}"); + return Ok(GcResult::default()); + } + }; + self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx) .await } @@ -1896,21 +1815,23 @@ impl Tenant { /// This function is periodically called by compactor task. /// Also it can be explicitly requested per timeline through page server /// api's 'compact' command. + /// + /// Returns whether we have pending compaction task. async fn compaction_iteration( &self, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result<(), timeline::CompactionError> { + ) -> Result { // Don't start doing work during shutdown, or when broken, we do not need those in the logs if !self.is_active() { - return Ok(()); + return Ok(false); } { - let conf = self.tenant_conf.read().unwrap(); + let conf = self.tenant_conf.load(); if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() { info!("Skipping compaction in location state {:?}", conf.location); - return Ok(()); + return Ok(false); } } @@ -1934,14 +1855,64 @@ impl Tenant { timelines_to_compact }; - for (timeline_id, timeline) in &timelines_to_compact { - timeline - .compact(cancel, EnumSet::empty(), ctx) - .instrument(info_span!("compact_timeline", %timeline_id)) - .await?; + // Before doing any I/O work, check our circuit breaker + if self.compaction_circuit_breaker.lock().unwrap().is_broken() { + info!("Skipping compaction due to previous failures"); + return Ok(false); } - Ok(()) + let mut has_pending_task = false; + + for (timeline_id, timeline) in &timelines_to_compact { + has_pending_task |= timeline + .compact(cancel, EnumSet::empty(), ctx) + .instrument(info_span!("compact_timeline", %timeline_id)) + .await + .inspect_err(|e| match e { + timeline::CompactionError::ShuttingDown => (), + timeline::CompactionError::Other(e) => { + self.compaction_circuit_breaker + .lock() + .unwrap() + .fail(&CIRCUIT_BREAKERS_BROKEN, e); + } + })?; + } + + self.compaction_circuit_breaker + .lock() + .unwrap() + .success(&CIRCUIT_BREAKERS_UNBROKEN); + + Ok(has_pending_task) + } + + // Call through to all timelines to freeze ephemeral layers if needed. Usually + // this happens during ingest: this background housekeeping is for freezing layers + // that are open but haven't been written to for some time. + async fn ingest_housekeeping(&self) { + // Scan through the hashmap and collect a list of all the timelines, + // while holding the lock. Then drop the lock and actually perform the + // compactions. We don't want to block everything else while the + // compaction runs. + let timelines = { + self.timelines + .lock() + .unwrap() + .values() + .filter_map(|timeline| { + if timeline.is_active() { + Some(timeline.clone()) + } else { + None + } + }) + .collect::>() + }; + + for timeline in &timelines { + timeline.maybe_freeze_ephemeral_layer().await; + } } pub fn current_state(&self) -> TenantState { @@ -1956,6 +1927,10 @@ impl Tenant { self.generation } + pub(crate) fn wal_redo_manager_status(&self) -> Option { + self.walredo_mgr.as_ref().and_then(|mgr| mgr.status()) + } + /// Changes tenant status to active, unless shutdown was already requested. /// /// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup @@ -1994,6 +1969,9 @@ impl Tenant { .values() .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping())); + // Before activation, populate each Timeline's GcInfo with information about its children + self.initialize_gc_info(&timelines_accessor); + // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. tasks::start_background_loops(self, background_jobs_can_start); @@ -2001,7 +1979,12 @@ impl Tenant { let mut activated_timelines = 0; for timeline in timelines_to_activate { - timeline.activate(broker_client.clone(), background_jobs_can_start, ctx); + timeline.activate( + self.clone(), + broker_client.clone(), + background_jobs_can_start, + ctx, + ); activated_timelines += 1; } @@ -2047,7 +2030,7 @@ impl Tenant { async fn shutdown( &self, shutdown_progress: completion::Barrier, - freeze_and_flush: bool, + shutdown_mode: timeline::ShutdownMode, ) -> Result<(), completion::Barrier> { span::debug_assert_current_span_has_tenant_id(); @@ -2072,9 +2055,15 @@ impl Tenant { // If we're still attaching, fire the cancellation token early to drop out: this // will prevent us flushing, but ensures timely shutdown if some I/O during attach // is very slow. - if matches!(self.current_state(), TenantState::Attaching) { + let shutdown_mode = if matches!(self.current_state(), TenantState::Attaching) { self.cancel.cancel(); - } + + // Having fired our cancellation token, do not try and flush timelines: their cancellation tokens + // are children of ours, so their flush loops will have shut down already + timeline::ShutdownMode::Hard + } else { + shutdown_mode + }; match self.set_stopping(shutdown_progress, false, false).await { Ok(()) => {} @@ -2093,14 +2082,9 @@ impl Tenant { let timelines = self.timelines.lock().unwrap(); timelines.values().for_each(|timeline| { let timeline = Arc::clone(timeline); - let span = Span::current(); - js.spawn(async move { - if freeze_and_flush { - timeline.flush_and_shutdown().instrument(span).await - } else { - timeline.shutdown().instrument(span).await - } - }); + let timeline_id = timeline.timeline_id; + let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode); + js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await }); }) }; // test_long_timeline_create_then_tenant_delete is leaning on this message @@ -2126,9 +2110,15 @@ impl Tenant { tracing::debug!("Waiting for tasks..."); task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await; + if let Some(walredo_mgr) = self.walredo_mgr.as_ref() { + walredo_mgr.shutdown().await; + } + // Wait for any in-flight operations to complete self.gate.close().await; + remove_tenant_metrics(&self.tenant_shard_id); + Ok(()) } @@ -2328,7 +2318,12 @@ impl Tenant { TenantState::Active { .. } => { return Ok(()); } - TenantState::Broken { .. } | TenantState::Stopping { .. } => { + TenantState::Broken { reason, .. } => { + // This is fatal, and reported distinctly from the general case of "will never be active" because + // it's logically a 500 to external API users (broken is always a bug). + return Err(GetActiveTenantError::Broken(reason)); + } + TenantState::Stopping { .. } => { // There's no chance the tenant can transition back into ::Active return Err(GetActiveTenantError::WillNotBecomeActive(current_state)); } @@ -2337,19 +2332,14 @@ impl Tenant { } pub(crate) fn get_attach_mode(&self) -> AttachmentMode { - self.tenant_conf - .read() - .unwrap() - .location - .attach_mode - .clone() + self.tenant_conf.load().location.attach_mode } /// For API access: generate a LocationConfig equivalent to the one that would be used to /// create a Tenant in the same state. Do not use this in hot paths: it's for relatively /// rare external API calls, like a reconciliation at startup. pub(crate) fn get_location_conf(&self) -> models::LocationConfig { - let conf = self.tenant_conf.read().unwrap(); + let conf = self.tenant_conf.load(); let location_config_mode = match conf.location.attach_mode { AttachmentMode::Single => models::LocationConfigMode::AttachedSingle, @@ -2358,14 +2348,14 @@ impl Tenant { }; // We have a pageserver TenantConf, we need the API-facing TenantConfig. - let tenant_config: models::TenantConfig = conf.tenant_conf.into(); + let tenant_config: models::TenantConfig = conf.tenant_conf.clone().into(); models::LocationConfig { mode: location_config_mode, generation: self.generation.into(), secondary_conf: None, shard_number: self.shard_identity.number.0, - shard_count: self.shard_identity.count.0, + shard_count: self.shard_identity.count.literal(), shard_stripe_size: self.shard_identity.stripe_size.0, tenant_conf: tenant_config, } @@ -2375,9 +2365,97 @@ impl Tenant { &self.tenant_shard_id } + pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize { + self.shard_identity.stripe_size + } + pub(crate) fn get_generation(&self) -> Generation { self.generation } + + /// This function partially shuts down the tenant (it shuts down the Timelines) and is fallible, + /// and can leave the tenant in a bad state if it fails. The caller is responsible for + /// resetting this tenant to a valid state if we fail. + pub(crate) async fn split_prepare( + &self, + child_shards: &Vec, + ) -> anyhow::Result<()> { + let timelines = self.timelines.lock().unwrap().clone(); + for timeline in timelines.values() { + // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels + // to ensure that they do not start a split if currently in the process of doing these. + + // Upload an index from the parent: this is partly to provide freshness for the + // child tenants that will copy it, and partly for general ease-of-debugging: there will + // always be a parent shard index in the same generation as we wrote the child shard index. + tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index"); + timeline + .remote_client + .schedule_index_upload_for_file_changes()?; + timeline.remote_client.wait_completion().await?; + + // Shut down the timeline's remote client: this means that the indices we write + // for child shards will not be invalidated by the parent shard deleting layers. + tracing::info!(timeline_id=%timeline.timeline_id, "Shutting down remote storage client"); + timeline.remote_client.shutdown().await; + + // Download methods can still be used after shutdown, as they don't flow through the remote client's + // queue. In principal the RemoteTimelineClient could provide this without downloading it, but this + // operation is rare, so it's simpler to just download it (and robustly guarantees that the index + // we use here really is the remotely persistent one). + tracing::info!(timeline_id=%timeline.timeline_id, "Downloading index_part from parent"); + let result = timeline.remote_client + .download_index_file(&self.cancel) + .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id)) + .await?; + let index_part = match result { + MaybeDeletedIndexPart::Deleted(_) => { + anyhow::bail!("Timeline deletion happened concurrently with split") + } + MaybeDeletedIndexPart::IndexPart(p) => p, + }; + + for child_shard in child_shards { + tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index_part for child {}", child_shard.to_index()); + upload_index_part( + &self.remote_storage, + child_shard, + &timeline.timeline_id, + self.generation, + &index_part, + &self.cancel, + ) + .await?; + } + } + + Ok(()) + } + + pub(crate) fn get_sizes(&self) -> TopTenantShardItem { + let mut result = TopTenantShardItem { + id: self.tenant_shard_id, + resident_size: 0, + physical_size: 0, + max_logical_size: 0, + }; + + for timeline in self.timelines.lock().unwrap().values() { + result.resident_size += timeline.metrics.resident_physical_size_gauge.get(); + + result.physical_size += timeline + .remote_client + .metrics + .remote_physical_size_gauge + .get(); + result.max_logical_size = std::cmp::max( + result.max_logical_size, + timeline.metrics.current_logical_size_gauge.get(), + ); + } + + result + } } /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id), @@ -2431,93 +2509,86 @@ where impl Tenant { pub fn tenant_specific_overrides(&self) -> TenantConfOpt { - self.tenant_conf.read().unwrap().tenant_conf + self.tenant_conf.load().tenant_conf.clone() } pub fn effective_config(&self) -> TenantConf { self.tenant_specific_overrides() - .merge(self.conf.default_tenant_conf) + .merge(self.conf.default_tenant_conf.clone()) } pub fn get_checkpoint_distance(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .checkpoint_distance .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) } pub fn get_checkpoint_timeout(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .checkpoint_timeout .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } pub fn get_compaction_target_size(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_target_size .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) } pub fn get_compaction_period(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_period .unwrap_or(self.conf.default_tenant_conf.compaction_period) } pub fn get_compaction_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_threshold .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } pub fn get_gc_horizon(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .gc_horizon .unwrap_or(self.conf.default_tenant_conf.gc_horizon) } pub fn get_gc_period(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .gc_period .unwrap_or(self.conf.default_tenant_conf.gc_period) } pub fn get_image_creation_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .image_creation_threshold .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } pub fn get_pitr_interval(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .pitr_interval .unwrap_or(self.conf.default_tenant_conf.pitr_interval) } - pub fn get_trace_read_requests(&self) -> bool { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; - tenant_conf - .trace_read_requests - .unwrap_or(self.conf.default_tenant_conf.trace_read_requests) - } - pub fn get_min_resident_size_override(&self) -> Option { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .min_resident_size_override .or(self.conf.default_tenant_conf.min_resident_size_override) } pub fn get_heatmap_period(&self) -> Option { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); let heatmap_period = tenant_conf .heatmap_period .unwrap_or(self.conf.default_tenant_conf.heatmap_period); @@ -2528,28 +2599,66 @@ impl Tenant { } } + pub fn get_lsn_lease_length(&self) -> Duration { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .lsn_lease_length + .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length) + } + pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { - self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf; + // Use read-copy-update in order to avoid overwriting the location config + // state if this races with [`Tenant::set_new_location_config`]. Note that + // this race is not possible if both request types come from the storage + // controller (as they should!) because an exclusive op lock is required + // on the storage controller side. + self.tenant_conf.rcu(|inner| { + Arc::new(AttachedTenantConf { + tenant_conf: new_tenant_conf.clone(), + location: inner.location, + }) + }); + + self.tenant_conf_updated(&new_tenant_conf); // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. let timelines = self.list_timelines(); for timeline in timelines { - timeline.tenant_conf_updated(); + timeline.tenant_conf_updated(&new_tenant_conf); } } pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) { - *self.tenant_conf.write().unwrap() = new_conf; + let new_tenant_conf = new_conf.tenant_conf.clone(); + + self.tenant_conf.store(Arc::new(new_conf)); + + self.tenant_conf_updated(&new_tenant_conf); // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. let timelines = self.list_timelines(); for timeline in timelines { - timeline.tenant_conf_updated(); + timeline.tenant_conf_updated(&new_tenant_conf); } } + fn get_timeline_get_throttle_config( + psconf: &'static PageServerConf, + overrides: &TenantConfOpt, + ) -> throttle::Config { + overrides + .timeline_get_throttle + .clone() + .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone()) + } + + pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) { + let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf); + self.timeline_get_throttle.reconfigure(conf) + } + /// Helper function to create a new Timeline struct. /// /// The returned Timeline is in Loading state. The caller is responsible for @@ -2566,6 +2675,7 @@ impl Tenant { ancestor: Option>, resources: TimelineResources, cause: CreateTimelineCause, + last_aux_file_policy: Option, ) -> anyhow::Result> { let state = match cause { CreateTimelineCause::Load => { @@ -2590,10 +2700,11 @@ impl Tenant { self.tenant_shard_id, self.generation, self.shard_identity, - Arc::clone(&self.walredo_mgr), + self.walredo_mgr.clone(), resources, pg_version, state, + last_aux_file_policy, self.cancel.child_token(), ); @@ -2608,17 +2719,29 @@ impl Tenant { conf: &'static PageServerConf, attached_conf: AttachedTenantConf, shard_identity: ShardIdentity, - walredo_mgr: Arc, + walredo_mgr: Option>, tenant_shard_id: TenantShardId, - remote_storage: Option, + remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, + l0_flush_global_state: L0FlushGlobalState, ) -> Tenant { + debug_assert!( + !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none() + ); + let (state, mut rx) = watch::channel(state); tokio::spawn(async move { - // Strings for metric labels + // reflect tenant state in metrics: + // - global per tenant state: TENANT_STATE_METRIC + // - "set" of broken tenants: BROKEN_TENANTS_SET + // + // set of broken tenants should not have zero counts so that it remains accessible for + // alerting. + let tid = tenant_shard_id.to_string(); - let shard_id_str = format!("{}", tenant_shard_id.shard_slug()); + let shard_id = tenant_shard_id.shard_slug().to_string(); + let set_key = &[tid.as_str(), shard_id.as_str()][..]; fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) { ([state.into()], matches!(state, TenantState::Broken { .. })) @@ -2627,21 +2750,13 @@ impl Tenant { let mut tuple = inspect_state(&rx.borrow_and_update()); let is_broken = tuple.1; - let mut counted_broken = if !is_broken { - // the tenant might be ignored and reloaded, so first remove any previous set - // element. it most likely has already been scraped, as these are manual operations - // right now. most likely we will add it back very soon. - drop( - crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid, &shard_id_str]), - ); - false - } else { + let mut counted_broken = if is_broken { // add the id to the set right away, there should not be any updates on the channel - // after - crate::metrics::BROKEN_TENANTS_SET - .with_label_values(&[&tid, &shard_id_str]) - .set(1); + // after before tenant is removed, if ever + BROKEN_TENANTS_SET.with_label_values(set_key).set(1); true + } else { + false }; loop { @@ -2650,10 +2765,9 @@ impl Tenant { current.inc(); if rx.changed().await.is_err() { - // tenant has been dropped; decrement the counter because a tenant with that - // state is no longer in tenant map, but allow any broken set item to exist - // still. + // tenant has been dropped current.dec(); + drop(BROKEN_TENANTS_SET.remove_label_values(set_key)); break; } @@ -2663,10 +2777,9 @@ impl Tenant { let is_broken = tuple.1; if is_broken && !counted_broken { counted_broken = true; - // insert the tenant_id (back) into the set - crate::metrics::BROKEN_TENANTS_SET - .with_label_values(&[&tid, &shard_id_str]) - .inc(); + // insert the tenant_id (back) into the set while avoiding needless counter + // access + BROKEN_TENANTS_SET.with_label_values(set_key).set(1); } } }); @@ -2679,7 +2792,6 @@ impl Tenant { // using now here is good enough approximation to catch tenants with really long // activation times. constructed_at: Instant::now(), - tenant_conf: Arc::new(RwLock::new(attached_conf)), timelines: Mutex::new(HashMap::new()), timelines_creating: Mutex::new(HashSet::new()), gc_cs: tokio::sync::Mutex::new(()), @@ -2690,10 +2802,25 @@ impl Tenant { cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()), cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)), eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()), + compaction_circuit_breaker: std::sync::Mutex::new(CircuitBreaker::new( + format!("compaction-{tenant_shard_id}"), + 5, + // Compaction can be a very expensive operation, and might leak disk space. It also ought + // to be infallible, as long as remote storage is available. So if it repeatedly fails, + // use an extremely long backoff. + Some(Duration::from_secs(3600 * 24)), + )), activate_now_sem: tokio::sync::Semaphore::new(0), - delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())), cancel: CancellationToken::default(), - gate: Gate::new(format!("Tenant<{tenant_shard_id}>")), + gate: Gate::default(), + timeline_get_throttle: Arc::new(throttle::Throttle::new( + Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf), + &crate::metrics::tenant_throttling::TIMELINE_GET, + )), + tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), + ongoing_timeline_detach: std::sync::Mutex::default(), + gc_block: Default::default(), + l0_flush_global_state, } } @@ -2701,59 +2828,35 @@ impl Tenant { pub(super) fn load_tenant_config( conf: &'static PageServerConf, tenant_shard_id: &TenantShardId, - ) -> anyhow::Result { - let legacy_config_path = conf.tenant_config_path(tenant_shard_id); + ) -> Result { let config_path = conf.tenant_location_config_path(tenant_shard_id); - if config_path.exists() { - // New-style config takes precedence - let deserialized = Self::read_config(&config_path)?; - Ok(toml_edit::de::from_document::(deserialized)?) - } else if legacy_config_path.exists() { - // Upgrade path: found an old-style configuration only - let deserialized = Self::read_config(&legacy_config_path)?; - - let mut tenant_conf = TenantConfOpt::default(); - for (key, item) in deserialized.iter() { - match key { - "tenant_config" => { - tenant_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("Failed to parse config from file '{legacy_config_path}' as pageserver config"))?; - } - _ => bail!( - "config file {legacy_config_path} has unrecognized pageserver option '{key}'" - ), - } - } - - // Legacy configs are implicitly in attached state, and do not support sharding - Ok(LocationConf::attached_single( - tenant_conf, - Generation::none(), - &models::ShardParameters::default(), - )) - } else { - // FIXME If the config file is not found, assume that we're attaching - // a detached tenant and config is passed via attach command. - // https://github.com/neondatabase/neon/issues/1555 - // OR: we're loading after incomplete deletion that managed to remove config. - info!( - "tenant config not found in {} or {}", - config_path, legacy_config_path - ); - Ok(LocationConf::default()) - } - } - - fn read_config(path: &Utf8Path) -> anyhow::Result { - info!("loading tenant configuration from {path}"); + info!("loading tenant configuration from {config_path}"); // load and parse file - let config = fs::read_to_string(path) - .with_context(|| format!("Failed to load config from path '{path}'"))?; + let config = fs::read_to_string(&config_path).map_err(|e| { + match e.kind() { + std::io::ErrorKind::NotFound => { + // The config should almost always exist for a tenant directory: + // - When attaching a tenant, the config is the first thing we write + // - When detaching a tenant, we atomically move the directory to a tmp location + // before deleting contents. + // + // The very rare edge case that can result in a missing config is if we crash during attach + // between creating directory and writing config. Callers should handle that as if the + // directory didn't exist. - config - .parse::() - .with_context(|| format!("Failed to parse config from file '{path}' as toml file")) + LoadConfigError::NotFound(config_path) + } + _ => { + // No IO errors except NotFound are acceptable here: other kinds of error indicate local storage or permissions issues + // that we cannot cleanly recover + crate::virtual_file::on_fatal_io_error(&e, "Reading tenant config file") + } + } + })?; + + Ok(toml_edit::de::from_str::(&config)?) } #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] @@ -2761,43 +2864,18 @@ impl Tenant { conf: &'static PageServerConf, tenant_shard_id: &TenantShardId, location_conf: &LocationConf, - ) -> anyhow::Result<()> { - let legacy_config_path = conf.tenant_config_path(tenant_shard_id); + ) -> std::io::Result<()> { let config_path = conf.tenant_location_config_path(tenant_shard_id); - Self::persist_tenant_config_at( - tenant_shard_id, - &config_path, - &legacy_config_path, - location_conf, - ) - .await + Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await } #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(super) async fn persist_tenant_config_at( tenant_shard_id: &TenantShardId, config_path: &Utf8Path, - legacy_config_path: &Utf8Path, location_conf: &LocationConf, - ) -> anyhow::Result<()> { - // Forward compat: write out an old-style configuration that old versions can read, in case we roll back - Self::persist_tenant_config_legacy( - tenant_shard_id, - legacy_config_path, - &location_conf.tenant_conf, - ) - .await?; - - if let LocationMode::Attached(attach_conf) = &location_conf.mode { - // Once we use LocationMode, generations are mandatory. If we aren't using generations, - // then drop out after writing legacy-style config. - if attach_conf.generation.is_none() { - tracing::debug!("Running without generations, not writing new-style LocationConf"); - return Ok(()); - } - } - + ) -> std::io::Result<()> { debug!("persisting tenantconf to {config_path}"); let mut conf_content = r#"# This file contains a specific per-tenant's config. @@ -2806,65 +2884,20 @@ impl Tenant { .to_string(); fail::fail_point!("tenant-config-before-write", |_| { - anyhow::bail!("tenant-config-before-write"); + Err(std::io::Error::new( + std::io::ErrorKind::Other, + "tenant-config-before-write", + )) }); // Convert the config to a toml file. - conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?; + conf_content += + &toml_edit::ser::to_string_pretty(&location_conf).expect("Config serialization failed"); let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX); - let tenant_shard_id = *tenant_shard_id; - let config_path = config_path.to_owned(); - tokio::task::spawn_blocking(move || { - Handle::current().block_on(async move { - let conf_content = conf_content.as_bytes(); - VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content) - .await - .with_context(|| { - format!("write tenant {tenant_shard_id} config to {config_path}") - }) - }) - }) - .await??; - - Ok(()) - } - - #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] - async fn persist_tenant_config_legacy( - tenant_shard_id: &TenantShardId, - target_config_path: &Utf8Path, - tenant_conf: &TenantConfOpt, - ) -> anyhow::Result<()> { - debug!("persisting tenantconf to {target_config_path}"); - - let mut conf_content = r#"# This file contains a specific per-tenant's config. -# It is read in case of pageserver restart. - -[tenant_config] -"# - .to_string(); - - // Convert the config to a toml file. - conf_content += &toml_edit::ser::to_string(&tenant_conf)?; - - let temp_path = path_with_suffix_extension(target_config_path, TEMP_FILE_SUFFIX); - - let tenant_shard_id = *tenant_shard_id; - let target_config_path = target_config_path.to_owned(); - tokio::task::spawn_blocking(move || { - Handle::current().block_on(async move { - let conf_content = conf_content.as_bytes(); - VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content) - .await - .with_context(|| { - format!("write tenant {tenant_shard_id} config to {target_config_path}") - }) - }) - }) - .await??; - Ok(()) + let conf_content = conf_content.into_bytes(); + VirtualFile::crashsafe_overwrite(config_path.to_owned(), temp_path, conf_content).await } // @@ -2899,28 +2932,13 @@ impl Tenant { pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let mut totals: GcResult = Default::default(); let now = Instant::now(); - let gc_timelines = match self + let gc_timelines = self .refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx) - .await - { - Ok(result) => result, - Err(e) => { - if let Some(PageReconstructError::Cancelled) = - e.downcast_ref::() - { - // Handle cancellation - totals.elapsed = now.elapsed(); - return Ok(totals); - } else { - // Propagate other errors - return Err(e); - } - } - }; + .await?; failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines"); @@ -2940,12 +2958,24 @@ impl Tenant { // See comments in [`Tenant::branch_timeline`] for more information about why branch // creation task can run concurrently with timeline's GC iteration. for timeline in gc_timelines { - if task_mgr::is_shutdown_requested() || cancel.is_cancelled() { + if cancel.is_cancelled() { // We were requested to shut down. Stop and return with the progress we // made. break; } - let result = timeline.gc().await?; + let result = match timeline.gc().await { + Err(GcError::TimelineCancelled) => { + if target_timeline_id.is_some() { + // If we were targetting this specific timeline, surface cancellation to caller + return Err(GcError::TimelineCancelled); + } else { + // A timeline may be shutting down independently of the tenant's lifecycle: we should + // skip past this and proceed to try GC on other timelines. + continue; + } + } + r => r?, + }; totals += result; } @@ -2958,11 +2988,11 @@ impl Tenant { /// [`Tenant::get_gc_horizon`]. /// /// This is usually executed as part of periodic gc, but can now be triggered more often. - pub async fn refresh_gc_info( + pub(crate) async fn refresh_gc_info( &self, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result>> { + ) -> Result>, GcError> { // since this method can now be called at different rates than the configured gc loop, it // might be that these configuration values get applied faster than what it was previously, // since these were only read from the gc task. @@ -2976,6 +3006,55 @@ impl Tenant { .await } + /// Populate all Timelines' `GcInfo` with information about their children. We do not set the + /// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`] + /// + /// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion. + fn initialize_gc_info( + &self, + timelines: &std::sync::MutexGuard>>, + ) { + // This function must be called before activation: after activation timeline create/delete operations + // might happen, and this function is not safe to run concurrently with those. + assert!(!self.is_active()); + + // Scan all timelines. For each timeline, remember the timeline ID and + // the branch point where it was created. + let mut all_branchpoints: BTreeMap> = BTreeMap::new(); + timelines.iter().for_each(|(timeline_id, timeline_entry)| { + if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { + let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default(); + ancestor_children.push((timeline_entry.get_ancestor_lsn(), *timeline_id)); + } + }); + + // The number of bytes we always keep, irrespective of PITR: this is a constant across timelines + let horizon = self.get_gc_horizon(); + + // Populate each timeline's GcInfo with information about its child branches + for timeline in timelines.values() { + let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints + .remove(&timeline.timeline_id) + .unwrap_or_default(); + + branchpoints.sort_by_key(|b| b.0); + + let mut target = timeline.gc_info.write().unwrap(); + + target.retain_lsns = branchpoints; + + let space_cutoff = timeline + .get_last_record_lsn() + .checked_sub(horizon) + .unwrap_or(Lsn(0)); + + target.cutoffs = GcCutoffs { + space: space_cutoff, + time: Lsn::INVALID, + }; + } + } + async fn refresh_gc_info_internal( &self, target_timeline_id: Option, @@ -2983,83 +3062,103 @@ impl Tenant { pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result>> { - // grab mutex to prevent new timelines from being created here. + ) -> Result>, GcError> { + // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for + // currently visible timelines. + let timelines = self + .timelines + .lock() + .unwrap() + .values() + .filter(|tl| match target_timeline_id.as_ref() { + Some(target) => &tl.timeline_id == target, + None => true, + }) + .cloned() + .collect::>(); + + if target_timeline_id.is_some() && timelines.is_empty() { + // We were to act on a particular timeline and it wasn't found + return Err(GcError::TimelineNotFound); + } + + let mut gc_cutoffs: HashMap = + HashMap::with_capacity(timelines.len()); + + for timeline in timelines.iter() { + let cutoff = timeline + .get_last_record_lsn() + .checked_sub(horizon) + .unwrap_or(Lsn(0)); + + let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?; + let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs); + assert!(old.is_none()); + } + + if !self.is_active() || self.cancel.is_cancelled() { + return Err(GcError::TenantCancelled); + } + + // grab mutex to prevent new timelines from being created here; avoid doing long operations + // because that will stall branch creation. let gc_cs = self.gc_cs.lock().await; - // Scan all timelines. For each timeline, remember the timeline ID and - // the branch point where it was created. - let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = { - let timelines = self.timelines.lock().unwrap(); - let mut all_branchpoints = BTreeSet::new(); - let timeline_ids = { - if let Some(target_timeline_id) = target_timeline_id.as_ref() { - if timelines.get(target_timeline_id).is_none() { - bail!("gc target timeline does not exist") - } - }; - - timelines - .iter() - .map(|(timeline_id, timeline_entry)| { - if let Some(ancestor_timeline_id) = - &timeline_entry.get_ancestor_timeline_id() - { - // If target_timeline is specified, we only need to know branchpoints of its children - if let Some(timeline_id) = target_timeline_id { - if ancestor_timeline_id == &timeline_id { - all_branchpoints.insert(( - *ancestor_timeline_id, - timeline_entry.get_ancestor_lsn(), - )); - } - } - // Collect branchpoints for all timelines - else { - all_branchpoints.insert(( - *ancestor_timeline_id, - timeline_entry.get_ancestor_lsn(), - )); - } - } - - *timeline_id - }) - .collect::>() - }; - (all_branchpoints, timeline_ids) - }; - // Ok, we now know all the branch points. // Update the GC information for each timeline. - let mut gc_timelines = Vec::with_capacity(timeline_ids.len()); - for timeline_id in timeline_ids { - // Timeline is known to be local and loaded. - let timeline = self - .get_timeline(timeline_id, false) - .with_context(|| format!("Timeline {timeline_id} was not found"))?; - - // If target_timeline is specified, ignore all other timelines + let mut gc_timelines = Vec::with_capacity(timelines.len()); + for timeline in timelines { + // We filtered the timeline list above if let Some(target_timeline_id) = target_timeline_id { - if timeline_id != target_timeline_id { - continue; + assert_eq!(target_timeline_id, timeline.timeline_id); + } + + { + let mut target = timeline.gc_info.write().unwrap(); + + // Cull any expired leases + let now = SystemTime::now(); + target.leases.retain(|_, lease| !lease.is_expired(&now)); + + timeline + .metrics + .valid_lsn_lease_count_gauge + .set(target.leases.len() as u64); + + // Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR + if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() { + if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) { + target.within_ancestor_pitr = + timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time; + } + } + + // Update metrics that depend on GC state + timeline + .metrics + .archival_size + .set(if target.within_ancestor_pitr { + timeline.metrics.current_logical_size_gauge.get() + } else { + 0 + }); + timeline.metrics.pitr_history_size.set( + timeline + .get_last_record_lsn() + .checked_sub(target.cutoffs.time) + .unwrap_or(Lsn(0)) + .0, + ); + + // Apply the cutoffs we found to the Timeline's GcInfo. Why might we _not_ have cutoffs for a timeline? + // - this timeline was created while we were finding cutoffs + // - lsn for timestamp search fails for this timeline repeatedly + if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) { + target.cutoffs = cutoffs.clone(); } } - if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) { - let branchpoints: Vec = all_branchpoints - .range(( - Included((timeline_id, Lsn(0))), - Included((timeline_id, Lsn(u64::MAX))), - )) - .map(|&x| x.1) - .collect(); - timeline - .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx) - .await?; - - gc_timelines.push(timeline); - } + gc_timelines.push(timeline); } drop(gc_cs); Ok(gc_timelines) @@ -3074,17 +3173,53 @@ impl Tenant { &self, src_timeline: &Arc, dst_id: TimelineId, - start_lsn: Option, + ancestor_lsn: Option, ctx: &RequestContext, ) -> Result, CreateTimelineError> { - let uninit_mark = self.create_timeline_uninit_mark(dst_id).unwrap(); + let create_guard = self.create_timeline_create_guard(dst_id).unwrap(); let tl = self - .branch_timeline_impl(src_timeline, dst_id, start_lsn, uninit_mark, ctx) + .branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, create_guard, ctx) .await?; tl.set_state(TimelineState::Active); Ok(tl) } + /// Helper for unit tests to branch a timeline with some pre-loaded states. + #[cfg(test)] + #[allow(clippy::too_many_arguments)] + pub async fn branch_timeline_test_with_layers( + &self, + src_timeline: &Arc, + dst_id: TimelineId, + ancestor_lsn: Option, + ctx: &RequestContext, + delta_layer_desc: Vec, + image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, + end_lsn: Lsn, + ) -> anyhow::Result> { + let tline = self + .branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx) + .await?; + let ancestor_lsn = if let Some(ancestor_lsn) = ancestor_lsn { + ancestor_lsn + } else { + tline.get_last_record_lsn() + }; + assert!(end_lsn >= ancestor_lsn); + tline.force_advance_lsn(end_lsn); + for deltas in delta_layer_desc { + tline + .force_create_delta_layer(deltas, Some(ancestor_lsn), ctx) + .await?; + } + for (lsn, images) in image_layer_desc { + tline + .force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx) + .await?; + } + Ok(tline) + } + /// Branch an existing timeline. /// /// The caller is responsible for activating the returned timeline. @@ -3093,10 +3228,10 @@ impl Tenant { src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, - timeline_uninit_mark: TimelineUninitMark<'_>, + timeline_create_guard: TimelineCreateGuard<'_>, ctx: &RequestContext, ) -> Result, CreateTimelineError> { - self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_uninit_mark, ctx) + self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_create_guard, ctx) .await } @@ -3105,7 +3240,7 @@ impl Tenant { src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, - timeline_uninit_mark: TimelineUninitMark<'_>, + timeline_create_guard: TimelineCreateGuard<'_>, _ctx: &RequestContext, ) -> Result, CreateTimelineError> { let src_id = src_timeline.timeline_id; @@ -3146,7 +3281,7 @@ impl Tenant { // and then the planned GC cutoff { let gc_info = src_timeline.gc_info.read().unwrap(); - let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); + let cutoff = gc_info.min_cutoff(); if start_lsn < cutoff { return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!( "invalid branch start lsn: less than planned GC cutoff {cutoff}" @@ -3189,9 +3324,10 @@ impl Tenant { .prepare_new_timeline( dst_id, &metadata, - timeline_uninit_mark, + timeline_create_guard, start_lsn + 1, Some(Arc::clone(src_timeline)), + src_timeline.last_aux_file_policy.load(), ) .await?; @@ -3202,20 +3338,17 @@ impl Tenant { // We still need to upload its metadata eagerly: if other nodes `attach` the tenant and miss this timeline, their GC // could get incorrect information and remove more layers, than needed. // See also https://github.com/neondatabase/neon/issues/3865 - if let Some(remote_client) = new_timeline.remote_client.as_ref() { - remote_client - .schedule_index_upload_for_metadata_update(&metadata) - .context("branch initial metadata upload")?; - } - - info!("branched timeline {dst_id} from {src_id} at {start_lsn}"); + new_timeline + .remote_client + .schedule_index_upload_for_full_metadata_update(&metadata) + .context("branch initial metadata upload")?; Ok(new_timeline) } /// For unit tests, make this visible so that other modules can directly create timelines #[cfg(test)] - #[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))] + #[tracing::instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))] pub(crate) async fn bootstrap_timeline_test( &self, timeline_id: TimelineId, @@ -3223,12 +3356,12 @@ impl Tenant { load_existing_initdb: Option, ctx: &RequestContext, ) -> anyhow::Result> { - let uninit_mark = self.create_timeline_uninit_mark(timeline_id).unwrap(); + let create_guard = self.create_timeline_create_guard(timeline_id).unwrap(); self.bootstrap_timeline( timeline_id, pg_version, load_existing_initdb, - uninit_mark, + create_guard, ctx, ) .await @@ -3240,11 +3373,6 @@ impl Tenant { pgdata_path: &Utf8PathBuf, timeline_id: &TimelineId, ) -> anyhow::Result<()> { - let Some(storage) = &self.remote_storage else { - // No remote storage? No upload. - return Ok(()); - }; - let temp_path = timelines_path.join(format!( "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}" )); @@ -3255,15 +3383,20 @@ impl Tenant { } } - let (pgdata_zstd, tar_zst_size) = - import_datadir::create_tar_zst(pgdata_path, &temp_path).await?; + let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?; + const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024; + if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT { + warn!( + "compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}." + ); + } pausable_failpoint!("before-initdb-upload"); backoff::retry( || async { self::remote_timeline_client::upload_initdb_dir( - storage, + &self.remote_storage, &self.tenant_shard_id.tenant_id, timeline_id, pgdata_zstd.try_clone().await?, @@ -3276,11 +3409,11 @@ impl Tenant { 3, u32::MAX, "persist_initdb_tar_zst", - backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")), + &self.cancel, ) - .await?; - - Ok(()) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) } /// - run initdb to init temporary instance and get bootstrap data @@ -3292,7 +3425,7 @@ impl Tenant { timeline_id: TimelineId, pg_version: u32, load_existing_initdb: Option, - timeline_uninit_mark: TimelineUninitMark<'_>, + timeline_create_guard: TimelineCreateGuard<'_>, ctx: &RequestContext, ) -> anyhow::Result> { // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` @@ -3304,13 +3437,14 @@ impl Tenant { TEMP_FILE_SUFFIX, ); - // an uninit mark was placed before, nothing else can access this timeline files - // current initdb was not run yet, so remove whatever was left from the previous runs + // Remove whatever was left from the previous runs: safe because TimelineCreateGuard guarantees + // we won't race with other creations or existent timelines with the same path. if pgdata_path.exists() { fs::remove_dir_all(&pgdata_path).with_context(|| { format!("Failed to remove already existing initdb directory: {pgdata_path}") })?; } + // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it scopeguard::defer! { if let Err(e) = fs::remove_dir_all(&pgdata_path) { @@ -3319,9 +3453,6 @@ impl Tenant { } } if let Some(existing_initdb_timeline_id) = load_existing_initdb { - let Some(storage) = &self.remote_storage else { - bail!("no storage configured but load_existing_initdb set to {existing_initdb_timeline_id}"); - }; if existing_initdb_timeline_id != timeline_id { let source_path = &remote_initdb_archive_path( &self.tenant_shard_id.tenant_id, @@ -3329,15 +3460,17 @@ impl Tenant { ); let dest_path = &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id); - storage - .copy_object(source_path, dest_path) + + // if this fails, it will get retried by retried control plane requests + self.remote_storage + .copy_object(source_path, dest_path, &self.cancel) .await .context("copy initdb tar")?; } let (initdb_tar_zst_path, initdb_tar_zst) = self::remote_timeline_client::download_initdb_tar_zst( self.conf, - storage, + &self.remote_storage, &self.tenant_shard_id, &existing_initdb_timeline_id, &self.cancel, @@ -3353,7 +3486,7 @@ impl Tenant { let buf_read = BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst); - import_datadir::extract_tar_zst(&pgdata_path, buf_read) + extract_zst_tarball(&pgdata_path, buf_read) .await .context("extract initdb tar")?; } else { @@ -3361,7 +3494,7 @@ impl Tenant { run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?; // Upload the created data dir to S3 - if self.tenant_shard_id().is_zero() { + if self.tenant_shard_id().is_shard_zero() { self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id) .await?; } @@ -3385,15 +3518,22 @@ impl Tenant { .prepare_new_timeline( timeline_id, &new_metadata, - timeline_uninit_mark, + timeline_create_guard, pgdata_lsn, None, + None, ) .await?; let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id; let unfinished_timeline = raw_timeline.raw_timeline()?; + // Flush the new layer files to disk, before we make the timeline as available to + // the outside world. + // + // Flush loop needs to be spawned in order to be able to flush. + unfinished_timeline.maybe_spawn_flush_loop(); + import_datadir::import_timeline_from_postgres_datadir( unfinished_timeline, &pgdata_path, @@ -3405,12 +3545,6 @@ impl Tenant { format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}") })?; - // Flush the new layer files to disk, before we make the timeline as available to - // the outside world. - // - // Flush loop needs to be spawned in order to be able to flush. - unfinished_timeline.maybe_spawn_flush_loop(); - fail::fail_point!("before-checkpoint-new-timeline", |_| { anyhow::bail!("failpoint before-checkpoint-new-timeline"); }); @@ -3427,34 +3561,23 @@ impl Tenant { // All done! let timeline = raw_timeline.finish_creation()?; - info!( - "created root timeline {} timeline.lsn {}", - timeline_id, - timeline.get_last_record_lsn() - ); - Ok(timeline) } /// Call this before constructing a timeline, to build its required structures fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources { - let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() { - let remote_client = RemoteTimelineClient::new( - remote_storage.clone(), - self.deletion_queue_client.clone(), - self.conf, - self.tenant_shard_id, - timeline_id, - self.generation, - ); - Some(remote_client) - } else { - None - }; - + let remote_client = RemoteTimelineClient::new( + self.remote_storage.clone(), + self.deletion_queue_client.clone(), + self.conf, + self.tenant_shard_id, + timeline_id, + self.generation, + ); TimelineResources { remote_client, - deletion_queue_client: self.deletion_queue_client.clone(), + timeline_get_throttle: self.timeline_get_throttle.clone(), + l0_flush_global_state: self.l0_flush_global_state.clone(), } } @@ -3462,22 +3585,22 @@ impl Tenant { /// /// An empty layer map is initialized, and new data and WAL can be imported starting /// at 'disk_consistent_lsn'. After any initial data has been imported, call - /// `finish_creation` to insert the Timeline into the timelines map and to remove the - /// uninit mark file. + /// `finish_creation` to insert the Timeline into the timelines map. async fn prepare_new_timeline<'a>( &'a self, new_timeline_id: TimelineId, new_metadata: &TimelineMetadata, - uninit_mark: TimelineUninitMark<'a>, + create_guard: TimelineCreateGuard<'a>, start_lsn: Lsn, ancestor: Option>, + last_aux_file_policy: Option, ) -> anyhow::Result { let tenant_shard_id = self.tenant_shard_id; let resources = self.build_timeline_resources(new_timeline_id); - if let Some(remote_client) = &resources.remote_client { - remote_client.init_upload_queue_for_empty_remote(new_metadata)?; - } + resources + .remote_client + .init_upload_queue_for_empty_remote(new_metadata)?; let timeline_struct = self .create_timeline_struct( @@ -3486,17 +3609,18 @@ impl Tenant { ancestor, resources, CreateTimelineCause::Load, + last_aux_file_policy, ) .context("Failed to create timeline data structure")?; timeline_struct.init_empty_layer_map(start_lsn); if let Err(e) = self - .create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata) + .create_timeline_files(&create_guard.timeline_path) .await { error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}"); - cleanup_timeline_directory(uninit_mark); + cleanup_timeline_directory(create_guard); return Err(e); } @@ -3507,54 +3631,31 @@ impl Tenant { Ok(UninitializedTimeline::new( self, new_timeline_id, - Some((timeline_struct, uninit_mark)), + Some((timeline_struct, create_guard)), )) } - async fn create_timeline_files( - &self, - timeline_path: &Utf8Path, - new_timeline_id: &TimelineId, - new_metadata: &TimelineMetadata, - ) -> anyhow::Result<()> { + async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> { crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?; - fail::fail_point!("after-timeline-uninit-mark-creation", |_| { - anyhow::bail!("failpoint after-timeline-uninit-mark-creation"); + fail::fail_point!("after-timeline-dir-creation", |_| { + anyhow::bail!("failpoint after-timeline-dir-creation"); }); - save_metadata( - self.conf, - &self.tenant_shard_id, - new_timeline_id, - new_metadata, - ) - .await - .context("Failed to create timeline metadata")?; Ok(()) } - /// Attempts to create an uninit mark file for the timeline initialization. - /// Bails, if the timeline is already loaded into the memory (i.e. initialized before), or the uninit mark file already exists. - /// - /// This way, we need to hold the timelines lock only for small amount of time during the mark check/creation per timeline init. - fn create_timeline_uninit_mark( + /// Get a guard that provides exclusive access to the timeline directory, preventing + /// concurrent attempts to create the same timeline. + fn create_timeline_create_guard( &self, timeline_id: TimelineId, - ) -> Result { + ) -> Result { let tenant_shard_id = self.tenant_shard_id; - let uninit_mark_path = self - .conf - .timeline_uninit_mark_file_path(tenant_shard_id, timeline_id); let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id); - let uninit_mark = TimelineUninitMark::new( - self, - timeline_id, - uninit_mark_path.clone(), - timeline_path.clone(), - )?; + let create_guard = TimelineCreateGuard::new(self, timeline_id, timeline_path.clone())?; // At this stage, we have got exclusive access to in-memory state for this timeline ID // for creation. @@ -3570,23 +3671,7 @@ impl Tenant { ))); } - // Create the on-disk uninit mark _after_ the in-memory acquisition of the tenant ID: guarantees - // that during process runtime, colliding creations will be caught in-memory without getting - // as far as failing to write a file. - fs::OpenOptions::new() - .write(true) - .create_new(true) - .open(&uninit_mark_path) - .context("Failed to create uninit mark file") - .and_then(|_| { - crashsafe::fsync_file_and_parent(&uninit_mark_path) - .context("Failed to fsync uninit mark file") - }) - .with_context(|| { - format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}") - })?; - - Ok(uninit_mark) + Ok(create_guard) } /// Gathers inputs from all of the timelines to produce a sizing model input. @@ -3601,7 +3686,7 @@ impl Tenant { cause: LogicalSizeCalculationCause, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let logical_sizes_at_once = self .conf .concurrent_tenant_size_logical_size_queries @@ -3614,7 +3699,11 @@ impl Tenant { // is in progress (which is not a common case). // // See more for on the issue #2748 condenced out of the initial PR review. - let mut shared_cache = self.cached_logical_sizes.lock().await; + let mut shared_cache = tokio::select! { + locked = self.cached_logical_sizes.lock() => locked, + _ = cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled), + _ = self.cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled), + }; size::gather_inputs( self, @@ -3637,10 +3726,10 @@ impl Tenant { cause: LogicalSizeCalculationCause, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let inputs = self.gather_size_inputs(None, cause, cancel, ctx).await?; - let size = inputs.calculate()?; + let size = inputs.calculate(); self.set_cached_synthetic_size(size); @@ -3653,7 +3742,7 @@ impl Tenant { .store(size, Ordering::Relaxed); // Only shard zero should be calculating synthetic sizes - debug_assert!(self.shard_identity.is_zero()); + debug_assert!(self.shard_identity.is_shard_zero()); TENANT_SYNTHETIC_SIZE_METRIC .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()]) @@ -3680,9 +3769,7 @@ impl Tenant { tracing::info!(timeline_id=%timeline.timeline_id, "Flushing..."); timeline.freeze_and_flush().await?; tracing::info!(timeline_id=%timeline.timeline_id, "Waiting for uploads..."); - if let Some(client) = &timeline.remote_client { - client.wait_completion().await?; - } + timeline.remote_client.wait_completion().await?; Ok(()) } @@ -3697,9 +3784,8 @@ impl Tenant { // Run each timeline's flush in a task holding the timeline's gate: this // means that if this function's future is cancelled, the Timeline shutdown // will still wait for any I/O in here to complete. - let gate = match timeline.gate.enter() { - Ok(g) => g, - Err(_) => continue, + let Ok(gate) = timeline.gate.enter() else { + continue; }; let jh = tokio::task::spawn(async move { flush_timeline(gate, timeline).await }); results.push(jh); @@ -3724,29 +3810,31 @@ impl Tenant { Ok(()) } -} -fn remove_timeline_and_uninit_mark( - timeline_dir: &Utf8Path, - uninit_mark: &Utf8Path, -) -> anyhow::Result<()> { - fs::remove_dir_all(timeline_dir) - .or_else(|e| { - if e.kind() == std::io::ErrorKind::NotFound { - // we can leave the uninit mark without a timeline dir, - // just remove the mark then - Ok(()) - } else { - Err(e) - } - }) - .with_context(|| { - format!("Failed to remove unit marked timeline directory {timeline_dir}") - })?; - fs::remove_file(uninit_mark) - .with_context(|| format!("Failed to remove timeline uninit mark file {uninit_mark}"))?; + pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt { + self.tenant_conf.load().tenant_conf.clone() + } - Ok(()) + /// How much local storage would this tenant like to have? It can cope with + /// less than this (via eviction and on-demand downloads), but this function enables + /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O + /// by keeping important things on local disk. + /// + /// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less + /// than they report here, due to layer eviction. Tenants with many active branches may + /// actually use more than they report here. + pub(crate) fn local_storage_wanted(&self) -> u64 { + let timelines = self.timelines.lock().unwrap(); + + // Heuristic: we use the max() of the timelines' visible sizes, rather than the sum. This + // reflects the observation that on tenants with multiple large branches, typically only one + // of them is used actively enough to occupy space on disk. + timelines + .values() + .map(|t| t.metrics.visible_physical_size_gauge.get()) + .max() + .unwrap_or(0) + } } /// Create the cluster temporarily in 'initdbpath' directory inside the repository @@ -3778,6 +3866,11 @@ async fn run_initdb( .env_clear() .env("LD_LIBRARY_PATH", &initdb_lib_dir) .env("DYLD_LIBRARY_PATH", &initdb_lib_dir) + .stdin(std::process::Stdio::null()) + // stdout invocation produces the same output every time, we don't need it + .stdout(std::process::Stdio::null()) + // we would be interested in the stderr output, if there was any + .stderr(std::process::Stdio::piped()) .spawn()?; // Ideally we'd select here with the cancellation token, but the problem is that @@ -3802,11 +3895,6 @@ async fn run_initdb( Ok(()) } -impl Drop for Tenant { - fn drop(&mut self) { - remove_tenant_metrics(&self.tenant_shard_id); - } -} /// Dump contents of a layer file to stdout. pub async fn dump_layerfile_from_path( path: &Utf8Path, @@ -3841,24 +3929,19 @@ pub async fn dump_layerfile_from_path( #[cfg(test)] pub(crate) mod harness { use bytes::{Bytes, BytesMut}; - use camino::Utf8PathBuf; use once_cell::sync::OnceCell; use pageserver_api::models::ShardParameters; use pageserver_api::shard::ShardIndex; - use std::fs; - use std::sync::Arc; use utils::logging; - use utils::lsn::Lsn; use crate::deletion_queue::mock::MockDeletionQueue; - use crate::{ - config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord, - }; + use crate::l0_flush::L0FlushConfig; + use crate::walredo::apply_neon; + use crate::{repository::Key, walrecord::NeonWalRecord}; use super::*; - use crate::tenant::config::{TenantConf, TenantConfOpt}; use hex_literal::hex; - use utils::id::{TenantId, TimelineId}; + use utils::id::TenantId; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); @@ -3866,8 +3949,7 @@ pub(crate) mod harness { TimelineId::from_array(hex!("AA223344556677881122334455667788")); /// Convenience function to create a page image with given string as the only content - #[allow(non_snake_case)] - pub fn TEST_IMG(s: &str) -> Bytes { + pub fn test_img(s: &str) -> Bytes { let mut buf = BytesMut::new(); buf.extend_from_slice(s.as_bytes()); buf.resize(64, 0); @@ -3883,6 +3965,7 @@ pub(crate) mod harness { compaction_target_size: Some(tenant_conf.compaction_target_size), compaction_period: Some(tenant_conf.compaction_period), compaction_threshold: Some(tenant_conf.compaction_threshold), + compaction_algorithm: Some(tenant_conf.compaction_algorithm), gc_horizon: Some(tenant_conf.gc_horizon), gc_period: Some(tenant_conf.gc_period), image_creation_threshold: Some(tenant_conf.image_creation_threshold), @@ -3890,23 +3973,24 @@ pub(crate) mod harness { walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout), lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout), max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag), - trace_read_requests: Some(tenant_conf.trace_read_requests), eviction_policy: Some(tenant_conf.eviction_policy), min_resident_size_override: tenant_conf.min_resident_size_override, evictions_low_residence_duration_metric_threshold: Some( tenant_conf.evictions_low_residence_duration_metric_threshold, ), - gc_feedback: Some(tenant_conf.gc_feedback), heatmap_period: Some(tenant_conf.heatmap_period), + lazy_slru_download: Some(tenant_conf.lazy_slru_download), + timeline_get_throttle: Some(tenant_conf.timeline_get_throttle), + image_layer_creation_check_threshold: Some( + tenant_conf.image_layer_creation_check_threshold, + ), + switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy), + lsn_lease_length: Some(tenant_conf.lsn_lease_length), + lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts), } } } - enum LoadMode { - Local, - Remote, - } - pub struct TenantHarness { pub conf: &'static PageServerConf, pub tenant_conf: TenantConf, @@ -3934,7 +4018,13 @@ pub(crate) mod harness { } impl TenantHarness { - pub fn create(test_name: &'static str) -> anyhow::Result { + pub async fn create_custom( + test_name: &'static str, + tenant_conf: TenantConf, + tenant_id: TenantId, + shard_identity: ShardIdentity, + generation: Generation, + ) -> anyhow::Result { setup_logging(); let repo_dir = PageServerConf::test_repo_dir(test_name); @@ -3946,16 +4036,12 @@ pub(crate) mod harness { // OK in a test. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - // Disable automatic GC and compaction to make the unit tests more deterministic. - // The tests perform them manually if needed. - let tenant_conf = TenantConf { - gc_period: Duration::ZERO, - compaction_period: Duration::ZERO, - ..TenantConf::default() + let shard = shard_identity.shard_index(); + let tenant_shard_id = TenantShardId { + tenant_id, + shard_number: shard.shard_number, + shard_count: shard.shard_count, }; - - let tenant_id = TenantId::generate(); - let tenant_shard_id = TenantShardId::unsharded(tenant_id); fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?; fs::create_dir_all(conf.timelines_path(&tenant_shard_id))?; @@ -3963,62 +4049,64 @@ pub(crate) mod harness { let remote_fs_dir = conf.workdir.join("localfs"); std::fs::create_dir_all(&remote_fs_dir).unwrap(); let config = RemoteStorageConfig { - storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), + storage: RemoteStorageKind::LocalFs { + local_path: remote_fs_dir.clone(), + }, + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; - let remote_storage = GenericRemoteStorage::from_config(&config).unwrap(); + let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap(); let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone())); Ok(Self { conf, tenant_conf, tenant_shard_id, - generation: Generation::new(0xdeadbeef), - shard: ShardIndex::unsharded(), + generation, + shard, remote_storage, remote_fs_dir, deletion_queue, }) } - pub async fn load(&self) -> (Arc, RequestContext) { + pub async fn create(test_name: &'static str) -> anyhow::Result { + // Disable automatic GC and compaction to make the unit tests more deterministic. + // The tests perform them manually if needed. + let tenant_conf = TenantConf { + gc_period: Duration::ZERO, + compaction_period: Duration::ZERO, + ..TenantConf::default() + }; + let tenant_id = TenantId::generate(); + let shard = ShardIdentity::unsharded(); + Self::create_custom( + test_name, + tenant_conf, + tenant_id, + shard, + Generation::new(0xdeadbeef), + ) + .await + } + + pub fn span(&self) -> tracing::Span { + info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()) + } + + pub(crate) async fn load(&self) -> (Arc, RequestContext) { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); ( - self.try_load(&ctx) + self.do_try_load(&ctx) .await .expect("failed to load test tenant"), ctx, ) } - fn remote_empty(&self) -> bool { - let tenant_path = self.conf.tenant_path(&self.tenant_shard_id); - let remote_tenant_dir = self - .remote_fs_dir - .join(tenant_path.strip_prefix(&self.conf.workdir).unwrap()); - if std::fs::metadata(&remote_tenant_dir).is_err() { - return true; - } - - match std::fs::read_dir(remote_tenant_dir) - .unwrap() - .flatten() - .next() - { - Some(entry) => { - tracing::debug!( - "remote_empty: not empty, found file {}", - entry.file_name().to_string_lossy(), - ); - false - } - None => true, - } - } - - async fn do_try_load( + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] + pub(crate) async fn do_try_load( &self, ctx: &RequestContext, - mode: LoadMode, ) -> anyhow::Result> { let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager)); @@ -4026,37 +4114,25 @@ pub(crate) mod harness { TenantState::Loading, self.conf, AttachedTenantConf::try_from(LocationConf::attached_single( - TenantConfOpt::from(self.tenant_conf), + TenantConfOpt::from(self.tenant_conf.clone()), self.generation, &ShardParameters::default(), )) .unwrap(), // This is a legacy/test code path: sharding isn't supported here. ShardIdentity::unsharded(), - walredo_mgr, + Some(walredo_mgr), self.tenant_shard_id, - Some(self.remote_storage.clone()), + self.remote_storage.clone(), self.deletion_queue.new_client(), + // TODO: ideally we should run all unit tests with both configs + L0FlushGlobalState::new(L0FlushConfig::default()), )); - match mode { - LoadMode::Local => { - tenant - .load_local(ctx) - .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) - .await?; - } - LoadMode::Remote => { - let preload = tenant - .preload(&self.remote_storage, CancellationToken::new()) - .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) - .await?; - tenant - .attach(Some(preload), SpawnMode::Normal, ctx) - .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) - .await?; - } - } + let preload = tenant + .preload(&self.remote_storage, CancellationToken::new()) + .await?; + tenant.attach(Some(preload), ctx).await?; tenant.state.send_replace(TenantState::Active); for timeline in tenant.timelines.lock().unwrap().values() { @@ -4065,27 +4141,6 @@ pub(crate) mod harness { Ok(tenant) } - /// For tests that specifically want to exercise the local load path, which does - /// not use remote storage. - pub async fn try_load_local(&self, ctx: &RequestContext) -> anyhow::Result> { - self.do_try_load(ctx, LoadMode::Local).await - } - - /// The 'load' in this function is either a local load or a normal attachment, - pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result> { - // If we have nothing in remote storage, must use load_local instead of attach: attach - // will error out if there are no timelines. - // - // See https://github.com/neondatabase/neon/issues/5456 for how we will eliminate - // this weird state of a Tenant which exists but doesn't have any timelines. - let mode = match self.remote_empty() { - true => LoadMode::Local, - false => LoadMode::Remote, - }; - - self.do_try_load(ctx, mode).await - } - pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf { self.conf.timeline_path(&self.tenant_shard_id, timeline_id) } @@ -4105,67 +4160,93 @@ pub(crate) mod harness { base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, _pg_version: u32, - ) -> anyhow::Result { - let s = format!( - "redo for {} to get to {}, with {} and {} records", - key, - lsn, - if base_img.is_some() { - "base image" - } else { - "no base image" - }, - records.len() - ); - println!("{s}"); + ) -> Result { + let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1)); + if records_neon { + // For Neon wal records, we can decode without spawning postgres, so do so. + let base_img = base_img.expect("Neon WAL redo requires base image").1; + let mut page = BytesMut::new(); + page.extend_from_slice(&base_img); + for (record_lsn, record) in records { + apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?; + } + Ok(page.freeze()) + } else { + // We never spawn a postgres walredo process in unit tests: just log what we might have done. + let s = format!( + "redo for {} to get to {}, with {} and {} records", + key, + lsn, + if base_img.is_some() { + "base image" + } else { + "no base image" + }, + records.len() + ); + println!("{s}"); - Ok(TEST_IMG(&s)) + Ok(test_img(&s)) + } } } } #[cfg(test)] mod tests { + use std::collections::{BTreeMap, BTreeSet}; + use super::*; use crate::keyspace::KeySpaceAccum; + use crate::pgdatadir_mapping::AuxFilesDirectory; use crate::repository::{Key, Value}; use crate::tenant::harness::*; + use crate::tenant::timeline::CompactFlags; + use crate::walrecord::NeonWalRecord; use crate::DEFAULT_PG_VERSION; - use crate::METADATA_FILE_NAME; - use bytes::BytesMut; + use bytes::{Bytes, BytesMut}; use hex_literal::hex; - use once_cell::sync::Lazy; + use itertools::Itertools; + use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE}; + use pageserver_api::keyspace::KeySpace; + use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; use rand::{thread_rng, Rng}; - use tokio_util::sync::CancellationToken; + use storage_layer::PersistentLayerKey; + use tests::storage_layer::ValuesReconstructState; + use tests::timeline::{GetVectoredError, ShutdownMode}; + use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; + use timeline::{DeltaLayerTestDesc, GcInfo}; + use utils::bin_ser::BeSer; + use utils::id::TenantId; static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); #[tokio::test] async fn test_basic() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x10), - &Value::Image(TEST_IMG("foo at 0x10")), + &Value::Image(test_img("foo at 0x10")), &ctx, ) .await?; writer.finish_write(Lsn(0x10)); drop(writer); - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x20), - &Value::Image(TEST_IMG("foo at 0x20")), + &Value::Image(test_img("foo at 0x20")), &ctx, ) .await?; @@ -4174,15 +4255,15 @@ mod tests { assert_eq!( tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?, - TEST_IMG("foo at 0x10") + test_img("foo at 0x10") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?, - TEST_IMG("foo at 0x10") + test_img("foo at 0x10") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?, - TEST_IMG("foo at 0x20") + test_img("foo at 0x20") ); Ok(()) @@ -4190,7 +4271,8 @@ mod tests { #[tokio::test] async fn no_duplicate_timelines() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")? + let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines") + .await? .load() .await; let _ = tenant @@ -4222,11 +4304,11 @@ mod tests { async fn test_branch() -> anyhow::Result<()> { use std::str::from_utf8; - let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_branch").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; #[allow(non_snake_case)] let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap(); @@ -4260,7 +4342,7 @@ mod tests { let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); - let new_writer = newtline.writer().await; + let mut new_writer = newtline.writer().await; new_writer .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx) .await?; @@ -4291,15 +4373,14 @@ mod tests { ctx: &RequestContext, ) -> anyhow::Result<()> { let mut lsn = start_lsn; - #[allow(non_snake_case)] { - let writer = tline.writer().await; + let mut writer = tline.writer().await; // Create a relation on the timeline writer .put( *TEST_KEY, lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {}", lsn))), ctx, ) .await?; @@ -4309,7 +4390,7 @@ mod tests { .put( *TEST_KEY, lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {}", lsn))), ctx, ) .await?; @@ -4318,12 +4399,12 @@ mod tests { } tline.freeze_and_flush().await?; { - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {}", lsn))), ctx, ) .await?; @@ -4333,19 +4414,20 @@ mod tests { .put( *TEST_KEY, lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {}", lsn))), ctx, ) .await?; writer.finish_write(lsn); } - tline.freeze_and_flush().await + tline.freeze_and_flush().await.map_err(|e| e.into()) } #[tokio::test] async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> { let (tenant, ctx) = - TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? + TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data") + .await? .load() .await; let tline = tenant @@ -4392,7 +4474,8 @@ mod tests { #[tokio::test] async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> { let (tenant, ctx) = - TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")? + TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn") + .await? .load() .await; @@ -4414,7 +4497,7 @@ mod tests { .source() .unwrap() .to_string() - .contains("is earlier than latest GC horizon")); + .contains("is earlier than latest GC cutoff")); } } @@ -4447,7 +4530,8 @@ mod tests { #[tokio::test] async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> { let (tenant, ctx) = - TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")? + TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline") + .await? .load() .await; let tline = tenant @@ -4481,21 +4565,25 @@ mod tests { { let branchpoints = &tline.gc_info.read().unwrap().retain_lsns; assert_eq!(branchpoints.len(), 1); - assert_eq!(branchpoints[0], Lsn(0x40)); + assert_eq!(branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID)); } // You can read the key from the child branch even though the parent is // Broken, as long as you don't need to access data from the parent. assert_eq!( newtline.get(*TEST_KEY, Lsn(0x70), &ctx).await?, - TEST_IMG(&format!("foo at {}", Lsn(0x70))) + test_img(&format!("foo at {}", Lsn(0x70))) ); // This needs to traverse to the parent, and fails. let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err(); - assert!(err - .to_string() - .contains("will not become active. Current state: Broken")); + assert!( + err.to_string().starts_with(&format!( + "bad state on timeline {}: Broken", + tline.timeline_id + )), + "{err}" + ); Ok(()) } @@ -4503,7 +4591,8 @@ mod tests { #[tokio::test] async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> { let (tenant, ctx) = - TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")? + TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child") + .await? .load() .await; let tline = tenant @@ -4533,10 +4622,10 @@ mod tests { } #[tokio::test] async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> { - let (tenant, ctx) = - TenantHarness::create("test_parent_keeps_data_forever_after_branching")? - .load() - .await; + let (tenant, ctx) = TenantHarness::create("test_parent_keeps_data_forever_after_branching") + .await? + .load() + .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4565,7 +4654,7 @@ mod tests { // Check that the data is still accessible on the branch. assert_eq!( newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?, - TEST_IMG(&format!("foo at {}", Lsn(0x40))) + test_img(&format!("foo at {}", Lsn(0x40))) ); Ok(()) @@ -4574,7 +4663,7 @@ mod tests { #[tokio::test] async fn timeline_load() -> anyhow::Result<()> { const TEST_NAME: &str = "timeline_load"; - let harness = TenantHarness::create(TEST_NAME)?; + let harness = TenantHarness::create(TEST_NAME).await?; { let (tenant, ctx) = harness.load().await; let tline = tenant @@ -4583,8 +4672,8 @@ mod tests { make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?; // so that all uploads finish & we can call harness.load() below again tenant - .shutdown(Default::default(), true) - .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id)) + .shutdown(Default::default(), ShutdownMode::FreezeAndFlush) + .instrument(harness.span()) .await .ok() .unwrap(); @@ -4601,7 +4690,7 @@ mod tests { #[tokio::test] async fn timeline_load_with_ancestor() -> anyhow::Result<()> { const TEST_NAME: &str = "timeline_load_with_ancestor"; - let harness = TenantHarness::create(TEST_NAME)?; + let harness = TenantHarness::create(TEST_NAME).await?; // create two timelines { let (tenant, ctx) = harness.load().await; @@ -4624,8 +4713,8 @@ mod tests { // so that all uploads finish & we can call harness.load() below again tenant - .shutdown(Default::default(), true) - .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id)) + .shutdown(Default::default(), ShutdownMode::FreezeAndFlush) + .instrument(harness.span()) .await .ok() .unwrap(); @@ -4649,7 +4738,10 @@ mod tests { #[tokio::test] async fn delta_layer_dumping() -> anyhow::Result<()> { use storage_layer::AsLayerDesc; - let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_layer_dumping") + .await? + .load() + .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4657,10 +4749,10 @@ mod tests { let layer_map = tline.layers.read().await; let level0_deltas = layer_map - .layer_map() - .get_level0_deltas()? - .into_iter() - .map(|desc| layer_map.get_from_desc(&desc)) + .layer_map()? + .level0_deltas() + .iter() + .map(|desc| layer_map.get_from_desc(desc)) .collect::>(); assert!(!level0_deltas.is_empty()); @@ -4674,73 +4766,19 @@ mod tests { Ok(()) } - #[tokio::test] - async fn corrupt_local_metadata() -> anyhow::Result<()> { - const TEST_NAME: &str = "corrupt_metadata"; - let harness = TenantHarness::create(TEST_NAME)?; - let (tenant, ctx) = harness.load().await; - - let tline = tenant - .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) - .await?; - drop(tline); - // so that all uploads finish & we can call harness.try_load() below again - tenant - .shutdown(Default::default(), true) - .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id)) - .await - .ok() - .unwrap(); - drop(tenant); - - // Corrupt local metadata - let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); - assert!(metadata_path.is_file()); - let mut metadata_bytes = std::fs::read(&metadata_path)?; - assert_eq!(metadata_bytes.len(), 512); - metadata_bytes[8] ^= 1; - std::fs::write(metadata_path, metadata_bytes)?; - - let err = harness.try_load_local(&ctx).await.expect_err("should fail"); - // get all the stack with all .context, not only the last one - let message = format!("{err:#}"); - let expected = "failed to load metadata"; - assert!( - message.contains(expected), - "message '{message}' expected to contain {expected}" - ); - - let mut found_error_message = false; - let mut err_source = err.source(); - while let Some(source) = err_source { - if source.to_string().contains("metadata checksum mismatch") { - found_error_message = true; - break; - } - err_source = source.source(); - } - assert!( - found_error_message, - "didn't find the corrupted metadata error in {}", - message - ); - - Ok(()) - } - #[tokio::test] async fn test_images() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_images")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_images").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x10), - &Value::Image(TEST_IMG("foo at 0x10")), + &Value::Image(test_img("foo at 0x10")), &ctx, ) .await?; @@ -4752,12 +4790,12 @@ mod tests { .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x20), - &Value::Image(TEST_IMG("foo at 0x20")), + &Value::Image(test_img("foo at 0x20")), &ctx, ) .await?; @@ -4769,12 +4807,12 @@ mod tests { .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x30), - &Value::Image(TEST_IMG("foo at 0x30")), + &Value::Image(test_img("foo at 0x30")), &ctx, ) .await?; @@ -4786,12 +4824,12 @@ mod tests { .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x40), - &Value::Image(TEST_IMG("foo at 0x40")), + &Value::Image(test_img("foo at 0x40")), &ctx, ) .await?; @@ -4805,58 +4843,72 @@ mod tests { assert_eq!( tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?, - TEST_IMG("foo at 0x10") + test_img("foo at 0x10") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?, - TEST_IMG("foo at 0x10") + test_img("foo at 0x10") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?, - TEST_IMG("foo at 0x20") + test_img("foo at 0x20") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x30), &ctx).await?, - TEST_IMG("foo at 0x30") + test_img("foo at 0x30") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x40), &ctx).await?, - TEST_IMG("foo at 0x40") + test_img("foo at 0x40") ); Ok(()) } - // - // Insert 1000 key-value pairs with increasing keys, flush, compact, GC. - // Repeat 50 times. - // - #[tokio::test] - async fn test_bulk_insert() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_bulk_insert")?; - let (tenant, ctx) = harness.load().await; - let tline = tenant - .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) - .await?; + async fn bulk_insert_compact_gc( + tenant: &Tenant, + timeline: &Arc, + ctx: &RequestContext, + lsn: Lsn, + repeat: usize, + key_count: usize, + ) -> anyhow::Result>> { + let compact = true; + bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await + } - let mut lsn = Lsn(0x10); - - let mut keyspace = KeySpaceAccum::new(); + async fn bulk_insert_maybe_compact_gc( + tenant: &Tenant, + timeline: &Arc, + ctx: &RequestContext, + mut lsn: Lsn, + repeat: usize, + key_count: usize, + compact: bool, + ) -> anyhow::Result>> { + let mut inserted: HashMap> = Default::default(); let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let mut blknum = 0; - for _ in 0..50 { - for _ in 0..10000 { + + // Enforce that key range is monotonously increasing + let mut keyspace = KeySpaceAccum::new(); + + let cancel = CancellationToken::new(); + + for _ in 0..repeat { + for _ in 0..key_count { test_key.field6 = blknum; - let writer = tline.writer().await; + let mut writer = timeline.writer().await; writer .put( test_key, lsn, - &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), - &ctx, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + ctx, ) .await?; + inserted.entry(test_key).or_default().insert(lsn); writer.finish_write(lsn); drop(writer); @@ -4866,22 +4918,525 @@ mod tests { blknum += 1; } - let cutoff = tline.get_last_record_lsn(); + timeline.freeze_and_flush().await?; + if compact { + // this requires timeline to be &Arc + timeline.compact(&cancel, EnumSet::empty(), ctx).await?; + } - tline - .update_gc_info( - Vec::new(), - cutoff, - Duration::ZERO, - &CancellationToken::new(), + // this doesn't really need to use the timeline_id target, but it is closer to what it + // originally was. + let res = tenant + .gc_iteration(Some(timeline.timeline_id), 0, Duration::ZERO, &cancel, ctx) + .await?; + + assert_eq!(res.layers_removed, 0, "this never removes anything"); + } + + Ok(inserted) + } + + // + // Insert 1000 key-value pairs with increasing keys, flush, compact, GC. + // Repeat 50 times. + // + #[tokio::test] + async fn test_bulk_insert() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_bulk_insert").await?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) + .await?; + + let lsn = Lsn(0x10); + bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; + + Ok(()) + } + + // Test the vectored get real implementation against a simple sequential implementation. + // + // The test generates a keyspace by repeatedly flushing the in-memory layer and compacting. + // Projected to 2D the key space looks like below. Lsn grows upwards on the Y axis and keys + // grow to the right on the X axis. + // [Delta] + // [Delta] + // [Delta] + // [Delta] + // ------------ Image --------------- + // + // After layer generation we pick the ranges to query as follows: + // 1. The beginning of each delta layer + // 2. At the seam between two adjacent delta layers + // + // There's one major downside to this test: delta layers only contains images, + // so the search can stop at the first delta layer and doesn't traverse any deeper. + #[tokio::test] + async fn test_get_vectored() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_get_vectored").await?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) + .await?; + + let lsn = Lsn(0x10); + let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; + + let guard = tline.layers.read().await; + let lm = guard.layer_map()?; + + lm.dump(true, &ctx).await?; + + let mut reads = Vec::new(); + let mut prev = None; + lm.iter_historic_layers().for_each(|desc| { + if !desc.is_delta() { + prev = Some(desc.clone()); + return; + } + + let start = desc.key_range.start; + let end = desc + .key_range + .start + .add(Timeline::MAX_GET_VECTORED_KEYS.try_into().unwrap()); + reads.push(KeySpace { + ranges: vec![start..end], + }); + + if let Some(prev) = &prev { + if !prev.is_delta() { + return; + } + + let first_range = Key { + field6: prev.key_range.end.field6 - 4, + ..prev.key_range.end + }..prev.key_range.end; + + let second_range = desc.key_range.start..Key { + field6: desc.key_range.start.field6 + 4, + ..desc.key_range.start + }; + + reads.push(KeySpace { + ranges: vec![first_range, second_range], + }); + }; + + prev = Some(desc.clone()); + }); + + drop(guard); + + // Pick a big LSN such that we query over all the changes. + let reads_lsn = Lsn(u64::MAX - 1); + + for read in reads { + info!("Doing vectored read on {:?}", read); + + let vectored_res = tline + .get_vectored_impl( + read.clone(), + reads_lsn, + &mut ValuesReconstructState::new(), + &ctx, + ) + .await; + + let mut expected_lsns: HashMap = Default::default(); + let mut expect_missing = false; + let mut key = read.start().unwrap(); + while key != read.end().unwrap() { + if let Some(lsns) = inserted.get(&key) { + let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn); + match expected_lsn { + Some(lsn) => { + expected_lsns.insert(key, *lsn); + } + None => { + expect_missing = true; + break; + } + } + } else { + expect_missing = true; + break; + } + + key = key.next(); + } + + if expect_missing { + assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_)))); + } else { + for (key, image) in vectored_res? { + let expected_lsn = expected_lsns.get(&key).expect("determined above"); + let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn)); + assert_eq!(image?, expected_image); + } + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_get_vectored_aux_files() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_get_vectored_aux_files").await?; + + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) + .await?; + let tline = tline.raw_timeline().unwrap(); + + let mut modification = tline.begin_modification(Lsn(0x1000)); + modification.put_file("foo/bar1", b"content1", &ctx).await?; + modification.set_lsn(Lsn(0x1008))?; + modification.put_file("foo/bar2", b"content2", &ctx).await?; + modification.commit(&ctx).await?; + + let child_timeline_id = TimelineId::generate(); + tenant + .branch_timeline_test( + tline, + child_timeline_id, + Some(tline.get_last_record_lsn()), + &ctx, + ) + .await?; + + let child_timeline = tenant + .get_timeline(child_timeline_id, true) + .expect("Should have the branched timeline"); + + let aux_keyspace = KeySpace { + ranges: vec![NON_INHERITED_RANGE], + }; + let read_lsn = child_timeline.get_last_record_lsn(); + + let vectored_res = child_timeline + .get_vectored_impl( + aux_keyspace.clone(), + read_lsn, + &mut ValuesReconstructState::new(), + &ctx, + ) + .await; + + let images = vectored_res?; + assert!(images.is_empty()); + Ok(()) + } + + // Test that vectored get handles layer gaps correctly + // by advancing into the next ancestor timeline if required. + // + // The test generates timelines that look like the diagram below. + // We leave a gap in one of the L1 layers at `gap_at_key` (`/` in the diagram). + // The reconstruct data for that key lies in the ancestor timeline (`X` in the diagram). + // + // ``` + //-------------------------------+ + // ... | + // [ L1 ] | + // [ / L1 ] | Child Timeline + // ... | + // ------------------------------+ + // [ X L1 ] | Parent Timeline + // ------------------------------+ + // ``` + #[tokio::test] + async fn test_get_vectored_key_gap() -> anyhow::Result<()> { + let tenant_conf = TenantConf { + // Make compaction deterministic + gc_period: Duration::ZERO, + compaction_period: Duration::ZERO, + // Encourage creation of L1 layers + checkpoint_distance: 16 * 1024, + compaction_target_size: 8 * 1024, + ..TenantConf::default() + }; + + let harness = TenantHarness::create_custom( + "test_get_vectored_key_gap", + tenant_conf, + TenantId::generate(), + ShardIdentity::unsharded(), + Generation::new(0xdeadbeef), + ) + .await?; + let (tenant, ctx) = harness.load().await; + + let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let gap_at_key = current_key.add(100); + let mut current_lsn = Lsn(0x10); + + const KEY_COUNT: usize = 10_000; + + let timeline_id = TimelineId::generate(); + let current_timeline = tenant + .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx) + .await?; + + current_lsn += 0x100; + + let mut writer = current_timeline.writer().await; + writer + .put( + gap_at_key, + current_lsn, + &Value::Image(test_img(&format!("{} at {}", gap_at_key, current_lsn))), + &ctx, + ) + .await?; + writer.finish_write(current_lsn); + drop(writer); + + let mut latest_lsns = HashMap::new(); + latest_lsns.insert(gap_at_key, current_lsn); + + current_timeline.freeze_and_flush().await?; + + let child_timeline_id = TimelineId::generate(); + + tenant + .branch_timeline_test( + ¤t_timeline, + child_timeline_id, + Some(current_lsn), + &ctx, + ) + .await?; + let child_timeline = tenant + .get_timeline(child_timeline_id, true) + .expect("Should have the branched timeline"); + + for i in 0..KEY_COUNT { + if current_key == gap_at_key { + current_key = current_key.next(); + continue; + } + + current_lsn += 0x10; + + let mut writer = child_timeline.writer().await; + writer + .put( + current_key, + current_lsn, + &Value::Image(test_img(&format!("{} at {}", current_key, current_lsn))), &ctx, ) .await?; - tline.freeze_and_flush().await?; - tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + writer.finish_write(current_lsn); + drop(writer); + + latest_lsns.insert(current_key, current_lsn); + current_key = current_key.next(); + + // Flush every now and then to encourage layer file creation. + if i % 500 == 0 { + child_timeline.freeze_and_flush().await?; + } + } + + child_timeline.freeze_and_flush().await?; + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceRepartition); + child_timeline + .compact(&CancellationToken::new(), flags, &ctx) + .await?; + + let key_near_end = { + let mut tmp = current_key; + tmp.field6 -= 10; + tmp + }; + + let key_near_gap = { + let mut tmp = gap_at_key; + tmp.field6 -= 10; + tmp + }; + + let read = KeySpace { + ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key], + }; + let results = child_timeline + .get_vectored_impl( + read.clone(), + current_lsn, + &mut ValuesReconstructState::new(), + &ctx, + ) + .await?; + + for (key, img_res) in results { + let expected = test_img(&format!("{} at {}", key, latest_lsns[&key])); + assert_eq!(img_res?, expected); + } + + Ok(()) + } + + // Test that vectored get descends into ancestor timelines correctly and + // does not return an image that's newer than requested. + // + // The diagram below ilustrates an interesting case. We have a parent timeline + // (top of the Lsn range) and a child timeline. The request key cannot be reconstructed + // from the child timeline, so the parent timeline must be visited. When advacing into + // the child timeline, the read path needs to remember what the requested Lsn was in + // order to avoid returning an image that's too new. The test below constructs such + // a timeline setup and does a few queries around the Lsn of each page image. + // ``` + // LSN + // ^ + // | + // | + // 500 | --------------------------------------> branch point + // 400 | X + // 300 | X + // 200 | --------------------------------------> requested lsn + // 100 | X + // |---------------------------------------> Key + // | + // ------> requested key + // + // Legend: + // * X - page images + // ``` + #[tokio::test] + async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?; + let (tenant, ctx) = harness.load().await; + + let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let end_key = start_key.add(1000); + let child_gap_at_key = start_key.add(500); + let mut parent_gap_lsns: BTreeMap = BTreeMap::new(); + + let mut current_lsn = Lsn(0x10); + + let timeline_id = TimelineId::generate(); + let parent_timeline = tenant + .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx) + .await?; + + current_lsn += 0x100; + + for _ in 0..3 { + let mut key = start_key; + while key < end_key { + current_lsn += 0x10; + + let image_value = format!("{} at {}", child_gap_at_key, current_lsn); + + let mut writer = parent_timeline.writer().await; + writer + .put( + key, + current_lsn, + &Value::Image(test_img(&image_value)), + &ctx, + ) + .await?; + writer.finish_write(current_lsn); + + if key == child_gap_at_key { + parent_gap_lsns.insert(current_lsn, image_value); + } + + key = key.next(); + } + + parent_timeline.freeze_and_flush().await?; + } + + let child_timeline_id = TimelineId::generate(); + + let child_timeline = tenant + .branch_timeline_test(&parent_timeline, child_timeline_id, Some(current_lsn), &ctx) + .await?; + + let mut key = start_key; + while key < end_key { + if key == child_gap_at_key { + key = key.next(); + continue; + } + + current_lsn += 0x10; + + let mut writer = child_timeline.writer().await; + writer + .put( + key, + current_lsn, + &Value::Image(test_img(&format!("{} at {}", key, current_lsn))), + &ctx, + ) .await?; - tline.gc().await?; + writer.finish_write(current_lsn); + + key = key.next(); + } + + child_timeline.freeze_and_flush().await?; + + let lsn_offsets: [i64; 5] = [-10, -1, 0, 1, 10]; + let mut query_lsns = Vec::new(); + for image_lsn in parent_gap_lsns.keys().rev() { + for offset in lsn_offsets { + query_lsns.push(Lsn(image_lsn + .0 + .checked_add_signed(offset) + .expect("Shouldn't overflow"))); + } + } + + for query_lsn in query_lsns { + let results = child_timeline + .get_vectored_impl( + KeySpace { + ranges: vec![child_gap_at_key..child_gap_at_key.next()], + }, + query_lsn, + &mut ValuesReconstructState::new(), + &ctx, + ) + .await; + + let expected_item = parent_gap_lsns + .iter() + .rev() + .find(|(lsn, _)| **lsn <= query_lsn); + + info!( + "Doing vectored read at LSN {}. Expecting image to be: {:?}", + query_lsn, expected_item + ); + + match expected_item { + Some((_, img_value)) => { + let key_results = results.expect("No vectored get error expected"); + let key_result = &key_results[&child_gap_at_key]; + let returned_img = key_result + .as_ref() + .expect("No page reconstruct error expected"); + + info!( + "Vectored read at LSN {} returned image {}", + query_lsn, + std::str::from_utf8(returned_img)? + ); + assert_eq!(*returned_img, test_img(img_value)); + } + None => { + assert!(matches!(results, Err(GetVectoredError::MissingKey(_)))); + } + } } Ok(()) @@ -4889,15 +5444,36 @@ mod tests { #[tokio::test] async fn test_random_updates() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_random_updates")?; + let names_algorithms = [ + ("test_random_updates_legacy", CompactionAlgorithm::Legacy), + ("test_random_updates_tiered", CompactionAlgorithm::Tiered), + ]; + for (name, algorithm) in names_algorithms { + test_random_updates_algorithm(name, algorithm).await?; + } + Ok(()) + } + + async fn test_random_updates_algorithm( + name: &'static str, + compaction_algorithm: CompactionAlgorithm, + ) -> anyhow::Result<()> { + let mut harness = TenantHarness::create(name).await?; + harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings { + kind: compaction_algorithm, + }; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; const NUM_KEYS: usize = 1000; + let cancel = CancellationToken::new(); let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let mut test_key_end = test_key; + test_key_end.field6 = NUM_KEYS as u32; + tline.add_extra_test_dense_keyspace(KeySpace::single(test_key..test_key_end)); let mut keyspace = KeySpaceAccum::new(); @@ -4910,12 +5486,12 @@ mod tests { for blknum in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, lsn, - &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), &ctx, ) .await?; @@ -4931,12 +5507,12 @@ mod tests { lsn = Lsn(lsn.0 + 0x10); let blknum = thread_rng().gen_range(0..NUM_KEYS); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, lsn, - &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), &ctx, ) .await?; @@ -4950,26 +5526,15 @@ mod tests { test_key.field6 = blknum as u32; assert_eq!( tline.get(test_key, lsn, &ctx).await?, - TEST_IMG(&format!("{} at {}", blknum, last_lsn)) + test_img(&format!("{} at {}", blknum, last_lsn)) ); } - // Perform a cycle of flush, compact, and GC - let cutoff = tline.get_last_record_lsn(); - tline - .update_gc_info( - Vec::new(), - cutoff, - Duration::ZERO, - &CancellationToken::new(), - &ctx, - ) - .await?; + // Perform a cycle of flush, and GC tline.freeze_and_flush().await?; - tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) .await?; - tline.gc().await?; } Ok(()) @@ -4977,7 +5542,8 @@ mod tests { #[tokio::test] async fn test_traverse_branches() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_traverse_branches")? + let (tenant, ctx) = TenantHarness::create("test_traverse_branches") + .await? .load() .await; let mut tline = tenant @@ -4990,6 +5556,8 @@ mod tests { let mut keyspace = KeySpaceAccum::new(); + let cancel = CancellationToken::new(); + // Track when each page was last modified. Used to assert that // a read sees the latest page version. let mut updated = [Lsn(0); NUM_KEYS]; @@ -4999,12 +5567,12 @@ mod tests { for blknum in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, lsn, - &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), &ctx, ) .await?; @@ -5028,12 +5596,12 @@ mod tests { lsn = Lsn(lsn.0 + 0x10); let blknum = thread_rng().gen_range(0..NUM_KEYS); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, lsn, - &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), &ctx, ) .await?; @@ -5048,26 +5616,16 @@ mod tests { test_key.field6 = blknum as u32; assert_eq!( tline.get(test_key, lsn, &ctx).await?, - TEST_IMG(&format!("{} at {}", blknum, last_lsn)) + test_img(&format!("{} at {}", blknum, last_lsn)) ); } // Perform a cycle of flush, compact, and GC - let cutoff = tline.get_last_record_lsn(); - tline - .update_gc_info( - Vec::new(), - cutoff, - Duration::ZERO, - &CancellationToken::new(), - &ctx, - ) - .await?; tline.freeze_and_flush().await?; - tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + tline.compact(&cancel, EnumSet::empty(), &ctx).await?; + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) .await?; - tline.gc().await?; } Ok(()) @@ -5075,7 +5633,8 @@ mod tests { #[tokio::test] async fn test_traverse_ancestors() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")? + let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors") + .await? .load() .await; let mut tline = tenant @@ -5105,12 +5664,12 @@ mod tests { lsn = Lsn(lsn.0 + 0x10); let blknum = thread_rng().gen_range(0..NUM_KEYS); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, lsn, - &Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))), + &Value::Image(test_img(&format!("{} {} at {}", idx, blknum, lsn))), &ctx, ) .await?; @@ -5132,7 +5691,7 @@ mod tests { test_key.field6 = blknum as u32; assert_eq!( tline.get(test_key, *lsn, &ctx).await?, - TEST_IMG(&format!("{idx} {blknum} at {lsn}")) + test_img(&format!("{idx} {blknum} at {lsn}")) ); } } @@ -5141,7 +5700,8 @@ mod tests { #[tokio::test] async fn test_write_at_initdb_lsn_takes_optimization_code_path() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")? + let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable") + .await? .load() .await; @@ -5208,19 +5768,19 @@ mod tests { } #[tokio::test] - async fn test_uninit_mark_crash() -> anyhow::Result<()> { - let name = "test_uninit_mark_crash"; - let harness = TenantHarness::create(name)?; + async fn test_create_guard_crash() -> anyhow::Result<()> { + let name = "test_create_guard_crash"; + let harness = TenantHarness::create(name).await?; { let (tenant, ctx) = harness.load().await; let tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) .await?; - // Keeps uninit mark in place + // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again let raw_tline = tline.raw_timeline().unwrap(); raw_tline - .shutdown() - .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id)) + .shutdown(super::timeline::ShutdownMode::Hard) + .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID)) .await; std::mem::forget(tline); } @@ -5244,10 +5804,2669 @@ mod tests { .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID) .exists()); - assert!(!harness - .conf - .timeline_uninit_mark_file_path(tenant.tenant_shard_id, TIMELINE_ID) - .exists()); + Ok(()) + } + + #[tokio::test] + async fn test_read_at_max_lsn() -> anyhow::Result<()> { + let names_algorithms = [ + ("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy), + ("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered), + ]; + for (name, algorithm) in names_algorithms { + test_read_at_max_lsn_algorithm(name, algorithm).await?; + } + Ok(()) + } + + async fn test_read_at_max_lsn_algorithm( + name: &'static str, + compaction_algorithm: CompactionAlgorithm, + ) -> anyhow::Result<()> { + let mut harness = TenantHarness::create(name).await?; + harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings { + kind: compaction_algorithm, + }; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) + .await?; + + let lsn = Lsn(0x10); + let compact = false; + bulk_insert_maybe_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000, compact).await?; + + let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let read_lsn = Lsn(u64::MAX - 1); + + let result = tline.get(test_key, read_lsn, &ctx).await; + assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err()); + + Ok(()) + } + + #[tokio::test] + async fn test_metadata_scan() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_scan").await?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + const NUM_KEYS: usize = 1000; + const STEP: usize = 10000; // random update + scan base_key + idx * STEP + + let cancel = CancellationToken::new(); + + let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + base_key.field1 = AUX_KEY_PREFIX; + let mut test_key = base_key; + + // Track when each page was last modified. Used to assert that + // a read sees the latest page version. + let mut updated = [Lsn(0); NUM_KEYS]; + + let mut lsn = Lsn(0x10); + #[allow(clippy::needless_range_loop)] + for blknum in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + updated[blknum] = lsn; + drop(writer); + } + + let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32)); + + for iter in 0..=10 { + // Read all the blocks + for (blknum, last_lsn) in updated.iter().enumerate() { + test_key.field6 = (blknum * STEP) as u32; + assert_eq!( + tline.get(test_key, lsn, &ctx).await?, + test_img(&format!("{} at {}", blknum, last_lsn)) + ); + } + + let mut cnt = 0; + for (key, value) in tline + .get_vectored_impl( + keyspace.clone(), + lsn, + &mut ValuesReconstructState::default(), + &ctx, + ) + .await? + { + let blknum = key.field6 as usize; + let value = value?; + assert!(blknum % STEP == 0); + let blknum = blknum / STEP; + assert_eq!( + value, + test_img(&format!("{} at {}", blknum, updated[blknum])) + ); + cnt += 1; + } + + assert_eq!(cnt, NUM_KEYS); + + for _ in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + updated[blknum] = lsn; + } + + // Perform two cycles of flush, compact, and GC + for round in 0..2 { + tline.freeze_and_flush().await?; + tline + .compact( + &cancel, + if iter % 5 == 0 && round == 0 { + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceImageLayerCreation); + flags.insert(CompactFlags::ForceRepartition); + flags + } else { + EnumSet::empty() + }, + &ctx, + ) + .await?; + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .await?; + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_metadata_compaction_trigger() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_compaction_trigger").await?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + let cancel = CancellationToken::new(); + + let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + base_key.field1 = AUX_KEY_PREFIX; + let test_key = base_key; + let mut lsn = Lsn(0x10); + + for _ in 0..20 { + lsn = Lsn(lsn.0 + 0x10); + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", 0, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + tline.freeze_and_flush().await?; // force create a delta layer + } + + let before_num_l0_delta_files = + tline.layers.read().await.layer_map()?.level0_deltas().len(); + + tline.compact(&cancel, EnumSet::empty(), &ctx).await?; + + let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); + + assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}"); + + assert_eq!( + tline.get(test_key, lsn, &ctx).await?, + test_img(&format!("{} at {}", 0, lsn)) + ); + + Ok(()) + } + + #[tokio::test] + async fn test_branch_copies_dirty_aux_file_flag() { + let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag") + .await + .unwrap(); + + // the default aux file policy to switch is v2 if not set by the admins + assert_eq!( + harness.tenant_conf.switch_aux_file_policy, + AuxFilePolicy::default_tenant_config() + ); + let (tenant, ctx) = harness.load().await; + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + // no aux file is written at this point, so the persistent flag should be unset + assert_eq!(tline.last_aux_file_policy.load(), None); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test1", b"first", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + // there is no tenant manager to pass the configuration through, so lets mimic it + tenant.set_new_location_config( + AttachedTenantConf::try_from(LocationConf::attached_single( + TenantConfOpt { + switch_aux_file_policy: Some(AuxFilePolicy::V2), + ..Default::default() + }, + tenant.generation, + &pageserver_api::models::ShardParameters::default(), + )) + .unwrap(), + ); + + assert_eq!( + tline.get_switch_aux_file_policy(), + AuxFilePolicy::V2, + "wanted state has been updated" + ); + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V2), + "aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there" + ); + + // we can read everything from the storage + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test2", b"second", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V2), + "keep v2 storage format when new files are written" + ); + + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"second")) + ); + + let child = tenant + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx) + .await + .unwrap(); + + // child copies the last flag even if that is not on remote storage yet + assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2); + assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2)); + + let files = child.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!(files.get("pg_logical/mappings/test1"), None); + assert_eq!(files.get("pg_logical/mappings/test2"), None); + + // even if we crash here without flushing parent timeline with it's new + // last_aux_file_policy we are safe, because child was never meant to access ancestor's + // files. the ancestor can even switch back to V1 because of a migration safely. + } + + #[tokio::test] + async fn aux_file_policy_switch() { + let mut harness = TenantHarness::create("aux_file_policy_switch") + .await + .unwrap(); + harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode + let (tenant, ctx) = harness.load().await; + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + assert_eq!( + tline.last_aux_file_policy.load(), + None, + "no aux file is written so it should be unset" + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test1", b"first", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + // there is no tenant manager to pass the configuration through, so lets mimic it + tenant.set_new_location_config( + AttachedTenantConf::try_from(LocationConf::attached_single( + TenantConfOpt { + switch_aux_file_policy: Some(AuxFilePolicy::V2), + ..Default::default() + }, + tenant.generation, + &pageserver_api::models::ShardParameters::default(), + )) + .unwrap(), + ); + + assert_eq!( + tline.get_switch_aux_file_policy(), + AuxFilePolicy::V2, + "wanted state has been updated" + ); + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::CrossValidation), + "dirty index_part.json reflected state is yet to be updated" + ); + + // we can still read the auxfile v1 before we ingest anything new + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test2", b"second", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V2), + "ingesting a file should apply the wanted switch state when applicable" + ); + + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")), + "cross validation writes to both v1 and v2 so this should be available in v2" + ); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"second")) + ); + + // mimic again by trying to flip it from V2 to V1 (not switched to while ingesting a file) + tenant.set_new_location_config( + AttachedTenantConf::try_from(LocationConf::attached_single( + TenantConfOpt { + switch_aux_file_policy: Some(AuxFilePolicy::V1), + ..Default::default() + }, + tenant.generation, + &pageserver_api::models::ShardParameters::default(), + )) + .unwrap(), + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test2", b"third", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!( + tline.get_switch_aux_file_policy(), + AuxFilePolicy::V1, + "wanted state has been updated again, even if invalid request" + ); + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V2), + "ingesting a file should apply the wanted switch state when applicable" + ); + + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"third")) + ); + + // mimic again by trying to flip it from from V1 to V2 (not switched to while ingesting a file) + tenant.set_new_location_config( + AttachedTenantConf::try_from(LocationConf::attached_single( + TenantConfOpt { + switch_aux_file_policy: Some(AuxFilePolicy::V2), + ..Default::default() + }, + tenant.generation, + &pageserver_api::models::ShardParameters::default(), + )) + .unwrap(), + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test3", b"last", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!(tline.get_switch_aux_file_policy(), AuxFilePolicy::V2); + + assert_eq!(tline.last_aux_file_policy.load(), Some(AuxFilePolicy::V2)); + + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"third")) + ); + assert_eq!( + files.get("pg_logical/mappings/test3"), + Some(&bytes::Bytes::from_static(b"last")) + ); + } + + #[tokio::test] + async fn aux_file_policy_force_switch() { + let mut harness = TenantHarness::create("aux_file_policy_force_switch") + .await + .unwrap(); + harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1; + let (tenant, ctx) = harness.load().await; + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + assert_eq!( + tline.last_aux_file_policy.load(), + None, + "no aux file is written so it should be unset" + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test1", b"first", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + tline.do_switch_aux_policy(AuxFilePolicy::V2).unwrap(); + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V2), + "dirty index_part.json reflected state is yet to be updated" + ); + + // lose all data from v1 + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!(files.get("pg_logical/mappings/test1"), None); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test2", b"second", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + // read data ingested in v2 + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"second")) + ); + // lose all data from v1 + assert_eq!(files.get("pg_logical/mappings/test1"), None); + } + + #[tokio::test] + async fn aux_file_policy_auto_detect() { + let mut harness = TenantHarness::create("aux_file_policy_auto_detect") + .await + .unwrap(); + harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode + let (tenant, ctx) = harness.load().await; + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + assert_eq!( + tline.last_aux_file_policy.load(), + None, + "no aux file is written so it should be unset" + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { + files: vec![( + "test_file".to_string(), + Bytes::copy_from_slice(b"test_file"), + )] + .into_iter() + .collect(), + }) + .unwrap(); + modification.put_for_test(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); + modification.commit(&ctx).await.unwrap(); + } + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test1", b"first", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V1), + "keep using v1 because there are aux files writting with v1" + ); + + // we can still read the auxfile v1 + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + assert_eq!( + files.get("test_file"), + Some(&bytes::Bytes::from_static(b"test_file")) + ); + } + + #[tokio::test] + async fn test_metadata_image_creation() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_image_creation").await?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + const NUM_KEYS: usize = 1000; + const STEP: usize = 10000; // random update + scan base_key + idx * STEP + + let cancel = CancellationToken::new(); + + let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix... + let mut test_key = base_key; + let mut lsn = Lsn(0x10); + + async fn scan_with_statistics( + tline: &Timeline, + keyspace: &KeySpace, + lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result<(BTreeMap>, usize)> { + let mut reconstruct_state = ValuesReconstructState::default(); + let res = tline + .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) + .await?; + Ok((res, reconstruct_state.get_delta_layers_visited() as usize)) + } + + #[allow(clippy::needless_range_loop)] + for blknum in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + } + + let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32)); + + for iter in 1..=10 { + for _ in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + } + + tline.freeze_and_flush().await?; + + if iter % 5 == 0 { + let (_, before_delta_file_accessed) = + scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?; + tline + .compact( + &cancel, + { + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceImageLayerCreation); + flags.insert(CompactFlags::ForceRepartition); + flags + }, + &ctx, + ) + .await?; + let (_, after_delta_file_accessed) = + scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?; + assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}"); + // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances. + assert!( + after_delta_file_accessed <= 2, + "after_delta_file_accessed={after_delta_file_accessed}" + ); + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_vectored_missing_data_key_reads").await?; + let (tenant, ctx) = harness.load().await; + + let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap(); + let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap(); + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + Vec::new(), // delta layers + vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers + Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN + ) + .await?; + tline.add_extra_test_dense_keyspace(KeySpace::single(base_key..(base_key_nonexist.next()))); + + let child = tenant + .branch_timeline_test_with_layers( + &tline, + NEW_TIMELINE_ID, + Some(Lsn(0x20)), + &ctx, + Vec::new(), // delta layers + vec![(Lsn(0x30), vec![(base_key_child, test_img("data key 2"))])], // image layers + Lsn(0x30), + ) + .await + .unwrap(); + + let lsn = Lsn(0x30); + + // test vectored get on parent timeline + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, + Some(test_img("data key 1")) + ); + assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error()); + assert!( + get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error() + ); + + // test vectored get on child timeline + assert_eq!( + get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?, + Some(test_img("data key 1")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?, + Some(test_img("data key 2")) + ); + assert!( + get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error() + ); + + Ok(()) + } + + #[tokio::test] + async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?; + let (tenant, ctx) = harness.load().await; + + let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap(); + let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap(); + assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix... + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + Vec::new(), // delta layers + vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers + Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN + ) + .await?; + + let child = tenant + .branch_timeline_test_with_layers( + &tline, + NEW_TIMELINE_ID, + Some(Lsn(0x20)), + &ctx, + Vec::new(), // delta layers + vec![( + Lsn(0x30), + vec![(base_key_child, test_img("metadata key 2"))], + )], // image layers + Lsn(0x30), + ) + .await + .unwrap(); + + let lsn = Lsn(0x30); + + // test vectored get on parent timeline + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, + Some(test_img("metadata key 1")) + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?, + None + ); + + // test vectored get on child timeline + assert_eq!( + get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?, + Some(test_img("metadata key 2")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?, + None + ); + + Ok(()) + } + + async fn get_vectored_impl_wrapper( + tline: &Arc, + key: Key, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, GetVectoredError> { + let mut reconstruct_state = ValuesReconstructState::new(); + let mut res = tline + .get_vectored_impl( + KeySpace::single(key..key.next()), + lsn, + &mut reconstruct_state, + ctx, + ) + .await?; + Ok(res.pop_last().map(|(k, v)| { + assert_eq!(k, key); + v.unwrap() + })) + } + + #[tokio::test] + async fn test_metadata_tombstone_reads() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_tombstone_reads").await?; + let (tenant, ctx) = harness.load().await; + let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap(); + let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); + let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap(); + let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap(); + + // We emulate the situation that the compaction algorithm creates an image layer that removes the tombstones + // Lsn 0x30 key0, key3, no key1+key2 + // Lsn 0x20 key1+key2 tomestones + // Lsn 0x10 key1 in image, key2 in delta + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + // delta layers + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + ), + ], + // image layers + vec![ + (Lsn(0x10), vec![(key1, test_img("metadata key 1"))]), + ( + Lsn(0x30), + vec![ + (key0, test_img("metadata key 0")), + (key3, test_img("metadata key 3")), + ], + ), + ], + Lsn(0x30), + ) + .await?; + + let lsn = Lsn(0x30); + let old_lsn = Lsn(0x20); + + assert_eq!( + get_vectored_impl_wrapper(&tline, key0, lsn, &ctx).await?, + Some(test_img("metadata key 0")) + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, key1, lsn, &ctx).await?, + None, + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, key2, lsn, &ctx).await?, + None, + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, key1, old_lsn, &ctx).await?, + Some(Bytes::new()), + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, key2, old_lsn, &ctx).await?, + Some(Bytes::new()), + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, key3, lsn, &ctx).await?, + Some(test_img("metadata key 3")) + ); + + Ok(()) + } + + #[tokio::test] + async fn test_metadata_tombstone_image_creation() { + let harness = TenantHarness::create("test_metadata_tombstone_image_creation") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap(); + let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); + let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap(); + let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap(); + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + // delta layers + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x30)..Lsn(0x40), + vec![ + (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))), + (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))), + ], + ), + ], + // image layers + vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])], + Lsn(0x40), + ) + .await + .unwrap(); + + let cancel = CancellationToken::new(); + + tline + .compact( + &cancel, + { + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceImageLayerCreation); + flags.insert(CompactFlags::ForceRepartition); + flags + }, + &ctx, + ) + .await + .unwrap(); + + // Image layers are created at last_record_lsn + let images = tline + .inspect_image_layers(Lsn(0x40), &ctx) + .await + .unwrap() + .into_iter() + .filter(|(k, _)| k.is_metadata_key()) + .collect::>(); + assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed. + } + + #[tokio::test] + async fn test_metadata_tombstone_empty_image_creation() { + let harness = TenantHarness::create("test_metadata_tombstone_empty_image_creation") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); + let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap(); + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + // delta layers + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + ), + ], + // image layers + vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])], + Lsn(0x30), + ) + .await + .unwrap(); + + let cancel = CancellationToken::new(); + + tline + .compact( + &cancel, + { + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceImageLayerCreation); + flags.insert(CompactFlags::ForceRepartition); + flags + }, + &ctx, + ) + .await + .unwrap(); + + // Image layers are created at last_record_lsn + let images = tline + .inspect_image_layers(Lsn(0x30), &ctx) + .await + .unwrap() + .into_iter() + .filter(|(k, _)| k.is_metadata_key()) + .collect::>(); + assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created + } + + #[tokio::test] + async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + // We create + // - one bottom-most image layer, + // - a delta layer D1 crossing the GC horizon with data below and above the horizon, + // - a delta layer D2 crossing the GC horizon with data only below the horizon, + // - a delta layer D3 above the horizon. + // + // | D3 | + // | D1 | + // -| |-- gc horizon ----------------- + // | | | D2 | + // --------- img layer ------------------ + // + // What we should expact from this compaction is: + // | D3 | + // | Part of D1 | + // --------- img layer with D1+D2 at GC horizon------------------ + + // img layer at 0x10 + let img_layer = (0..10) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + ( + get_key(1), + Lsn(0x20), + Value::Image(Bytes::from("value 1@0x20")), + ), + ( + get_key(2), + Lsn(0x30), + Value::Image(Bytes::from("value 2@0x30")), + ), + ( + get_key(3), + Lsn(0x40), + Value::Image(Bytes::from("value 3@0x40")), + ), + ]; + let delta2 = vec![ + ( + get_key(5), + Lsn(0x20), + Value::Image(Bytes::from("value 5@0x20")), + ), + ( + get_key(6), + Lsn(0x20), + Value::Image(Bytes::from("value 6@0x20")), + ), + ]; + let delta3 = vec![ + ( + get_key(8), + Lsn(0x48), + Value::Image(Bytes::from("value 8@0x48")), + ), + ( + get_key(9), + Lsn(0x48), + Value::Image(Bytes::from("value 9@0x48")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), + ], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + guard.cutoffs.time = Lsn(0x30); + guard.cutoffs.space = Lsn(0x30); + } + + let expected_result = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x20"), + Bytes::from_static(b"value 2@0x30"), + Bytes::from_static(b"value 3@0x40"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x20"), + Bytes::from_static(b"value 6@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x48"), + Bytes::from_static(b"value 9@0x48"), + ]; + + for (idx, expected) in expected_result.iter().enumerate() { + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + expected + ); + } + + let cancel = CancellationToken::new(); + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + + for (idx, expected) in expected_result.iter().enumerate() { + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + expected + ); + } + + // Check if the image layer at the GC horizon contains exactly what we want + let image_at_gc_horizon = tline + .inspect_image_layers(Lsn(0x30), &ctx) + .await + .unwrap() + .into_iter() + .filter(|(k, _)| k.is_metadata_key()) + .collect::>(); + + assert_eq!(image_at_gc_horizon.len(), 10); + let expected_result = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x20"), + Bytes::from_static(b"value 2@0x30"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x20"), + Bytes::from_static(b"value 6@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + for idx in 0..10 { + assert_eq!( + image_at_gc_horizon[idx], + (get_key(idx as u32), expected_result[idx].clone()) + ); + } + + // Check if old layers are removed / new layers have the expected LSN + let mut all_layers = tline.inspect_historic_layers().await.unwrap(); + all_layers.sort_by(|k1, k2| { + ( + k1.is_delta, + k1.key_range.start, + k1.key_range.end, + k1.lsn_range.start, + k1.lsn_range.end, + ) + .cmp(&( + k2.is_delta, + k2.key_range.start, + k2.key_range.end, + k2.lsn_range.start, + k2.lsn_range.end, + )) + }); + assert_eq!( + all_layers, + vec![ + // Image layer at GC horizon + PersistentLayerKey { + key_range: Key::MIN..Key::NON_L0_MAX, + lsn_range: Lsn(0x30)..Lsn(0x31), + is_delta: false + }, + // The delta layer covers the full range (with the layer key hack to avoid being recognized as L0) + PersistentLayerKey { + key_range: Key::MIN..Key::NON_L0_MAX, + lsn_range: Lsn(0x30)..Lsn(0x48), + is_delta: true + }, + // The delta3 layer that should not be picked for the compaction + PersistentLayerKey { + key_range: get_key(8)..get_key(10), + lsn_range: Lsn(0x48)..Lsn(0x50), + is_delta: true + } + ] + ); + + // increase GC horizon and compact again + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + guard.cutoffs.time = Lsn(0x40); + guard.cutoffs.space = Lsn(0x40); + } + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + + Ok(()) + } + + #[tokio::test] + async fn test_neon_test_record() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_neon_test_record").await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let delta1 = vec![ + ( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(",0x20")), + ), + ( + get_key(1), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(",0x30")), + ), + (get_key(2), Lsn(0x10), Value::Image("0x10".into())), + ( + get_key(2), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(",0x20")), + ), + ( + get_key(2), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(",0x30")), + ), + (get_key(3), Lsn(0x10), Value::Image("0x10".into())), + ( + get_key(3), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_clear()), + ), + (get_key(4), Lsn(0x10), Value::Image("0x10".into())), + ( + get_key(4), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_init()), + ), + ]; + let image1 = vec![(get_key(1), "0x10".into())]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x40), + delta1, + )], // delta layers + vec![(Lsn(0x10), image1)], // image layers + Lsn(0x50), + ) + .await?; + + assert_eq!( + tline.get(get_key(1), Lsn(0x50), &ctx).await?, + Bytes::from_static(b"0x10,0x20,0x30") + ); + assert_eq!( + tline.get(get_key(2), Lsn(0x50), &ctx).await?, + Bytes::from_static(b"0x10,0x20,0x30") + ); + + // Need to remove the limit of "Neon WAL redo requires base image". + + // assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new()); + // assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new()); + + Ok(()) + } + + #[tokio::test] + async fn test_lsn_lease() -> anyhow::Result<()> { + let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await; + let key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + + let end_lsn = Lsn(0x100); + let image_layers = (0x20..=0x90) + .step_by(0x10) + .map(|n| { + ( + Lsn(n), + vec![(key, test_img(&format!("data key at {:x}", n)))], + ) + }) + .collect(); + + let timeline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + Vec::new(), + image_layers, + end_lsn, + ) + .await?; + + let leased_lsns = [0x30, 0x50, 0x70]; + let mut leases = Vec::new(); + let _: anyhow::Result<_> = leased_lsns.iter().try_for_each(|n| { + leases.push(timeline.make_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)?); + Ok(()) + }); + + // Renewing with shorter lease should not change the lease. + let updated_lease_0 = + timeline.make_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx)?; + assert_eq!(updated_lease_0.valid_until, leases[0].valid_until); + + // Renewing with a long lease should renew lease with later expiration time. + let updated_lease_1 = timeline.make_lsn_lease( + Lsn(leased_lsns[1]), + timeline.get_lsn_lease_length() * 2, + &ctx, + )?; + + assert!(updated_lease_1.valid_until > leases[1].valid_until); + + // Force set disk consistent lsn so we can get the cutoff at `end_lsn`. + info!( + "latest_gc_cutoff_lsn: {}", + *timeline.get_latest_gc_cutoff_lsn() + ); + timeline.force_set_disk_consistent_lsn(end_lsn); + + let res = tenant + .gc_iteration( + Some(TIMELINE_ID), + 0, + Duration::ZERO, + &CancellationToken::new(), + &ctx, + ) + .await?; + + // Keeping everything <= Lsn(0x80) b/c leases: + // 0/10: initdb layer + // (0/20..=0/70).step_by(0x10): image layers added when creating the timeline. + assert_eq!(res.layers_needed_by_leases, 7); + // Keeping 0/90 b/c it is the latest layer. + assert_eq!(res.layers_not_updated, 1); + // Removed 0/80. + assert_eq!(res.layers_removed, 1); + + // Make lease on a already GC-ed LSN. + // 0/80 does not have a valid lease + is below latest_gc_cutoff + assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn()); + let res = timeline.make_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx); + assert!(res.is_err()); + + // Should still be able to renew a currently valid lease + // Assumption: original lease to is still valid for 0/50. + let _ = + timeline.make_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)?; + + Ok(()) + } + + #[tokio::test] + async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + // We create + // - one bottom-most image layer, + // - a delta layer D1 crossing the GC horizon with data below and above the horizon, + // - a delta layer D2 crossing the GC horizon with data only below the horizon, + // - a delta layer D3 above the horizon. + // + // | D3 | + // | D1 | + // -| |-- gc horizon ----------------- + // | | | D2 | + // --------- img layer ------------------ + // + // What we should expact from this compaction is: + // | D3 | + // | Part of D1 | + // --------- img layer with D1+D2 at GC horizon------------------ + + // img layer at 0x10 + let img_layer = (0..10) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + ( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(2), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(3), + Lsn(0x28), + Value::WalRecord(NeonWalRecord::wal_append("@0x28")), + ), + ( + get_key(3), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(3), + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append("@0x40")), + ), + ]; + let delta2 = vec![ + ( + get_key(5), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(6), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ]; + let delta3 = vec![ + ( + get_key(8), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ( + get_key(9), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), + ], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![], + cutoffs: GcCutoffs { + time: Lsn(0x30), + space: Lsn(0x30), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + let expected_result = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10@0x30"), + Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10@0x48"), + Bytes::from_static(b"value 9@0x10@0x48"), + ]; + + let expected_result_at_gc_horizon = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10@0x30"), + Bytes::from_static(b"value 3@0x10@0x28@0x30"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + for idx in 0..10 { + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + &expected_result[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x30), &ctx) + .await + .unwrap(), + &expected_result_at_gc_horizon[idx] + ); + } + + let cancel = CancellationToken::new(); + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + + for idx in 0..10 { + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + &expected_result[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x30), &ctx) + .await + .unwrap(), + &expected_result_at_gc_horizon[idx] + ); + } + + // increase GC horizon and compact again + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + guard.cutoffs.time = Lsn(0x40); + guard.cutoffs.space = Lsn(0x40); + } + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + + Ok(()) + } + + #[tokio::test] + async fn test_generate_key_retention() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_generate_key_retention").await?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + tline.force_advance_lsn(Lsn(0x70)); + let key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let history = vec![ + ( + key, + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"0x10")), + ), + ( + key, + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(";0x20")), + ), + ( + key, + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(";0x30")), + ), + ( + key, + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append(";0x40")), + ), + ( + key, + Lsn(0x50), + Value::WalRecord(NeonWalRecord::wal_append(";0x50")), + ), + ( + key, + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + ), + ( + key, + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ( + key, + Lsn(0x80), + Value::Image(Bytes::copy_from_slice( + b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", + )), + ), + ( + key, + Lsn(0x90), + Value::WalRecord(NeonWalRecord::wal_append(";0x90")), + ), + ]; + let res = tline + .generate_key_retention( + key, + &history, + Lsn(0x60), + &[Lsn(0x20), Lsn(0x40), Lsn(0x50)], + 3, + None, + ) + .await + .unwrap(); + let expected_res = KeyHistoryRetention { + below_horizon: vec![ + ( + Lsn(0x20), + KeyLogAtLsn(vec![( + Lsn(0x20), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20")), + )]), + ), + ( + Lsn(0x40), + KeyLogAtLsn(vec![ + ( + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(";0x30")), + ), + ( + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append(";0x40")), + ), + ]), + ), + ( + Lsn(0x50), + KeyLogAtLsn(vec![( + Lsn(0x50), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40;0x50")), + )]), + ), + ( + Lsn(0x60), + KeyLogAtLsn(vec![( + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + )]), + ), + ], + above_horizon: KeyLogAtLsn(vec![ + ( + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ( + Lsn(0x80), + Value::Image(Bytes::copy_from_slice( + b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", + )), + ), + ( + Lsn(0x90), + Value::WalRecord(NeonWalRecord::wal_append(";0x90")), + ), + ]), + }; + assert_eq!(res, expected_res); + + // We expect GC-compaction to run with the original GC. This would create a situation that + // the original GC algorithm removes some delta layers b/c there are full image coverage, + // therefore causing some keys to have an incomplete history below the lowest retain LSN. + // For example, we have + // ```plain + // init delta @ 0x10, image @ 0x20, delta @ 0x30 (gc_horizon), image @ 0x40. + // ``` + // Now the GC horizon moves up, and we have + // ```plain + // init delta @ 0x10, image @ 0x20, delta @ 0x30, image @ 0x40 (gc_horizon) + // ``` + // The original GC algorithm kicks in, and removes delta @ 0x10, image @ 0x20. + // We will end up with + // ```plain + // delta @ 0x30, image @ 0x40 (gc_horizon) + // ``` + // Now we run the GC-compaction, and this key does not have a full history. + // We should be able to handle this partial history and drop everything before the + // gc_horizon image. + + let history = vec![ + ( + key, + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(";0x20")), + ), + ( + key, + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(";0x30")), + ), + ( + key, + Lsn(0x40), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), + ), + ( + key, + Lsn(0x50), + Value::WalRecord(NeonWalRecord::wal_append(";0x50")), + ), + ( + key, + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + ), + ( + key, + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ( + key, + Lsn(0x80), + Value::Image(Bytes::copy_from_slice( + b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", + )), + ), + ( + key, + Lsn(0x90), + Value::WalRecord(NeonWalRecord::wal_append(";0x90")), + ), + ]; + let res = tline + .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None) + .await + .unwrap(); + let expected_res = KeyHistoryRetention { + below_horizon: vec![ + ( + Lsn(0x40), + KeyLogAtLsn(vec![( + Lsn(0x40), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), + )]), + ), + ( + Lsn(0x50), + KeyLogAtLsn(vec![( + Lsn(0x50), + Value::WalRecord(NeonWalRecord::wal_append(";0x50")), + )]), + ), + ( + Lsn(0x60), + KeyLogAtLsn(vec![( + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + )]), + ), + ], + above_horizon: KeyLogAtLsn(vec![ + ( + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ( + Lsn(0x80), + Value::Image(Bytes::copy_from_slice( + b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80", + )), + ), + ( + Lsn(0x90), + Value::WalRecord(NeonWalRecord::wal_append(";0x90")), + ), + ]), + }; + assert_eq!(res, expected_res); + + // In case of branch compaction, the branch itself does not have the full history, and we need to provide + // the ancestor image in the test case. + + let history = vec![ + ( + key, + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(";0x20")), + ), + ( + key, + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(";0x30")), + ), + ( + key, + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append(";0x40")), + ), + ( + key, + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ]; + let res = tline + .generate_key_retention( + key, + &history, + Lsn(0x60), + &[], + 3, + Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))), + ) + .await + .unwrap(); + let expected_res = KeyHistoryRetention { + below_horizon: vec![( + Lsn(0x60), + KeyLogAtLsn(vec![( + Lsn(0x60), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), // use the ancestor image to reconstruct the page + )]), + )], + above_horizon: KeyLogAtLsn(vec![( + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + )]), + }; + assert_eq!(res, expected_res); + + let history = vec![ + ( + key, + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(";0x20")), + ), + ( + key, + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append(";0x40")), + ), + ( + key, + Lsn(0x60), + Value::WalRecord(NeonWalRecord::wal_append(";0x60")), + ), + ( + key, + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + ), + ]; + let res = tline + .generate_key_retention( + key, + &history, + Lsn(0x60), + &[Lsn(0x30)], + 3, + Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))), + ) + .await + .unwrap(); + let expected_res = KeyHistoryRetention { + below_horizon: vec![ + ( + Lsn(0x30), + KeyLogAtLsn(vec![( + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(";0x20")), + )]), + ), + ( + Lsn(0x60), + KeyLogAtLsn(vec![( + Lsn(0x60), + Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x40;0x60")), + )]), + ), + ], + above_horizon: KeyLogAtLsn(vec![( + Lsn(0x70), + Value::WalRecord(NeonWalRecord::wal_append(";0x70")), + )]), + }; + assert_eq!(res, expected_res); + + Ok(()) + } + + #[tokio::test] + async fn test_simple_bottom_most_compaction_with_retain_lsns() -> anyhow::Result<()> { + let harness = + TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns").await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let img_layer = (0..10) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + ( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(2), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(3), + Lsn(0x28), + Value::WalRecord(NeonWalRecord::wal_append("@0x28")), + ), + ( + get_key(3), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(3), + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append("@0x40")), + ), + ]; + let delta2 = vec![ + ( + get_key(5), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(6), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ]; + let delta3 = vec![ + ( + get_key(8), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ( + get_key(9), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), + ], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![ + (Lsn(0x10), tline.timeline_id), + (Lsn(0x20), tline.timeline_id), + ], + cutoffs: GcCutoffs { + time: Lsn(0x30), + space: Lsn(0x30), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + let expected_result = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10@0x30"), + Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10@0x48"), + Bytes::from_static(b"value 9@0x10@0x48"), + ]; + + let expected_result_at_gc_horizon = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10@0x30"), + Bytes::from_static(b"value 3@0x10@0x28@0x30"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let expected_result_at_lsn_20 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let expected_result_at_lsn_10 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let verify_result = || async { + let gc_horizon = { + let gc_info = tline.gc_info.read().unwrap(); + gc_info.cutoffs.time + }; + for idx in 0..10 { + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + &expected_result[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), gc_horizon, &ctx) + .await + .unwrap(), + &expected_result_at_gc_horizon[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x20), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_20[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x10), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_10[idx] + ); + } + }; + + verify_result().await; + + let cancel = CancellationToken::new(); + let mut dryrun_flags = EnumSet::new(); + dryrun_flags.insert(CompactFlags::DryRun); + + tline + .compact_with_gc(&cancel, dryrun_flags, &ctx) + .await + .unwrap(); + // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs + // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests. + verify_result().await; + + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + verify_result().await; + + // compact again + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + verify_result().await; + + // increase GC horizon and compact again + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + guard.cutoffs.time = Lsn(0x38); + guard.cutoffs.space = Lsn(0x38); + } + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result + + // not increasing the GC horizon and compact again + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + verify_result().await; + + Ok(()) + } + + #[tokio::test] + async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()> + { + let harness = + TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns_single_key") + .await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let img_layer = (0..10) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + ( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(1), + Lsn(0x28), + Value::WalRecord(NeonWalRecord::wal_append("@0x28")), + ), + ]; + let delta2 = vec![ + ( + get_key(1), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(1), + Lsn(0x38), + Value::WalRecord(NeonWalRecord::wal_append("@0x38")), + ), + ]; + let delta3 = vec![ + ( + get_key(8), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ( + get_key(9), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![ + // delta1 and delta 2 only contain a single key but multiple updates + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x50), delta3), + ], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![ + (Lsn(0x10), tline.timeline_id), + (Lsn(0x20), tline.timeline_id), + ], + cutoffs: GcCutoffs { + time: Lsn(0x30), + space: Lsn(0x30), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + let expected_result = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10@0x48"), + Bytes::from_static(b"value 9@0x10@0x48"), + ]; + + let expected_result_at_gc_horizon = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let expected_result_at_lsn_20 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let expected_result_at_lsn_10 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let verify_result = || async { + let gc_horizon = { + let gc_info = tline.gc_info.read().unwrap(); + gc_info.cutoffs.time + }; + for idx in 0..10 { + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + &expected_result[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), gc_horizon, &ctx) + .await + .unwrap(), + &expected_result_at_gc_horizon[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x20), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_20[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x10), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_10[idx] + ); + } + }; + + verify_result().await; + + let cancel = CancellationToken::new(); + let mut dryrun_flags = EnumSet::new(); + dryrun_flags.insert(CompactFlags::DryRun); + + tline + .compact_with_gc(&cancel, dryrun_flags, &ctx) + .await + .unwrap(); + // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs + // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests. + verify_result().await; + + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + verify_result().await; + + // compact again + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + verify_result().await; + + Ok(()) + } + + #[tokio::test] + async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let img_layer = (0..10) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + ( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(2), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(3), + Lsn(0x28), + Value::WalRecord(NeonWalRecord::wal_append("@0x28")), + ), + ( + get_key(3), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(3), + Lsn(0x40), + Value::WalRecord(NeonWalRecord::wal_append("@0x40")), + ), + ]; + let delta2 = vec![ + ( + get_key(5), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(6), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ]; + let delta3 = vec![ + ( + get_key(8), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ( + get_key(9), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ]; + + let parent_tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![], // delta layers + vec![(Lsn(0x18), img_layer)], // image layers + Lsn(0x18), + ) + .await?; + + parent_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10))); + + let branch_tline = tenant + .branch_timeline_test_with_layers( + &parent_tline, + NEW_TIMELINE_ID, + Some(Lsn(0x18)), + &ctx, + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), + ], // delta layers + vec![], // image layers + Lsn(0x50), + ) + .await?; + + branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10))); + + { + // Update GC info + let mut guard = parent_tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)], + cutoffs: GcCutoffs { + time: Lsn(0x10), + space: Lsn(0x10), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + { + // Update GC info + let mut guard = branch_tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)], + cutoffs: GcCutoffs { + time: Lsn(0x50), + space: Lsn(0x50), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + let expected_result_at_gc_horizon = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10@0x30"), + Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10@0x48"), + Bytes::from_static(b"value 9@0x10@0x48"), + ]; + + let expected_result_at_lsn_40 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10@0x30"), + Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10@0x20"), + Bytes::from_static(b"value 6@0x10@0x20"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let verify_result = || async { + for idx in 0..10 { + assert_eq!( + branch_tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + &expected_result_at_gc_horizon[idx] + ); + assert_eq!( + branch_tline + .get(get_key(idx as u32), Lsn(0x40), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_40[idx] + ); + } + }; + + verify_result().await; + + let cancel = CancellationToken::new(); + branch_tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + + verify_result().await; Ok(()) } diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 6de2e95055..dd70f6bbff 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -6,18 +6,35 @@ //! is written as a one byte. If it's larger than that, the length //! is written as a four-byte integer, in big-endian, with the high //! bit set. This way, we can detect whether it's 1- or 4-byte header -//! by peeking at the first byte. +//! by peeking at the first byte. For blobs larger than 128 bits, +//! we also specify three reserved bits, only one of the three bit +//! patterns is currently in use (0b011) and signifies compression +//! with zstd. //! //! len < 128: 0XXXXXXX -//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX +//! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! +use async_compression::Level; +use bytes::{BufMut, BytesMut}; +use pageserver_api::models::ImageCompressionAlgorithm; +use tokio::io::AsyncWriteExt; +use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; +use tracing::warn; + use crate::context::RequestContext; use crate::page_cache::PAGE_SZ; use crate::tenant::block_io::BlockCursor; +use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; use crate::virtual_file::VirtualFile; use std::cmp::min; use std::io::{Error, ErrorKind}; +#[derive(Copy, Clone, Debug)] +pub struct CompressionInfo { + pub written_compressed: bool, + pub compressed_size: Option, +} + impl<'a> BlockCursor<'a> { /// Read a blob into a new buffer. pub async fn read_blob( @@ -63,12 +80,37 @@ impl<'a> BlockCursor<'a> { len_buf.copy_from_slice(&buf[off..off + 4]); off += 4; } - len_buf[0] &= 0x7f; + let bit_mask = if self.read_compressed { + !LEN_COMPRESSION_BIT_MASK + } else { + 0x7f + }; + len_buf[0] &= bit_mask; u32::from_be_bytes(len_buf) as usize }; + let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK; - dstbuf.clear(); - dstbuf.reserve(len); + let mut tmp_buf = Vec::new(); + let buf_to_write; + let compression = if compression_bits <= BYTE_UNCOMPRESSED || !self.read_compressed { + if compression_bits > BYTE_UNCOMPRESSED { + warn!("reading key above future limit ({len} bytes)"); + } + buf_to_write = dstbuf; + None + } else if compression_bits == BYTE_ZSTD { + buf_to_write = &mut tmp_buf; + Some(dstbuf) + } else { + let error = std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("invalid compression byte {compression_bits:x}"), + ); + return Err(error); + }; + + buf_to_write.clear(); + buf_to_write.reserve(len); // Read the payload let mut remain = len; @@ -82,14 +124,35 @@ impl<'a> BlockCursor<'a> { page_remain = PAGE_SZ; } let this_blk_len = min(remain, page_remain); - dstbuf.extend_from_slice(&buf[off..off + this_blk_len]); + buf_to_write.extend_from_slice(&buf[off..off + this_blk_len]); remain -= this_blk_len; off += this_blk_len; } + + if let Some(dstbuf) = compression { + if compression_bits == BYTE_ZSTD { + let mut decoder = async_compression::tokio::write::ZstdDecoder::new(dstbuf); + decoder.write_all(buf_to_write).await?; + decoder.flush().await?; + } else { + unreachable!("already checked above") + } + } + Ok(()) } } +/// Reserved bits for length and compression +pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0; + +/// The maximum size of blobs we support. The highest few bits +/// are reserved for compression and other further uses. +pub(crate) const MAX_SUPPORTED_BLOB_LEN: usize = 0x0fff_ffff; + +pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80; +pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10; + /// A wrapper of `VirtualFile` that allows users to write blobs. /// /// If a `BlobWriter` is dropped, the internal buffer will be @@ -100,6 +163,8 @@ pub struct BlobWriter { offset: u64, /// A buffer to save on write calls, only used if BUFFERED=true buf: Vec, + /// We do tiny writes for the length headers; they need to be in an owned buffer; + io_buf: Option, } impl BlobWriter { @@ -108,6 +173,7 @@ impl BlobWriter { inner, offset: start_offset, buf: Vec::with_capacity(Self::CAPACITY), + io_buf: Some(BytesMut::new()), } } @@ -115,23 +181,35 @@ impl BlobWriter { self.offset } - const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 }; + const CAPACITY: usize = if BUFFERED { 64 * 1024 } else { 0 }; - #[inline(always)] /// Writes the given buffer directly to the underlying `VirtualFile`. /// You need to make sure that the internal buffer is empty, otherwise /// data will be written in wrong order. - async fn write_all_unbuffered(&mut self, src_buf: &[u8]) -> Result<(), Error> { - self.inner.write_all(src_buf).await?; - self.offset += src_buf.len() as u64; - Ok(()) + #[inline(always)] + async fn write_all_unbuffered( + &mut self, + src_buf: FullSlice, + ctx: &RequestContext, + ) -> (FullSlice, Result<(), Error>) { + let (src_buf, res) = self.inner.write_all(src_buf, ctx).await; + let nbytes = match res { + Ok(nbytes) => nbytes, + Err(e) => return (src_buf, Err(e)), + }; + self.offset += nbytes as u64; + (src_buf, Ok(())) } #[inline(always)] /// Flushes the internal buffer to the underlying `VirtualFile`. - pub async fn flush_buffer(&mut self) -> Result<(), Error> { - self.inner.write_all(&self.buf).await?; - self.buf.clear(); + pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> { + let buf = std::mem::take(&mut self.buf); + let (slice, res) = self.inner.write_all(buf.slice_len(), ctx).await; + res?; + let mut buf = slice.into_raw_slice().into_inner(); + buf.clear(); + self.buf = buf; Ok(()) } @@ -146,62 +224,165 @@ impl BlobWriter { } /// Internal, possibly buffered, write function - async fn write_all(&mut self, mut src_buf: &[u8]) -> Result<(), Error> { + async fn write_all( + &mut self, + src_buf: FullSlice, + ctx: &RequestContext, + ) -> (FullSlice, Result<(), Error>) { + let src_buf = src_buf.into_raw_slice(); + let src_buf_bounds = src_buf.bounds(); + let restore = move |src_buf_slice: Slice<_>| { + FullSlice::must_new(Slice::from_buf_bounds( + src_buf_slice.into_inner(), + src_buf_bounds, + )) + }; + if !BUFFERED { assert!(self.buf.is_empty()); - self.write_all_unbuffered(src_buf).await?; - return Ok(()); + return self + .write_all_unbuffered(FullSlice::must_new(src_buf), ctx) + .await; } let remaining = Self::CAPACITY - self.buf.len(); + let src_buf_len = src_buf.bytes_init(); + if src_buf_len == 0 { + return (restore(src_buf), Ok(())); + } + let mut src_buf = src_buf.slice(0..src_buf_len); // First try to copy as much as we can into the buffer if remaining > 0 { - let copied = self.write_into_buffer(src_buf); - src_buf = &src_buf[copied..]; + let copied = self.write_into_buffer(&src_buf); + src_buf = src_buf.slice(copied..); } // Then, if the buffer is full, flush it out if self.buf.len() == Self::CAPACITY { - self.flush_buffer().await?; + if let Err(e) = self.flush_buffer(ctx).await { + return (restore(src_buf), Err(e)); + } } // Finally, write the tail of src_buf: // If it wholly fits into the buffer without // completely filling it, then put it there. // If not, write it out directly. - if !src_buf.is_empty() { + let src_buf = if !src_buf.is_empty() { assert_eq!(self.buf.len(), 0); if src_buf.len() < Self::CAPACITY { - let copied = self.write_into_buffer(src_buf); + let copied = self.write_into_buffer(&src_buf); // We just verified above that src_buf fits into our internal buffer. assert_eq!(copied, src_buf.len()); + restore(src_buf) } else { - self.write_all_unbuffered(src_buf).await?; + let (src_buf, res) = self + .write_all_unbuffered(FullSlice::must_new(src_buf), ctx) + .await; + if let Err(e) = res { + return (src_buf, Err(e)); + } + src_buf } - } - Ok(()) + } else { + restore(src_buf) + }; + (src_buf, Ok(())) } /// Write a blob of data. Returns the offset that it was written to, /// which can be used to retrieve the data later. - pub async fn write_blob(&mut self, srcbuf: &[u8]) -> Result { - let offset = self.offset; + pub async fn write_blob( + &mut self, + srcbuf: FullSlice, + ctx: &RequestContext, + ) -> (FullSlice, Result) { + let (buf, res) = self + .write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled) + .await; + (buf, res.map(|(off, _compression_info)| off)) + } - if srcbuf.len() < 128 { - // Short blob. Write a 1-byte length header - let len_buf = srcbuf.len() as u8; - self.write_all(&[len_buf]).await?; - } else { - // Write a 4-byte length header - if srcbuf.len() > 0x7fff_ffff { - return Err(Error::new( - ErrorKind::Other, - format!("blob too large ({} bytes)", srcbuf.len()), - )); + /// Write a blob of data. Returns the offset that it was written to, + /// which can be used to retrieve the data later. + pub(crate) async fn write_blob_maybe_compressed( + &mut self, + srcbuf: FullSlice, + ctx: &RequestContext, + algorithm: ImageCompressionAlgorithm, + ) -> (FullSlice, Result<(u64, CompressionInfo), Error>) { + let offset = self.offset; + let mut compression_info = CompressionInfo { + written_compressed: false, + compressed_size: None, + }; + + let len = srcbuf.len(); + + let mut io_buf = self.io_buf.take().expect("we always put it back below"); + io_buf.clear(); + let mut compressed_buf = None; + let ((io_buf_slice, hdr_res), srcbuf) = async { + if len < 128 { + // Short blob. Write a 1-byte length header + io_buf.put_u8(len as u8); + (self.write_all(io_buf.slice_len(), ctx).await, srcbuf) + } else { + // Write a 4-byte length header + if len > MAX_SUPPORTED_BLOB_LEN { + return ( + ( + io_buf.slice_len(), + Err(Error::new( + ErrorKind::Other, + format!("blob too large ({len} bytes)"), + )), + ), + srcbuf, + ); + } + let (high_bit_mask, len_written, srcbuf) = match algorithm { + ImageCompressionAlgorithm::Zstd { level } => { + let mut encoder = if let Some(level) = level { + async_compression::tokio::write::ZstdEncoder::with_quality( + Vec::new(), + Level::Precise(level.into()), + ) + } else { + async_compression::tokio::write::ZstdEncoder::new(Vec::new()) + }; + encoder.write_all(&srcbuf[..]).await.unwrap(); + encoder.shutdown().await.unwrap(); + let compressed = encoder.into_inner(); + compression_info.compressed_size = Some(compressed.len()); + if compressed.len() < len { + compression_info.written_compressed = true; + let compressed_len = compressed.len(); + compressed_buf = Some(compressed); + (BYTE_ZSTD, compressed_len, srcbuf) + } else { + (BYTE_UNCOMPRESSED, len, srcbuf) + } + } + ImageCompressionAlgorithm::Disabled => (BYTE_UNCOMPRESSED, len, srcbuf), + }; + let mut len_buf = (len_written as u32).to_be_bytes(); + assert_eq!(len_buf[0] & 0xf0, 0); + len_buf[0] |= high_bit_mask; + io_buf.extend_from_slice(&len_buf[..]); + (self.write_all(io_buf.slice_len(), ctx).await, srcbuf) } - let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes(); - len_buf[0] |= 0x80; - self.write_all(&len_buf).await?; } - self.write_all(srcbuf).await?; - Ok(offset) + .await; + self.io_buf = Some(io_buf_slice.into_raw_slice().into_inner()); + match hdr_res { + Ok(_) => (), + Err(e) => return (srcbuf, Err(e)), + } + let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf { + let (_buf, res) = self.write_all(compressed_buf.slice_len(), ctx).await; + (srcbuf, res) + } else { + self.write_all(srcbuf, ctx).await + }; + (srcbuf, res.map(|_| (offset, compression_info))) } } @@ -210,8 +391,8 @@ impl BlobWriter { /// /// This function flushes the internal buffer before giving access /// to the underlying `VirtualFile`. - pub async fn into_inner(mut self) -> Result { - self.flush_buffer().await?; + pub async fn into_inner(mut self, ctx: &RequestContext) -> Result { + self.flush_buffer(ctx).await?; Ok(self.inner) } @@ -232,35 +413,67 @@ impl BlobWriter { } #[cfg(test)] -mod tests { +pub(crate) mod tests { use super::*; use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef}; + use camino::Utf8PathBuf; + use camino_tempfile::Utf8TempDir; use rand::{Rng, SeedableRng}; async fn round_trip_test(blobs: &[Vec]) -> Result<(), Error> { + round_trip_test_compressed::(blobs, false).await + } + + pub(crate) async fn write_maybe_compressed( + blobs: &[Vec], + compression: bool, + ctx: &RequestContext, + ) -> Result<(Utf8TempDir, Utf8PathBuf, Vec), Error> { let temp_dir = camino_tempfile::tempdir()?; let pathbuf = temp_dir.path().join("file"); - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); // Write part (in block to drop the file) let mut offsets = Vec::new(); { - let file = VirtualFile::create(pathbuf.as_path()).await?; + let file = VirtualFile::create(pathbuf.as_path(), ctx).await?; let mut wtr = BlobWriter::::new(file, 0); for blob in blobs.iter() { - let offs = wtr.write_blob(blob).await?; + let (_, res) = if compression { + let res = wtr + .write_blob_maybe_compressed( + blob.clone().slice_len(), + ctx, + ImageCompressionAlgorithm::Zstd { level: Some(1) }, + ) + .await; + (res.0, res.1.map(|(off, _)| off)) + } else { + wtr.write_blob(blob.clone().slice_len(), ctx).await + }; + let offs = res?; offsets.push(offs); } // Write out one page worth of zeros so that we can // read again with read_blk - let offs = wtr.write_blob(&vec![0; PAGE_SZ]).await?; + let (_, res) = wtr.write_blob(vec![0; PAGE_SZ].slice_len(), ctx).await; + let offs = res?; println!("Writing final blob at offs={offs}"); - wtr.flush_buffer().await?; + wtr.flush_buffer(ctx).await?; } + Ok((temp_dir, pathbuf, offsets)) + } - let file = VirtualFile::open(pathbuf.as_path()).await?; + async fn round_trip_test_compressed( + blobs: &[Vec], + compression: bool, + ) -> Result<(), Error> { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let (_temp_dir, pathbuf, offsets) = + write_maybe_compressed::(blobs, compression, &ctx).await?; + + let file = VirtualFile::open(pathbuf, &ctx).await?; let rdr = BlockReaderRef::VirtualFile(&file); - let rdr = BlockCursor::new(rdr); + let rdr = BlockCursor::new_with_compression(rdr, compression); for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() { let blob_read = rdr.read_blob(*offset, &ctx).await?; assert_eq!( @@ -271,7 +484,7 @@ mod tests { Ok(()) } - fn random_array(len: usize) -> Vec { + pub(crate) fn random_array(len: usize) -> Vec { let mut rng = rand::thread_rng(); (0..len).map(|_| rng.gen()).collect::<_>() } @@ -294,6 +507,8 @@ mod tests { ]; round_trip_test::(blobs).await?; round_trip_test::(blobs).await?; + round_trip_test_compressed::(blobs, true).await?; + round_trip_test_compressed::(blobs, true).await?; Ok(()) } @@ -302,10 +517,15 @@ mod tests { let blobs = &[ b"test".to_vec(), random_array(10 * PAGE_SZ), + b"hello".to_vec(), + random_array(66 * PAGE_SZ), + vec![0xf3; 24 * PAGE_SZ], b"foobar".to_vec(), ]; round_trip_test::(blobs).await?; round_trip_test::(blobs).await?; + round_trip_test_compressed::(blobs, true).await?; + round_trip_test_compressed::(blobs, true).await?; Ok(()) } diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 0617017528..3afa3a86b9 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -2,13 +2,12 @@ //! Low-level Block-oriented I/O functions //! -use super::ephemeral_file::EphemeralFile; use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner}; use crate::context::RequestContext; -use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ}; +use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ}; use crate::virtual_file::VirtualFile; use bytes::Bytes; -use std::ops::{Deref, DerefMut}; +use std::ops::Deref; /// This is implemented by anything that can read 8 kB (PAGE_SZ) /// blocks, using the page cache @@ -37,8 +36,11 @@ where pub enum BlockLease<'a> { PageReadGuard(PageReadGuard<'static>), EphemeralFileMutableTail(&'a [u8; PAGE_SZ]), + Slice(&'a [u8; PAGE_SZ]), #[cfg(test)] Arc(std::sync::Arc<[u8; PAGE_SZ]>), + #[cfg(test)] + Vec(Vec), } impl From> for BlockLease<'static> { @@ -61,8 +63,13 @@ impl<'a> Deref for BlockLease<'a> { match self { BlockLease::PageReadGuard(v) => v.deref(), BlockLease::EphemeralFileMutableTail(v) => v, + BlockLease::Slice(v) => v, #[cfg(test)] BlockLease::Arc(v) => v.deref(), + #[cfg(test)] + BlockLease::Vec(v) => { + TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ") + } } } } @@ -72,8 +79,7 @@ impl<'a> Deref for BlockLease<'a> { /// /// Unlike traits, we also support the read function to be async though. pub(crate) enum BlockReaderRef<'a> { - FileBlockReader(&'a FileBlockReader), - EphemeralFile(&'a EphemeralFile), + FileBlockReader(&'a FileBlockReader<'a>), Adapter(Adapter<&'a DeltaLayerInner>), #[cfg(test)] TestDisk(&'a super::disk_btree::tests::TestDisk), @@ -91,12 +97,11 @@ impl<'a> BlockReaderRef<'a> { use BlockReaderRef::*; match self { FileBlockReader(r) => r.read_blk(blknum, ctx).await, - EphemeralFile(r) => r.read_blk(blknum, ctx).await, Adapter(r) => r.read_blk(blknum, ctx).await, #[cfg(test)] TestDisk(r) => r.read_blk(blknum), #[cfg(test)] - VirtualFile(r) => r.read_blk(blknum).await, + VirtualFile(r) => r.read_blk(blknum, ctx).await, } } } @@ -121,16 +126,24 @@ impl<'a> BlockReaderRef<'a> { /// ``` /// pub struct BlockCursor<'a> { + pub(super) read_compressed: bool, reader: BlockReaderRef<'a>, } impl<'a> BlockCursor<'a> { pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self { - BlockCursor { reader } + Self::new_with_compression(reader, false) + } + pub(crate) fn new_with_compression(reader: BlockReaderRef<'a>, read_compressed: bool) -> Self { + BlockCursor { + read_compressed, + reader, + } } // Needed by cli pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self { BlockCursor { + read_compressed: false, reader: BlockReaderRef::FileBlockReader(reader), } } @@ -154,25 +167,35 @@ impl<'a> BlockCursor<'a> { /// /// The file is assumed to be immutable. This doesn't provide any functions /// for modifying the file, nor for invalidating the cache if it is modified. -pub struct FileBlockReader { - pub file: VirtualFile, +#[derive(Clone)] +pub struct FileBlockReader<'a> { + pub file: &'a VirtualFile, /// Unique ID of this file, used as key in the page cache. file_id: page_cache::FileId, + + compressed_reads: bool, } -impl FileBlockReader { - pub fn new(file: VirtualFile) -> Self { - let file_id = page_cache::next_file_id(); - - FileBlockReader { file_id, file } +impl<'a> FileBlockReader<'a> { + pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self { + FileBlockReader { + file_id, + file, + compressed_reads: true, + } } /// Read a page from the underlying file into given buffer. - async fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> { + async fn fill_buffer( + &self, + buf: PageWriteGuard<'static>, + blkno: u32, + ctx: &RequestContext, + ) -> Result, std::io::Error> { assert!(buf.len() == PAGE_SZ); self.file - .read_exact_at(buf, blkno as u64 * PAGE_SZ as u64) + .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64, ctx) .await } /// Read a block. @@ -180,11 +203,11 @@ impl FileBlockReader { /// Returns a "lease" object that can be used to /// access to the contents of the page. (For the page cache, the /// lease object represents a lock on the buffer.) - pub async fn read_blk( + pub async fn read_blk<'b>( &self, blknum: u32, ctx: &RequestContext, - ) -> Result { + ) -> Result, std::io::Error> { let cache = page_cache::get(); match cache .read_immutable_buf(self.file_id, blknum, ctx) @@ -196,18 +219,21 @@ impl FileBlockReader { ) })? { ReadBufResult::Found(guard) => Ok(guard.into()), - ReadBufResult::NotFound(mut write_guard) => { + ReadBufResult::NotFound(write_guard) => { // Read the page from disk into the buffer - self.fill_buffer(write_guard.deref_mut(), blknum).await?; + let write_guard = self.fill_buffer(write_guard, blknum, ctx).await?; Ok(write_guard.mark_valid().into()) } } } } -impl BlockReader for FileBlockReader { +impl BlockReader for FileBlockReader<'_> { fn block_cursor(&self) -> BlockCursor<'_> { - BlockCursor::new(BlockReaderRef::FileBlockReader(self)) + BlockCursor::new_with_compression( + BlockReaderRef::FileBlockReader(self), + self.compressed_reads, + ) } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index c44164c12d..7e0344666b 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -9,8 +9,11 @@ //! may lead to a data loss. //! use anyhow::bail; -use pageserver_api::models; +pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf; +use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::CompactionAlgorithmSettings; use pageserver_api::models::EvictionPolicy; +use pageserver_api::models::{self, ThrottleConfig}; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; use serde::de::IntoDeserializer; use serde::{Deserialize, Serialize}; @@ -19,39 +22,7 @@ use std::num::NonZeroU64; use std::time::Duration; use utils::generation::Generation; -pub mod defaults { - // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB - // would be more appropriate. But a low value forces the code to be exercised more, - // which is good for now to trigger bugs. - // This parameter actually determines L0 layer file size. - pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; - pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m"; - - // Target file size, when creating image and delta layers. - // This parameter determines L1 layer file size. - pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024; - - pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; - pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; - - pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; - - // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger. - // If there's a need to decrease this value, first make sure that GC - // doesn't hold a layer map write lock for non-trivial operations. - // Relevant: https://github.com/neondatabase/neon/issues/3394 - pub const DEFAULT_GC_PERIOD: &str = "1 hr"; - pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; - pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; - pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds"; - pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; - pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024; - pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; - - pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; -} - -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] pub(crate) enum AttachmentMode { /// Our generation is current as far as we know, and as far as we know we are the only attached /// pageserver. This is the "normal" attachment mode. @@ -66,7 +37,7 @@ pub(crate) enum AttachmentMode { Stale, } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] pub(crate) struct AttachedLocationConfig { pub(crate) generation: Generation, pub(crate) attach_mode: AttachmentMode, @@ -186,16 +157,17 @@ impl LocationConf { /// For use when attaching/re-attaching: update the generation stored in this /// structure. If we were in a secondary state, promote to attached (posession /// of a fresh generation implies this). - pub(crate) fn attach_in_generation(&mut self, generation: Generation) { + pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) { match &mut self.mode { LocationMode::Attached(attach_conf) => { attach_conf.generation = generation; + attach_conf.attach_mode = mode; } LocationMode::Secondary(_) => { // We are promoted to attached by the control plane's re-attach response self.mode = LocationMode::Attached(AttachedLocationConfig { generation, - attach_mode: AttachmentMode::Single, + attach_mode: mode, }) } } @@ -251,7 +223,7 @@ impl LocationConf { } else { ShardIdentity::new( ShardNumber(conf.shard_number), - ShardCount(conf.shard_count), + ShardCount::new(conf.shard_count), ShardStripeSize(conf.shard_stripe_size), )? }; @@ -280,76 +252,9 @@ impl Default for LocationConf { } } -/// A tenant's calcuated configuration, which is the result of merging a -/// tenant's TenantConfOpt with the global TenantConf from PageServerConf. -/// -/// For storing and transmitting individual tenant's configuration, see -/// TenantConfOpt. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub struct TenantConf { - // Flush out an inmemory layer, if it's holding WAL older than this - // This puts a backstop on how much WAL needs to be re-digested if the - // page server crashes. - // This parameter actually determines L0 layer file size. - pub checkpoint_distance: u64, - // Inmemory layer is also flushed at least once in checkpoint_timeout to - // eventually upload WAL after activity is stopped. - #[serde(with = "humantime_serde")] - pub checkpoint_timeout: Duration, - // Target file size, when creating image and delta layers. - // This parameter determines L1 layer file size. - pub compaction_target_size: u64, - // How often to check if there's compaction work to be done. - // Duration::ZERO means automatic compaction is disabled. - #[serde(with = "humantime_serde")] - pub compaction_period: Duration, - // Level0 delta layer threshold for compaction. - pub compaction_threshold: usize, - // Determines how much history is retained, to allow - // branching and read replicas at an older point in time. - // The unit is #of bytes of WAL. - // Page versions older than this are garbage collected away. - pub gc_horizon: u64, - // Interval at which garbage collection is triggered. - // Duration::ZERO means automatic GC is disabled - #[serde(with = "humantime_serde")] - pub gc_period: Duration, - // Delta layer churn threshold to create L1 image layers. - pub image_creation_threshold: usize, - // Determines how much history is retained, to allow - // branching and read replicas at an older point in time. - // The unit is time. - // Page versions older than this are garbage collected away. - #[serde(with = "humantime_serde")] - pub pitr_interval: Duration, - /// Maximum amount of time to wait while opening a connection to receive wal, before erroring. - #[serde(with = "humantime_serde")] - pub walreceiver_connect_timeout: Duration, - /// Considers safekeepers stalled after no WAL updates were received longer than this threshold. - /// A stalled safekeeper will be changed to a newer one when it appears. - #[serde(with = "humantime_serde")] - pub lagging_wal_timeout: Duration, - /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold. - /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update, - /// to avoid eager reconnects. - pub max_lsn_wal_lag: NonZeroU64, - pub trace_read_requests: bool, - pub eviction_policy: EvictionPolicy, - pub min_resident_size_override: Option, - // See the corresponding metric's help string. - #[serde(with = "humantime_serde")] - pub evictions_low_residence_duration_metric_threshold: Duration, - pub gc_feedback: bool, - - /// If non-zero, the period between uploads of a heatmap from attached tenants. This - /// may be disabled if a Tenant will not have secondary locations: only secondary - /// locations will use the heatmap uploaded by attached locations. - pub heatmap_period: Duration, -} - /// Same as TenantConf, but this struct preserves the information about /// which parameters are set and which are not. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)] pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] @@ -373,6 +278,10 @@ pub struct TenantConfOpt { #[serde(default)] pub compaction_threshold: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub compaction_algorithm: Option, + #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub gc_horizon: Option, @@ -405,10 +314,6 @@ pub struct TenantConfOpt { #[serde(default)] pub max_lsn_wal_lag: Option, - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub trace_read_requests: Option, - #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub eviction_policy: Option, @@ -423,13 +328,33 @@ pub struct TenantConfOpt { pub evictions_low_residence_duration_metric_threshold: Option, #[serde(skip_serializing_if = "Option::is_none")] + #[serde(with = "humantime_serde")] #[serde(default)] - pub gc_feedback: Option, + pub heatmap_period: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub lazy_slru_download: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub timeline_get_throttle: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub image_layer_creation_check_threshold: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub switch_aux_file_policy: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] #[serde(default)] - pub heatmap_period: Option, + pub lsn_lease_length: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(with = "humantime_serde")] + #[serde(default)] + pub lsn_lease_length_for_ts: Option, } impl TenantConfOpt { @@ -450,6 +375,11 @@ impl TenantConfOpt { compaction_threshold: self .compaction_threshold .unwrap_or(global_conf.compaction_threshold), + compaction_algorithm: self + .compaction_algorithm + .as_ref() + .unwrap_or(&global_conf.compaction_algorithm) + .clone(), gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon), gc_period: self.gc_period.unwrap_or(global_conf.gc_period), image_creation_threshold: self @@ -463,9 +393,6 @@ impl TenantConfOpt { .lagging_wal_timeout .unwrap_or(global_conf.lagging_wal_timeout), max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag), - trace_read_requests: self - .trace_read_requests - .unwrap_or(global_conf.trace_read_requests), eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy), min_resident_size_override: self .min_resident_size_override @@ -473,46 +400,26 @@ impl TenantConfOpt { evictions_low_residence_duration_metric_threshold: self .evictions_low_residence_duration_metric_threshold .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold), - gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback), heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period), - } - } -} - -impl Default for TenantConf { - fn default() -> Self { - use defaults::*; - Self { - checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE, - checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT) - .expect("cannot parse default checkpoint timeout"), - compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE, - compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) - .expect("cannot parse default compaction period"), - compaction_threshold: DEFAULT_COMPACTION_THRESHOLD, - gc_horizon: DEFAULT_GC_HORIZON, - gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) - .expect("cannot parse default gc period"), - image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD, - pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL) - .expect("cannot parse default PITR interval"), - walreceiver_connect_timeout: humantime::parse_duration( - DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, - ) - .expect("cannot parse default walreceiver connect timeout"), - lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT) - .expect("cannot parse default walreceiver lagging wal timeout"), - max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) - .expect("cannot parse default max walreceiver Lsn wal lag"), - trace_read_requests: false, - eviction_policy: EvictionPolicy::NoEviction, - min_resident_size_override: None, - evictions_low_residence_duration_metric_threshold: humantime::parse_duration( - DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, - ) - .expect("cannot parse default evictions_low_residence_duration_metric_threshold"), - gc_feedback: false, - heatmap_period: Duration::ZERO, + lazy_slru_download: self + .lazy_slru_download + .unwrap_or(global_conf.lazy_slru_download), + timeline_get_throttle: self + .timeline_get_throttle + .clone() + .unwrap_or(global_conf.timeline_get_throttle), + image_layer_creation_check_threshold: self + .image_layer_creation_check_threshold + .unwrap_or(global_conf.image_layer_creation_check_threshold), + switch_aux_file_policy: self + .switch_aux_file_policy + .unwrap_or(global_conf.switch_aux_file_policy), + lsn_lease_length: self + .lsn_lease_length + .unwrap_or(global_conf.lsn_lease_length), + lsn_lease_length_for_ts: self + .lsn_lease_length_for_ts + .unwrap_or(global_conf.lsn_lease_length_for_ts), } } } @@ -566,6 +473,7 @@ impl From for models::TenantConfig { Self { checkpoint_distance: value.checkpoint_distance, checkpoint_timeout: value.checkpoint_timeout.map(humantime), + compaction_algorithm: value.compaction_algorithm, compaction_target_size: value.compaction_target_size, compaction_period: value.compaction_period.map(humantime), compaction_threshold: value.compaction_threshold, @@ -576,14 +484,18 @@ impl From for models::TenantConfig { walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime), lagging_wal_timeout: value.lagging_wal_timeout.map(humantime), max_lsn_wal_lag: value.max_lsn_wal_lag, - trace_read_requests: value.trace_read_requests, eviction_policy: value.eviction_policy, min_resident_size_override: value.min_resident_size_override, evictions_low_residence_duration_metric_threshold: value .evictions_low_residence_duration_metric_threshold .map(humantime), - gc_feedback: value.gc_feedback, heatmap_period: value.heatmap_period.map(humantime), + lazy_slru_download: value.lazy_slru_download, + timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from), + image_layer_creation_check_threshold: value.image_layer_creation_check_threshold, + switch_aux_file_policy: value.switch_aux_file_policy, + lsn_lease_length: value.lsn_lease_length.map(humantime), + lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime), } } } diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs deleted file mode 100644 index 97de0cdcf9..0000000000 --- a/pageserver/src/tenant/delete.rs +++ /dev/null @@ -1,628 +0,0 @@ -use std::sync::Arc; - -use anyhow::Context; -use camino::{Utf8Path, Utf8PathBuf}; -use pageserver_api::{models::TenantState, shard::TenantShardId}; -use remote_storage::{GenericRemoteStorage, RemotePath}; -use tokio::sync::OwnedMutexGuard; -use tokio_util::sync::CancellationToken; -use tracing::{error, instrument, Instrument, Span}; - -use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId}; - -use crate::{ - config::PageServerConf, - context::RequestContext, - task_mgr::{self, TaskKind}, - tenant::mgr::{TenantSlot, TenantsMapRemoveResult}, -}; - -use super::{ - mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap}, - remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}, - span, - timeline::delete::DeleteTimelineFlow, - tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload, -}; - -#[derive(Debug, thiserror::Error)] -pub(crate) enum DeleteTenantError { - #[error("GetTenant {0}")] - Get(#[from] GetTenantError), - - #[error("Tenant not attached")] - NotAttached, - - #[error("Invalid state {0}. Expected Active or Broken")] - InvalidState(TenantState), - - #[error("Tenant deletion is already in progress")] - AlreadyInProgress, - - #[error("Tenant map slot error {0}")] - SlotError(#[from] TenantSlotError), - - #[error("Tenant map slot upsert error {0}")] - SlotUpsertError(#[from] TenantSlotUpsertError), - - #[error("Timeline {0}")] - Timeline(#[from] DeleteTimelineError), - - #[error("Cancelled")] - Cancelled, - - #[error(transparent)] - Other(#[from] anyhow::Error), -} - -type DeletionGuard = tokio::sync::OwnedMutexGuard; - -fn remote_tenant_delete_mark_path( - conf: &PageServerConf, - tenant_shard_id: &TenantShardId, -) -> anyhow::Result { - let tenant_remote_path = conf - .tenant_path(tenant_shard_id) - .strip_prefix(&conf.workdir) - .context("Failed to strip workdir prefix") - .and_then(RemotePath::new) - .context("tenant path")?; - Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted"))) -} - -async fn create_remote_delete_mark( - conf: &PageServerConf, - remote_storage: &GenericRemoteStorage, - tenant_shard_id: &TenantShardId, - cancel: &CancellationToken, -) -> Result<(), DeleteTenantError> { - let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; - - let data: &[u8] = &[]; - backoff::retry( - || async { - let data = bytes::Bytes::from_static(data); - let stream = futures::stream::once(futures::future::ready(Ok(data))); - remote_storage - .upload(stream, 0, &remote_mark_path, None) - .await - }, - |_e| false, - FAILED_UPLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - "mark_upload", - backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")), - ) - .await - .context("mark_upload")?; - - Ok(()) -} - -async fn create_local_delete_mark( - conf: &PageServerConf, - tenant_shard_id: &TenantShardId, -) -> Result<(), DeleteTenantError> { - let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id); - - // Note: we're ok to replace existing file. - let _ = std::fs::OpenOptions::new() - .write(true) - .create(true) - .open(&marker_path) - .with_context(|| format!("could not create delete marker file {marker_path:?}"))?; - - crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?; - - Ok(()) -} - -async fn schedule_ordered_timeline_deletions( - tenant: &Arc, -) -> Result>, TimelineId)>, DeleteTenantError> { - // Tenant is stopping at this point. We know it will be deleted. - // No new timelines should be created. - // Tree sort timelines to delete from leafs to the root. - // NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion - // can complete and remove timeline from the map in between our call to clone - // and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map. - // timelines.lock is currently synchronous so we cant hold it across await point. - // So just ignore NotFound error if we get it from `run`. - // Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock. - let timelines = tenant.timelines.lock().unwrap().clone(); - let sorted = - tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?; - - let mut already_running_deletions = vec![]; - - for (timeline_id, _) in sorted.into_iter().rev() { - if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await { - match e { - DeleteTimelineError::NotFound => { - // Timeline deletion finished after call to clone above but before call - // to `DeleteTimelineFlow::run` and removed timeline from the map. - continue; - } - DeleteTimelineError::AlreadyInProgress(guard) => { - already_running_deletions.push((guard, timeline_id)); - continue; - } - e => return Err(DeleteTenantError::Timeline(e)), - } - } - } - - Ok(already_running_deletions) -} - -async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), DeleteTenantError> { - // Assert timelines dir is empty. - if !fs_ext::is_directory_empty(timelines_path).await? { - // Display first 10 items in directory - let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?; - let list = &list.into_iter().take(10).collect::>(); - return Err(DeleteTenantError::Other(anyhow::anyhow!( - "Timelines directory is not empty after all timelines deletion: {list:?}" - ))); - } - - Ok(()) -} - -async fn remove_tenant_remote_delete_mark( - conf: &PageServerConf, - remote_storage: Option<&GenericRemoteStorage>, - tenant_shard_id: &TenantShardId, - cancel: &CancellationToken, -) -> Result<(), DeleteTenantError> { - if let Some(remote_storage) = remote_storage { - let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; - backoff::retry( - || async { remote_storage.delete(&path).await }, - |_e| false, - FAILED_UPLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - "remove_tenant_remote_delete_mark", - backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")), - ) - .await - .context("remove_tenant_remote_delete_mark")?; - } - Ok(()) -} - -// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir -async fn cleanup_remaining_fs_traces( - conf: &PageServerConf, - tenant_shard_id: &TenantShardId, -) -> Result<(), DeleteTenantError> { - let rm = |p: Utf8PathBuf, is_dir: bool| async move { - if is_dir { - tokio::fs::remove_dir(&p).await - } else { - tokio::fs::remove_file(&p).await - } - .or_else(fs_ext::ignore_not_found) - .with_context(|| format!("failed to delete {p}")) - }; - - rm(conf.tenant_config_path(tenant_shard_id), false).await?; - rm(conf.tenant_location_config_path(tenant_shard_id), false).await?; - - fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-remove-timelines-dir" - ))? - }); - - rm(conf.timelines_path(tenant_shard_id), true).await?; - - fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-remove-deleted-mark" - ))? - }); - - // Make sure previous deletions are ordered before mark removal. - // Otherwise there is no guarantee that they reach the disk before mark deletion. - // So its possible for mark to reach disk first and for other deletions - // to be reordered later and thus missed if a crash occurs. - // Note that we dont need to sync after mark file is removed - // because we can tolerate the case when mark file reappears on startup. - let tenant_path = &conf.tenant_path(tenant_shard_id); - if tenant_path.exists() { - crashsafe::fsync_async(&conf.tenant_path(tenant_shard_id)) - .await - .context("fsync_pre_mark_remove")?; - } - - rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?; - - fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-remove-tenant-dir" - ))? - }); - - rm(conf.tenant_path(tenant_shard_id), true).await?; - - Ok(()) -} - -/// Orchestrates tenant shut down of all tasks, removes its in-memory structures, -/// and deletes its data from both disk and s3. -/// The sequence of steps: -/// 1. Upload remote deletion mark. -/// 2. Create local mark file. -/// 3. Shutdown tasks -/// 4. Run ordered timeline deletions -/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested -/// 6. Remove remote mark -/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark -/// It is resumable from any step in case a crash/restart occurs. -/// There are two entrypoints to the process: -/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler. -/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process. -/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function. -#[derive(Default)] -pub enum DeleteTenantFlow { - #[default] - NotStarted, - InProgress, - Finished, -} - -impl DeleteTenantFlow { - // These steps are run in the context of management api request handler. - // Long running steps are continued to run in the background. - // NB: If this fails half-way through, and is retried, the retry will go through - // all the same steps again. Make sure the code here is idempotent, and don't - // error out if some of the shutdown tasks have already been completed! - // NOTE: static needed for background part. - // We assume that calling code sets up the span with tenant_id. - #[instrument(skip_all)] - pub(crate) async fn run( - conf: &'static PageServerConf, - remote_storage: Option, - tenants: &'static std::sync::RwLock, - tenant: Arc, - ) -> Result<(), DeleteTenantError> { - span::debug_assert_current_span_has_tenant_id(); - - pausable_failpoint!("tenant-delete-before-run"); - - let mut guard = Self::prepare(&tenant).await?; - - if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await { - tenant.set_broken(format!("{e:#}")).await; - return Err(e); - } - - Self::schedule_background(guard, conf, remote_storage, tenants, tenant); - - Ok(()) - } - - // Helper function needed to be able to match once on returned error and transition tenant into broken state. - // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown - // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried. - // So the solution is to set tenant state to broken. - async fn run_inner( - guard: &mut OwnedMutexGuard, - conf: &'static PageServerConf, - remote_storage: Option<&GenericRemoteStorage>, - tenant: &Tenant, - ) -> Result<(), DeleteTenantError> { - guard.mark_in_progress()?; - - fail::fail_point!("tenant-delete-before-create-remote-mark", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-create-remote-mark" - ))? - }); - - // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend. - // Though sounds scary, different mark name? - // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state. - if let Some(remote_storage) = &remote_storage { - create_remote_delete_mark( - conf, - remote_storage, - &tenant.tenant_shard_id, - // Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token - &CancellationToken::new(), - ) - .await - .context("remote_mark")? - } - - fail::fail_point!("tenant-delete-before-create-local-mark", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-create-local-mark" - ))? - }); - - create_local_delete_mark(conf, &tenant.tenant_shard_id) - .await - .context("local delete mark")?; - - fail::fail_point!("tenant-delete-before-background", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-background" - ))? - }); - - Ok(()) - } - - fn mark_in_progress(&mut self) -> anyhow::Result<()> { - match self { - Self::Finished => anyhow::bail!("Bug. Is in finished state"), - Self::InProgress { .. } => { /* We're in a retry */ } - Self::NotStarted => { /* Fresh start */ } - } - - *self = Self::InProgress; - - Ok(()) - } - - pub(crate) async fn should_resume_deletion( - conf: &'static PageServerConf, - remote_mark_exists: bool, - tenant: &Tenant, - ) -> Result, DeleteTenantError> { - let acquire = |t: &Tenant| { - Some( - Arc::clone(&t.delete_progress) - .try_lock_owned() - .expect("we're the only owner during init"), - ) - }; - - if remote_mark_exists { - return Ok(acquire(tenant)); - } - - // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists. - if conf - .tenant_deleted_mark_file_path(&tenant.tenant_shard_id) - .exists() - { - Ok(acquire(tenant)) - } else { - Ok(None) - } - } - - pub(crate) async fn resume_from_attach( - guard: DeletionGuard, - tenant: &Arc, - preload: Option, - tenants: &'static std::sync::RwLock, - ctx: &RequestContext, - ) -> Result<(), DeleteTenantError> { - let (_, progress) = completion::channel(); - - tenant - .set_stopping(progress, false, true) - .await - .expect("cant be stopping or broken"); - - tenant - .attach(preload, super::SpawnMode::Normal, ctx) - .await - .context("attach")?; - - Self::background( - guard, - tenant.conf, - tenant.remote_storage.clone(), - tenants, - tenant, - ) - .await - } - - async fn prepare( - tenant: &Arc, - ) -> Result, DeleteTenantError> { - // FIXME: unsure about active only. Our init jobs may not be cancellable properly, - // so at least for now allow deletions only for active tenants. TODO recheck - // Broken and Stopping is needed for retries. - if !matches!( - tenant.current_state(), - TenantState::Active | TenantState::Broken { .. } - ) { - return Err(DeleteTenantError::InvalidState(tenant.current_state())); - } - - let guard = Arc::clone(&tenant.delete_progress) - .try_lock_owned() - .map_err(|_| DeleteTenantError::AlreadyInProgress)?; - - fail::fail_point!("tenant-delete-before-shutdown", |_| { - Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))? - }); - - // make pageserver shutdown not to wait for our completion - let (_, progress) = completion::channel(); - - // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent. - // i e it is an error to do: - // tenant.set_stopping - // tenant.shutdown - // Its also bad that we're holding tenants.read here. - // TODO relax set_stopping to be idempotent? - if tenant.shutdown(progress, false).await.is_err() { - return Err(DeleteTenantError::Other(anyhow::anyhow!( - "tenant shutdown is already in progress" - ))); - } - - Ok(guard) - } - - fn schedule_background( - guard: OwnedMutexGuard, - conf: &'static PageServerConf, - remote_storage: Option, - tenants: &'static std::sync::RwLock, - tenant: Arc, - ) { - let tenant_shard_id = tenant.tenant_shard_id; - - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - TaskKind::TimelineDeletionWorker, - Some(tenant_shard_id), - None, - "tenant_delete", - false, - async move { - if let Err(err) = - Self::background(guard, conf, remote_storage, tenants, &tenant).await - { - error!("Error: {err:#}"); - tenant.set_broken(format!("{err:#}")).await; - }; - Ok(()) - } - .instrument({ - let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()); - span.follows_from(Span::current()); - span - }), - ); - } - - async fn background( - mut guard: OwnedMutexGuard, - conf: &PageServerConf, - remote_storage: Option, - tenants: &'static std::sync::RwLock, - tenant: &Arc, - ) -> Result<(), DeleteTenantError> { - // Tree sort timelines, schedule delete for them. Mention retries from the console side. - // Note that if deletion fails we dont mark timelines as broken, - // the whole tenant will become broken as by `Self::schedule_background` logic - let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant) - .await - .context("schedule_ordered_timeline_deletions")?; - - fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-polling-ongoing-deletions" - ))? - }); - - // Wait for deletions that were already running at the moment when tenant deletion was requested. - // When we can lock deletion guard it means that corresponding timeline deletion finished. - for (guard, timeline_id) in already_running_timeline_deletions { - let flow = guard.lock().await; - if !flow.is_finished() { - return Err(DeleteTenantError::Other(anyhow::anyhow!( - "already running timeline deletion failed: {timeline_id}" - ))); - } - } - - let timelines_path = conf.timelines_path(&tenant.tenant_shard_id); - // May not exist if we fail in cleanup_remaining_fs_traces after removing it - if timelines_path.exists() { - // sanity check to guard against layout changes - ensure_timelines_dir_empty(&timelines_path) - .await - .context("timelines dir not empty")?; - } - - remove_tenant_remote_delete_mark( - conf, - remote_storage.as_ref(), - &tenant.tenant_shard_id, - // Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token - &CancellationToken::new(), - ) - .await?; - - pausable_failpoint!("tenant-delete-before-cleanup-remaining-fs-traces-pausable"); - fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-cleanup-remaining-fs-traces" - ))? - }); - - cleanup_remaining_fs_traces(conf, &tenant.tenant_shard_id) - .await - .context("cleanup_remaining_fs_traces")?; - - { - pausable_failpoint!("tenant-delete-before-map-remove"); - - // This block is simply removing the TenantSlot for this tenant. It requires a loop because - // we might conflict with a TenantSlot::InProgress marker and need to wait for it. - // - // This complexity will go away when we simplify how deletion works: - // https://github.com/neondatabase/neon/issues/5080 - loop { - // Under the TenantMap lock, try to remove the tenant. We usually succeed, but if - // we encounter an InProgress marker, yield the barrier it contains and wait on it. - let barrier = { - let mut locked = tenants.write().unwrap(); - let removed = locked.remove(tenant.tenant_shard_id); - - // FIXME: we should not be modifying this from outside of mgr.rs. - // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080) - crate::metrics::TENANT_MANAGER - .tenant_slots - .set(locked.len() as u64); - - match removed { - TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => { - match tenant.current_state() { - TenantState::Stopping { .. } | TenantState::Broken { .. } => { - // Expected: we put the tenant into stopping state before we start deleting it - } - state => { - // Unexpected state - tracing::warn!( - "Tenant in unexpected state {state} after deletion" - ); - } - } - break; - } - TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => { - // This is unexpected: this secondary tenants should not have been created, and we - // are not in a position to shut it down from here. - tracing::warn!("Tenant transitioned to secondary mode while deleting!"); - break; - } - TenantsMapRemoveResult::Occupied(TenantSlot::InProgress(_)) => { - unreachable!("TenantsMap::remove handles InProgress separately, should never return it here"); - } - TenantsMapRemoveResult::Vacant => { - tracing::warn!( - "Tenant removed from TenantsMap before deletion completed" - ); - break; - } - TenantsMapRemoveResult::InProgress(barrier) => { - // An InProgress entry was found, we must wait on its barrier - barrier - } - } - }; - - tracing::info!( - "Waiting for competing operation to complete before deleting state for tenant" - ); - barrier.wait().await; - } - } - - *guard = Self::Finished; - - Ok(()) - } -} diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index 06a04bf536..0107b0ac7e 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -18,11 +18,19 @@ //! - An Iterator interface would be more convenient for the callers than the //! 'visit' function //! +use async_stream::try_stream; use byteorder::{ReadBytesExt, BE}; use bytes::{BufMut, Bytes, BytesMut}; use either::Either; +use futures::{Stream, StreamExt}; use hex; -use std::{cmp::Ordering, io, result}; +use std::{ + cmp::Ordering, + io, + iter::Rev, + ops::{Range, RangeInclusive}, + result, +}; use thiserror::Error; use tracing::error; @@ -36,7 +44,6 @@ use crate::{ pub const VALUE_SZ: usize = 5; pub const MAX_VALUE: u64 = 0x007f_ffff_ffff; -#[allow(dead_code)] pub const PAGE_SZ: usize = 8192; #[derive(Clone, Copy, Debug)] @@ -205,6 +212,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> { /// /// Public reader object, to search the tree. /// +#[derive(Clone)] pub struct DiskBtreeReader where R: BlockReader, @@ -252,6 +260,118 @@ where Ok(result) } + pub fn iter<'a>(self, start_key: &'a [u8; L], ctx: &'a RequestContext) -> DiskBtreeIterator<'a> + where + R: 'a + Send, + { + DiskBtreeIterator { + stream: Box::pin(self.into_stream(start_key, ctx)), + } + } + + /// Return a stream which yields all key, value pairs from the index + /// starting from the first key greater or equal to `start_key`. + /// + /// Note 1: that this is a copy of [`Self::visit`]. + /// TODO: Once the sequential read path is removed this will become + /// the only index traversal method. + /// + /// Note 2: this function used to take `&self` but it now consumes `self`. This is due to + /// the lifetime constraints of the reader and the stream / iterator it creates. Using `&self` + /// requires the reader to be present when the stream is used, and this creates a lifetime + /// dependency between the reader and the stream. Now if we want to create an iterator that + /// holds the stream, someone will need to keep a reference to the reader, which is inconvenient + /// to use from the image/delta layer APIs. + /// + /// Feel free to add the `&self` variant back if it's necessary. + pub fn into_stream<'a>( + self, + start_key: &'a [u8; L], + ctx: &'a RequestContext, + ) -> impl Stream, u64), DiskBtreeError>> + 'a + where + R: 'a, + { + try_stream! { + let mut stack = Vec::new(); + stack.push((self.root_blk, None)); + let block_cursor = self.reader.block_cursor(); + let mut node_buf = [0_u8; PAGE_SZ]; + while let Some((node_blknum, opt_iter)) = stack.pop() { + // Read the node, through the PS PageCache, into local variable `node_buf`. + // We could keep the page cache read guard alive, but, at the time of writing, + // we run quite small PS PageCache s => can't risk running out of + // PageCache space because this stream isn't consumed fast enough. + let page_read_guard = block_cursor + .read_blk(self.start_blk + node_blknum, ctx) + .await?; + node_buf.copy_from_slice(page_read_guard.as_ref()); + drop(page_read_guard); // drop page cache read guard early + + let node = OnDiskNode::deparse(&node_buf)?; + let prefix_len = node.prefix_len as usize; + let suffix_len = node.suffix_len as usize; + + assert!(node.num_children > 0); + + let mut keybuf = Vec::new(); + keybuf.extend(node.prefix); + keybuf.resize(prefix_len + suffix_len, 0); + + let mut iter: Either, Rev>> = if let Some(iter) = opt_iter { + iter + } else { + // Locate the first match + let idx = match node.binary_search(start_key, keybuf.as_mut_slice()) { + Ok(idx) => idx, + Err(idx) => { + if node.level == 0 { + // Imagine that the node contains the following keys: + // + // 1 + // 3 <-- idx + // 5 + // + // If the search key is '2' and there is exact match, + // the binary search would return the index of key + // '3'. That's cool, '3' is the first key to return. + idx + } else { + // This is an internal page, so each key represents a lower + // bound for what's in the child page. If there is no exact + // match, we have to return the *previous* entry. + // + // 1 <-- return this + // 3 <-- idx + // 5 + idx.saturating_sub(1) + } + } + }; + Either::Left(idx..node.num_children.into()) + }; + + + // idx points to the first match now. Keep going from there + while let Some(idx) = iter.next() { + let key_off = idx * suffix_len; + let suffix = &node.keys[key_off..key_off + suffix_len]; + keybuf[prefix_len..].copy_from_slice(suffix); + let value = node.value(idx); + #[allow(clippy::collapsible_if)] + if node.level == 0 { + // leaf + yield (keybuf.clone(), value.to_u64()); + } else { + stack.push((node_blknum, Some(iter))); + stack.push((value.to_blknum(), None)); + break; + } + } + } + } + } + /// /// Scan the tree, starting from 'search_key', in the given direction. 'visitor' /// will be called for every key >= 'search_key' (or <= 'search_key', if scanning @@ -405,6 +525,19 @@ where } } +pub struct DiskBtreeIterator<'a> { + #[allow(clippy::type_complexity)] + stream: std::pin::Pin< + Box, u64), DiskBtreeError>> + 'a + Send>, + >, +} + +impl<'a> DiskBtreeIterator<'a> { + pub async fn next(&mut self) -> Option, u64), DiskBtreeError>> { + self.stream.next().await + } +} + /// /// Public builder object, for creating a new tree. /// @@ -424,10 +557,10 @@ where /// We maintain the length of the stack to be always greater than zero. /// Two exceptions are: /// 1. `Self::flush_node`. The method will push the new node if it extracted the last one. - /// So because other methods cannot see the intermediate state invariant still holds. + /// So because other methods cannot see the intermediate state invariant still holds. /// 2. `Self::finish`. It consumes self and does not return it back, - /// which means that this is where the structure is destroyed. - /// Thus stack of zero length cannot be observed by other methods. + /// which means that this is where the structure is destroyed. + /// Thus stack of zero length cannot be observed by other methods. stack: Vec>, /// Last key that was appended to the tree. Used to sanity check that append @@ -701,8 +834,6 @@ impl BuildNode { #[cfg(test)] pub(crate) mod tests { use super::*; - use crate::context::DownloadBehavior; - use crate::task_mgr::TaskKind; use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef}; use rand::Rng; use std::collections::BTreeMap; @@ -999,6 +1130,17 @@ pub(crate) mod tests { == all_data.get(&u128::MAX).cloned() ); + // Test iterator and get_stream API + let mut iter = reader.iter(&[0; 16], &ctx); + let mut cnt = 0; + while let Some(res) = iter.next().await { + let (key, val) = res?; + let key = u128::from_be_bytes(key.as_slice().try_into().unwrap()); + assert_eq!(val, *all_data.get(&key).unwrap()); + cnt += 1; + } + assert_eq!(cnt, all_data.len()); + Ok(()) } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 591eacd104..5324e1807d 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -1,39 +1,48 @@ //! Implementation of append-only file data structure //! used to keep in-memory layers spilled on disk. +use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}; use crate::config::PageServerConf; use crate::context::RequestContext; -use crate::page_cache::{self, PAGE_SZ}; -use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader}; -use crate::virtual_file::VirtualFile; +use crate::page_cache; +use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File; +use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; +use crate::virtual_file::owned_buffers_io::util::size_tracking_writer; +use crate::virtual_file::owned_buffers_io::write::Buffer; +use crate::virtual_file::{self, owned_buffers_io, VirtualFile}; +use bytes::BytesMut; use camino::Utf8PathBuf; +use num_traits::Num; use pageserver_api::shard::TenantShardId; -use std::cmp::min; -use std::fs::OpenOptions; -use std::io::{self, ErrorKind}; -use std::ops::DerefMut; +use tokio_epoll_uring::{BoundedBuf, Slice}; +use tracing::error; + +use std::io; use std::sync::atomic::AtomicU64; -use tracing::*; use utils::id::TimelineId; pub struct EphemeralFile { - page_cache_file_id: page_cache::FileId, - _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, - file: VirtualFile, - len: u64, - /// An ephemeral file is append-only. - /// We keep the last page, which can still be modified, in [`Self::mutable_tail`]. - /// The other pages, which can no longer be modified, are accessed through the page cache. - mutable_tail: [u8; PAGE_SZ], + page_cache_file_id: page_cache::FileId, + bytes_written: u64, + buffered_writer: owned_buffers_io::write::BufferedWriter< + BytesMut, + size_tracking_writer::Writer, + >, + /// Gate guard is held on as long as we need to do operations in the path (delete on drop) + _gate_guard: utils::sync::gate::GateGuard, } +const TAIL_SZ: usize = 64 * 1024; + impl EphemeralFile { pub async fn create( conf: &PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + gate_guard: utils::sync::gate::GateGuard, + ctx: &RequestContext, ) -> Result { static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1); let filename_disambiguator = @@ -47,190 +56,186 @@ impl EphemeralFile { let file = VirtualFile::open_with_options( &filename, - OpenOptions::new().read(true).write(true).create(true), + virtual_file::OpenOptions::new() + .read(true) + .write(true) + .create(true), + ctx, ) .await?; + let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore + Ok(EphemeralFile { - page_cache_file_id: page_cache::next_file_id(), _tenant_shard_id: tenant_shard_id, _timeline_id: timeline_id, - file, - len: 0, - mutable_tail: [0u8; PAGE_SZ], + page_cache_file_id, + bytes_written: 0, + buffered_writer: owned_buffers_io::write::BufferedWriter::new( + size_tracking_writer::Writer::new(file), + BytesMut::with_capacity(TAIL_SZ), + ), + _gate_guard: gate_guard, }) } +} - pub(crate) fn len(&self) -> u64 { - self.len - } - - pub(crate) async fn read_blk( - &self, - blknum: u32, - ctx: &RequestContext, - ) -> Result { - let flushed_blknums = 0..self.len / PAGE_SZ as u64; - if flushed_blknums.contains(&(blknum as u64)) { - let cache = page_cache::get(); - match cache - .read_immutable_buf(self.page_cache_file_id, blknum, ctx) - .await - .map_err(|e| { - std::io::Error::new( - std::io::ErrorKind::Other, - // order path before error because error is anyhow::Error => might have many contexts - format!( - "ephemeral file: read immutable page #{}: {}: {:#}", - blknum, self.file.path, e, - ), - ) - })? { - page_cache::ReadBufResult::Found(guard) => { - return Ok(BlockLease::PageReadGuard(guard)) - } - page_cache::ReadBufResult::NotFound(mut write_guard) => { - let buf: &mut [u8] = write_guard.deref_mut(); - debug_assert_eq!(buf.len(), PAGE_SZ); - self.file - .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64) - .await?; - let read_guard = write_guard.mark_valid(); - return Ok(BlockLease::PageReadGuard(read_guard)); - } - }; - } else { - debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64); - Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail)) +impl Drop for EphemeralFile { + fn drop(&mut self) { + // unlink the file + // we are clear to do this, because we have entered a gate + let path = &self.buffered_writer.as_inner().as_inner().path; + let res = std::fs::remove_file(path); + if let Err(e) = res { + if e.kind() != std::io::ErrorKind::NotFound { + // just never log the not found errors, we cannot do anything for them; on detach + // the tenant directory is already gone. + // + // not found files might also be related to https://github.com/neondatabase/neon/issues/2442 + error!("could not remove ephemeral file '{path}': {e}"); + } } } +} - pub(crate) async fn write_blob( +impl EphemeralFile { + pub(crate) fn len(&self) -> u64 { + self.bytes_written + } + + pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId { + self.page_cache_file_id + } + + pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result, io::Error> { + let size = self.len().into_usize(); + let vec = Vec::with_capacity(size); + let (slice, nread) = self.read_exact_at_eof_ok(0, vec.slice_full(), ctx).await?; + assert_eq!(nread, size); + let vec = slice.into_inner(); + assert_eq!(vec.len(), nread); + assert_eq!(vec.capacity(), size, "we shouldn't be reallocating"); + Ok(vec) + } + + /// Returns the offset at which the first byte of the input was written, for use + /// in constructing indices over the written value. + /// + /// Panics if the write is short because there's no way we can recover from that. + /// TODO: make upstack handle this as an error. + pub(crate) async fn write_raw( &mut self, srcbuf: &[u8], ctx: &RequestContext, - ) -> Result { - struct Writer<'a> { - ephemeral_file: &'a mut EphemeralFile, - /// The block to which the next [`push_bytes`] will write. - blknum: u32, - /// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write. - off: usize, - } - impl<'a> Writer<'a> { - fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result> { - Ok(Writer { - blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32, - off: (ephemeral_file.len % PAGE_SZ as u64) as usize, - ephemeral_file, - }) - } - #[inline(always)] - async fn push_bytes( - &mut self, - src: &[u8], - ctx: &RequestContext, - ) -> Result<(), io::Error> { - let mut src_remaining = src; - while !src_remaining.is_empty() { - let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..]; - let n = min(dst_remaining.len(), src_remaining.len()); - dst_remaining[..n].copy_from_slice(&src_remaining[..n]); - self.off += n; - src_remaining = &src_remaining[n..]; - if self.off == PAGE_SZ { - match self - .ephemeral_file - .file - .write_all_at( - &self.ephemeral_file.mutable_tail, - self.blknum as u64 * PAGE_SZ as u64, - ) - .await - { - Ok(_) => { - // Pre-warm the page cache with what we just wrote. - // This isn't necessary for coherency/correctness, but it's how we've always done it. - let cache = page_cache::get(); - match cache - .read_immutable_buf( - self.ephemeral_file.page_cache_file_id, - self.blknum, - ctx, - ) - .await - { - Ok(page_cache::ReadBufResult::Found(_guard)) => { - // This function takes &mut self, so, it shouldn't be possible to reach this point. - unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum); - } - Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => { - let buf: &mut [u8] = write_guard.deref_mut(); - debug_assert_eq!(buf.len(), PAGE_SZ); - buf.copy_from_slice(&self.ephemeral_file.mutable_tail); - let _ = write_guard.mark_valid(); - // pre-warm successful - } - Err(e) => { - error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}"); - // fail gracefully, it's not the end of the world if we can't pre-warm the cache here - } - } - // Zero the buffer for re-use. - // Zeroing is critical for correcntess because the write_blob code below - // and similarly read_blk expect zeroed pages. - self.ephemeral_file.mutable_tail.fill(0); - // This block is done, move to next one. - self.blknum += 1; - self.off = 0; - } - Err(e) => { - return Err(std::io::Error::new( - ErrorKind::Other, - // order error before path because path is long and error is short - format!( - "ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}", - self.blknum, - e, - self.ephemeral_file.file.path, - ), - )); - } - } - } - } - Ok(()) - } - } + ) -> std::io::Result { + let pos = self.bytes_written; - let pos = self.len; - let mut writer = Writer::new(self)?; - - // Write the length field - if srcbuf.len() < 0x80 { - // short one-byte length header - let len_buf = [srcbuf.len() as u8]; - writer.push_bytes(&len_buf, ctx).await?; - } else { - let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32); - len_buf[0] |= 0x80; - writer.push_bytes(&len_buf, ctx).await?; - } + let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| { + std::io::Error::new( + std::io::ErrorKind::Other, + format!( + "write would grow EphemeralFile beyond u64::MAX: len={pos} writen={srcbuf_len}", + srcbuf_len = srcbuf.len(), + ), + ) + })?; // Write the payload - writer.push_bytes(srcbuf, ctx).await?; + let nwritten = self + .buffered_writer + .write_buffered_borrowed(srcbuf, ctx) + .await?; + assert_eq!( + nwritten, + srcbuf.len(), + "buffered writer has no short writes" + ); - if srcbuf.len() < 0x80 { - self.len += 1; - } else { - self.len += 4; - } - self.len += srcbuf.len() as u64; + self.bytes_written = new_bytes_written; Ok(pos) } } +impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile { + async fn read_exact_at_eof_ok<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>( + &'b self, + start: u64, + dst: tokio_epoll_uring::Slice, + ctx: &'a RequestContext, + ) -> std::io::Result<(tokio_epoll_uring::Slice, usize)> { + let file_size_tracking_writer = self.buffered_writer.as_inner(); + let flushed_offset = file_size_tracking_writer.bytes_written(); + + let buffer = self.buffered_writer.inspect_buffer(); + let buffered = &buffer[0..buffer.pending()]; + + let dst_cap = dst.bytes_total().into_u64(); + let end = { + // saturating_add is correct here because the max file size is u64::MAX, so, + // if start + dst.len() > u64::MAX, then we know it will be a short read + let mut end: u64 = start.saturating_add(dst_cap); + if end > self.bytes_written { + end = self.bytes_written; + } + end + }; + + // inclusive, exclusive + #[derive(Debug)] + struct Range(N, N); + impl Range { + fn len(&self) -> N { + if self.0 > self.1 { + N::zero() + } else { + self.1 - self.0 + } + } + } + let written_range = Range(start, std::cmp::min(end, flushed_offset)); + let buffered_range = Range(std::cmp::max(start, flushed_offset), end); + + let dst = if written_range.len() > 0 { + let file: &VirtualFile = file_size_tracking_writer.as_inner(); + let bounds = dst.bounds(); + let slice = file + .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx) + .await?; + Slice::from_buf_bounds(Slice::into_inner(slice), bounds) + } else { + dst + }; + + let dst = if buffered_range.len() > 0 { + let offset_in_buffer = buffered_range + .0 + .checked_sub(flushed_offset) + .unwrap() + .into_usize(); + let to_copy = + &buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())]; + let bounds = dst.bounds(); + let mut view = dst.slice({ + let start = written_range.len().into_usize(); + let end = start + .checked_add(buffered_range.len().into_usize()) + .unwrap(); + start..end + }); + view.as_mut_rust_slice_full_zeroed() + .copy_from_slice(to_copy); + Slice::from_buf_bounds(Slice::into_inner(view), bounds) + } else { + dst + }; + + // TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs + + Ok((dst, (end - start).into_usize())) + } +} + /// Does the given filename look like an ephemeral file? pub fn is_ephemeral_file(filename: &str) -> bool { if let Some(rest) = filename.strip_prefix("ephemeral-") { @@ -240,41 +245,13 @@ pub fn is_ephemeral_file(filename: &str) -> bool { } } -impl Drop for EphemeralFile { - fn drop(&mut self) { - // There might still be pages in the [`crate::page_cache`] for this file. - // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed. - - // unlink the file - let res = std::fs::remove_file(&self.file.path); - if let Err(e) = res { - if e.kind() != std::io::ErrorKind::NotFound { - // just never log the not found errors, we cannot do anything for them; on detach - // the tenant directory is already gone. - // - // not found files might also be related to https://github.com/neondatabase/neon/issues/2442 - error!( - "could not remove ephemeral file '{}': {}", - self.file.path, e - ); - } - } - } -} - -impl BlockReader for EphemeralFile { - fn block_cursor(&self) -> super::block_io::BlockCursor<'_> { - BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self)) - } -} - #[cfg(test)] mod tests { + use rand::Rng; + use super::*; use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; - use crate::tenant::block_io::{BlockCursor, BlockReaderRef}; - use rand::{thread_rng, RngCore}; use std::fs; use std::str::FromStr; @@ -306,61 +283,183 @@ mod tests { } #[tokio::test] - async fn test_ephemeral_blobs() -> Result<(), io::Error> { - let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?; + async fn ephemeral_file_holds_gate_open() { + const FOREVER: std::time::Duration = std::time::Duration::from_secs(5); - let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?; + let (conf, tenant_id, timeline_id, ctx) = + harness("ephemeral_file_holds_gate_open").unwrap(); - let pos_foo = file.write_blob(b"foo", &ctx).await?; - assert_eq!( - b"foo", - file.block_cursor() - .read_blob(pos_foo, &ctx) - .await? - .as_slice() - ); - let pos_bar = file.write_blob(b"bar", &ctx).await?; - assert_eq!( - b"foo", - file.block_cursor() - .read_blob(pos_foo, &ctx) - .await? - .as_slice() - ); - assert_eq!( - b"bar", - file.block_cursor() - .read_blob(pos_bar, &ctx) - .await? - .as_slice() - ); + let gate = utils::sync::gate::Gate::default(); - let mut blobs = Vec::new(); - for i in 0..10000 { - let data = Vec::from(format!("blob{}", i).as_bytes()); - let pos = file.write_blob(&data, &ctx).await?; - blobs.push((pos, data)); - } - // also test with a large blobs - for i in 0..100 { - let data = format!("blob{}", i).as_bytes().repeat(100); - let pos = file.write_blob(&data, &ctx).await?; - blobs.push((pos, data)); + let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) + .await + .unwrap(); + + let mut closing = tokio::task::spawn(async move { + gate.close().await; + }); + + // gate is entered until the ephemeral file is dropped + // do not start paused tokio-epoll-uring has a sleep loop + tokio::time::pause(); + tokio::time::timeout(FOREVER, &mut closing) + .await + .expect_err("closing cannot complete before dropping"); + + // this is a requirement of the reset_tenant functionality: we have to be able to restart a + // tenant fast, and for that, we need all tenant_dir operations be guarded by entering a gate + drop(file); + + tokio::time::timeout(FOREVER, &mut closing) + .await + .expect("closing completes right away") + .expect("closing does not panic"); + } + + #[tokio::test] + async fn test_ephemeral_file_basics() { + let (conf, tenant_id, timeline_id, ctx) = harness("test_ephemeral_file_basics").unwrap(); + + let gate = utils::sync::gate::Gate::default(); + + let mut file = + EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) + .await + .unwrap(); + + let cap = file.buffered_writer.inspect_buffer().capacity(); + + let write_nbytes = cap + cap / 2; + + let content: Vec = rand::thread_rng() + .sample_iter(rand::distributions::Standard) + .take(write_nbytes) + .collect(); + + let mut value_offsets = Vec::new(); + for i in 0..write_nbytes { + let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap(); + value_offsets.push(off); } - let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file)); - for (pos, expected) in blobs { - let actual = cursor.read_blob(pos, &ctx).await?; - assert_eq!(actual, expected); + assert!(file.len() as usize == write_nbytes); + for i in 0..write_nbytes { + assert_eq!(value_offsets[i], i.into_u64()); + let buf = Vec::with_capacity(1); + let (buf_slice, nread) = file + .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx) + .await + .unwrap(); + let buf = buf_slice.into_inner(); + assert_eq!(nread, 1); + assert_eq!(&buf, &content[i..i + 1]); } - // Test a large blob that spans multiple pages - let mut large_data = vec![0; 20000]; - thread_rng().fill_bytes(&mut large_data); - let pos_large = file.write_blob(&large_data, &ctx).await?; - let result = file.block_cursor().read_blob(pos_large, &ctx).await?; - assert_eq!(result, large_data); + let file_contents = + std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap(); + assert_eq!(file_contents, &content[0..cap]); - Ok(()) + let buffer_contents = file.buffered_writer.inspect_buffer(); + assert_eq!(buffer_contents, &content[cap..write_nbytes]); + } + + #[tokio::test] + async fn test_flushes_do_happen() { + let (conf, tenant_id, timeline_id, ctx) = harness("test_flushes_do_happen").unwrap(); + + let gate = utils::sync::gate::Gate::default(); + + let mut file = + EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) + .await + .unwrap(); + + let cap = file.buffered_writer.inspect_buffer().capacity(); + + let content: Vec = rand::thread_rng() + .sample_iter(rand::distributions::Standard) + .take(cap + cap / 2) + .collect(); + + file.write_raw(&content, &ctx).await.unwrap(); + + // assert the state is as this test expects it to be + assert_eq!( + &file.load_to_vec(&ctx).await.unwrap(), + &content[0..cap + cap / 2] + ); + let md = file + .buffered_writer + .as_inner() + .as_inner() + .path + .metadata() + .unwrap(); + assert_eq!( + md.len(), + cap.into_u64(), + "buffered writer does one write if we write 1.5x buffer capacity" + ); + assert_eq!( + &file.buffered_writer.inspect_buffer()[0..cap / 2], + &content[cap..cap + cap / 2] + ); + } + + #[tokio::test] + async fn test_read_split_across_file_and_buffer() { + // This test exercises the logic on the read path that splits the logical read + // into a read from the flushed part (= the file) and a copy from the buffered writer's buffer. + // + // This test build on the assertions in test_flushes_do_happen + + let (conf, tenant_id, timeline_id, ctx) = + harness("test_read_split_across_file_and_buffer").unwrap(); + + let gate = utils::sync::gate::Gate::default(); + + let mut file = + EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) + .await + .unwrap(); + + let cap = file.buffered_writer.inspect_buffer().capacity(); + + let content: Vec = rand::thread_rng() + .sample_iter(rand::distributions::Standard) + .take(cap + cap / 2) + .collect(); + + file.write_raw(&content, &ctx).await.unwrap(); + + let test_read = |start: usize, len: usize| { + let file = &file; + let ctx = &ctx; + let content = &content; + async move { + let (buf, nread) = file + .read_exact_at_eof_ok( + start.into_u64(), + Vec::with_capacity(len).slice_full(), + ctx, + ) + .await + .unwrap(); + assert_eq!(nread, len); + assert_eq!(&buf.into_inner(), &content[start..(start + len)]); + } + }; + + // completely within the file range + assert!(20 < cap, "test assumption"); + test_read(10, 10).await; + // border onto edge of file + test_read(cap - 10, 10).await; + // read across file and buffer + test_read(cap - 10, 20).await; + // stay from start of buffer + test_read(cap, 10).await; + // completely within buffer + test_read(cap + 10, 10).await; } } diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs new file mode 100644 index 0000000000..8b41ba1746 --- /dev/null +++ b/pageserver/src/tenant/gc_block.rs @@ -0,0 +1,213 @@ +use std::collections::HashMap; + +use utils::id::TimelineId; + +use super::remote_timeline_client::index::GcBlockingReason; + +type Storage = HashMap>; + +#[derive(Default)] +pub(crate) struct GcBlock { + /// The timelines which have current reasons to block gc. + /// + /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done + /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`. + reasons: std::sync::Mutex, + blocking: tokio::sync::Mutex<()>, +} + +impl GcBlock { + /// Start another gc iteration. + /// + /// Returns a guard to be held for the duration of gc iteration to allow synchronizing with + /// it's ending, or if not currently possible, a value describing the reasons why not. + /// + /// Cancellation safe. + pub(super) async fn start(&self) -> Result, BlockingReasons> { + let reasons = { + let g = self.reasons.lock().unwrap(); + + // TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in + // tests, we use everything. we should warn if the gc has been consecutively blocked + // for more than 1h (within single tenant session?). + BlockingReasons::clean_and_summarize(g) + }; + + if let Some(reasons) = reasons { + Err(reasons) + } else { + Ok(Guard { + _inner: self.blocking.lock().await, + }) + } + } + + pub(crate) fn summary(&self) -> Option { + let g = self.reasons.lock().unwrap(); + + BlockingReasons::summarize(&g) + } + + /// Start blocking gc for this one timeline for the given reason. + /// + /// This is not a guard based API but instead it mimics set API. The returned future will not + /// resolve until an existing gc round has completed. + /// + /// Returns true if this block was new, false if gc was already blocked for this reason. + /// + /// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will + /// keep the gc blocking reason. + pub(crate) async fn insert( + &self, + timeline: &super::Timeline, + reason: GcBlockingReason, + ) -> anyhow::Result { + let (added, uploaded) = { + let mut g = self.reasons.lock().unwrap(); + let set = g.entry(timeline.timeline_id).or_default(); + let added = set.insert(reason); + + // LOCK ORDER: intentionally hold the lock, see self.reasons. + let uploaded = timeline + .remote_client + .schedule_insert_gc_block_reason(reason)?; + + (added, uploaded) + }; + + uploaded.await?; + + // ensure that any ongoing gc iteration has completed + drop(self.blocking.lock().await); + + Ok(added) + } + + /// Remove blocking gc for this one timeline and the given reason. + pub(crate) async fn remove( + &self, + timeline: &super::Timeline, + reason: GcBlockingReason, + ) -> anyhow::Result<()> { + use std::collections::hash_map::Entry; + + super::span::debug_assert_current_span_has_tenant_and_timeline_id(); + + let (remaining_blocks, uploaded) = { + let mut g = self.reasons.lock().unwrap(); + match g.entry(timeline.timeline_id) { + Entry::Occupied(mut oe) => { + let set = oe.get_mut(); + set.remove(reason); + if set.is_empty() { + oe.remove(); + } + } + Entry::Vacant(_) => { + // we must still do the index_part.json update regardless, in case we had earlier + // been cancelled + } + } + + let remaining_blocks = g.len(); + + // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons + let uploaded = timeline + .remote_client + .schedule_remove_gc_block_reason(reason)?; + + (remaining_blocks, uploaded) + }; + uploaded.await?; + + // no need to synchronize with gc iteration again + + if remaining_blocks > 0 { + tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked"); + } else { + tracing::info!("gc is now unblocked for the tenant"); + } + + Ok(()) + } + + pub(crate) fn before_delete(&self, timeline: &super::Timeline) { + let unblocked = { + let mut g = self.reasons.lock().unwrap(); + if g.is_empty() { + return; + } + + g.remove(&timeline.timeline_id); + + BlockingReasons::clean_and_summarize(g).is_none() + }; + + if unblocked { + tracing::info!("gc is now unblocked following deletion"); + } + } + + /// Initialize with the non-deleted timelines of this tenant. + pub(crate) fn set_scanned(&self, scanned: Storage) { + let mut g = self.reasons.lock().unwrap(); + assert!(g.is_empty()); + g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty())); + + if let Some(reasons) = BlockingReasons::clean_and_summarize(g) { + tracing::info!(summary=?reasons, "initialized with gc blocked"); + } + } +} + +pub(super) struct Guard<'a> { + _inner: tokio::sync::MutexGuard<'a, ()>, +} + +#[derive(Debug)] +pub(crate) struct BlockingReasons { + timelines: usize, + reasons: enumset::EnumSet, +} + +impl std::fmt::Display for BlockingReasons { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{} timelines block for {:?}", + self.timelines, self.reasons + ) + } +} + +impl BlockingReasons { + fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option { + let mut reasons = enumset::EnumSet::empty(); + g.retain(|_key, value| { + reasons = reasons.union(*value); + !value.is_empty() + }); + if !g.is_empty() { + Some(BlockingReasons { + timelines: g.len(), + reasons, + }) + } else { + None + } + } + + fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option { + if g.is_empty() { + None + } else { + let reasons = g + .values() + .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next)); + Some(BlockingReasons { + timelines: g.len(), + reasons, + }) + } + } +} diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index c31d401e84..707233b003 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -51,7 +51,10 @@ use crate::keyspace::KeyPartitioning; use crate::repository::Key; use crate::tenant::storage_layer::InMemoryLayer; use anyhow::Result; -use std::collections::VecDeque; +use pageserver_api::keyspace::{KeySpace, KeySpaceAccum}; +use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze}; +use std::collections::{HashMap, VecDeque}; +use std::iter::Peekable; use std::ops::Range; use std::sync::Arc; use utils::lsn::Lsn; @@ -59,7 +62,7 @@ use utils::lsn::Lsn; use historic_layer_coverage::BufferedHistoricLayerCoverage; pub use historic_layer_coverage::LayerKey; -use super::storage_layer::PersistentLayerDesc; +use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc}; /// /// LayerMap tracks what layers exist on a timeline. @@ -144,11 +147,206 @@ impl Drop for BatchedUpdates<'_> { } /// Return value of LayerMap::search +#[derive(Eq, PartialEq, Debug, Hash)] pub struct SearchResult { pub layer: Arc, pub lsn_floor: Lsn, } +/// Return value of [`LayerMap::range_search`] +/// +/// Contains a mapping from a layer description to a keyspace +/// accumulator that contains all the keys which intersect the layer +/// from the original search space. Keys that were not found are accumulated +/// in a separate key space accumulator. +#[derive(Debug)] +pub struct RangeSearchResult { + pub found: HashMap, + pub not_found: KeySpaceAccum, +} + +impl RangeSearchResult { + fn new() -> Self { + Self { + found: HashMap::new(), + not_found: KeySpaceAccum::new(), + } + } +} + +/// Collector for results of range search queries on the LayerMap. +/// It should be provided with two iterators for the delta and image coverage +/// that contain all the changes for layers which intersect the range. +struct RangeSearchCollector +where + Iter: Iterator>)>, +{ + delta_coverage: Peekable, + image_coverage: Peekable, + key_range: Range, + end_lsn: Lsn, + + current_delta: Option>, + current_image: Option>, + + result: RangeSearchResult, +} + +#[derive(Debug)] +enum NextLayerType { + Delta(i128), + Image(i128), + Both(i128), +} + +impl NextLayerType { + fn next_change_at_key(&self) -> Key { + match self { + NextLayerType::Delta(at) => Key::from_i128(*at), + NextLayerType::Image(at) => Key::from_i128(*at), + NextLayerType::Both(at) => Key::from_i128(*at), + } + } +} + +impl RangeSearchCollector +where + Iter: Iterator>)>, +{ + fn new( + key_range: Range, + end_lsn: Lsn, + delta_coverage: Iter, + image_coverage: Iter, + ) -> Self { + Self { + delta_coverage: delta_coverage.peekable(), + image_coverage: image_coverage.peekable(), + key_range, + end_lsn, + current_delta: None, + current_image: None, + result: RangeSearchResult::new(), + } + } + + /// Run the collector. Collection is implemented via a two pointer algorithm. + /// One pointer tracks the start of the current range and the other tracks + /// the beginning of the next range which will overlap with the next change + /// in coverage across both image and delta. + fn collect(mut self) -> RangeSearchResult { + let next_layer_type = self.choose_next_layer_type(); + let mut current_range_start = match next_layer_type { + None => { + // No changes for the range + self.pad_range(self.key_range.clone()); + return self.result; + } + Some(layer_type) if self.key_range.end <= layer_type.next_change_at_key() => { + // Changes only after the end of the range + self.pad_range(self.key_range.clone()); + return self.result; + } + Some(layer_type) => { + // Changes for the range exist. Record anything before the first + // coverage change as not found. + let coverage_start = layer_type.next_change_at_key(); + let range_before = self.key_range.start..coverage_start; + self.pad_range(range_before); + + self.advance(&layer_type); + coverage_start + } + }; + + while current_range_start < self.key_range.end { + let next_layer_type = self.choose_next_layer_type(); + match next_layer_type { + Some(t) => { + let current_range_end = t.next_change_at_key(); + self.add_range(current_range_start..current_range_end); + current_range_start = current_range_end; + + self.advance(&t); + } + None => { + self.add_range(current_range_start..self.key_range.end); + current_range_start = self.key_range.end; + } + } + } + + self.result + } + + /// Mark a range as not found (i.e. no layers intersect it) + fn pad_range(&mut self, key_range: Range) { + if !key_range.is_empty() { + self.result.not_found.add_range(key_range); + } + } + + /// Select the appropiate layer for the given range and update + /// the collector. + fn add_range(&mut self, covered_range: Range) { + let selected = LayerMap::select_layer( + self.current_delta.clone(), + self.current_image.clone(), + self.end_lsn, + ); + + match selected { + Some(search_result) => self + .result + .found + .entry(search_result) + .or_default() + .add_range(covered_range), + None => self.pad_range(covered_range), + } + } + + /// Move to the next coverage change. + fn advance(&mut self, layer_type: &NextLayerType) { + match layer_type { + NextLayerType::Delta(_) => { + let (_, layer) = self.delta_coverage.next().unwrap(); + self.current_delta = layer; + } + NextLayerType::Image(_) => { + let (_, layer) = self.image_coverage.next().unwrap(); + self.current_image = layer; + } + NextLayerType::Both(_) => { + let (_, image_layer) = self.image_coverage.next().unwrap(); + let (_, delta_layer) = self.delta_coverage.next().unwrap(); + + self.current_image = image_layer; + self.current_delta = delta_layer; + } + } + } + + /// Pick the next coverage change: the one at the lesser key or both if they're alligned. + fn choose_next_layer_type(&mut self) -> Option { + let next_delta_at = self.delta_coverage.peek().map(|(key, _)| key); + let next_image_at = self.image_coverage.peek().map(|(key, _)| key); + + match (next_delta_at, next_image_at) { + (None, None) => None, + (Some(next_delta_at), None) => Some(NextLayerType::Delta(*next_delta_at)), + (None, Some(next_image_at)) => Some(NextLayerType::Image(*next_image_at)), + (Some(next_delta_at), Some(next_image_at)) if next_image_at < next_delta_at => { + Some(NextLayerType::Image(*next_image_at)) + } + (Some(next_delta_at), Some(next_image_at)) if next_delta_at < next_image_at => { + Some(NextLayerType::Delta(*next_delta_at)) + } + (Some(next_delta_at), Some(_)) => Some(NextLayerType::Both(*next_delta_at)), + } + } +} + impl LayerMap { /// /// Find the latest layer (by lsn.end) that covers the given @@ -186,7 +384,18 @@ impl LayerMap { let latest_delta = version.delta_coverage.query(key.to_i128()); let latest_image = version.image_coverage.query(key.to_i128()); - match (latest_delta, latest_image) { + Self::select_layer(latest_delta, latest_image, end_lsn) + } + + fn select_layer( + delta_layer: Option>, + image_layer: Option>, + end_lsn: Lsn, + ) -> Option { + assert!(delta_layer.as_ref().map_or(true, |l| l.is_delta())); + assert!(image_layer.as_ref().map_or(true, |l| !l.is_delta())); + + match (delta_layer, image_layer) { (None, None) => None, (None, Some(image)) => { let lsn_floor = image.get_lsn_range().start; @@ -223,6 +432,24 @@ impl LayerMap { } } + pub fn range_search(&self, key_range: Range, end_lsn: Lsn) -> RangeSearchResult { + let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) { + Some(version) => version, + None => { + let mut result = RangeSearchResult::new(); + result.not_found.add_range(key_range); + return result; + } + }; + + let raw_range = key_range.start.to_i128()..key_range.end.to_i128(); + let delta_changes = version.delta_coverage.range_overlaps(&raw_range); + let image_changes = version.image_coverage.range_overlaps(&raw_range); + + let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes); + collector.collect() + } + /// Start a batch of updates, applied on drop pub fn batch_update(&mut self) -> BatchedUpdates<'_> { BatchedUpdates { layer_map: self } @@ -237,7 +464,7 @@ impl LayerMap { pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) { // TODO: See #3869, resulting #4088, attempted fix and repro #4094 - if Self::is_l0(&layer_desc) { + if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) { self.l0_delta_layers.push(layer_desc.clone().into()); } @@ -256,7 +483,7 @@ impl LayerMap { self.historic .remove(historic_layer_coverage::LayerKey::from(layer_desc)); let layer_key = layer_desc.key(); - if Self::is_l0(layer_desc) { + if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) { let len_before = self.l0_delta_layers.len(); let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers); l0_delta_layers.retain(|other| other.key() != layer_key); @@ -321,6 +548,20 @@ impl LayerMap { self.historic.iter() } + /// Get a ref counted pointer for the first in memory layer that matches the provided predicate. + pub fn find_in_memory_layer(&self, mut pred: Pred) -> Option> + where + Pred: FnMut(&Arc) -> bool, + { + if let Some(open) = &self.open_layer { + if pred(open) { + return Some(open.clone()); + } + } + + self.frozen_layers.iter().rfind(|l| pred(l)).cloned() + } + /// /// Divide the whole given range of keys into sub-ranges based on the latest /// image layer that covers each range at the specified lsn (inclusive). @@ -348,7 +589,7 @@ impl LayerMap { let kr = Key::from_i128(current_key)..Key::from_i128(change_key); coverage.push((kr, current_val.take())); current_key = change_key; - current_val = change_val.clone(); + current_val.clone_from(&change_val); } // Add the final interval @@ -358,8 +599,9 @@ impl LayerMap { coverage } - pub fn is_l0(layer: &PersistentLayerDesc) -> bool { - layer.get_key_range() == (Key::MIN..Key::MAX) + /// Check if the key range resembles that of an L0 layer. + pub fn is_l0(key_range: &Range, is_delta_layer: bool) -> bool { + is_delta_layer && key_range == &(Key::MIN..Key::MAX) } /// This function determines which layers are counted in `count_deltas`: @@ -386,7 +628,7 @@ impl LayerMap { /// than just the current partition_range. pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range) -> bool { // Case 1 - if !Self::is_l0(layer) { + if !Self::is_l0(&layer.key_range, layer.is_delta) { return true; } @@ -432,12 +674,12 @@ impl LayerMap { // Loop through the delta coverage and recurse on each part for (change_key, change_val) in version.delta_coverage.range(start..end) { // If there's a relevant delta in this part, add 1 and recurse down - if let Some(val) = current_val { + if let Some(val) = ¤t_val { if val.get_lsn_range().end > lsn.start { let kr = Key::from_i128(current_key)..Key::from_i128(change_key); let lr = lsn.start..val.get_lsn_range().start; if !kr.is_empty() { - let base_count = Self::is_reimage_worthy(&val, key) as usize; + let base_count = Self::is_reimage_worthy(val, key) as usize; let new_limit = limit.map(|l| l - base_count); let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit); max_stacked_deltas = std::cmp::max( @@ -449,17 +691,17 @@ impl LayerMap { } current_key = change_key; - current_val = change_val.clone(); + current_val.clone_from(&change_val); } // Consider the last part - if let Some(val) = current_val { + if let Some(val) = ¤t_val { if val.get_lsn_range().end > lsn.start { let kr = Key::from_i128(current_key)..Key::from_i128(end); let lr = lsn.start..val.get_lsn_range().start; if !kr.is_empty() { - let base_count = Self::is_reimage_worthy(&val, key) as usize; + let base_count = Self::is_reimage_worthy(val, key) as usize; let new_limit = limit.map(|l| l - base_count); let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit); max_stacked_deltas = std::cmp::max( @@ -604,8 +846,8 @@ impl LayerMap { } /// Return all L0 delta layers - pub fn get_level0_deltas(&self) -> Result>> { - Ok(self.l0_delta_layers.to_vec()) + pub fn level0_deltas(&self) -> &Vec> { + &self.l0_delta_layers } /// debugging function to print out the contents of the layer map @@ -630,4 +872,602 @@ impl LayerMap { println!("End dump LayerMap"); Ok(()) } + + /// `read_points` represent the tip of a timeline and any branch points, i.e. the places + /// where we expect to serve reads. + /// + /// This function is O(N) and should be called infrequently. The caller is responsible for + /// looking up and updating the Layer objects for these layer descriptors. + pub fn get_visibility( + &self, + mut read_points: Vec, + ) -> ( + Vec<(Arc, LayerVisibilityHint)>, + KeySpace, + ) { + // This is like a KeySpace, but this type is intended for efficient unions with image layer ranges, whereas + // KeySpace is intended to be composed statically and iterated over. + struct KeyShadow { + // Map of range start to range end + inner: RangeSetBlaze, + } + + impl KeyShadow { + fn new() -> Self { + Self { + inner: Default::default(), + } + } + + fn contains(&self, range: Range) -> bool { + let range_incl = range.start.to_i128()..=range.end.to_i128() - 1; + self.inner.is_superset(&RangeSetBlaze::from_sorted_disjoint( + CheckSortedDisjoint::from([range_incl]), + )) + } + + /// Add the input range to the keys covered by self. + /// + /// Return true if inserting this range covered some keys that were previously not covered + fn cover(&mut self, insert: Range) -> bool { + let range_incl = insert.start.to_i128()..=insert.end.to_i128() - 1; + self.inner.ranges_insert(range_incl) + } + + fn reset(&mut self) { + self.inner = Default::default(); + } + + fn to_keyspace(&self) -> KeySpace { + let mut accum = KeySpaceAccum::new(); + for range_incl in self.inner.ranges() { + let range = Range { + start: Key::from_i128(*range_incl.start()), + end: Key::from_i128(range_incl.end() + 1), + }; + accum.add_range(range) + } + + accum.to_keyspace() + } + } + + // The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow, + // and a ReadPoint + read_points.sort_by_key(|rp| rp.0); + let mut shadow = KeyShadow::new(); + + // We will interleave all our read points and layers into a sorted collection + enum Item { + ReadPoint { lsn: Lsn }, + Layer(Arc), + } + + let mut items = Vec::with_capacity(self.historic.len() + read_points.len()); + items.extend(self.iter_historic_layers().map(Item::Layer)); + items.extend( + read_points + .into_iter() + .map(|rp| Item::ReadPoint { lsn: rp }), + ); + + // Ordering: we want to iterate like this: + // 1. Highest LSNs first + // 2. Consider images before deltas if they end at the same LSNs (images cover deltas) + // 3. Consider ReadPoints before image layers if they're at the same LSN (readpoints make that image visible) + items.sort_by_key(|item| { + std::cmp::Reverse(match item { + Item::Layer(layer) => { + if layer.is_delta() { + (Lsn(layer.get_lsn_range().end.0 - 1), 0) + } else { + (layer.image_layer_lsn(), 1) + } + } + Item::ReadPoint { lsn } => (*lsn, 2), + }) + }); + + let mut results = Vec::with_capacity(self.historic.len()); + + let mut maybe_covered_deltas: Vec> = Vec::new(); + + for item in items { + let (reached_lsn, is_readpoint) = match &item { + Item::ReadPoint { lsn } => (lsn, true), + Item::Layer(layer) => (&layer.lsn_range.start, false), + }; + maybe_covered_deltas.retain(|d| { + if *reached_lsn >= d.lsn_range.start && is_readpoint { + // We encountered a readpoint within the delta layer: it is visible + + results.push((d.clone(), LayerVisibilityHint::Visible)); + false + } else if *reached_lsn < d.lsn_range.start { + // We passed the layer's range without encountering a read point: it is not visible + results.push((d.clone(), LayerVisibilityHint::Covered)); + false + } else { + // We're still in the delta layer: continue iterating + true + } + }); + + match item { + Item::ReadPoint { lsn: _lsn } => { + // TODO: propagate the child timeline's shadow from their own run of this function, so that we don't have + // to assume that the whole key range is visible at the branch point. + shadow.reset(); + } + Item::Layer(layer) => { + let visibility = if layer.is_delta() { + if shadow.contains(layer.get_key_range()) { + // If a layer isn't visible based on current state, we must defer deciding whether + // it is truly not visible until we have advanced past the delta's range: we might + // encounter another branch point within this delta layer's LSN range. + maybe_covered_deltas.push(layer); + continue; + } else { + LayerVisibilityHint::Visible + } + } else { + let modified = shadow.cover(layer.get_key_range()); + if modified { + // An image layer in a region which wasn't fully covered yet: this layer is visible, but layers below it will be covered + LayerVisibilityHint::Visible + } else { + // An image layer in a region that was already covered + LayerVisibilityHint::Covered + } + }; + + results.push((layer, visibility)); + } + } + } + + // Drain any remaining maybe_covered deltas + results.extend( + maybe_covered_deltas + .into_iter() + .map(|d| (d, LayerVisibilityHint::Covered)), + ); + + (results, shadow.to_keyspace()) + } +} + +#[cfg(test)] +mod tests { + use crate::tenant::{storage_layer::LayerName, IndexPart}; + use pageserver_api::{ + key::DBDIR_KEY, + keyspace::{KeySpace, KeySpaceRandomAccum}, + }; + use std::{collections::HashMap, path::PathBuf}; + use utils::{ + id::{TenantId, TimelineId}, + shard::TenantShardId, + }; + + use super::*; + + #[derive(Clone)] + struct LayerDesc { + key_range: Range, + lsn_range: Range, + is_delta: bool, + } + + fn create_layer_map(layers: Vec) -> LayerMap { + let mut layer_map = LayerMap::default(); + + for layer in layers { + layer_map.insert_historic_noflush(PersistentLayerDesc::new_test( + layer.key_range, + layer.lsn_range, + layer.is_delta, + )); + } + + layer_map.flush_updates(); + layer_map + } + + fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) { + assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace()); + let lhs: HashMap = lhs + .found + .into_iter() + .map(|(search_result, accum)| (search_result, accum.to_keyspace())) + .collect(); + let rhs: HashMap = rhs + .found + .into_iter() + .map(|(search_result, accum)| (search_result, accum.to_keyspace())) + .collect(); + + assert_eq!(lhs, rhs); + } + + #[cfg(test)] + fn brute_force_range_search( + layer_map: &LayerMap, + key_range: Range, + end_lsn: Lsn, + ) -> RangeSearchResult { + let mut range_search_result = RangeSearchResult::new(); + + let mut key = key_range.start; + while key != key_range.end { + let res = layer_map.search(key, end_lsn); + match res { + Some(res) => { + range_search_result + .found + .entry(res) + .or_default() + .add_key(key); + } + None => { + range_search_result.not_found.add_key(key); + } + } + + key = key.next(); + } + + range_search_result + } + + #[test] + fn ranged_search_on_empty_layer_map() { + let layer_map = LayerMap::default(); + let range = Key::from_i128(100)..Key::from_i128(200); + + let res = layer_map.range_search(range.clone(), Lsn(100)); + assert_eq!( + res.not_found.to_keyspace(), + KeySpace { + ranges: vec![range] + } + ); + } + + #[test] + fn ranged_search() { + let layers = vec![ + LayerDesc { + key_range: Key::from_i128(15)..Key::from_i128(50), + lsn_range: Lsn(0)..Lsn(5), + is_delta: false, + }, + LayerDesc { + key_range: Key::from_i128(10)..Key::from_i128(20), + lsn_range: Lsn(5)..Lsn(20), + is_delta: true, + }, + LayerDesc { + key_range: Key::from_i128(15)..Key::from_i128(25), + lsn_range: Lsn(20)..Lsn(30), + is_delta: true, + }, + LayerDesc { + key_range: Key::from_i128(35)..Key::from_i128(40), + lsn_range: Lsn(25)..Lsn(35), + is_delta: true, + }, + LayerDesc { + key_range: Key::from_i128(35)..Key::from_i128(40), + lsn_range: Lsn(35)..Lsn(40), + is_delta: false, + }, + ]; + + let layer_map = create_layer_map(layers.clone()); + for start in 0..60 { + for end in (start + 1)..60 { + let range = Key::from_i128(start)..Key::from_i128(end); + let result = layer_map.range_search(range.clone(), Lsn(100)); + let expected = brute_force_range_search(&layer_map, range, Lsn(100)); + + assert_range_search_result_eq(result, expected); + } + } + } + + #[test] + fn layer_visibility_basic() { + // A simple synthetic input, as a smoke test. + let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); + let timeline_id = TimelineId::generate(); + let mut layer_map = LayerMap::default(); + let mut updates = layer_map.batch_update(); + + const FAKE_LAYER_SIZE: u64 = 1024; + + let inject_delta = |updates: &mut BatchedUpdates, + key_start: i128, + key_end: i128, + lsn_start: u64, + lsn_end: u64| { + let desc = PersistentLayerDesc::new_delta( + tenant_shard_id, + timeline_id, + Range { + start: Key::from_i128(key_start), + end: Key::from_i128(key_end), + }, + Range { + start: Lsn(lsn_start), + end: Lsn(lsn_end), + }, + 1024, + ); + updates.insert_historic(desc.clone()); + desc + }; + + let inject_image = + |updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn: u64| { + let desc = PersistentLayerDesc::new_img( + tenant_shard_id, + timeline_id, + Range { + start: Key::from_i128(key_start), + end: Key::from_i128(key_end), + }, + Lsn(lsn), + FAKE_LAYER_SIZE, + ); + updates.insert_historic(desc.clone()); + desc + }; + + // + // Construct our scenario: the following lines go in backward-LSN order, constructing the various scenarios + // we expect to handle. You can follow these examples through in the same order as they would be processed + // by the function under test. + // + + let mut read_points = vec![Lsn(1000)]; + + // A delta ahead of any image layer + let ahead_layer = inject_delta(&mut updates, 10, 20, 101, 110); + + // An image layer is visible and covers some layers beneath itself + let visible_covering_img = inject_image(&mut updates, 5, 25, 99); + + // A delta layer covered by the image layer: should be covered + let covered_delta = inject_delta(&mut updates, 10, 20, 90, 100); + + // A delta layer partially covered by an image layer: should be visible + let partially_covered_delta = inject_delta(&mut updates, 1, 7, 90, 100); + + // A delta layer not covered by an image layer: should be visible + let not_covered_delta = inject_delta(&mut updates, 1, 4, 90, 100); + + // An image layer covered by the image layer above: should be covered + let covered_image = inject_image(&mut updates, 10, 20, 89); + + // An image layer partially covered by an image layer: should be visible + let partially_covered_image = inject_image(&mut updates, 1, 7, 89); + + // An image layer not covered by an image layer: should be visible + let not_covered_image = inject_image(&mut updates, 1, 4, 89); + + // A read point: this will make subsequent layers below here visible, even if there are + // more recent layers covering them. + read_points.push(Lsn(80)); + + // A delta layer covered by an earlier image layer, but visible to a readpoint below that covering layer + let covered_delta_below_read_point = inject_delta(&mut updates, 10, 20, 70, 79); + + // A delta layer whose end LSN is covered, but where a read point is present partway through its LSN range: + // the read point should make it visible, even though its end LSN is covered + let covering_img_between_read_points = inject_image(&mut updates, 10, 20, 69); + let covered_delta_between_read_points = inject_delta(&mut updates, 10, 15, 67, 69); + read_points.push(Lsn(65)); + let covered_delta_intersects_read_point = inject_delta(&mut updates, 15, 20, 60, 69); + + let visible_img_after_last_read_point = inject_image(&mut updates, 10, 20, 65); + + updates.flush(); + + let (layer_visibilities, shadow) = layer_map.get_visibility(read_points); + let layer_visibilities = layer_visibilities.into_iter().collect::>(); + + assert_eq!( + layer_visibilities.get(&ahead_layer), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&visible_covering_img), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&covered_delta), + Some(&LayerVisibilityHint::Covered) + ); + assert_eq!( + layer_visibilities.get(&partially_covered_delta), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(¬_covered_delta), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&covered_image), + Some(&LayerVisibilityHint::Covered) + ); + assert_eq!( + layer_visibilities.get(&partially_covered_image), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(¬_covered_image), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&covered_delta_below_read_point), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&covering_img_between_read_points), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&covered_delta_between_read_points), + Some(&LayerVisibilityHint::Covered) + ); + assert_eq!( + layer_visibilities.get(&covered_delta_intersects_read_point), + Some(&LayerVisibilityHint::Visible) + ); + assert_eq!( + layer_visibilities.get(&visible_img_after_last_read_point), + Some(&LayerVisibilityHint::Visible) + ); + + // Shadow should include all the images below the last read point + let expected_shadow = KeySpace { + ranges: vec![Key::from_i128(10)..Key::from_i128(20)], + }; + assert_eq!(shadow, expected_shadow); + } + + fn fixture_path(relative: &str) -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative) + } + + #[test] + fn layer_visibility_realistic() { + // Load a large example layermap + let index_raw = std::fs::read_to_string(fixture_path( + "test_data/indices/mixed_workload/index_part.json", + )) + .unwrap(); + let index: IndexPart = serde_json::from_str::(&index_raw).unwrap(); + + let tenant_id = TenantId::generate(); + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + let timeline_id = TimelineId::generate(); + + let mut layer_map = LayerMap::default(); + let mut updates = layer_map.batch_update(); + for (layer_name, layer_metadata) in index.layer_metadata { + let layer_desc = match layer_name { + LayerName::Image(layer_name) => PersistentLayerDesc { + key_range: layer_name.key_range.clone(), + lsn_range: layer_name.lsn_as_range(), + tenant_shard_id, + timeline_id, + is_delta: false, + file_size: layer_metadata.file_size, + }, + LayerName::Delta(layer_name) => PersistentLayerDesc { + key_range: layer_name.key_range, + lsn_range: layer_name.lsn_range, + tenant_shard_id, + timeline_id, + is_delta: true, + file_size: layer_metadata.file_size, + }, + }; + updates.insert_historic(layer_desc); + } + updates.flush(); + + let read_points = vec![index.metadata.disk_consistent_lsn()]; + let (layer_visibilities, shadow) = layer_map.get_visibility(read_points); + for (layer_desc, visibility) in &layer_visibilities { + tracing::info!("{layer_desc:?}: {visibility:?}"); + eprintln!("{layer_desc:?}: {visibility:?}"); + } + + // The shadow should be non-empty, since there were some image layers + assert!(!shadow.ranges.is_empty()); + + // At least some layers should be marked covered + assert!(layer_visibilities + .iter() + .any(|i| matches!(i.1, LayerVisibilityHint::Covered))); + + let layer_visibilities = layer_visibilities.into_iter().collect::>(); + + // Brute force validation: a layer should be marked covered if and only if there are image layers above it in LSN order which cover it + for (layer_desc, visible) in &layer_visibilities { + let mut coverage = KeySpaceRandomAccum::new(); + let mut covered_by = Vec::new(); + + for other_layer in layer_map.iter_historic_layers() { + if &other_layer == layer_desc { + continue; + } + if !other_layer.is_delta() + && other_layer.image_layer_lsn() >= Lsn(layer_desc.get_lsn_range().end.0 - 1) + && other_layer.key_range.start <= layer_desc.key_range.end + && layer_desc.key_range.start <= other_layer.key_range.end + { + coverage.add_range(other_layer.get_key_range()); + covered_by.push((*other_layer).clone()); + } + } + let coverage = coverage.to_keyspace(); + + let expect_visible = if coverage.ranges.len() == 1 + && coverage.contains(&layer_desc.key_range.start) + && coverage.contains(&Key::from_i128(layer_desc.key_range.end.to_i128() - 1)) + { + LayerVisibilityHint::Covered + } else { + LayerVisibilityHint::Visible + }; + + if expect_visible != *visible { + eprintln!( + "Layer {}..{} @ {}..{} (delta={}) is {visible:?}, should be {expect_visible:?}", + layer_desc.key_range.start, + layer_desc.key_range.end, + layer_desc.lsn_range.start, + layer_desc.lsn_range.end, + layer_desc.is_delta() + ); + if expect_visible == LayerVisibilityHint::Covered { + eprintln!("Covered by:"); + for other in covered_by { + eprintln!( + " {}..{} @ {}", + other.get_key_range().start, + other.get_key_range().end, + other.image_layer_lsn() + ); + } + if let Some(range) = coverage.ranges.first() { + eprintln!( + "Total coverage from contributing layers: {}..{}", + range.start, range.end + ); + } else { + eprintln!( + "Total coverage from contributing layers: {:?}", + coverage.ranges + ); + } + } + } + assert_eq!(expect_visible, *visible); + } + + // Sanity: the layer that holds latest data for the DBDIR key should always be visible + // (just using this key as a key that will always exist for any layermap fixture) + let dbdir_layer = layer_map + .search(DBDIR_KEY, index.metadata.disk_consistent_lsn()) + .unwrap(); + assert!(matches!( + layer_visibilities.get(&dbdir_layer.layer).unwrap(), + LayerVisibilityHint::Visible + )); + } } diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs index 347490c1ba..136f68bc36 100644 --- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -521,6 +521,10 @@ impl BufferedHistoricLayerCoverage { Ok(&self.historic_coverage) } + + pub(crate) fn len(&self) -> usize { + self.layers.len() + } } #[test] diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs index 1d9101d3d1..cf0085c071 100644 --- a/pageserver/src/tenant/layer_map/layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/layer_coverage.rs @@ -129,6 +129,42 @@ impl LayerCoverage { .map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone()))) } + /// Returns an iterator which includes all coverage changes for layers that intersect + /// with the provided range. + pub fn range_overlaps( + &self, + key_range: &Range, + ) -> impl Iterator)> + '_ + where + Value: Eq, + { + let first_change = self.query(key_range.start); + match first_change { + Some(change) => { + // If the start of the range is covered, we have to deal with two cases: + // 1. Start of the range is aligned with the start of a layer. + // In this case the return of `self.range` will contain the layer which aligns with the start of the key range. + // We advance said iterator to avoid duplicating the first change. + // 2. Start of the range is not aligned with the start of a layer. + let range = key_range.start..key_range.end; + let mut range_coverage = self.range(range).peekable(); + if range_coverage + .peek() + .is_some_and(|c| c.1.as_ref() == Some(&change)) + { + range_coverage.next(); + } + itertools::Either::Left( + std::iter::once((key_range.start, Some(change))).chain(range_coverage), + ) + } + None => { + let range = key_range.start..key_range.end; + let coverage = self.range(range); + itertools::Either::Right(coverage) + } + } + } /// O(1) clone pub fn clone(&self) -> Self { Self { diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 6fb86c65e2..24440d4b35 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -1,42 +1,62 @@ -//! Every image of a certain timeline from [`crate::tenant::Tenant`] -//! has a metadata that needs to be stored persistently. +//! Describes the legacy now hopefully no longer modified per-timeline metadata. //! -//! Later, the file gets used in [`remote_timeline_client`] as a part of -//! external storage import and export operations. +//! It is stored in `index_part.json` managed by [`remote_timeline_client`]. For many tenants and +//! their timelines, this struct and its original serialization format is still needed because +//! they were written a long time ago. //! -//! The module contains all structs and related helper methods related to timeline metadata. +//! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json +//! versioning. +//! +//! To clean up this module we need to migrate all index_part.json files to a later version. +//! While doing this, we need to be mindful about s3 based recovery as well, so it might take +//! however long we keep the old versions to be able to delete the old code. After that, we can +//! remove everything else than [`TimelineMetadataBodyV2`], rename it as `TimelineMetadata` and +//! move it to `index.rs`. Before doing all of this, we need to keep the structures for backwards +//! compatibility. //! //! [`remote_timeline_client`]: super::remote_timeline_client +//! [`IndexPart`]: super::remote_timeline_client::index::IndexPart -use std::io::{self}; - -use anyhow::{ensure, Context}; -use pageserver_api::shard::TenantShardId; -use serde::{de::Error, Deserialize, Serialize, Serializer}; -use thiserror::Error; +use anyhow::ensure; +use serde::{Deserialize, Serialize}; use utils::bin_ser::SerializeError; -use utils::crashsafe::path_with_suffix_extension; use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn}; -use crate::config::PageServerConf; -use crate::virtual_file::VirtualFile; -use crate::TEMP_FILE_SUFFIX; - /// Use special format number to enable backward compatibility. const METADATA_FORMAT_VERSION: u16 = 4; /// Previous supported format versions. +/// +/// In practice, none of these should remain, all are [`METADATA_FORMAT_VERSION`], but confirming +/// that requires a scrubber run which is yet to be done. const METADATA_OLD_FORMAT_VERSION: u16 = 3; -/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic. +/// When the file existed on disk we assumed that a write of up to METADATA_MAX_SIZE bytes is atomic. /// /// This is the same assumption that PostgreSQL makes with the control file, +/// /// see PG_CONTROL_MAX_SAFE_SIZE const METADATA_MAX_SIZE: usize = 512; -/// Metadata stored on disk for each timeline +/// Legacy metadata stored as a component of `index_part.json` per timeline. /// -/// The fields correspond to the values we hold in memory, in Timeline. +/// Do not make new changes to this type or the module. In production, we have two different kinds +/// of serializations of this type: bincode and json. Bincode version reflects what used to be +/// stored on disk in earlier versions and does internal crc32 checksumming. +/// +/// This type should not implement `serde::Serialize` or `serde::Deserialize` because there would +/// be a confusion whether you want the old version ([`TimelineMetadata::from_bytes`]) or the modern +/// as-exists in `index_part.json` ([`self::modern_serde`]). +/// +/// ```compile_fail +/// #[derive(serde::Serialize)] +/// struct DoNotDoThis(pageserver::tenant::metadata::TimelineMetadata); +/// ``` +/// +/// ```compile_fail +/// #[derive(serde::Deserialize)] +/// struct NeitherDoThis(pageserver::tenant::metadata::TimelineMetadata); +/// ``` #[derive(Debug, Clone, PartialEq, Eq)] pub struct TimelineMetadata { hdr: TimelineMetadataHeader, @@ -49,7 +69,50 @@ struct TimelineMetadataHeader { size: u16, // size of serialized metadata format_version: u16, // metadata format version (used for compatibility checks) } -const METADATA_HDR_SIZE: usize = std::mem::size_of::(); + +impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader { + type Error = Crc32CalculationFailed; + + fn try_from(value: &TimelineMetadataBodyV2) -> Result { + #[derive(Default)] + struct Crc32Sink { + crc: u32, + count: usize, + } + + impl std::io::Write for Crc32Sink { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.crc = crc32c::crc32c_append(self.crc, buf); + self.count += buf.len(); + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } + } + + // jump through hoops to calculate the crc32 so that TimelineMetadata::ne works + // across serialization versions + let mut sink = Crc32Sink::default(); + ::ser_into(value, &mut sink) + .map_err(Crc32CalculationFailed)?; + + let size = METADATA_HDR_SIZE + sink.count; + + Ok(TimelineMetadataHeader { + checksum: sink.crc, + size: size as u16, + format_version: METADATA_FORMAT_VERSION, + }) + } +} + +#[derive(thiserror::Error, Debug)] +#[error("re-serializing for crc32 failed")] +struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError); + +const METADATA_HDR_SIZE: usize = size_of::(); #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] struct TimelineMetadataBodyV2 { @@ -120,6 +183,12 @@ impl TimelineMetadata { } } + #[cfg(test)] + pub(crate) fn with_recalculated_checksum(mut self) -> anyhow::Result { + self.hdr = TimelineMetadataHeader::try_from(&self.body)?; + Ok(self) + } + fn upgrade_timeline_metadata(metadata_bytes: &[u8]) -> anyhow::Result { let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?; @@ -216,6 +285,27 @@ impl TimelineMetadata { self.body.ancestor_lsn } + /// When reparenting, the `ancestor_lsn` does not change. + /// + /// Returns true if anything was changed. + pub fn reparent(&mut self, timeline: &TimelineId) { + assert!(self.body.ancestor_timeline.is_some()); + // no assertion for redoing this: it's fine, we may have to repeat this multiple times over + self.body.ancestor_timeline = Some(*timeline); + } + + /// Returns true if anything was changed + pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) { + if let Some(ancestor) = self.body.ancestor_timeline { + assert_eq!(ancestor, branchpoint.0); + } + if self.body.ancestor_lsn != Lsn(0) { + assert_eq!(self.body.ancestor_lsn, branchpoint.1); + } + self.body.ancestor_timeline = None; + self.body.ancestor_lsn = Lsn(0); + } + pub fn latest_gc_cutoff_lsn(&self) -> Lsn { self.body.latest_gc_cutoff_lsn } @@ -244,65 +334,123 @@ impl TimelineMetadata { let bytes = instance.to_bytes().unwrap(); Self::from_bytes(&bytes).unwrap() } -} -impl<'de> Deserialize<'de> for TimelineMetadata { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - let bytes = Vec::::deserialize(deserializer)?; - Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{e}"))) + pub(crate) fn apply(&mut self, update: &MetadataUpdate) { + self.body.disk_consistent_lsn = update.disk_consistent_lsn; + self.body.prev_record_lsn = update.prev_record_lsn; + self.body.latest_gc_cutoff_lsn = update.latest_gc_cutoff_lsn; } } -impl Serialize for TimelineMetadata { - fn serialize(&self, serializer: S) -> Result +pub(crate) mod modern_serde { + use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader}; + use serde::{Deserialize, Serialize}; + + pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result where - S: Serializer, + D: serde::de::Deserializer<'de>, { - let bytes = self - .to_bytes() - .map_err(|e| serde::ser::Error::custom(format!("{e}")))?; - bytes.serialize(serializer) + // for legacy reasons versions 1-5 had TimelineMetadata serialized as a Vec field with + // BeSer. + struct Visitor; + + impl<'d> serde::de::Visitor<'d> for Visitor { + type Value = TimelineMetadata; + + fn expecting(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.write_str("BeSer bytes or json structure") + } + + fn visit_seq
(self, seq: A) -> Result + where + A: serde::de::SeqAccess<'d>, + { + use serde::de::Error; + let de = serde::de::value::SeqAccessDeserializer::new(seq); + Vec::::deserialize(de) + .map(|v| TimelineMetadata::from_bytes(&v).map_err(A::Error::custom))? + } + + fn visit_map(self, map: A) -> Result + where + A: serde::de::MapAccess<'d>, + { + use serde::de::Error; + + let de = serde::de::value::MapAccessDeserializer::new(map); + let body = TimelineMetadataBodyV2::deserialize(de)?; + let hdr = TimelineMetadataHeader::try_from(&body).map_err(A::Error::custom)?; + + Ok(TimelineMetadata { hdr, body }) + } + } + + deserializer.deserialize_any(Visitor) + } + + pub(crate) fn serialize( + metadata: &TimelineMetadata, + serializer: S, + ) -> Result + where + S: serde::Serializer, + { + // header is not needed, upon reading we've upgraded all v1 to v2 + metadata.body.serialize(serializer) + } + + #[test] + fn deserializes_bytes_as_well_as_equivalent_body_v2() { + #[derive(serde::Deserialize, serde::Serialize)] + struct Wrapper( + #[serde(deserialize_with = "deserialize", serialize_with = "serialize")] + TimelineMetadata, + ); + + let too_many_bytes = "[216,111,252,208,0,54,0,4,0,0,0,0,1,73,253,144,1,0,0,0,0,1,73,253,24,0,0,0,0,0,0,0,0,0,0,0,0,0,1,73,253,24,0,0,0,0,1,73,253,24,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]"; + + let wrapper_from_bytes = serde_json::from_str::(too_many_bytes).unwrap(); + + let serialized = serde_json::to_value(&wrapper_from_bytes).unwrap(); + + assert_eq!( + serialized, + serde_json::json! {{ + "disk_consistent_lsn": "0/149FD90", + "prev_record_lsn": "0/149FD18", + "ancestor_timeline": null, + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/149FD18", + "initdb_lsn": "0/149FD18", + "pg_version": 15 + }} + ); + + let wrapper_from_json = serde_json::value::from_value::(serialized).unwrap(); + + assert_eq!(wrapper_from_bytes.0, wrapper_from_json.0); } } -/// Save timeline metadata to file -#[tracing::instrument(skip_all, fields(%tenant_id=tenant_shard_id.tenant_id, %shard_id=tenant_shard_id.shard_slug(), %timeline_id))] -pub async fn save_metadata( - conf: &'static PageServerConf, - tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, - data: &TimelineMetadata, -) -> anyhow::Result<()> { - let path = conf.metadata_path(tenant_shard_id, timeline_id); - let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX); - let metadata_bytes = data.to_bytes().context("serialize metadata")?; - VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes) - .await - .context("write metadata")?; - Ok(()) +/// Parts of the metadata which are regularly modified. +pub(crate) struct MetadataUpdate { + disk_consistent_lsn: Lsn, + prev_record_lsn: Option, + latest_gc_cutoff_lsn: Lsn, } -#[derive(Error, Debug)] -pub enum LoadMetadataError { - #[error(transparent)] - Read(#[from] io::Error), - - #[error(transparent)] - Decode(#[from] anyhow::Error), -} - -pub fn load_metadata( - conf: &'static PageServerConf, - tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, -) -> Result { - let metadata_path = conf.metadata_path(tenant_shard_id, timeline_id); - let metadata_bytes = std::fs::read(metadata_path)?; - - Ok(TimelineMetadata::from_bytes(&metadata_bytes)?) +impl MetadataUpdate { + pub(crate) fn new( + disk_consistent_lsn: Lsn, + prev_record_lsn: Option, + latest_gc_cutoff_lsn: Lsn, + ) -> Self { + Self { + disk_consistent_lsn, + prev_record_lsn, + latest_gc_cutoff_lsn, + } + } } #[cfg(test)] @@ -404,59 +552,6 @@ mod tests { ); } - #[test] - fn test_metadata_bincode_serde() { - let original_metadata = TimelineMetadata::new( - Lsn(0x200), - Some(Lsn(0x100)), - Some(TIMELINE_ID), - Lsn(0), - Lsn(0), - Lsn(0), - // Any version will do here, so use the default - crate::DEFAULT_PG_VERSION, - ); - let metadata_bytes = original_metadata - .to_bytes() - .expect("Cannot create bytes array from metadata"); - - let metadata_bincode_be_bytes = original_metadata - .ser() - .expect("Cannot serialize the metadata"); - - // 8 bytes for the length of the vector - assert_eq!(metadata_bincode_be_bytes.len(), 8 + metadata_bytes.len()); - - let expected_bincode_bytes = { - let mut temp = vec![]; - let len_bytes = metadata_bytes.len().to_be_bytes(); - temp.extend_from_slice(&len_bytes); - temp.extend_from_slice(&metadata_bytes); - temp - }; - assert_eq!(metadata_bincode_be_bytes, expected_bincode_bytes); - - let deserialized_metadata = TimelineMetadata::des(&metadata_bincode_be_bytes).unwrap(); - // Deserialized metadata has the metadata header, which is different from the serialized one. - // Reference: TimelineMetaData::to_bytes() - let expected_metadata = { - let mut temp_metadata = original_metadata; - let body_bytes = temp_metadata - .body - .ser() - .expect("Cannot serialize the metadata body"); - let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); - let hdr = TimelineMetadataHeader { - size: metadata_size as u16, - format_version: METADATA_FORMAT_VERSION, - checksum: crc32c::crc32c(&body_bytes), - }; - temp_metadata.hdr = hdr; - temp_metadata - }; - assert_eq!(deserialized_metadata, expected_metadata); - } - #[test] fn test_metadata_bincode_serde_ensure_roundtrip() { let original_metadata = TimelineMetadata::new( @@ -470,10 +565,8 @@ mod tests { crate::DEFAULT_PG_VERSION, ); let expected_bytes = vec![ - /* bincode length encoding bytes */ - 0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector /* TimelineMetadataHeader */ - 4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2) + 74, 104, 158, 105, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2) /* TimelineMetadataBodyV2 */ 0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes) 1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes) @@ -482,7 +575,7 @@ mod tests { 0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes) 0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes) 0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes) - 0, 0, 0, 15, // pg_version (4 bytes) + 0, 0, 0, 16, // pg_version (4 bytes) /* padding bytes */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -501,7 +594,7 @@ mod tests { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; - let metadata_ser_bytes = original_metadata.ser().unwrap(); + let metadata_ser_bytes = original_metadata.to_bytes().unwrap(); assert_eq!(metadata_ser_bytes, expected_bytes); let expected_metadata = { @@ -519,7 +612,7 @@ mod tests { temp_metadata.hdr = hdr; temp_metadata }; - let des_metadata = TimelineMetadata::des(&metadata_ser_bytes).unwrap(); + let des_metadata = TimelineMetadata::from_bytes(&metadata_ser_bytes).unwrap(); assert_eq!(des_metadata, expected_metadata); } } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 32535e0134..2104f41531 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2,18 +2,23 @@ //! page server. use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; +use futures::StreamExt; +use itertools::Itertools; use pageserver_api::key::Key; -use pageserver_api::models::ShardParameters; -use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId}; +use pageserver_api::models::LocationConfigMode; +use pageserver_api::shard::{ + ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId, +}; +use pageserver_api::upcall_api::ReAttachResponseTenant; use rand::{distributions::Alphanumeric, Rng}; use std::borrow::Cow; use std::cmp::Ordering; -use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::ops::Deref; use std::sync::Arc; -use std::time::{Duration, Instant}; +use std::time::Duration; +use sysinfo::SystemExt; use tokio::fs; -use utils::timeout::{timeout_cancellable, TimeoutCancellableError}; use anyhow::Context; use once_cell::sync::Lazy; @@ -21,8 +26,7 @@ use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; -use remote_storage::GenericRemoteStorage; -use utils::crashsafe; +use utils::{backoff, completion, crashsafe}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; @@ -30,25 +34,28 @@ use crate::control_plane_client::{ ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError, }; use crate::deletion_queue::DeletionQueueClient; +use crate::http::routes::ACTIVE_TENANT_TIMEOUT; use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; -use crate::task_mgr::{self, TaskKind}; +use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::config::{ AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig, - TenantConfOpt, }; -use crate::tenant::delete::DeleteTenantFlow; use crate::tenant::span::debug_assert_current_span_has_tenant_id; -use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState}; -use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX}; +use crate::tenant::storage_layer::inmemory_layer; +use crate::tenant::timeline::ShutdownMode; +use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Tenant, TenantState}; +use crate::virtual_file::MaybeFatalIo; +use crate::{InitializationOrder, TEMP_FILE_SUFFIX}; use utils::crashsafe::path_with_suffix_extension; use utils::fs_ext::PathExt; use utils::generation::Generation; use utils::id::{TenantId, TimelineId}; -use super::delete::DeleteTenantError; +use super::remote_timeline_client::remote_tenant_path; use super::secondary::SecondaryTenant; -use super::TenantSharedResources; +use super::timeline::detach_ancestor::{self, PreparedTimelineDetach}; +use super::{GlobalShutDown, TenantSharedResources}; /// For a tenant that appears in TenantsMap, it may either be /// - `Attached`: has a full Tenant object, is elegible to service @@ -97,27 +104,70 @@ pub(crate) enum TenantsMap { /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded. /// New tenants can be added using [`tenant_map_acquire_slot`]. Open(BTreeMap), - /// The pageserver has entered shutdown mode via [`shutdown_all_tenants`]. + /// The pageserver has entered shutdown mode via [`TenantManager::shutdown`]. /// Existing tenants are still accessible, but no new tenants can be created. ShuttingDown(BTreeMap), } -pub(crate) enum TenantsMapRemoveResult { - Occupied(TenantSlot), - Vacant, - InProgress(utils::completion::Barrier), -} - /// When resolving a TenantId to a shard, we may be looking for the 0th /// shard, or we might be looking for whichever shard holds a particular page. +#[derive(Copy, Clone)] pub(crate) enum ShardSelector { /// Only return the 0th shard, if it is present. If a non-0th shard is present, /// ignore it. Zero, - /// Pick the first shard we find for the TenantId - First, /// Pick the shard that holds this key Page(Key), + /// The shard ID is known: pick the given shard + Known(ShardIndex), +} + +/// A convenience for use with the re_attach ControlPlaneClient function: rather +/// than the serializable struct, we build this enum that encapsulates +/// the invariant that attached tenants always have generations. +/// +/// This represents the subset of a LocationConfig that we receive during re-attach. +pub(crate) enum TenantStartupMode { + Attached((AttachmentMode, Generation)), + Secondary, +} + +impl TenantStartupMode { + /// Return the generation & mode that should be used when starting + /// this tenant. + /// + /// If this returns None, the re-attach struct is in an invalid state and + /// should be ignored in the response. + fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option { + match (rart.mode, rart.gen) { + (LocationConfigMode::Detached, _) => None, + (LocationConfigMode::Secondary, _) => Some(Self::Secondary), + (LocationConfigMode::AttachedMulti, Some(g)) => { + Some(Self::Attached((AttachmentMode::Multi, Generation::new(g)))) + } + (LocationConfigMode::AttachedSingle, Some(g)) => { + Some(Self::Attached((AttachmentMode::Single, Generation::new(g)))) + } + (LocationConfigMode::AttachedStale, Some(g)) => { + Some(Self::Attached((AttachmentMode::Stale, Generation::new(g)))) + } + _ => { + tracing::warn!( + "Received invalid re-attach state for tenant {}: {rart:?}", + rart.id + ); + None + } + } + } +} + +/// Result type for looking up a TenantId to a specific shard +pub(crate) enum ShardResolveResult { + NotFound, + Found(Arc), + // Wait for this barrrier, then query again + InProgress(utils::completion::Barrier), } impl TenantsMap { @@ -133,71 +183,7 @@ impl TenantsMap { } } - /// A page service client sends a TenantId, and to look up the correct Tenant we must - /// resolve this to a fully qualified TenantShardId. - fn resolve_attached_shard( - &self, - tenant_id: &TenantId, - selector: ShardSelector, - ) -> Option { - let mut want_shard = None; - match self { - TenantsMap::Initializing => None, - TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { - for slot in m.range(TenantShardId::tenant_range(*tenant_id)) { - // Ignore all slots that don't contain an attached tenant - let tenant = match &slot.1 { - TenantSlot::Attached(t) => t, - _ => continue, - }; - - match selector { - ShardSelector::First => return Some(*slot.0), - ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => { - return Some(*slot.0) - } - ShardSelector::Page(key) => { - // First slot we see for this tenant, calculate the expected shard number - // for the key: we will use this for checking if this and subsequent - // slots contain the key, rather than recalculating the hash each time. - if want_shard.is_none() { - want_shard = Some(tenant.shard_identity.get_shard_number(&key)); - } - - if Some(tenant.shard_identity.number) == want_shard { - return Some(*slot.0); - } - } - _ => continue, - } - } - - // Fall through: we didn't find an acceptable shard - None - } - } - } - - /// Only for use from DeleteTenantFlow. This method directly removes a TenantSlot from the map. - /// - /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded - /// slot if the enclosed tenant is shutdown. - pub(crate) fn remove(&mut self, tenant_shard_id: TenantShardId) -> TenantsMapRemoveResult { - use std::collections::btree_map::Entry; - match self { - TenantsMap::Initializing => TenantsMapRemoveResult::Vacant, - TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => match m.entry(tenant_shard_id) { - Entry::Occupied(entry) => match entry.get() { - TenantSlot::InProgress(barrier) => { - TenantsMapRemoveResult::InProgress(barrier.clone()) - } - _ => TenantsMapRemoveResult::Occupied(entry.remove()), - }, - Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant, - }, - } - } - + #[cfg(all(debug_assertions, not(test)))] pub(crate) fn len(&self) -> usize { match self { TenantsMap::Initializing => 0, @@ -206,17 +192,15 @@ impl TenantsMap { } } +/// Precursor to deletion of a tenant dir: we do a fast rename to a tmp path, and then +/// the slower actual deletion in the background. +/// /// This is "safe" in that that it won't leave behind a partially deleted directory /// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting /// the contents. /// /// This is pageserver-specific, as it relies on future processes after a crash to check /// for TEMP_FILE_SUFFIX when loading things. -async fn safe_remove_tenant_dir_all(path: impl AsRef) -> std::io::Result<()> { - let tmp_path = safe_rename_tenant_dir(path).await?; - fs::remove_dir_all(tmp_path).await -} - async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result { let parent = path .as_ref() @@ -239,12 +223,69 @@ async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result> = Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing)); -/// The TenantManager is responsible for storing and mutating the collection of all tenants -/// that this pageserver process has state for. Every Tenant and SecondaryTenant instance -/// lives inside the TenantManager. +/// Responsible for storing and mutating the collection of all tenants +/// that this pageserver has state for. +/// +/// Every Tenant and SecondaryTenant instance lives inside the TenantManager. /// /// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach /// the same tenant twice concurrently, or trying to configure the same tenant into secondary @@ -256,11 +297,19 @@ pub struct TenantManager { // See https://github.com/neondatabase/neon/issues/5796 tenants: &'static std::sync::RwLock, resources: TenantSharedResources, + + // Long-running operations that happen outside of a [`Tenant`] lifetime should respect this token. + // This is for edge cases like tenant deletion. In normal cases (within a Tenant lifetime), + // tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or + // when the tenant detaches. + cancel: CancellationToken, + + background_purges: BackgroundPurges, } fn emergency_generations( - tenant_confs: &HashMap>, -) -> HashMap { + tenant_confs: &HashMap>, +) -> HashMap { tenant_confs .iter() .filter_map(|(tid, lc)| { @@ -268,22 +317,25 @@ fn emergency_generations( Ok(lc) => lc, Err(_) => return None, }; - let gen = match &lc.mode { - LocationMode::Attached(alc) => Some(alc.generation), - LocationMode::Secondary(_) => None, - }; - - gen.map(|g| (*tid, g)) + Some(( + *tid, + match &lc.mode { + LocationMode::Attached(alc) => { + TenantStartupMode::Attached((alc.attach_mode, alc.generation)) + } + LocationMode::Secondary(_) => TenantStartupMode::Secondary, + }, + )) }) .collect() } async fn init_load_generations( conf: &'static PageServerConf, - tenant_confs: &HashMap>, + tenant_confs: &HashMap>, resources: &TenantSharedResources, cancel: &CancellationToken, -) -> anyhow::Result>> { +) -> anyhow::Result>> { let generations = if conf.control_plane_emergency_mode { error!( "Emergency mode! Tenants will be attached unsafely using their last known generation" @@ -292,8 +344,13 @@ async fn init_load_generations( } else if let Some(client) = ControlPlaneClient::new(conf, cancel) { info!("Calling control plane API to re-attach tenants"); // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. - match client.re_attach().await { - Ok(tenants) => tenants, + match client.re_attach(conf).await { + Ok(tenants) => tenants + .into_iter() + .flat_map(|(id, rart)| { + TenantStartupMode::from_reattach_tenant(rart).map(|tsm| (id, tsm)) + }) + .collect(), Err(RetryForeverError::ShuttingDown) => { anyhow::bail!("Shut down while waiting for control plane re-attach response") } @@ -307,14 +364,17 @@ async fn init_load_generations( // deletion list entries may still be valid. We provide that by pushing a recovery operation into // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions // are processed, even though we don't block on recovery completing here. - // - // Must only do this if remote storage is enabled, otherwise deletion queue - // is not running and channel push will fail. - if resources.remote_storage.is_some() { - resources - .deletion_queue_client - .recover(generations.clone())?; - } + let attached_tenants = generations + .iter() + .flat_map(|(id, start_mode)| { + match start_mode { + TenantStartupMode::Attached((_mode, generation)) => Some(generation), + TenantStartupMode::Secondary => None, + } + .map(|gen| (*id, *gen)) + }) + .collect(); + resources.deletion_queue_client.recover(attached_tenants)?; Ok(Some(generations)) } @@ -322,62 +382,32 @@ async fn init_load_generations( /// Given a directory discovered in the pageserver's tenants/ directory, attempt /// to load a tenant config from it. /// -/// If file is missing, return Ok(None) +/// If we cleaned up something expected (like an empty dir or a temp dir), return None. fn load_tenant_config( conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, dentry: Utf8DirEntry, -) -> anyhow::Result)>> { +) -> Option> { let tenant_dir_path = dentry.path().to_path_buf(); if crate::is_temporary(&tenant_dir_path) { info!("Found temporary tenant directory, removing: {tenant_dir_path}"); // No need to use safe_remove_tenant_dir_all because this is already // a temporary path - if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) { - error!( - "Failed to remove temporary directory '{}': {:?}", - tenant_dir_path, e - ); - } - return Ok(None); + std::fs::remove_dir_all(&tenant_dir_path).fatal_err("delete temporary tenant dir"); + return None; } // This case happens if we crash during attachment before writing a config into the dir let is_empty = tenant_dir_path .is_empty_dir() - .with_context(|| format!("Failed to check whether {tenant_dir_path:?} is an empty dir"))?; + .fatal_err("Checking for empty tenant dir"); if is_empty { info!("removing empty tenant directory {tenant_dir_path:?}"); - if let Err(e) = std::fs::remove_dir(&tenant_dir_path) { - error!( - "Failed to remove empty tenant directory '{}': {e:#}", - tenant_dir_path - ) - } - return Ok(None); + std::fs::remove_dir(&tenant_dir_path).fatal_err("delete empty tenant dir"); + return None; } - let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME); - if tenant_ignore_mark_file.exists() { - info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant"); - return Ok(None); - } - - let tenant_shard_id = match tenant_dir_path - .file_name() - .unwrap_or_default() - .parse::() - { - Ok(id) => id, - Err(_) => { - warn!("Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",); - return Ok(None); - } - }; - - Ok(Some(( - tenant_shard_id, - Tenant::load_tenant_config(conf, &tenant_shard_id), - ))) + Some(Tenant::load_tenant_config(conf, &tenant_shard_id)) } /// Initial stage of load: walk the local tenants directory, clean up any temp files, @@ -387,32 +417,63 @@ fn load_tenant_config( /// seconds even on reasonably fast drives. async fn init_load_tenant_configs( conf: &'static PageServerConf, -) -> anyhow::Result>> { +) -> HashMap> { let tenants_dir = conf.tenants_path(); - let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result> { - let dir_entries = tenants_dir - .read_dir_utf8() - .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?; + let dentries = tokio::task::spawn_blocking(move || -> Vec { + let context = format!("read tenants dir {tenants_dir}"); + let dir_entries = tenants_dir.read_dir_utf8().fatal_err(&context); - Ok(dir_entries.collect::, std::io::Error>>()?) + dir_entries + .collect::, std::io::Error>>() + .fatal_err(&context) }) - .await??; + .await + .expect("Config load task panicked"); let mut configs = HashMap::new(); let mut join_set = JoinSet::new(); for dentry in dentries { - join_set.spawn_blocking(move || load_tenant_config(conf, dentry)); + let tenant_shard_id = match dentry.file_name().parse::() { + Ok(id) => id, + Err(_) => { + warn!( + "Invalid tenant path (garbage in our repo directory?): '{}'", + dentry.file_name() + ); + continue; + } + }; + + join_set.spawn_blocking(move || { + ( + tenant_shard_id, + load_tenant_config(conf, tenant_shard_id, dentry), + ) + }); } while let Some(r) = join_set.join_next().await { - if let Some((tenant_id, tenant_config)) = r?? { - configs.insert(tenant_id, tenant_config); + let (tenant_shard_id, tenant_config) = r.expect("Panic in config load task"); + if let Some(tenant_config) = tenant_config { + configs.insert(tenant_shard_id, tenant_config); } } - Ok(configs) + configs +} + +#[derive(Debug, thiserror::Error)] +pub(crate) enum DeleteTenantError { + #[error("Tenant map slot error {0}")] + SlotError(#[from] TenantSlotError), + + #[error("Cancelled")] + Cancelled, + + #[error(transparent)] + Other(#[from] anyhow::Error), } /// Initialize repositories with locally available timelines. @@ -421,6 +482,7 @@ async fn init_load_tenant_configs( #[instrument(skip_all)] pub async fn init_tenant_mgr( conf: &'static PageServerConf, + background_purges: BackgroundPurges, resources: TenantSharedResources, init_order: InitializationOrder, cancel: CancellationToken, @@ -429,12 +491,23 @@ pub async fn init_tenant_mgr( let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn); - // Scan local filesystem for attached tenants - let tenant_configs = init_load_tenant_configs(conf).await?; + // Initialize dynamic limits that depend on system resources + let system_memory = + sysinfo::System::new_with_specifics(sysinfo::RefreshKind::new().with_memory()) + .total_memory(); + let max_ephemeral_layer_bytes = + conf.ephemeral_bytes_per_memory_kb as u64 * (system_memory / 1024); + tracing::info!("Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory"); + inmemory_layer::GLOBAL_RESOURCES.max_dirty_bytes.store( + max_ephemeral_layer_bytes, + std::sync::atomic::Ordering::Relaxed, + ); - // Determine which tenants are to be attached - let tenant_generations = - init_load_generations(conf, &tenant_configs, &resources, &cancel).await?; + // Scan local filesystem for attached tenants + let tenant_configs = init_load_tenant_configs(conf).await; + + // Determine which tenants are to be secondary or attached, and in which generation + let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?; tracing::info!( "Attaching {} tenants at startup, warming up {} at a time", @@ -443,138 +516,170 @@ pub async fn init_tenant_mgr( ); TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64); - // Construct `Tenant` objects and start them running + // Accumulate futures for writing tenant configs, so that we can execute in parallel + let mut config_write_futs = Vec::new(); + + // Update the location configs according to the re-attach response and persist them to disk + tracing::info!("Updating {} location configs", tenant_configs.len()); for (tenant_shard_id, location_conf) in tenant_configs { let tenant_dir_path = conf.tenant_path(&tenant_shard_id); let mut location_conf = match location_conf { Ok(l) => l, Err(e) => { - warn!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Marking tenant broken, failed to {e:#}"); - - tenants.insert( - tenant_shard_id, - TenantSlot::Attached(Tenant::create_broken_tenant( - conf, - tenant_shard_id, - format!("{}", e), - )), - ); + // This should only happen in the case of a serialization bug or critical local I/O error: we cannot load this tenant + error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to load tenant config, failed to {e:#}"); continue; } }; - let generation = if let Some(generations) = &tenant_generations { + // FIXME: if we were attached, and get demoted to secondary on re-attach, we + // don't have a place to get a config. + // (https://github.com/neondatabase/neon/issues/5377) + const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig = + SecondaryLocationConfig { warm: true }; + + if let Some(tenant_modes) = &tenant_modes { // We have a generation map: treat it as the authority for whether // this tenant is really attached. - if let Some(gen) = generations.get(&tenant_shard_id) { - if let LocationMode::Attached(attached) = &location_conf.mode { - if attached.generation > *gen { + match tenant_modes.get(&tenant_shard_id) { + None => { + info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response"); + + match safe_rename_tenant_dir(&tenant_dir_path).await { + Ok(tmp_path) => { + background_purges.spawn(tmp_path); + } + Err(e) => { + error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Failed to move detached tenant directory '{tenant_dir_path}': {e:?}"); + } + }; + + // We deleted local content: move on to next tenant, don't try and spawn this one. + continue; + } + Some(TenantStartupMode::Secondary) => { + if !matches!(location_conf.mode, LocationMode::Secondary(_)) { + location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF); + } + } + Some(TenantStartupMode::Attached((attach_mode, generation))) => { + let old_gen_higher = match &location_conf.mode { + LocationMode::Attached(AttachedLocationConfig { + generation: old_generation, + attach_mode: _attach_mode, + }) => { + if old_generation > generation { + Some(old_generation) + } else { + None + } + } + _ => None, + }; + if let Some(old_generation) = old_gen_higher { tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary", - attached.generation + "Control plane gave decreasing generation ({generation:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary", + old_generation ); // We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away // local disk content: demote to secondary rather than detaching. - tenants.insert( - tenant_shard_id, - TenantSlot::Secondary(SecondaryTenant::new( - tenant_shard_id, - location_conf.shard, - location_conf.tenant_conf, - &SecondaryLocationConfig { warm: false }, - )), - ); + location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF); + } else { + location_conf.attach_in_generation(*attach_mode, *generation); } } - *gen - } else { - match &location_conf.mode { - LocationMode::Secondary(secondary_config) => { - // We do not require the control plane's permission for secondary mode - // tenants, because they do no remote writes and hence require no - // generation number - info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode"); - tenants.insert( - tenant_shard_id, - TenantSlot::Secondary(SecondaryTenant::new( - tenant_shard_id, - location_conf.shard, - location_conf.tenant_conf, - secondary_config, - )), - ); - } - LocationMode::Attached(_) => { - // TODO: augment re-attach API to enable the control plane to - // instruct us about secondary attachments. That way, instead of throwing - // away local state, we can gracefully fall back to secondary here, if the control - // plane tells us so. - // (https://github.com/neondatabase/neon/issues/5377) - info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response"); - if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await { - error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}", - ); - } - } - }; - - continue; } } else { // Legacy mode: no generation information, any tenant present // on local disk may activate info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",); - Generation::none() }; // Presence of a generation number implies attachment: attach the tenant // if it wasn't already, and apply the generation number. - location_conf.attach_in_generation(generation); - Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?; + config_write_futs.push(async move { + let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await; + (tenant_shard_id, location_conf, r) + }); + } + // Execute config writes with concurrency, to avoid bottlenecking on local FS write latency + tracing::info!( + "Writing {} location config files...", + config_write_futs.len() + ); + let config_write_results = futures::stream::iter(config_write_futs) + .buffer_unordered(16) + .collect::>() + .await; + + tracing::info!( + "Spawning {} tenant shard locations...", + config_write_results.len() + ); + // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running + for (tenant_shard_id, location_conf, config_write_result) in config_write_results { + // Writing a config to local disk is foundational to startup up tenants: panic if we can't. + config_write_result.fatal_err("write tenant shard config file"); + + let tenant_dir_path = conf.tenant_path(&tenant_shard_id); let shard_identity = location_conf.shard; - match tenant_spawn( - conf, - tenant_shard_id, - &tenant_dir_path, - resources.clone(), - AttachedTenantConf::try_from(location_conf)?, - shard_identity, - Some(init_order.clone()), - &TENANTS, - SpawnMode::Normal, - &ctx, - ) { - Ok(tenant) => { - tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant)); + let slot = match location_conf.mode { + LocationMode::Attached(attached_conf) => TenantSlot::Attached( + tenant_spawn( + conf, + tenant_shard_id, + &tenant_dir_path, + resources.clone(), + AttachedTenantConf::new(location_conf.tenant_conf, attached_conf), + shard_identity, + Some(init_order.clone()), + SpawnMode::Lazy, + &ctx, + ) + .expect("global shutdown during init_tenant_mgr cannot happen"), + ), + LocationMode::Secondary(secondary_conf) => { + info!( + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + "Starting secondary tenant" + ); + TenantSlot::Secondary(SecondaryTenant::new( + tenant_shard_id, + shard_identity, + location_conf.tenant_conf, + &secondary_conf, + )) } - Err(e) => { - error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}"); - } - } + }; + + METRICS.slot_inserted(&slot); + tenants.insert(tenant_shard_id, slot); } info!("Processed {} local tenants at startup", tenants.len()); let mut tenants_map = TENANTS.write().unwrap(); assert!(matches!(&*tenants_map, &TenantsMap::Initializing)); - METRICS.tenant_slots.set(tenants.len() as u64); + *tenants_map = TenantsMap::Open(tenants); Ok(TenantManager { conf, tenants: &TENANTS, resources, + cancel: CancellationToken::new(), + background_purges, }) } -/// Wrapper for Tenant::spawn that checks invariants before running, and inserts -/// a broken tenant in the map if Tenant::spawn fails. +/// Wrapper for Tenant::spawn that checks invariants before running #[allow(clippy::too_many_arguments)] -pub(crate) fn tenant_spawn( +fn tenant_spawn( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, tenant_path: &Utf8Path, @@ -582,79 +687,42 @@ pub(crate) fn tenant_spawn( location_conf: AttachedTenantConf, shard_identity: ShardIdentity, init_order: Option, - tenants: &'static std::sync::RwLock, mode: SpawnMode, ctx: &RequestContext, -) -> anyhow::Result> { - anyhow::ensure!( - tenant_path.is_dir(), - "Cannot load tenant from path {tenant_path:?}, it either does not exist or not a directory" - ); - anyhow::ensure!( - !crate::is_temporary(tenant_path), - "Cannot load tenant from temporary path {tenant_path:?}" - ); - anyhow::ensure!( - !tenant_path.is_empty_dir().with_context(|| { - format!("Failed to check whether {tenant_path:?} is an empty dir") - })?, - "Cannot load tenant from empty directory {tenant_path:?}" - ); +) -> Result, GlobalShutDown> { + // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed + // path, and contains a configuration file. Assertions that do synchronous I/O are limited to debug mode + // to avoid impacting prod runtime performance. + assert!(!crate::is_temporary(tenant_path)); + debug_assert!(tenant_path.is_dir()); + debug_assert!(conf + .tenant_location_config_path(&tenant_shard_id) + .try_exists() + .unwrap()); - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id); - anyhow::ensure!( - !conf.tenant_ignore_mark_file_path(&tenant_shard_id).exists(), - "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}" - ); - - info!( - tenant_id = %tenant_shard_id.tenant_id, - shard_id = %tenant_shard_id.shard_slug(), - generation = ?location_conf.location.generation, - attach_mode = ?location_conf.location.attach_mode, - "Attaching tenant" - ); - let tenant = match Tenant::spawn( + Tenant::spawn( conf, tenant_shard_id, resources, location_conf, shard_identity, init_order, - tenants, mode, ctx, - ) { - Ok(tenant) => tenant, - Err(e) => { - error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}"); - Tenant::create_broken_tenant(conf, tenant_shard_id, format!("{e:#}")) - } - }; - - Ok(tenant) -} - -/// -/// Shut down all tenants. This runs as part of pageserver shutdown. -/// -/// NB: We leave the tenants in the map, so that they remain accessible through -/// the management API until we shut it down. If we removed the shut-down tenants -/// from the tenants map, the management API would return 404 for these tenants, -/// because TenantsMap::get() now returns `None`. -/// That could be easily misinterpreted by control plane, the consumer of the -/// management API. For example, it could attach the tenant on a different pageserver. -/// We would then be in split-brain once this pageserver restarts. -#[instrument(skip_all)] -pub(crate) async fn shutdown_all_tenants() { - shutdown_all_tenants0(&TENANTS).await + ) } async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { - use utils::completion; - let mut join_set = JoinSet::new(); + #[cfg(all(debug_assertions, not(test)))] + { + // Check that our metrics properly tracked the size of the tenants map. This is a convenient location to check, + // as it happens implicitly at the end of tests etc. + let m = tenants.read().unwrap(); + debug_assert_eq!(METRICS.slots_total(), m.len() as u64); + } + // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants. let (total_in_progress, total_attached) = { let mut m = tenants.write().unwrap(); @@ -675,11 +743,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone())); join_set.spawn( async move { - let freeze_and_flush = true; - let res = { let (_guard, shutdown_progress) = completion::channel(); - t.shutdown(shutdown_progress, freeze_and_flush).await + t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await }; if let Err(other_progress) = res { @@ -691,7 +757,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { // going to log too many lines debug!("tenant successfully stopped"); } - .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug())), + .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())), ); total_attached += 1; @@ -780,50 +846,6 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { // caller will log how long we took } -#[derive(Debug, thiserror::Error)] -pub(crate) enum SetNewTenantConfigError { - #[error(transparent)] - GetTenant(#[from] GetTenantError), - #[error(transparent)] - Persist(anyhow::Error), - #[error(transparent)] - Other(anyhow::Error), -} - -pub(crate) async fn set_new_tenant_config( - conf: &'static PageServerConf, - new_tenant_conf: TenantConfOpt, - tenant_id: TenantId, -) -> Result<(), SetNewTenantConfigError> { - // Legacy API: does not support sharding - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - - info!("configuring tenant {tenant_id}"); - let tenant = get_tenant(tenant_shard_id, true)?; - - if tenant.tenant_shard_id().shard_count > ShardCount(0) { - // Note that we use ShardParameters::default below. - return Err(SetNewTenantConfigError::Other(anyhow::anyhow!( - "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants" - ))); - } - - // This is a legacy API that only operates on attached tenants: the preferred - // API to use is the location_config/ endpoint, which lets the caller provide - // the full LocationConf. - let location_conf = LocationConf::attached_single( - new_tenant_conf, - tenant.generation, - &ShardParameters::default(), - ); - - Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf) - .await - .map_err(SetNewTenantConfigError::Persist)?; - tenant.set_new_tenant_config(new_tenant_conf); - Ok(()) -} - #[derive(thiserror::Error, Debug)] pub(crate) enum UpsertLocationError { #[error("Bad config request: {0}")] @@ -838,8 +860,9 @@ pub(crate) enum UpsertLocationError { #[error("Failed to flush: {0}")] Flush(anyhow::Error), + /// This error variant is for unexpected situations (soft assertions) where the system is in an unexpected state. #[error("Internal error: {0}")] - Other(#[from] anyhow::Error), + InternalError(anyhow::Error), } impl TenantManager { @@ -849,32 +872,21 @@ impl TenantManager { self.conf } - /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query. - /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. + /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or currently + /// undergoing a state change (i.e. slot is InProgress). + /// + /// The return Tenant is not guaranteed to be active: check its status after obtaing it, or + /// use [`Tenant::wait_to_become_active`] before using it if you will do I/O on it. pub(crate) fn get_attached_tenant_shard( &self, tenant_shard_id: TenantShardId, - active_only: bool, ) -> Result, GetTenantError> { let locked = self.tenants.read().unwrap(); let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?; match peek_slot { - Some(TenantSlot::Attached(tenant)) => match tenant.current_state() { - TenantState::Broken { - reason, - backtrace: _, - } if active_only => Err(GetTenantError::Broken(reason)), - TenantState::Active => Ok(Arc::clone(tenant)), - _ => { - if active_only { - Err(GetTenantError::NotActive(tenant_shard_id)) - } else { - Ok(Arc::clone(tenant)) - } - } - }, + Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)), Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)), None | Some(TenantSlot::Secondary(_)) => { Err(GetTenantError::NotFound(tenant_shard_id.tenant_id)) @@ -898,6 +910,17 @@ impl TenantManager { } } + /// Whether the `TenantManager` is responsible for the tenant shard + pub(crate) fn manages_tenant_shard(&self, tenant_shard_id: TenantShardId) -> bool { + let locked = self.tenants.read().unwrap(); + + let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read) + .ok() + .flatten(); + + peek_slot.is_some() + } + #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(crate) async fn upsert_location( &self, @@ -969,7 +992,8 @@ impl TenantManager { match fast_path_taken { Some(FastPathModified::Attached(tenant)) => { Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) - .await?; + .await + .fatal_err("write tenant shard config"); // Transition to AttachedStale means we may well hold a valid generation // still, and have been requested to go stale as part of a migration. If @@ -999,7 +1023,8 @@ impl TenantManager { } Some(FastPathModified::Secondary(_secondary_tenant)) => { Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) - .await?; + .await + .fatal_err("write tenant shard config"); return Ok(None); } @@ -1016,7 +1041,7 @@ impl TenantManager { // not do significant I/O, and shutdowns should be prompt via cancellation tokens. let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any) .map_err(|e| match e { - TenantSlotError::AlreadyExists(_, _) | TenantSlotError::NotFound(_) => { + TenantSlotError::NotFound(_) => { unreachable!("Called with mode Any") } TenantSlotError::InProgress => UpsertLocationError::InProgress, @@ -1043,7 +1068,7 @@ impl TenantManager { }; info!("Shutting down attached tenant"); - match tenant.shutdown(progress, false).await { + match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => {} Err(barrier) => { info!("Shutdown already in progress, waiting for it to complete"); @@ -1054,9 +1079,9 @@ impl TenantManager { // Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then // the caller thinks they're creating but the tenant already existed. We must switch to - // Normal mode so that when starting this Tenant we properly probe remote storage for timelines, + // Eager mode so that when starting this Tenant we properly probe remote storage for timelines, // rather than assuming it to be empty. - spawn_mode = SpawnMode::Normal; + spawn_mode = SpawnMode::Eager; } Some(TenantSlot::Secondary(state)) => { info!("Shutting down secondary tenant"); @@ -1065,7 +1090,7 @@ impl TenantManager { Some(TenantSlot::InProgress(_)) => { // This should never happen: acquire_slot should error out // if the contents of a slot were InProgress. - return Err(UpsertLocationError::Other(anyhow::anyhow!( + return Err(UpsertLocationError::InternalError(anyhow::anyhow!( "Acquired an InProgress slot, this is a bug." ))); } @@ -1084,12 +1109,14 @@ impl TenantManager { // Does not need to be fsync'd because local storage is just a cache. tokio::fs::create_dir_all(&timelines_path) .await - .with_context(|| format!("Creating {timelines_path}"))?; + .fatal_err("create timelines/ dir"); // Before activating either secondary or attached mode, persist the // configuration, so that on restart we will re-attach (or re-start // secondary) on the tenant. - Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config).await?; + Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) + .await + .fatal_err("write tenant shard config"); let new_slot = match &new_location_config.mode { LocationMode::Secondary(secondary_config) => { @@ -1108,13 +1135,15 @@ impl TenantManager { // from upserts. This enables creating generation-less tenants even though neon_local // always uses generations when calling the location conf API. let attached_conf = if cfg!(feature = "testing") { - let mut conf = AttachedTenantConf::try_from(new_location_config)?; + let mut conf = AttachedTenantConf::try_from(new_location_config) + .map_err(UpsertLocationError::BadRequest)?; if self.conf.control_plane_api.is_none() { conf.location.generation = Generation::none(); } conf } else { - AttachedTenantConf::try_from(new_location_config)? + AttachedTenantConf::try_from(new_location_config) + .map_err(UpsertLocationError::BadRequest)? }; let tenant = tenant_spawn( @@ -1125,10 +1154,12 @@ impl TenantManager { attached_conf, shard_identity, None, - self.tenants, spawn_mode, ctx, - )?; + ) + .map_err(|_: GlobalShutDown| { + UpsertLocationError::Unavailable(TenantMapError::ShuttingDown) + })?; TenantSlot::Attached(tenant) } @@ -1142,7 +1173,7 @@ impl TenantManager { match slot_guard.upsert(new_slot) { Err(TenantSlotUpsertError::InternalError(e)) => { - Err(UpsertLocationError::Other(anyhow::anyhow!(e))) + Err(UpsertLocationError::InternalError(anyhow::anyhow!(e))) } Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)), Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => { @@ -1159,7 +1190,7 @@ impl TenantManager { TenantSlot::Attached(tenant) => { let (_guard, progress) = utils::completion::channel(); info!("Shutting down just-spawned tenant, because tenant manager is shut down"); - match tenant.shutdown(progress, false).await { + match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => { info!("Finished shutting down just-spawned tenant"); } @@ -1196,7 +1227,7 @@ impl TenantManager { &self, tenant_shard_id: TenantShardId, drop_cache: bool, - ctx: RequestContext, + ctx: &RequestContext, ) -> anyhow::Result<()> { let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; let Some(old_slot) = slot_guard.get_old_value() else { @@ -1209,7 +1240,7 @@ impl TenantManager { }; let (_guard, progress) = utils::completion::channel(); - match tenant.shutdown(progress, false).await { + match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => { slot_guard.drop_old_value()?; } @@ -1247,9 +1278,8 @@ impl TenantManager { AttachedTenantConf::try_from(config)?, shard_identity, None, - self.tenants, - SpawnMode::Normal, - &ctx, + SpawnMode::Eager, + ctx, )?; slot_guard.upsert(TenantSlot::Attached(tenant))?; @@ -1306,69 +1336,862 @@ impl TenantManager { } } - pub(crate) async fn delete_tenant( + pub(crate) fn get(&self, tenant_shard_id: TenantShardId) -> Option { + let locked = self.tenants.read().unwrap(); + match &*locked { + TenantsMap::Initializing => None, + TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => { + map.get(&tenant_shard_id).cloned() + } + } + } + + async fn delete_tenant_remote( &self, tenant_shard_id: TenantShardId, - activation_timeout: Duration, ) -> Result<(), DeleteTenantError> { - // We acquire a SlotGuard during this function to protect against concurrent - // changes while the ::prepare phase of DeleteTenantFlow executes, but then - // have to return the Tenant to the map while the background deletion runs. - // - // TODO: refactor deletion to happen outside the lifetime of a Tenant. - // Currently, deletion requires a reference to the tenants map in order to - // keep the Tenant in the map until deletion is complete, and then remove - // it at the end. - // - // See https://github.com/neondatabase/neon/issues/5080 + let remote_path = remote_tenant_path(&tenant_shard_id); + let mut keys_stream = self.resources.remote_storage.list_streaming( + Some(&remote_path), + remote_storage::ListingMode::NoDelimiter, + None, + &self.cancel, + ); + while let Some(chunk) = keys_stream.next().await { + let keys = match chunk { + Ok(listing) => listing.keys, + Err(remote_storage::DownloadError::Cancelled) => { + return Err(DeleteTenantError::Cancelled) + } + Err(remote_storage::DownloadError::NotFound) => return Ok(()), + Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))), + }; - let slot_guard = - tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?; - - // unwrap is safe because we used MustExist mode when acquiring - let tenant = match slot_guard.get_old_value().as_ref().unwrap() { - TenantSlot::Attached(tenant) => tenant.clone(), - _ => { - // Express "not attached" as equivalent to "not found" - return Err(DeleteTenantError::NotAttached); - } - }; - - match tenant.current_state() { - TenantState::Broken { .. } | TenantState::Stopping { .. } => { - // If a tenant is broken or stopping, DeleteTenantFlow can - // handle it: broken tenants proceed to delete, stopping tenants - // are checked for deletion already in progress. - } - _ => { - tenant - .wait_to_become_active(activation_timeout) - .await - .map_err(|e| match e { - GetActiveTenantError::WillNotBecomeActive(_) => { - DeleteTenantError::InvalidState(tenant.current_state()) - } - GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled, - GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached, - GetActiveTenantError::WaitForActiveTimeout { - latest_state: _latest_state, - wait_time: _wait_time, - } => DeleteTenantError::InvalidState(tenant.current_state()), - })?; + if keys.is_empty() { + tracing::info!("Remote storage already deleted"); + } else { + tracing::info!("Deleting {} keys from remote storage", keys.len()); + let keys = keys.into_iter().map(|o| o.key).collect::>(); + self.resources + .remote_storage + .delete_objects(&keys, &self.cancel) + .await?; } } - let result = DeleteTenantFlow::run( - self.conf, - self.resources.remote_storage.clone(), - &TENANTS, - tenant, + Ok(()) + } + + /// If a tenant is attached, detach it. Then remove its data from remote storage. + /// + /// A tenant is considered deleted once it is gone from remote storage. It is the caller's + /// responsibility to avoid trying to attach the tenant again or use it any way once deletion + /// has started: this operation is not atomic, and must be retried until it succeeds. + pub(crate) async fn delete_tenant( + &self, + tenant_shard_id: TenantShardId, + ) -> Result<(), DeleteTenantError> { + super::span::debug_assert_current_span_has_tenant_id(); + + async fn delete_local( + conf: &PageServerConf, + background_purges: &BackgroundPurges, + tenant_shard_id: &TenantShardId, + ) -> anyhow::Result<()> { + let local_tenant_directory = conf.tenant_path(tenant_shard_id); + let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory) + .await + .with_context(|| { + format!("local tenant directory {local_tenant_directory:?} rename") + })?; + background_purges.spawn(tmp_dir); + Ok(()) + } + + let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; + match &slot_guard.old_value { + Some(TenantSlot::Attached(tenant)) => { + // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and + // deletion will be resumed across restarts. + let tenant = tenant.clone(); + let (_guard, progress) = utils::completion::channel(); + match tenant.shutdown(progress, ShutdownMode::Hard).await { + Ok(()) => {} + Err(barrier) => { + info!("Shutdown already in progress, waiting for it to complete"); + barrier.wait().await; + } + } + delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?; + } + Some(TenantSlot::Secondary(secondary_tenant)) => { + secondary_tenant.shutdown().await; + + delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?; + } + Some(TenantSlot::InProgress(_)) => unreachable!(), + None => {} + }; + + // Fall through: local state for this tenant is no longer present, proceed with remote delete. + // - We use a retry wrapper here so that common transient S3 errors (e.g. 503, 429) do not result + // in 500 responses to delete requests. + // - We keep the `SlotGuard` during this I/O, so that if a concurrent delete request comes in, it will + // 503/retry, rather than kicking off a wasteful concurrent deletion. + match backoff::retry( + || async move { self.delete_tenant_remote(tenant_shard_id).await }, + |e| match e { + DeleteTenantError::Cancelled => true, + DeleteTenantError::SlotError(_) => { + unreachable!("Remote deletion doesn't touch slots") + } + _ => false, + }, + 1, + 3, + &format!("delete_tenant[tenant_shard_id={tenant_shard_id}]"), + &self.cancel, + ) + .await + { + Some(r) => r, + None => Err(DeleteTenantError::Cancelled), + } + } + + #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))] + pub(crate) async fn shard_split( + &self, + tenant: Arc, + new_shard_count: ShardCount, + new_stripe_size: Option, + ctx: &RequestContext, + ) -> anyhow::Result> { + let tenant_shard_id = *tenant.get_tenant_shard_id(); + let r = self + .do_shard_split(tenant, new_shard_count, new_stripe_size, ctx) + .await; + if r.is_err() { + // Shard splitting might have left the original shard in a partially shut down state (it + // stops the shard's remote timeline client). Reset it to ensure we leave things in + // a working state. + if self.get(tenant_shard_id).is_some() { + tracing::warn!("Resetting after shard split failure"); + if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await { + // Log this error because our return value will still be the original error, not this one. This is + // a severe error: if this happens, we might be leaving behind a tenant that is not fully functional + // (e.g. has uploads disabled). We can't do anything else: if reset fails then shutting the tenant down or + // setting it broken probably won't help either. + tracing::error!("Failed to reset: {e}"); + } + } + } + + r + } + + pub(crate) async fn do_shard_split( + &self, + tenant: Arc, + new_shard_count: ShardCount, + new_stripe_size: Option, + ctx: &RequestContext, + ) -> anyhow::Result> { + let tenant_shard_id = *tenant.get_tenant_shard_id(); + + // Validate the incoming request + if new_shard_count.count() <= tenant_shard_id.shard_count.count() { + anyhow::bail!("Requested shard count is not an increase"); + } + let expansion_factor = new_shard_count.count() / tenant_shard_id.shard_count.count(); + if !expansion_factor.is_power_of_two() { + anyhow::bail!("Requested split is not a power of two"); + } + + if let Some(new_stripe_size) = new_stripe_size { + if tenant.get_shard_stripe_size() != new_stripe_size + && tenant_shard_id.shard_count.count() > 1 + { + // This tenant already has multiple shards, it is illegal to try and change its stripe size + anyhow::bail!( + "Shard stripe size may not be modified once tenant has multiple shards" + ); + } + } + + // Plan: identify what the new child shards will be + let child_shards = tenant_shard_id.split(new_shard_count); + tracing::info!( + "Shard {} splits into: {}", + tenant_shard_id.to_index(), + child_shards + .iter() + .map(|id| format!("{}", id.to_index())) + .join(",") + ); + + fail::fail_point!("shard-split-pre-prepare", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + + let parent_shard_identity = tenant.shard_identity; + let parent_tenant_conf = tenant.get_tenant_conf(); + let parent_generation = tenant.generation; + + // Phase 1: Write out child shards' remote index files, in the parent tenant's current generation + if let Err(e) = tenant.split_prepare(&child_shards).await { + // If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might + // have been left in a partially-shut-down state. + tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning"); + return Err(e); + } + + fail::fail_point!("shard-split-post-prepare", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + + self.resources.deletion_queue_client.flush_advisory(); + + // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant + drop(tenant); + let mut parent_slot_guard = + tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; + let parent = match parent_slot_guard.get_old_value() { + Some(TenantSlot::Attached(t)) => t, + Some(TenantSlot::Secondary(_)) => anyhow::bail!("Tenant location in secondary mode"), + Some(TenantSlot::InProgress(_)) => { + // tenant_map_acquire_slot never returns InProgress, if a slot was InProgress + // it would return an error. + unreachable!() + } + None => { + // We don't actually need the parent shard to still be attached to do our work, but it's + // a weird enough situation that the caller probably didn't want us to continue working + // if they had detached the tenant they requested the split on. + anyhow::bail!("Detached parent shard in the middle of split!") + } + }; + fail::fail_point!("shard-split-pre-hardlink", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + // Optimization: hardlink layers from the parent into the children, so that they don't have to + // re-download & duplicate the data referenced in their initial IndexPart + self.shard_split_hardlink(parent, child_shards.clone()) + .await?; + fail::fail_point!("shard-split-post-hardlink", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + + // Take a snapshot of where the parent's WAL ingest had got to: we will wait for + // child shards to reach this point. + let mut target_lsns = HashMap::new(); + for timeline in parent.timelines.lock().unwrap().clone().values() { + target_lsns.insert(timeline.timeline_id, timeline.get_last_record_lsn()); + } + + // TODO: we should have the parent shard stop its WAL ingest here, it's a waste of resources + // and could slow down the children trying to catch up. + + // Phase 3: Spawn the child shards + for child_shard in &child_shards { + let mut child_shard_identity = parent_shard_identity; + if let Some(new_stripe_size) = new_stripe_size { + child_shard_identity.stripe_size = new_stripe_size; + } + child_shard_identity.count = child_shard.shard_count; + child_shard_identity.number = child_shard.shard_number; + + let child_location_conf = LocationConf { + mode: LocationMode::Attached(AttachedLocationConfig { + generation: parent_generation, + attach_mode: AttachmentMode::Single, + }), + shard: child_shard_identity, + tenant_conf: parent_tenant_conf.clone(), + }; + + self.upsert_location( + *child_shard, + child_location_conf, + None, + SpawnMode::Eager, + ctx, + ) + .await?; + } + + fail::fail_point!("shard-split-post-child-conf", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + + // Phase 4: wait for child chards WAL ingest to catch up to target LSN + for child_shard_id in &child_shards { + let child_shard_id = *child_shard_id; + let child_shard = { + let locked = self.tenants.read().unwrap(); + let peek_slot = + tenant_map_peek_slot(&locked, &child_shard_id, TenantSlotPeekMode::Read)?; + peek_slot.and_then(|s| s.get_attached()).cloned() + }; + if let Some(t) = child_shard { + // Wait for the child shard to become active: this should be very quick because it only + // has to download the index_part that we just uploaded when creating it. + if let Err(e) = t.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await { + // This is not fatal: we have durably created the child shard. It just makes the + // split operation less seamless for clients, as we will may detach the parent + // shard before the child shards are fully ready to serve requests. + tracing::warn!("Failed to wait for shard {child_shard_id} to activate: {e}"); + continue; + } + + let timelines = t.timelines.lock().unwrap().clone(); + for timeline in timelines.values() { + let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else { + continue; + }; + + tracing::info!( + "Waiting for child shard {}/{} to reach target lsn {}...", + child_shard_id, + timeline.timeline_id, + target_lsn + ); + + fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + if let Err(e) = timeline + .wait_lsn( + *target_lsn, + crate::tenant::timeline::WaitLsnWaiter::Tenant, + ctx, + ) + .await + { + // Failure here might mean shutdown, in any case this part is an optimization + // and we shouldn't hold up the split operation. + tracing::warn!( + "Failed to wait for timeline {} to reach lsn {target_lsn}: {e}", + timeline.timeline_id + ); + } else { + tracing::info!( + "Child shard {}/{} reached target lsn {}", + child_shard_id, + timeline.timeline_id, + target_lsn + ); + } + } + } + } + + // Phase 5: Shut down the parent shard, and erase it from disk + let (_guard, progress) = completion::channel(); + match parent.shutdown(progress, ShutdownMode::Hard).await { + Ok(()) => {} + Err(other) => { + other.wait().await; + } + } + let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id); + let tmp_path = safe_rename_tenant_dir(&local_tenant_directory) + .await + .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?; + self.background_purges.spawn(tmp_path); + + fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + + parent_slot_guard.drop_old_value()?; + + // Phase 6: Release the InProgress on the parent shard + drop(parent_slot_guard); + + Ok(child_shards) + } + + /// Part of [`Self::shard_split`]: hard link parent shard layers into child shards, as an optimization + /// to avoid the children downloading them again. + /// + /// For each resident layer in the parent shard, we will hard link it into all of the child shards. + async fn shard_split_hardlink( + &self, + parent_shard: &Tenant, + child_shards: Vec, + ) -> anyhow::Result<()> { + debug_assert_current_span_has_tenant_id(); + + let parent_path = self.conf.tenant_path(parent_shard.get_tenant_shard_id()); + let (parent_timelines, parent_layers) = { + let mut parent_layers = Vec::new(); + let timelines = parent_shard.timelines.lock().unwrap().clone(); + let parent_timelines = timelines.keys().cloned().collect::>(); + for timeline in timelines.values() { + tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink"); + let layers = timeline.layers.read().await; + + for layer in layers.likely_resident_layers() { + let relative_path = layer + .local_path() + .strip_prefix(&parent_path) + .context("Removing prefix from parent layer path")?; + parent_layers.push(relative_path.to_owned()); + } + } + debug_assert!( + !parent_layers.is_empty(), + "shutdown cannot empty the layermap" + ); + (parent_timelines, parent_layers) + }; + + let mut child_prefixes = Vec::new(); + let mut create_dirs = Vec::new(); + + for child in child_shards { + let child_prefix = self.conf.tenant_path(&child); + create_dirs.push(child_prefix.clone()); + create_dirs.extend( + parent_timelines + .iter() + .map(|t| self.conf.timeline_path(&child, t)), + ); + + child_prefixes.push(child_prefix); + } + + // Since we will do a large number of small filesystem metadata operations, batch them into + // spawn_blocking calls rather than doing each one as a tokio::fs round-trip. + let span = tracing::Span::current(); + let jh = tokio::task::spawn_blocking(move || -> anyhow::Result { + // Run this synchronous code in the same log context as the outer function that spawned it. + let _span = span.enter(); + + tracing::info!("Creating {} directories", create_dirs.len()); + for dir in &create_dirs { + if let Err(e) = std::fs::create_dir_all(dir) { + // Ignore AlreadyExists errors, drop out on all other errors + match e.kind() { + std::io::ErrorKind::AlreadyExists => {} + _ => { + return Err(anyhow::anyhow!(e).context(format!("Creating {dir}"))); + } + } + } + } + + for child_prefix in child_prefixes { + tracing::info!( + "Hard-linking {} parent layers into child path {}", + parent_layers.len(), + child_prefix + ); + for relative_layer in &parent_layers { + let parent_path = parent_path.join(relative_layer); + let child_path = child_prefix.join(relative_layer); + if let Err(e) = std::fs::hard_link(&parent_path, &child_path) { + match e.kind() { + std::io::ErrorKind::AlreadyExists => {} + std::io::ErrorKind::NotFound => { + tracing::info!( + "Layer {} not found during hard-linking, evicted during split?", + relative_layer + ); + } + _ => { + return Err(anyhow::anyhow!(e).context(format!( + "Hard linking {relative_layer} into {child_prefix}" + ))) + } + } + } + } + } + + // Durability is not required for correctness, but if we crashed during split and + // then came restarted with empty timeline dirs, it would be very inefficient to + // re-populate from remote storage. + tracing::info!("fsyncing {} directories", create_dirs.len()); + for dir in create_dirs { + if let Err(e) = crashsafe::fsync(&dir) { + // Something removed a newly created timeline dir out from underneath us? Extremely + // unexpected, but not worth panic'ing over as this whole function is just an + // optimization. + tracing::warn!("Failed to fsync directory {dir}: {e}") + } + } + + Ok(parent_layers.len()) + }); + + match jh.await { + Ok(Ok(layer_count)) => { + tracing::info!(count = layer_count, "Hard linked layers into child shards"); + } + Ok(Err(e)) => { + // This is an optimization, so we tolerate failure. + tracing::warn!("Error hard-linking layers, proceeding anyway: {e}") + } + Err(e) => { + // This is something totally unexpected like a panic, so bail out. + anyhow::bail!("Error joining hard linking task: {e}"); + } + } + + Ok(()) + } + + /// + /// Shut down all tenants. This runs as part of pageserver shutdown. + /// + /// NB: We leave the tenants in the map, so that they remain accessible through + /// the management API until we shut it down. If we removed the shut-down tenants + /// from the tenants map, the management API would return 404 for these tenants, + /// because TenantsMap::get() now returns `None`. + /// That could be easily misinterpreted by control plane, the consumer of the + /// management API. For example, it could attach the tenant on a different pageserver. + /// We would then be in split-brain once this pageserver restarts. + #[instrument(skip_all)] + pub(crate) async fn shutdown(&self) { + self.cancel.cancel(); + + shutdown_all_tenants0(self.tenants).await + } + + pub(crate) async fn detach_tenant( + &self, + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + deletion_queue_client: &DeletionQueueClient, + ) -> Result<(), TenantStateError> { + let tmp_path = self + .detach_tenant0(conf, tenant_shard_id, deletion_queue_client) + .await?; + self.background_purges.spawn(tmp_path); + + Ok(()) + } + + async fn detach_tenant0( + &self, + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + deletion_queue_client: &DeletionQueueClient, + ) -> Result { + let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move { + let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean); + safe_rename_tenant_dir(&local_tenant_directory) + .await + .with_context(|| { + format!("local tenant directory {local_tenant_directory:?} rename") + }) + }; + + let removal_result = remove_tenant_from_memory( + self.tenants, + tenant_shard_id, + tenant_dir_rename_operation(tenant_shard_id), ) .await; - // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow - slot_guard.revert(); - result + // Flush pending deletions, so that they have a good chance of passing validation + // before this tenant is potentially re-attached elsewhere. + deletion_queue_client.flush_advisory(); + + removal_result + } + + pub(crate) fn list_tenants( + &self, + ) -> Result, TenantMapListError> { + let tenants = self.tenants.read().unwrap(); + let m = match &*tenants { + TenantsMap::Initializing => return Err(TenantMapListError::Initializing), + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m, + }; + Ok(m.iter() + .filter_map(|(id, tenant)| match tenant { + TenantSlot::Attached(tenant) => { + Some((*id, tenant.current_state(), tenant.generation())) + } + TenantSlot::Secondary(_) => None, + TenantSlot::InProgress(_) => None, + }) + .collect()) + } + + /// Completes an earlier prepared timeline detach ancestor. + pub(crate) async fn complete_detaching_timeline_ancestor( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + prepared: PreparedTimelineDetach, + mut attempt: detach_ancestor::Attempt, + ctx: &RequestContext, + ) -> Result, detach_ancestor::Error> { + use detach_ancestor::Error; + + let slot_guard = + tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist).map_err( + |e| { + use TenantSlotError::*; + + match e { + MapState(TenantMapError::ShuttingDown) => Error::ShuttingDown, + NotFound(_) | InProgress | MapState(_) => Error::DetachReparent(e.into()), + } + }, + )?; + + let tenant = { + let old_slot = slot_guard + .get_old_value() + .as_ref() + .expect("requested MustExist"); + + let Some(tenant) = old_slot.get_attached() else { + return Err(Error::DetachReparent(anyhow::anyhow!( + "Tenant is not in attached state" + ))); + }; + + if !tenant.is_active() { + return Err(Error::DetachReparent(anyhow::anyhow!( + "Tenant is not active" + ))); + } + + tenant.clone() + }; + + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(Error::NotFound)?; + + let resp = timeline + .detach_from_ancestor_and_reparent(&tenant, prepared, ctx) + .await?; + + let mut slot_guard = slot_guard; + + let tenant = if resp.reset_tenant_required() { + attempt.before_reset_tenant(); + + let (_guard, progress) = utils::completion::channel(); + match tenant.shutdown(progress, ShutdownMode::Hard).await { + Ok(()) => { + slot_guard.drop_old_value().expect("it was just shutdown"); + } + Err(_barrier) => { + slot_guard.revert(); + // this really should not happen, at all, unless a shutdown without acquiring + // tenant slot was already going? regardless, on restart the attempt tracking + // will reset to retryable. + return Err(Error::ShuttingDown); + } + } + + let tenant_path = self.conf.tenant_path(&tenant_shard_id); + let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id) + .map_err(|e| Error::DetachReparent(e.into()))?; + + let shard_identity = config.shard; + let tenant = tenant_spawn( + self.conf, + tenant_shard_id, + &tenant_path, + self.resources.clone(), + AttachedTenantConf::try_from(config).map_err(Error::DetachReparent)?, + shard_identity, + None, + SpawnMode::Eager, + ctx, + ) + .map_err(|_| Error::ShuttingDown)?; + + { + let mut g = tenant.ongoing_timeline_detach.lock().unwrap(); + assert!( + g.is_none(), + "there cannot be any new timeline detach ancestor on newly created tenant" + ); + *g = Some((attempt.timeline_id, attempt.new_barrier())); + } + + // if we bail out here, we will not allow a new attempt, which should be fine. + // pageserver should be shutting down regardless? tenant_reset would help, unless it + // runs into the same problem. + slot_guard + .upsert(TenantSlot::Attached(tenant.clone())) + .map_err(|e| match e { + TenantSlotUpsertError::ShuttingDown(_) => Error::ShuttingDown, + other => Error::DetachReparent(other.into()), + })?; + tenant + } else { + tracing::info!("skipping tenant_reset as no changes made required it"); + tenant + }; + + if let Some(reparented) = resp.completed() { + // finally ask the restarted tenant to complete the detach + // + // rationale for 9999s: we don't really have a timetable here; if retried, the caller + // will get an 503. + tenant + .wait_to_become_active(std::time::Duration::from_secs(9999)) + .await + .map_err(|e| { + use pageserver_api::models::TenantState; + use GetActiveTenantError::{Cancelled, WillNotBecomeActive}; + match e { + Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) => { + Error::ShuttingDown + } + other => Error::Complete(other.into()), + } + })?; + + utils::pausable_failpoint!( + "timeline-detach-ancestor::after_activating_before_finding-pausable" + ); + + let timeline = tenant + .get_timeline(attempt.timeline_id, true) + .map_err(Error::NotFound)?; + + timeline + .complete_detaching_timeline_ancestor(&tenant, attempt, ctx) + .await + .map(|()| reparented) + } else { + // at least the latest versions have now been downloaded and refreshed; be ready to + // retry another time. + Err(Error::FailedToReparentAll) + } + } + + /// A page service client sends a TenantId, and to look up the correct Tenant we must + /// resolve this to a fully qualified TenantShardId. + /// + /// During shard splits: we shall see parent shards in InProgress state and skip them, and + /// instead match on child shards which should appear in Attached state. Very early in a shard + /// split, or in other cases where a shard is InProgress, we will return our own InProgress result + /// to instruct the caller to wait for that to finish before querying again. + pub(crate) fn resolve_attached_shard( + &self, + tenant_id: &TenantId, + selector: ShardSelector, + ) -> ShardResolveResult { + let tenants = self.tenants.read().unwrap(); + let mut want_shard = None; + let mut any_in_progress = None; + + match &*tenants { + TenantsMap::Initializing => ShardResolveResult::NotFound, + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { + for slot in m.range(TenantShardId::tenant_range(*tenant_id)) { + // Ignore all slots that don't contain an attached tenant + let tenant = match &slot.1 { + TenantSlot::Attached(t) => t, + TenantSlot::InProgress(barrier) => { + // We might still find a usable shard, but in case we don't, remember that + // we saw at least one InProgress slot, so that we can distinguish this case + // from a simple NotFound in our return value. + any_in_progress = Some(barrier.clone()); + continue; + } + _ => continue, + }; + + match selector { + ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => { + return ShardResolveResult::Found(tenant.clone()) + } + ShardSelector::Page(key) => { + // First slot we see for this tenant, calculate the expected shard number + // for the key: we will use this for checking if this and subsequent + // slots contain the key, rather than recalculating the hash each time. + if want_shard.is_none() { + want_shard = Some(tenant.shard_identity.get_shard_number(&key)); + } + + if Some(tenant.shard_identity.number) == want_shard { + return ShardResolveResult::Found(tenant.clone()); + } + } + ShardSelector::Known(shard) + if tenant.shard_identity.shard_index() == shard => + { + return ShardResolveResult::Found(tenant.clone()); + } + _ => continue, + } + } + + // Fall through: we didn't find a slot that was in Attached state & matched our selector. If + // we found one or more InProgress slot, indicate to caller that they should retry later. Otherwise + // this requested shard simply isn't found. + if let Some(barrier) = any_in_progress { + ShardResolveResult::InProgress(barrier) + } else { + ShardResolveResult::NotFound + } + } + } + } + + /// Calculate the tenant shards' contributions to this pageserver's utilization metrics. The + /// returned values are: + /// - the number of bytes of local disk space this pageserver's shards are requesting, i.e. + /// how much space they would use if not impacted by disk usage eviction. + /// - the number of tenant shards currently on this pageserver, including attached + /// and secondary. + /// + /// This function is quite expensive: callers are expected to cache the result and + /// limit how often they call it. + pub(crate) fn calculate_utilization(&self) -> Result<(u64, u32), TenantMapListError> { + let tenants = self.tenants.read().unwrap(); + let m = match &*tenants { + TenantsMap::Initializing => return Err(TenantMapListError::Initializing), + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m, + }; + let shard_count = m.len(); + let mut wanted_bytes = 0; + + for tenant_slot in m.values() { + match tenant_slot { + TenantSlot::InProgress(_barrier) => { + // While a slot is being changed, we can't know how much storage it wants. This + // means this function's output can fluctuate if a lot of changes are going on + // (such as transitions from secondary to attached). + // + // We could wait for the barrier and retry, but it's important that the utilization + // API is responsive, and the data quality impact is not very significant. + continue; + } + TenantSlot::Attached(tenant) => { + wanted_bytes += tenant.local_storage_wanted(); + } + TenantSlot::Secondary(secondary) => { + let progress = secondary.progress.lock().unwrap(); + wanted_bytes += if progress.heatmap_mtime.is_some() { + // If we have heatmap info, then we will 'want' the sum + // of the size of layers in the heatmap: this is how much space + // we would use if not doing any eviction. + progress.bytes_total + } else { + // In the absence of heatmap info, assume that the secondary location simply + // needs as much space as it is currently using. + secondary.resident_size_metric.get() + } + } + } + } + + Ok((wanted_bytes, shard_count as u32)) } } @@ -1381,51 +2204,12 @@ pub(crate) enum GetTenantError { #[error("Tenant {0} is not active")] NotActive(TenantShardId), - /// Broken is logically a subset of NotActive, but a distinct error is useful as - /// NotActive is usually a retryable state for API purposes, whereas Broken - /// is a stuck error state - #[error("Tenant is broken: {0}")] - Broken(String), // Initializing or shutting down: cannot authoritatively say whether we have this tenant #[error("Tenant map is not available: {0}")] MapState(#[from] TenantMapError), } -/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. -/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. -/// -/// This method is cancel-safe. -pub(crate) fn get_tenant( - tenant_shard_id: TenantShardId, - active_only: bool, -) -> Result, GetTenantError> { - let locked = TENANTS.read().unwrap(); - - let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?; - - match peek_slot { - Some(TenantSlot::Attached(tenant)) => match tenant.current_state() { - TenantState::Broken { - reason, - backtrace: _, - } if active_only => Err(GetTenantError::Broken(reason)), - TenantState::Active => Ok(Arc::clone(tenant)), - _ => { - if active_only { - Err(GetTenantError::NotActive(tenant_shard_id)) - } else { - Ok(Arc::clone(tenant)) - } - } - }, - Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)), - None | Some(TenantSlot::Secondary(_)) => { - Err(GetTenantError::NotFound(tenant_shard_id.tenant_id)) - } - } -} - #[derive(thiserror::Error, Debug)] pub(crate) enum GetActiveTenantError { /// We may time out either while TenantSlot is InProgress, or while the Tenant @@ -1449,105 +2233,15 @@ pub(crate) enum GetActiveTenantError { /// Tenant exists, but is in a state that cannot become active (e.g. Stopping, Broken) #[error("will not become active. Current state: {0}")] WillNotBecomeActive(TenantState), -} -/// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`] -/// state, then wait for up to `timeout`. If the [`Tenant`] is not currently in [`TenantState::Active`], -/// then wait for up to `timeout` (minus however long we waited for the slot). -pub(crate) async fn get_active_tenant_with_timeout( - tenant_id: TenantId, - shard_selector: ShardSelector, - timeout: Duration, - cancel: &CancellationToken, -) -> Result, GetActiveTenantError> { - enum WaitFor { - Barrier(utils::completion::Barrier), - Tenant(Arc), - } + /// Broken is logically a subset of WillNotBecomeActive, but a distinct error is useful as + /// WillNotBecomeActive is a permitted error under some circumstances, whereas broken should + /// never happen. + #[error("Tenant is broken: {0}")] + Broken(String), - let wait_start = Instant::now(); - let deadline = wait_start + timeout; - - let (wait_for, tenant_shard_id) = { - let locked = TENANTS.read().unwrap(); - - // Resolve TenantId to TenantShardId - let tenant_shard_id = locked - .resolve_attached_shard(&tenant_id, shard_selector) - .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound( - tenant_id, - )))?; - - let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read) - .map_err(GetTenantError::MapState)?; - match peek_slot { - Some(TenantSlot::Attached(tenant)) => { - match tenant.current_state() { - TenantState::Active => { - // Fast path: we don't need to do any async waiting. - return Ok(tenant.clone()); - } - _ => { - tenant.activate_now(); - (WaitFor::Tenant(tenant.clone()), tenant_shard_id) - } - } - } - Some(TenantSlot::Secondary(_)) => { - return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive( - tenant_shard_id, - ))) - } - Some(TenantSlot::InProgress(barrier)) => { - (WaitFor::Barrier(barrier.clone()), tenant_shard_id) - } - None => { - return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound( - tenant_id, - ))) - } - } - }; - - let tenant = match wait_for { - WaitFor::Barrier(barrier) => { - tracing::debug!("Waiting for tenant InProgress state to pass..."); - timeout_cancellable( - deadline.duration_since(Instant::now()), - cancel, - barrier.wait(), - ) - .await - .map_err(|e| match e { - TimeoutCancellableError::Timeout => GetActiveTenantError::WaitForActiveTimeout { - latest_state: None, - wait_time: wait_start.elapsed(), - }, - TimeoutCancellableError::Cancelled => GetActiveTenantError::Cancelled, - })?; - { - let locked = TENANTS.read().unwrap(); - let peek_slot = - tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read) - .map_err(GetTenantError::MapState)?; - match peek_slot { - Some(TenantSlot::Attached(tenant)) => tenant.clone(), - _ => { - return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive( - tenant_shard_id, - ))) - } - } - } - } - WaitFor::Tenant(tenant) => tenant, - }; - - tracing::debug!("Waiting for tenant to enter active state..."); - tenant - .wait_to_become_active(deadline.duration_since(Instant::now())) - .await?; - Ok(tenant) + #[error("reconnect to switch tenant id")] + SwitchedTenant, } #[derive(Debug, thiserror::Error)] @@ -1571,200 +2265,12 @@ pub(crate) enum TenantStateError { Other(#[from] anyhow::Error), } -pub(crate) async fn detach_tenant( - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - detach_ignored: bool, - deletion_queue_client: &DeletionQueueClient, -) -> Result<(), TenantStateError> { - let tmp_path = detach_tenant0( - conf, - &TENANTS, - tenant_shard_id, - detach_ignored, - deletion_queue_client, - ) - .await?; - // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory. - // After a tenant is detached, there are no more task_mgr tasks for that tenant_id. - let task_tenant_id = None; - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - TaskKind::MgmtRequest, - task_tenant_id, - None, - "tenant_files_delete", - false, - async move { - fs::remove_dir_all(tmp_path.as_path()) - .await - .with_context(|| format!("tenant directory {:?} deletion", tmp_path)) - }, - ); - Ok(()) -} - -async fn detach_tenant0( - conf: &'static PageServerConf, - tenants: &std::sync::RwLock, - tenant_shard_id: TenantShardId, - detach_ignored: bool, - deletion_queue_client: &DeletionQueueClient, -) -> Result { - let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move { - let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean); - safe_rename_tenant_dir(&local_tenant_directory) - .await - .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename")) - }; - - let removal_result = remove_tenant_from_memory( - tenants, - tenant_shard_id, - tenant_dir_rename_operation(tenant_shard_id), - ) - .await; - - // Flush pending deletions, so that they have a good chance of passing validation - // before this tenant is potentially re-attached elsewhere. - deletion_queue_client.flush_advisory(); - - // Ignored tenants are not present in memory and will bail the removal from memory operation. - // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then. - if detach_ignored - && matches!( - removal_result, - Err(TenantStateError::SlotError(TenantSlotError::NotFound(_))) - ) - { - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id); - if tenant_ignore_mark.exists() { - info!("Detaching an ignored tenant"); - let tmp_path = tenant_dir_rename_operation(tenant_shard_id) - .await - .with_context(|| { - format!("Ignored tenant {tenant_shard_id} local directory rename") - })?; - return Ok(tmp_path); - } - } - - removal_result -} - -pub(crate) async fn load_tenant( - conf: &'static PageServerConf, - tenant_id: TenantId, - generation: Generation, - broker_client: storage_broker::BrokerClientChannel, - remote_storage: Option, - deletion_queue_client: DeletionQueueClient, - ctx: &RequestContext, -) -> Result<(), TenantMapInsertError> { - // This is a legacy API (replaced by `/location_conf`). It does not support sharding - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - - let slot_guard = - tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?; - let tenant_path = conf.tenant_path(&tenant_shard_id); - - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id); - if tenant_ignore_mark.exists() { - std::fs::remove_file(&tenant_ignore_mark).with_context(|| { - format!( - "Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading" - ) - })?; - } - - let resources = TenantSharedResources { - broker_client, - remote_storage, - deletion_queue_client, - }; - - let mut location_conf = - Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?; - location_conf.attach_in_generation(generation); - - Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?; - - let shard_identity = location_conf.shard; - let new_tenant = tenant_spawn( - conf, - tenant_shard_id, - &tenant_path, - resources, - AttachedTenantConf::try_from(location_conf)?, - shard_identity, - None, - &TENANTS, - SpawnMode::Normal, - ctx, - ) - .with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?; - - slot_guard.upsert(TenantSlot::Attached(new_tenant))?; - Ok(()) -} - -pub(crate) async fn ignore_tenant( - conf: &'static PageServerConf, - tenant_id: TenantId, -) -> Result<(), TenantStateError> { - ignore_tenant0(conf, &TENANTS, tenant_id).await -} - -async fn ignore_tenant0( - conf: &'static PageServerConf, - tenants: &std::sync::RwLock, - tenant_id: TenantId, -) -> Result<(), TenantStateError> { - // This is a legacy API (replaced by `/location_conf`). It does not support sharding - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - - remove_tenant_from_memory(tenants, tenant_shard_id, async { - let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id); - fs::File::create(&ignore_mark_file) - .await - .context("Failed to create ignore mark file") - .and_then(|_| { - crashsafe::fsync_file_and_parent(&ignore_mark_file) - .context("Failed to fsync ignore mark file") - }) - .with_context(|| format!("Failed to crate ignore mark for tenant {tenant_shard_id}"))?; - Ok(()) - }) - .await -} - #[derive(Debug, thiserror::Error)] pub(crate) enum TenantMapListError { #[error("tenant map is still initiailizing")] Initializing, } -/// -/// Get list of tenants, for the mgmt API -/// -pub(crate) async fn list_tenants( -) -> Result, TenantMapListError> { - let tenants = TENANTS.read().unwrap(); - let m = match &*tenants { - TenantsMap::Initializing => return Err(TenantMapListError::Initializing), - TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m, - }; - Ok(m.iter() - .filter_map(|(id, tenant)| match tenant { - TenantSlot::Attached(tenant) => { - Some((*id, tenant.current_state(), tenant.generation())) - } - TenantSlot::Secondary(_) => None, - TenantSlot::InProgress(_) => None, - }) - .collect()) -} - #[derive(Debug, thiserror::Error)] pub(crate) enum TenantMapInsertError { #[error(transparent)] @@ -1783,10 +2289,6 @@ pub(crate) enum TenantSlotError { #[error("Tenant {0} not found")] NotFound(TenantShardId), - /// When acquiring a slot with the expectation that the tenant does not already exist. - #[error("tenant {0} already exists, state: {1:?}")] - AlreadyExists(TenantShardId, TenantState), - // Tried to read a slot that is currently being mutated by another administrative // operation. #[error("tenant has a state change in progress, try again later")] @@ -1845,8 +2347,9 @@ pub enum TenantMapError { ShuttingDown, } -/// Guards a particular tenant_id's content in the TenantsMap. While this -/// structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`] +/// Guards a particular tenant_id's content in the TenantsMap. +/// +/// While this structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`] /// for this tenant, which acts as a marker for any operations targeting /// this tenant to retry later, or wait for the InProgress state to end. /// @@ -1890,6 +2393,9 @@ impl SlotGuard { /// Get any value that was present in the slot before we acquired ownership /// of it: in state transitions, this will be the old state. + /// + // FIXME: get_ prefix + // FIXME: this should be .as_ref() -- unsure why no clippy fn get_old_value(&self) -> &Option { &self.old_value } @@ -1931,10 +2437,13 @@ impl SlotGuard { TenantsMap::Open(m) => m, }; + METRICS.slot_inserted(&new_value); + let replaced = m.insert(self.tenant_shard_id, new_value); self.upserted = true; - - METRICS.tenant_slots.set(m.len() as u64); + if let Some(replaced) = replaced.as_ref() { + METRICS.slot_removed(replaced); + } replaced }; @@ -2044,9 +2553,13 @@ impl Drop for SlotGuard { } if self.old_value_is_shutdown() { + METRICS.slot_removed(entry.get()); entry.remove(); } else { - entry.insert(self.old_value.take().unwrap()); + let inserting = self.old_value.take().unwrap(); + METRICS.slot_inserted(&inserting); + let replaced = entry.insert(inserting); + METRICS.slot_removed(&replaced); } } Entry::Vacant(_) => { @@ -2057,8 +2570,6 @@ impl Drop for SlotGuard { ); } } - - METRICS.tenant_slots.set(m.len() as u64); } } @@ -2097,8 +2608,6 @@ enum TenantSlotAcquireMode { Any, /// Return an error if trying to acquire a slot and it doesn't already exist MustExist, - /// Return an error if trying to acquire a slot and it already exists - MustNotExist, } fn tenant_map_acquire_slot( @@ -2117,7 +2626,7 @@ fn tenant_map_acquire_slot_impl( METRICS.tenant_slot_writes.inc(); let mut locked = tenants.write().unwrap(); - let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug()); + let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()); let _guard = span.enter(); let m = match &mut *locked { @@ -2138,7 +2647,9 @@ fn tenant_map_acquire_slot_impl( } _ => { let (completion, barrier) = utils::completion::channel(); - v.insert(TenantSlot::InProgress(barrier)); + let inserting = TenantSlot::InProgress(barrier); + METRICS.slot_inserted(&inserting); + v.insert(inserting); tracing::debug!("Vacant, inserted InProgress"); Ok(SlotGuard::new(*tenant_shard_id, None, completion)) } @@ -2150,31 +2661,13 @@ fn tenant_map_acquire_slot_impl( tracing::debug!("Occupied, failing for InProgress"); Err(TenantSlotError::InProgress) } - (slot, MustNotExist) => match slot { - TenantSlot::Attached(tenant) => { - tracing::debug!("Attached && MustNotExist, return AlreadyExists"); - Err(TenantSlotError::AlreadyExists( - *tenant_shard_id, - tenant.current_state(), - )) - } - _ => { - // FIXME: the AlreadyExists error assumes that we have a Tenant - // to get the state from - tracing::debug!("Occupied & MustNotExist, return AlreadyExists"); - Err(TenantSlotError::AlreadyExists( - *tenant_shard_id, - TenantState::Broken { - reason: "Present but not attached".to_string(), - backtrace: "".to_string(), - }, - )) - } - }, _ => { // Happy case: the slot was not in any state that violated our mode let (completion, barrier) = utils::completion::channel(); - let old_value = o.insert(TenantSlot::InProgress(barrier)); + let in_progress = TenantSlot::InProgress(barrier); + METRICS.slot_inserted(&in_progress); + let old_value = o.insert(in_progress); + METRICS.slot_removed(&old_value); tracing::debug!("Occupied, replaced with InProgress"); Ok(SlotGuard::new( *tenant_shard_id, @@ -2199,8 +2692,6 @@ async fn remove_tenant_from_memory( where F: std::future::Future>, { - use utils::completion; - let mut slot_guard = tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?; @@ -2212,11 +2703,11 @@ where let attached_tenant = match slot_guard.get_old_value() { Some(TenantSlot::Attached(tenant)) => { // whenever we remove a tenant from memory, we don't want to flush and wait for upload - let freeze_and_flush = false; + let shutdown_mode = ShutdownMode::Hard; // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so // that we can continue safely to cleanup. - match tenant.shutdown(progress, freeze_and_flush).await { + match tenant.shutdown(progress, shutdown_mode).await { Ok(()) => {} Err(_other) => { // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to @@ -2269,91 +2760,86 @@ use { utils::http::error::ApiError, }; +#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))] pub(crate) async fn immediate_gc( tenant_shard_id: TenantShardId, timeline_id: TimelineId, gc_req: TimelineGcRequest, cancel: CancellationToken, ctx: &RequestContext, -) -> Result>, ApiError> { - let guard = TENANTS.read().unwrap(); - - let tenant = guard - .get(&tenant_shard_id) - .map(Arc::clone) - .with_context(|| format!("tenant {tenant_shard_id}")) - .map_err(|e| ApiError::NotFound(e.into()))?; +) -> Result { + let tenant = { + let guard = TENANTS.read().unwrap(); + guard + .get(&tenant_shard_id) + .cloned() + .with_context(|| format!("tenant {tenant_shard_id}")) + .map_err(|e| ApiError::NotFound(e.into()))? + }; let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); // Use tenant's pitr setting let pitr = tenant.get_pitr_interval(); + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + // Run in task_mgr to avoid race with tenant_detach operation - let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download); - let (task_done, wait_task_done) = tokio::sync::oneshot::channel(); - // TODO: spawning is redundant now, need to hold the gate - task_mgr::spawn( - &tokio::runtime::Handle::current(), - TaskKind::GarbageCollector, - Some(tenant_shard_id), - Some(timeline_id), - &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"), - false, - async move { - fail::fail_point!("immediate_gc_task_pre"); + let ctx: RequestContext = + ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download); - #[allow(unused_mut)] - let mut result = tenant - .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) - .instrument(info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id)) - .await; - // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it - // better once the types support it. + let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?; - #[cfg(feature = "testing")] - { - if let Ok(result) = result.as_mut() { - // why not futures unordered? it seems it needs very much the same task structure - // but would only run on single task. - let mut js = tokio::task::JoinSet::new(); - for layer in std::mem::take(&mut result.doomed_layers) { - js.spawn(layer.wait_drop()); - } - tracing::info!(total = js.len(), "starting to wait for the gc'd layers to be dropped"); - while let Some(res) = js.join_next().await { - res.expect("wait_drop should not panic"); - } - } + fail::fail_point!("immediate_gc_task_pre"); - let timeline = tenant.get_timeline(timeline_id, false).ok(); - let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref()); + #[allow(unused_mut)] + let mut result = tenant + .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) + .await; + // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it + // better once the types support it. - if let Some(rtc) = rtc { - // layer drops schedule actions on remote timeline client to actually do the - // deletions; don't care just exit fast about the shutdown error - drop(rtc.wait_completion().await); - } + #[cfg(feature = "testing")] + { + // we need to synchronize with drop completion for python tests without polling for + // log messages + if let Ok(result) = result.as_mut() { + let mut js = tokio::task::JoinSet::new(); + for layer in std::mem::take(&mut result.doomed_layers) { + js.spawn(layer.wait_drop()); } - - match task_done.send(result) { - Ok(_) => (), - Err(result) => error!("failed to send gc result: {result:?}"), + tracing::info!( + total = js.len(), + "starting to wait for the gc'd layers to be dropped" + ); + while let Some(res) = js.join_next().await { + res.expect("wait_drop should not panic"); } - Ok(()) } - ); - // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task - drop(guard); + let timeline = tenant.get_timeline(timeline_id, false).ok(); + let rtc = timeline.as_ref().map(|x| &x.remote_client); - Ok(wait_task_done) + if let Some(rtc) = rtc { + // layer drops schedule actions on remote timeline client to actually do the + // deletions; don't care about the shutdown error, just exit fast + drop(rtc.wait_completion().await); + } + } + + result.map_err(|e| match e { + GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown, + GcError::TimelineNotFound => { + ApiError::NotFound(anyhow::anyhow!("Timeline not found").into()) + } + other => ApiError::InternalServerError(anyhow::anyhow!(other)), + }) } #[cfg(test)] mod tests { use std::collections::BTreeMap; use std::sync::Arc; - use tracing::{info_span, Instrument}; + use tracing::Instrument; use crate::tenant::mgr::TenantSlot; @@ -2364,17 +2850,18 @@ mod tests { // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully // wait for it to complete before proceeding. - let (t, _ctx) = TenantHarness::create("shutdown_awaits_in_progress_tenant") - .unwrap() - .load() - .await; + let h = TenantHarness::create("shutdown_awaits_in_progress_tenant") + .await + .unwrap(); + let (t, _ctx) = h.load().await; // harness loads it to active, which is forced and nothing is running on the tenant let id = t.tenant_shard_id(); // tenant harness configures the logging and we cannot escape it - let _e = info_span!("testing", tenant_id = %id).entered(); + let span = h.span(); + let _e = span.enter(); let tenants = BTreeMap::from([(id, TenantSlot::Attached(t.clone()))]); let tenants = Arc::new(std::sync::RwLock::new(TenantsMap::Open(tenants))); @@ -2395,7 +2882,7 @@ mod tests { }; super::remove_tenant_from_memory(&tenants, id, cleanup).await } - .instrument(info_span!("foobar", tenant_id = %id)) + .instrument(h.span()) }); // now the long cleanup should be in place, with the stopping state diff --git a/pageserver/src/tenant/par_fsync.rs b/pageserver/src/tenant/par_fsync.rs deleted file mode 100644 index 3acb0fb431..0000000000 --- a/pageserver/src/tenant/par_fsync.rs +++ /dev/null @@ -1,84 +0,0 @@ -use std::{ - io, - sync::atomic::{AtomicUsize, Ordering}, -}; - -use camino::{Utf8Path, Utf8PathBuf}; - -fn fsync_path(path: &Utf8Path) -> io::Result<()> { - // TODO use VirtualFile::fsync_all once we fully go async. - let file = std::fs::File::open(path)?; - file.sync_all() -} - -fn parallel_worker(paths: &[Utf8PathBuf], next_path_idx: &AtomicUsize) -> io::Result<()> { - while let Some(path) = paths.get(next_path_idx.fetch_add(1, Ordering::Relaxed)) { - fsync_path(path)?; - } - - Ok(()) -} - -fn fsync_in_thread_pool(paths: &[Utf8PathBuf]) -> io::Result<()> { - // TODO: remove this function in favor of `par_fsync_async` once we asyncify everything. - - /// Use at most this number of threads. - /// Increasing this limit will - /// - use more memory - /// - increase the cost of spawn/join latency - const MAX_NUM_THREADS: usize = 64; - let num_threads = paths.len().min(MAX_NUM_THREADS); - let next_path_idx = AtomicUsize::new(0); - - std::thread::scope(|s| -> io::Result<()> { - let mut handles = vec![]; - // Spawn `num_threads - 1`, as the current thread is also a worker. - for _ in 1..num_threads { - handles.push(s.spawn(|| parallel_worker(paths, &next_path_idx))); - } - - parallel_worker(paths, &next_path_idx)?; - - for handle in handles { - handle.join().unwrap()?; - } - - Ok(()) - }) -} - -/// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool. -pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> { - if paths.len() == 1 { - fsync_path(&paths[0])?; - return Ok(()); - } - - fsync_in_thread_pool(paths) -} - -/// Parallel fsync asynchronously. -pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> { - const MAX_CONCURRENT_FSYNC: usize = 64; - let mut next = paths.iter().peekable(); - let mut js = tokio::task::JoinSet::new(); - loop { - while js.len() < MAX_CONCURRENT_FSYNC && next.peek().is_some() { - let next = next.next().expect("just peeked"); - let next = next.to_owned(); - js.spawn_blocking(move || fsync_path(&next)); - } - - // now the joinset has been filled up, wait for next to complete - if let Some(res) = js.join_next().await { - res??; - } else { - // last item had already completed - assert!( - next.peek().is_none(), - "joinset emptied, we shouldn't have more work" - ); - return Ok(()); - } - } -} diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 80ff5c9a2d..1f9ae40af5 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -91,8 +91,7 @@ //! //! The *actual* remote state lags behind the *desired* remote state while //! there are in-flight operations. -//! We keep track of the desired remote state in -//! [`UploadQueueInitialized::latest_files`] and [`UploadQueueInitialized::latest_metadata`]. +//! We keep track of the desired remote state in [`UploadQueueInitialized::dirty`]. //! It is initialized based on the [`IndexPart`] that was passed during init //! and updated with every `schedule_*` function call. //! All this is necessary necessary to compute the future [`IndexPart`]s @@ -115,8 +114,7 @@ //! //! # Completion //! -//! Once an operation has completed, we update -//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately, +//! Once an operation has completed, we update [`UploadQueueInitialized::clean`] immediately, //! and submit a request through the DeletionQueue to update //! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has //! validated that our generation is not stale. It is this visible value @@ -189,6 +187,7 @@ use camino::Utf8Path; use chrono::{NaiveDateTime, Utc}; pub(crate) use download::download_initdb_tar_zst; +use pageserver_api::models::{AuxFilePolicy, TimelineArchivalState}; use pageserver_api::shard::{ShardIndex, TenantShardId}; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; @@ -196,20 +195,23 @@ pub(crate) use upload::upload_initdb_dir; use utils::backoff::{ self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, }; -use utils::timeout::{timeout_cancellable, TimeoutCancellableError}; +use utils::pausable_failpoint; use std::collections::{HashMap, VecDeque}; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; use std::time::Duration; -use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath}; +use remote_storage::{ + DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel, +}; use std::ops::DerefMut; use tracing::{debug, error, info, instrument, warn}; use tracing::{info_span, Instrument}; use utils::lsn::Lsn; -use crate::deletion_queue::DeletionQueueClient; +use crate::context::RequestContext; +use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; use crate::metrics::{ MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES, @@ -217,8 +219,9 @@ use crate::metrics::{ }; use crate::task_mgr::shutdown_token; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::remote_timeline_client::download::download_retry; use crate::tenant::storage_layer::AsLayerDesc; -use crate::tenant::upload_queue::Delete; +use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable}; use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::{ config::PageServerConf, @@ -236,11 +239,14 @@ use utils::id::{TenantId, TimelineId}; use self::index::IndexPart; -use super::storage_layer::{Layer, LayerFileName, ResidentLayer}; -use super::upload_queue::SetDeletedFlagProgress; +use super::metadata::MetadataUpdate; +use super::storage_layer::{Layer, LayerName, ResidentLayer}; +use super::upload_queue::{NotInitialized, SetDeletedFlagProgress}; use super::Generation; -pub(crate) use download::{is_temp_download_file, list_remote_timelines}; +pub(crate) use download::{ + download_index_part, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines, +}; pub(crate) use index::LayerFileMetadata; // Occasional network issues and such can cause remote operations to fail, and @@ -262,20 +268,15 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst"; /// Default buffer size when interfacing with [`tokio::fs::File`]. pub(crate) const BUFFER_SIZE: usize = 32 * 1024; +/// Doing non-essential flushes of deletion queue is subject to this timeout, after +/// which we warn and skip. +const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10); + pub enum MaybeDeletedIndexPart { IndexPart(IndexPart), Deleted(IndexPart), } -/// Errors that can arise when calling [`RemoteTimelineClient::stop`]. -#[derive(Debug, thiserror::Error)] -pub enum StopError { - /// Returned if the upload queue was never initialized. - /// See [`RemoteTimelineClient::init_upload_queue`] and [`RemoteTimelineClient::init_upload_queue_for_empty_remote`]. - #[error("queue is not initialized")] - QueueUninitialized, -} - #[derive(Debug, thiserror::Error)] pub enum PersistIndexPartWithDeletedFlagError { #[error("another task is already setting the deleted_flag, started at {0:?}")] @@ -286,6 +287,14 @@ pub enum PersistIndexPartWithDeletedFlagError { Other(#[from] anyhow::Error), } +#[derive(Debug, thiserror::Error)] +pub enum WaitCompletionError { + #[error(transparent)] + NotInitialized(NotInitialized), + #[error("wait_completion aborted because upload queue was stopped")] + UploadQueueShutDownOrStopped, +} + /// A client for accessing a timeline's data in remote storage. /// /// This takes care of managing the number of connections, and balancing them @@ -316,7 +325,7 @@ pub struct RemoteTimelineClient { upload_queue: Mutex, - metrics: Arc, + pub(crate) metrics: Arc, storage_impl: GenericRemoteStorage, @@ -325,45 +334,6 @@ pub struct RemoteTimelineClient { cancel: CancellationToken, } -/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows. It is not -/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that. -const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120); -const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120); - -/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow. -/// -/// This is a convenience for the various upload functions. In future -/// the anyhow::Error result should be replaced with a more structured type that -/// enables callers to avoid handling shutdown as an error. -async fn upload_cancellable(cancel: &CancellationToken, future: F) -> anyhow::Result<()> -where - F: std::future::Future>, -{ - match timeout_cancellable(UPLOAD_TIMEOUT, cancel, future).await { - Ok(Ok(())) => Ok(()), - Ok(Err(e)) => Err(e), - Err(TimeoutCancellableError::Timeout) => Err(anyhow::anyhow!("Timeout")), - Err(TimeoutCancellableError::Cancelled) => Err(anyhow::anyhow!("Shutting down")), - } -} -/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to DownloaDError. -async fn download_cancellable( - cancel: &CancellationToken, - future: F, -) -> Result -where - F: std::future::Future>, -{ - match timeout_cancellable(DOWNLOAD_TIMEOUT, cancel, future).await { - Ok(Ok(r)) => Ok(r), - Ok(Err(e)) => Err(e), - Err(TimeoutCancellableError::Timeout) => { - Err(DownloadError::Other(anyhow::anyhow!("Timed out"))) - } - Err(TimeoutCancellableError::Cancelled) => Err(DownloadError::Cancelled), - } -} - impl RemoteTimelineClient { /// /// Create a remote storage client for given timeline @@ -439,15 +409,10 @@ impl RemoteTimelineClient { "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted" ))?; - { - let mut upload_queue = self.upload_queue.lock().unwrap(); - upload_queue.initialize_with_current_remote_index_part(index_part)?; - self.update_remote_physical_size_gauge(Some(index_part)); - } - // also locks upload queue, without dropping the guard above it will be a deadlock - self.stop().expect("initialized line above"); - let mut upload_queue = self.upload_queue.lock().unwrap(); + upload_queue.initialize_with_current_remote_index_part(index_part)?; + self.update_remote_physical_size_gauge(Some(index_part)); + self.stop_impl(&mut upload_queue); upload_queue .stopped_mut() @@ -457,11 +422,13 @@ impl RemoteTimelineClient { Ok(()) } + /// Returns `None` if nothing is yet uplodaded, `Some(disk_consistent_lsn)` otherwise. pub fn remote_consistent_lsn_projected(&self) -> Option { match &mut *self.upload_queue.lock().unwrap() { UploadQueue::Uninitialized => None, UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(), - UploadQueue::Stopped(q) => q + UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None, + UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => q .upload_queue_for_deletion .get_last_remote_consistent_lsn_projected(), } @@ -471,29 +438,51 @@ impl RemoteTimelineClient { match &mut *self.upload_queue.lock().unwrap() { UploadQueue::Uninitialized => None, UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()), - UploadQueue::Stopped(q) => Some( + UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None, + UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => Some( q.upload_queue_for_deletion .get_last_remote_consistent_lsn_visible(), ), } } + /// Returns true if this timeline was previously detached at this Lsn and the remote timeline + /// client is currently initialized. + pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool { + self.upload_queue + .lock() + .unwrap() + .initialized_mut() + .map(|uq| uq.clean.0.lineage.is_previous_ancestor_lsn(lsn)) + .unwrap_or(false) + } + + /// Returns whether the timeline is archived. + /// Return None if the remote index_part hasn't been downloaded yet. + pub(crate) fn is_archived(&self) -> Option { + self.upload_queue + .lock() + .unwrap() + .initialized_mut() + .map(|q| q.clean.0.archived_at.is_some()) + .ok() + } + fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) { let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part { current_remote_index_part .layer_metadata .values() - // If we don't have the file size for the layer, don't account for it in the metric. .map(|ilmd| ilmd.file_size) .sum() } else { 0 }; - self.metrics.remote_physical_size_set(size); + self.metrics.remote_physical_size_gauge.set(size); } pub fn get_remote_physical_size(&self) -> u64 { - self.metrics.remote_physical_size_get() + self.metrics.remote_physical_size_gauge.get() } // @@ -506,7 +495,7 @@ impl RemoteTimelineClient { /// Download index file pub async fn download_index_file( &self, - cancel: CancellationToken, + cancel: &CancellationToken, ) -> Result { let _unfinished_gauge_guard = self.metrics.call_begin( &RemoteOpFileKind::Index, @@ -516,7 +505,7 @@ impl RemoteTimelineClient { }, ); - let index_part = download::download_index_part( + let (index_part, _index_generation) = download::download_index_part( &self.storage_impl, &self.tenant_shard_id, &self.timeline_id, @@ -544,10 +533,12 @@ impl RemoteTimelineClient { /// On success, returns the size of the downloaded file. pub async fn download_layer_file( &self, - layer_file_name: &LayerFileName, + layer_file_name: &LayerName, layer_metadata: &LayerFileMetadata, + local_path: &Utf8Path, cancel: &CancellationToken, - ) -> anyhow::Result { + ctx: &RequestContext, + ) -> Result { let downloaded_size = { let _unfinished_gauge_guard = self.metrics.call_begin( &RemoteOpFileKind::Layer, @@ -563,7 +554,9 @@ impl RemoteTimelineClient { self.timeline_id, layer_file_name, layer_metadata, + local_path, cancel, + ctx, ) .measure_remote_op( RemoteOpFileKind::Layer, @@ -583,9 +576,10 @@ impl RemoteTimelineClient { // Upload operations. // - /// /// Launch an index-file upload operation in the background, with - /// updated metadata. + /// fully updated metadata. + /// + /// This should only be used to upload initial metadata to remote storage. /// /// The upload will be added to the queue immediately, but it /// won't be performed until all previously scheduled layer file @@ -597,7 +591,7 @@ impl RemoteTimelineClient { /// If there were any changes to the list of files, i.e. if any /// layer file uploads were scheduled, since the last index file /// upload, those will be included too. - pub fn schedule_index_upload_for_metadata_update( + pub fn schedule_index_upload_for_full_metadata_update( self: &Arc, metadata: &TimelineMetadata, ) -> anyhow::Result<()> { @@ -606,13 +600,87 @@ impl RemoteTimelineClient { // As documented in the struct definition, it's ok for latest_metadata to be // ahead of what's _actually_ on the remote during index upload. - upload_queue.latest_metadata = metadata.clone(); + upload_queue.dirty.metadata = metadata.clone(); - self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone()); + self.schedule_index_upload(upload_queue)?; Ok(()) } + /// Launch an index-file upload operation in the background, with only parts of the metadata + /// updated. + /// + /// This is the regular way of updating metadata on layer flushes or Gc. + /// + /// Using this lighter update mechanism allows for reparenting and detaching without changes to + /// `index_part.json`, while being more clear on what values update regularly. + pub(crate) fn schedule_index_upload_for_metadata_update( + self: &Arc, + update: &MetadataUpdate, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + upload_queue.dirty.metadata.apply(update); + + self.schedule_index_upload(upload_queue)?; + + Ok(()) + } + + /// Launch an index-file upload operation in the background, with only the `aux_file_policy` flag updated. + pub(crate) fn schedule_index_upload_for_aux_file_policy_update( + self: &Arc, + last_aux_file_policy: Option, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + upload_queue.dirty.last_aux_file_policy = last_aux_file_policy; + self.schedule_index_upload(upload_queue)?; + Ok(()) + } + + /// Launch an index-file upload operation in the background, with only the `archived_at` field updated. + /// + /// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded, + /// so either if the change is already sitting in the queue, but not commited yet, or the change has not + /// been in the queue yet. + pub(crate) fn schedule_index_upload_for_timeline_archival_state( + self: &Arc, + state: TimelineArchivalState, + ) -> anyhow::Result { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + /// Returns Some(_) if a change is needed, and Some(true) if it's a + /// change needed to set archived_at. + fn need_change( + archived_at: &Option, + state: TimelineArchivalState, + ) -> Option { + match (archived_at, state) { + (Some(_), TimelineArchivalState::Archived) + | (None, TimelineArchivalState::Unarchived) => { + // Nothing to do + tracing::info!("intended state matches present state"); + None + } + (None, TimelineArchivalState::Archived) => Some(true), + (Some(_), TimelineArchivalState::Unarchived) => Some(false), + } + } + let need_upload_scheduled = need_change(&upload_queue.dirty.archived_at, state); + + if let Some(archived_at_set) = need_upload_scheduled { + let intended_archived_at = archived_at_set.then(|| Utc::now().naive_utc()); + upload_queue.dirty.archived_at = intended_archived_at; + self.schedule_index_upload(upload_queue)?; + } + + let need_wait = need_change(&upload_queue.clean.0.archived_at, state).is_some(); + Ok(need_wait) + } + /// /// Launch an index-file upload operation in the background, if necessary. /// @@ -623,12 +691,12 @@ impl RemoteTimelineClient { /// /// Like schedule_index_upload_for_metadata_update(), this merely adds /// the upload to the upload queue and returns quickly. - pub fn schedule_index_upload_for_file_changes(self: &Arc) -> anyhow::Result<()> { + pub fn schedule_index_upload_for_file_changes(self: &Arc) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { - self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone()); + self.schedule_index_upload(upload_queue)?; } Ok(()) @@ -638,37 +706,231 @@ impl RemoteTimelineClient { fn schedule_index_upload( self: &Arc, upload_queue: &mut UploadQueueInitialized, - metadata: TimelineMetadata, - ) { + ) -> Result<(), NotInitialized> { + let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn(); + // fix up the duplicated field + upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn; + + // make sure it serializes before doing it in perform_upload_task so that it doesn't + // look like a retryable error + let void = std::io::sink(); + serde_json::to_writer(void, &upload_queue.dirty).expect("serialize index_part.json"); + + let index_part = &upload_queue.dirty; + info!( - "scheduling metadata upload with {} files ({} changed)", - upload_queue.latest_files.len(), + "scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)", + index_part.layer_metadata.len(), upload_queue.latest_files_changes_since_metadata_upload_scheduled, ); - let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); - - let index_part = IndexPart::new( - upload_queue.latest_files.clone(), - disk_consistent_lsn, - metadata, - ); - let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn); - self.calls_unfinished_metric_begin(&op); + let op = UploadOp::UploadMetadata { + uploaded: Box::new(index_part.clone()), + }; + self.metric_begin(&op); upload_queue.queued_operations.push_back(op); upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0; // Launch the task immediately, if possible self.launch_queued_tasks(upload_queue); + Ok(()) } + /// Reparent this timeline to a new parent. /// - /// Launch an upload operation in the background. + /// A retryable step of timeline ancestor detach. + pub(crate) async fn schedule_reparenting_and_wait( + self: &Arc, + new_parent: &TimelineId, + ) -> anyhow::Result<()> { + let receiver = { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + let Some(prev) = upload_queue.dirty.metadata.ancestor_timeline() else { + return Err(anyhow::anyhow!( + "cannot reparent without a current ancestor" + )); + }; + + let uploaded = &upload_queue.clean.0.metadata; + + if uploaded.ancestor_timeline().is_none() && !uploaded.ancestor_lsn().is_valid() { + // nothing to do + None + } else { + upload_queue.dirty.metadata.reparent(new_parent); + upload_queue.dirty.lineage.record_previous_ancestor(&prev); + + self.schedule_index_upload(upload_queue)?; + + Some(self.schedule_barrier0(upload_queue)) + } + }; + + if let Some(receiver) = receiver { + Self::wait_completion0(receiver).await?; + } + Ok(()) + } + + /// Schedules uploading a new version of `index_part.json` with the given layers added, + /// detaching from ancestor and waits for it to complete. /// + /// This is used with `Timeline::detach_ancestor` functionality. + pub(crate) async fn schedule_adding_existing_layers_to_index_detach_and_wait( + self: &Arc, + layers: &[Layer], + adopted: (TimelineId, Lsn), + ) -> anyhow::Result<()> { + let barrier = { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + if upload_queue.clean.0.lineage.detached_previous_ancestor() == Some(adopted) { + None + } else { + upload_queue.dirty.metadata.detach_from_ancestor(&adopted); + upload_queue.dirty.lineage.record_detaching(&adopted); + + for layer in layers { + let prev = upload_queue + .dirty + .layer_metadata + .insert(layer.layer_desc().layer_name(), layer.metadata()); + assert!(prev.is_none(), "copied layer existed already {layer}"); + } + + self.schedule_index_upload(upload_queue)?; + + Some(self.schedule_barrier0(upload_queue)) + } + }; + + if let Some(barrier) = barrier { + Self::wait_completion0(barrier).await?; + } + Ok(()) + } + + /// Adds a gc blocking reason for this timeline if one does not exist already. + /// + /// A retryable step of timeline detach ancestor. + /// + /// Returns a future which waits until the completion of the upload. + pub(crate) fn schedule_insert_gc_block_reason( + self: &Arc, + reason: index::GcBlockingReason, + ) -> Result>, NotInitialized> + { + let maybe_barrier = { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + if let index::GcBlockingReason::DetachAncestor = reason { + if upload_queue.dirty.metadata.ancestor_timeline().is_none() { + drop(guard); + panic!("cannot start detach ancestor if there is nothing to detach from"); + } + } + + let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason)); + + let current = upload_queue.dirty.gc_blocking.as_ref(); + let uploaded = upload_queue.clean.0.gc_blocking.as_ref(); + + match (current, uploaded) { + (x, y) if wanted(x) && wanted(y) => None, + (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)), + // Usual case: !wanted(x) && !wanted(y) + // + // Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to + // turn on and off some reason. + (x, y) => { + if !wanted(x) && wanted(y) { + // this could be avoided by having external in-memory synchronization, like + // timeline detach ancestor + warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason"); + } + + // at this point, the metadata must always show that there is a parent + upload_queue.dirty.gc_blocking = current + .map(|x| x.with_reason(reason)) + .or_else(|| Some(index::GcBlocking::started_now_for(reason))); + self.schedule_index_upload(upload_queue)?; + Some(self.schedule_barrier0(upload_queue)) + } + } + }; + + Ok(async move { + if let Some(barrier) = maybe_barrier { + Self::wait_completion0(barrier).await?; + } + Ok(()) + }) + } + + /// Removes a gc blocking reason for this timeline if one exists. + /// + /// A retryable step of timeline detach ancestor. + /// + /// Returns a future which waits until the completion of the upload. + pub(crate) fn schedule_remove_gc_block_reason( + self: &Arc, + reason: index::GcBlockingReason, + ) -> Result>, NotInitialized> + { + let maybe_barrier = { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + if let index::GcBlockingReason::DetachAncestor = reason { + if !upload_queue.clean.0.lineage.is_detached_from_ancestor() { + drop(guard); + panic!("cannot complete timeline_ancestor_detach while not detached"); + } + } + + let wanted = |x: Option<&index::GcBlocking>| { + x.is_none() || x.is_some_and(|b| !b.blocked_by(reason)) + }; + + let current = upload_queue.dirty.gc_blocking.as_ref(); + let uploaded = upload_queue.clean.0.gc_blocking.as_ref(); + + match (current, uploaded) { + (x, y) if wanted(x) && wanted(y) => None, + (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)), + (x, y) => { + if !wanted(x) && wanted(y) { + warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)"); + } + + upload_queue.dirty.gc_blocking = + current.as_ref().and_then(|x| x.without_reason(reason)); + assert!(wanted(upload_queue.dirty.gc_blocking.as_ref())); + // FIXME: bogus ? + self.schedule_index_upload(upload_queue)?; + Some(self.schedule_barrier0(upload_queue)) + } + } + }; + + Ok(async move { + if let Some(barrier) = maybe_barrier { + Self::wait_completion0(barrier).await?; + } + Ok(()) + }) + } + + /// Launch an upload operation in the background; the file is added to be included in next + /// `index_part.json` upload. pub(crate) fn schedule_layer_file_upload( self: &Arc, layer: ResidentLayer, - ) -> anyhow::Result<()> { + ) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -685,16 +947,19 @@ impl RemoteTimelineClient { let metadata = layer.metadata(); upload_queue - .latest_files - .insert(layer.layer_desc().filename(), metadata.clone()); + .dirty + .layer_metadata + .insert(layer.layer_desc().layer_name(), metadata.clone()); upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; info!( - "scheduled layer file upload {layer} gen={:?} shard={:?}", - metadata.generation, metadata.shard + gen=?metadata.generation, + shard=?metadata.shard, + "scheduled layer file upload {layer}", ); + let op = UploadOp::UploadLayer(layer, metadata); - self.calls_unfinished_metric_begin(&op); + self.metric_begin(&op); upload_queue.queued_operations.push_back(op); } @@ -708,13 +973,13 @@ impl RemoteTimelineClient { /// successfully. pub fn schedule_layer_file_deletion( self: &Arc, - names: &[LayerFileName], + names: &[LayerName], ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - let with_metadata = - self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned()); + let with_metadata = self + .schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned())?; self.schedule_deletion_of_unlinked0(upload_queue, with_metadata); @@ -728,7 +993,10 @@ impl RemoteTimelineClient { /// /// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`] /// is invoked on them. - pub(crate) fn schedule_gc_update(self: &Arc, gc_layers: &[Layer]) -> anyhow::Result<()> { + pub(crate) fn schedule_gc_update( + self: &Arc, + gc_layers: &[Layer], + ) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -736,9 +1004,9 @@ impl RemoteTimelineClient { // the layer files as "dangling". this is fine, at worst case we create work for the // scrubber. - let names = gc_layers.iter().map(|x| x.layer_desc().filename()); + let names = gc_layers.iter().map(|x| x.layer_desc().layer_name()); - self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); + self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?; self.launch_queued_tasks(upload_queue); @@ -751,21 +1019,17 @@ impl RemoteTimelineClient { self: &Arc, upload_queue: &mut UploadQueueInitialized, names: I, - ) -> Vec<(LayerFileName, LayerFileMetadata)> + ) -> Result, NotInitialized> where - I: IntoIterator, + I: IntoIterator, { - // Deleting layers doesn't affect the values stored in TimelineMetadata, - // so we don't need update it. Just serialize it. - let metadata = upload_queue.latest_metadata.clone(); - // Decorate our list of names with each name's metadata, dropping // names that are unexpectedly missing from our metadata. This metadata // is later used when physically deleting layers, to construct key paths. let with_metadata: Vec<_> = names .into_iter() .filter_map(|name| { - let meta = upload_queue.latest_files.remove(&name); + let meta = upload_queue.dirty.layer_metadata.remove(&name); if let Some(meta) = meta { upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; @@ -797,17 +1061,17 @@ impl RemoteTimelineClient { // index_part update, because that needs to be uploaded before we can actually delete the // files. if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { - self.schedule_index_upload(upload_queue, metadata); + self.schedule_index_upload(upload_queue)?; } - with_metadata + Ok(with_metadata) } /// Schedules deletion for layer files which have previously been unlinked from the /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`]. pub(crate) fn schedule_deletion_of_unlinked( self: &Arc, - layers: Vec<(LayerFileName, LayerFileMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -820,7 +1084,7 @@ impl RemoteTimelineClient { fn schedule_deletion_of_unlinked0( self: &Arc, upload_queue: &mut UploadQueueInitialized, - mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>, + mut with_metadata: Vec<(LayerName, LayerFileMetadata)>, ) { // Filter out any layers which were not created by this tenant shard. These are // layers that originate from some ancestor shard after a split, and may still @@ -863,10 +1127,14 @@ impl RemoteTimelineClient { } // schedule the actual deletions + if with_metadata.is_empty() { + // avoid scheduling the op & bumping the metric + return; + } let op = UploadOp::Delete(Delete { layers: with_metadata, }); - self.calls_unfinished_metric_begin(&op); + self.metric_begin(&op); upload_queue.queued_operations.push_back(op); } @@ -877,7 +1145,7 @@ impl RemoteTimelineClient { self: &Arc, compacted_from: &[Layer], compacted_to: &[ResidentLayer], - ) -> anyhow::Result<()> { + ) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -885,24 +1153,32 @@ impl RemoteTimelineClient { self.schedule_layer_file_upload0(upload_queue, layer.clone()); } - let names = compacted_from.iter().map(|x| x.layer_desc().filename()); + let names = compacted_from.iter().map(|x| x.layer_desc().layer_name()); - self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); + self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?; self.launch_queued_tasks(upload_queue); Ok(()) } /// Wait for all previously scheduled uploads/deletions to complete - pub(crate) async fn wait_completion(self: &Arc) -> anyhow::Result<()> { - let mut receiver = { + pub(crate) async fn wait_completion(self: &Arc) -> Result<(), WaitCompletionError> { + let receiver = { let mut guard = self.upload_queue.lock().unwrap(); - let upload_queue = guard.initialized_mut()?; + let upload_queue = guard + .initialized_mut() + .map_err(WaitCompletionError::NotInitialized)?; self.schedule_barrier0(upload_queue) }; + Self::wait_completion0(receiver).await + } + + async fn wait_completion0( + mut receiver: tokio::sync::watch::Receiver<()>, + ) -> Result<(), WaitCompletionError> { if receiver.changed().await.is_err() { - anyhow::bail!("wait_completion aborted because upload queue was stopped"); + return Err(WaitCompletionError::UploadQueueShutDownOrStopped); } Ok(()) @@ -934,7 +1210,7 @@ impl RemoteTimelineClient { /// Wait for all previously scheduled operations to complete, and then stop. /// /// Not cancellation safe - pub(crate) async fn shutdown(self: &Arc) -> Result<(), StopError> { + pub(crate) async fn shutdown(self: &Arc) { // On cancellation the queue is left in ackward state of refusing new operations but // proper stop is yet to be called. On cancel the original or some later task must call // `stop` or `shutdown`. @@ -945,8 +1221,12 @@ impl RemoteTimelineClient { let fut = { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = match &mut *guard { - UploadQueue::Stopped(_) => return Ok(()), - UploadQueue::Uninitialized => return Err(StopError::QueueUninitialized), + UploadQueue::Stopped(_) => return, + UploadQueue::Uninitialized => { + // transition into Stopped state + self.stop_impl(&mut guard); + return; + } UploadQueue::Initialized(ref mut init) => init, }; @@ -978,7 +1258,7 @@ impl RemoteTimelineClient { } } - self.stop() + self.stop(); } /// Set the deleted_at field in the remote index file. @@ -1012,8 +1292,7 @@ impl RemoteTimelineClient { let deleted_at = Utc::now().naive_utc(); stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at); - let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion) - .context("IndexPart serialize")?; + let mut index_part = stopped.upload_queue_for_deletion.dirty.clone(); index_part.deleted_at = Some(deleted_at); index_part }; @@ -1046,9 +1325,11 @@ impl RemoteTimelineClient { // when executed as part of tenant deletion this happens in the background 2, "persist_index_part_with_deleted_flag", - backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")), + &self.cancel, ) - .await?; + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x)?; // all good, disarm the guard and mark as success ScopeGuard::into_inner(undo_deleted_at); @@ -1068,6 +1349,11 @@ impl RemoteTimelineClient { Ok(()) } + pub(crate) fn is_deleting(&self) -> bool { + let mut locked = self.upload_queue.lock().unwrap(); + locked.stopped_mut().is_ok() + } + pub(crate) async fn preserve_initdb_archive( self: &Arc, tenant_id: &TenantId, @@ -1079,17 +1365,126 @@ impl RemoteTimelineClient { upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel) .await }, - |_e| false, + TimeoutOrCancel::caused_by_cancel, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "preserve_initdb_tar_zst", - backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled!")), + &cancel.clone(), ) .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) .context("backing up initdb archive")?; Ok(()) } + /// Uploads the given layer **without** adding it to be part of a future `index_part.json` upload. + /// + /// This is not normally needed. + pub(crate) async fn upload_layer_file( + self: &Arc, + uploaded: &ResidentLayer, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + self.tenant_shard_id.to_index(), + &uploaded.layer_desc().layer_name(), + uploaded.metadata().generation, + ); + + backoff::retry( + || async { + upload::upload_timeline_layer( + &self.storage_impl, + uploaded.local_path(), + &remote_path, + uploaded.metadata().file_size, + cancel, + ) + .await + }, + TimeoutOrCancel::caused_by_cancel, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "upload a layer without adding it to latest files", + cancel, + ) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("upload a layer without adding it to latest files") + } + + /// Copies the `adopted` remote existing layer to the remote path of `adopted_as`. The layer is + /// not added to be part of a future `index_part.json` upload. + pub(crate) async fn copy_timeline_layer( + self: &Arc, + adopted: &Layer, + adopted_as: &Layer, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let source_remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &adopted + .get_timeline_id() + .expect("Source timeline should be alive"), + self.tenant_shard_id.to_index(), + &adopted.layer_desc().layer_name(), + adopted.metadata().generation, + ); + + let target_remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + self.tenant_shard_id.to_index(), + &adopted_as.layer_desc().layer_name(), + adopted_as.metadata().generation, + ); + + backoff::retry( + || async { + upload::copy_timeline_layer( + &self.storage_impl, + &source_remote_path, + &target_remote_path, + cancel, + ) + .await + }, + TimeoutOrCancel::caused_by_cancel, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "copy timeline layer", + cancel, + ) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("remote copy timeline layer") + } + + async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> { + match tokio::time::timeout( + DELETION_QUEUE_FLUSH_TIMEOUT, + self.deletion_queue_client.flush_immediate(), + ) + .await + { + Ok(result) => result, + Err(_timeout) => { + // Flushing remote deletions is not mandatory: we flush here to make the system easier to test, and + // to ensure that _usually_ objects are really gone after a DELETE is acked. However, in case of deletion + // queue issues (https://github.com/neondatabase/neon/issues/6440), we don't want to wait indefinitely here. + tracing::warn!( + "Timed out waiting for deletion queue flush, acking deletion anyway" + ); + Ok(()) + } + } + } + /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set. /// The function deletes layer files one by one, then lists the prefix to see if we leaked something /// deletes leaked files if any and proceeds with deletion of index file at the end. @@ -1108,8 +1503,21 @@ impl RemoteTimelineClient { stopped .upload_queue_for_deletion - .latest_files + .dirty + .layer_metadata .drain() + .filter(|(_file_name, meta)| { + // Filter out layers that belonged to an ancestor shard. Since we are deleting the whole timeline from + // all shards anyway, we _could_ delete these, but + // - it creates a potential race if other shards are still + // using the layers while this shard deletes them. + // - it means that if we rolled back the shard split, the ancestor shards would be in a state where + // these timelines are present but corrupt (their index exists but some layers don't) + // + // These layers will eventually be cleaned up by the scrubber when it does physical GC. + meta.shard.shard_number == self.tenant_shard_id.shard_number + && meta.shard.shard_count == self.tenant_shard_id.shard_count + }) .map(|(file_name, meta)| { remote_layer_path( &self.tenant_shard_id.tenant_id, @@ -1137,35 +1545,41 @@ impl RemoteTimelineClient { // and retry will arrive to different pageserver there wont be any traces of it on remote storage let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id); - // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't + // Execute all pending deletions, so that when we proceed to do a listing below, we aren't // taking the burden of listing all the layers that we already know we should delete. - self.deletion_queue_client.flush_immediate().await?; + self.flush_deletion_queue().await?; - let remaining = backoff::retry( + let cancel = shutdown_token(); + + let remaining = download_retry( || async { self.storage_impl - .list_files(Some(&timeline_storage_path)) + .list( + Some(&timeline_storage_path), + ListingMode::NoDelimiter, + None, + &cancel, + ) .await }, - |_e| false, - FAILED_DOWNLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - "list_prefixes", - backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")), + "list remaining files", + &cancel, ) .await - .context("list prefixes")?; + .context("list files remaining files")? + .keys; // We will delete the current index_part object last, since it acts as a deletion // marker via its deleted_at attribute let latest_index = remaining .iter() - .filter(|p| { - p.object_name() + .filter(|o| { + o.key + .object_name() .map(|n| n.starts_with(IndexPart::FILE_NAME)) .unwrap_or(false) }) - .filter_map(|path| parse_remote_index_path(path.clone()).map(|gen| (path, gen))) + .filter_map(|o| parse_remote_index_path(o.key.clone()).map(|gen| (o.key.clone(), gen))) .max_by_key(|i| i.1) .map(|i| i.0.clone()) .unwrap_or( @@ -1176,14 +1590,12 @@ impl RemoteTimelineClient { let remaining_layers: Vec = remaining .into_iter() - .filter(|p| { - if p == &latest_index { - return false; + .filter_map(|o| { + if o.key == latest_index || o.key.object_name() == Some(INITDB_PRESERVED_PATH) { + None + } else { + Some(o.key) } - if p.object_name() == Some(INITDB_PRESERVED_PATH) { - return false; - } - true }) .inspect(|path| { if let Some(name) = path.object_name() { @@ -1214,7 +1626,7 @@ impl RemoteTimelineClient { // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait // for a flush to a persistent deletion list so that we may be sure deletion will occur. - self.deletion_queue_client.flush_immediate().await?; + self.flush_deletion_queue().await?; fail::fail_point!("timeline-delete-after-index-delete", |_| { Err(anyhow::anyhow!( @@ -1236,11 +1648,11 @@ impl RemoteTimelineClient { while let Some(next_op) = upload_queue.queued_operations.front() { // Can we run this task now? let can_run_now = match next_op { - UploadOp::UploadLayer(_, _) => { + UploadOp::UploadLayer(..) => { // Can always be scheduled. true } - UploadOp::UploadMetadata(_, _) => { + UploadOp::UploadMetadata { .. } => { // These can only be performed after all the preceding operations // have finished. upload_queue.inprogress_tasks.is_empty() @@ -1282,7 +1694,7 @@ impl RemoteTimelineClient { UploadOp::UploadLayer(_, _) => { upload_queue.num_inprogress_layer_uploads += 1; } - UploadOp::UploadMetadata(_, _) => { + UploadOp::UploadMetadata { .. } => { upload_queue.num_inprogress_metadata_uploads += 1; } UploadOp::Delete(_) => { @@ -1316,10 +1728,9 @@ impl RemoteTimelineClient { task_mgr::spawn( &self.runtime, TaskKind::RemoteUploadTask, - Some(self.tenant_shard_id), + self.tenant_shard_id, Some(self.timeline_id), "remote upload", - false, async move { self_rc.perform_upload_task(task).await; Ok(()) @@ -1343,6 +1754,7 @@ impl RemoteTimelineClient { /// queue. /// async fn perform_upload_task(self: &Arc, task: Arc) { + let cancel = shutdown_token(); // Loop to retry until it completes. loop { // If we're requested to shut down, close up shop and exit. @@ -1354,26 +1766,33 @@ impl RemoteTimelineClient { // the Future, but we're not 100% sure if the remote storage library // is cancellation safe, so we don't dare to do that. Hopefully, the // upload finishes or times out soon enough. - if task_mgr::is_shutdown_requested() { + if cancel.is_cancelled() { info!("upload task cancelled by shutdown request"); - match self.stop() { - Ok(()) => {} - Err(StopError::QueueUninitialized) => { - unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back") - } - } + self.stop(); return; } let upload_result: anyhow::Result<()> = match &task.op { UploadOp::UploadLayer(ref layer, ref layer_metadata) => { - let path = layer.local_path(); + let local_path = layer.local_path(); + + // We should only be uploading layers created by this `Tenant`'s lifetime, so + // the metadata in the upload should always match our current generation. + assert_eq!(layer_metadata.generation, self.generation); + + let remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + layer_metadata.shard, + &layer.layer_desc().layer_name(), + layer_metadata.generation, + ); + upload::upload_timeline_layer( - self.conf, &self.storage_impl, - path, - layer_metadata, - self.generation, + local_path, + &remote_path, + layer_metadata.file_size, &self.cancel, ) .measure_remote_op( @@ -1383,22 +1802,13 @@ impl RemoteTimelineClient { ) .await } - UploadOp::UploadMetadata(ref index_part, _lsn) => { - let mention_having_future_layers = if cfg!(feature = "testing") { - index_part - .layer_metadata - .keys() - .any(|x| x.is_in_future(*_lsn)) - } else { - false - }; - + UploadOp::UploadMetadata { ref uploaded } => { let res = upload::upload_index_part( &self.storage_impl, &self.tenant_shard_id, &self.timeline_id, self.generation, - index_part, + uploaded, &self.cancel, ) .measure_remote_op( @@ -1408,10 +1818,21 @@ impl RemoteTimelineClient { ) .await; if res.is_ok() { - self.update_remote_physical_size_gauge(Some(index_part)); + self.update_remote_physical_size_gauge(Some(uploaded)); + let mention_having_future_layers = if cfg!(feature = "testing") { + uploaded + .layer_metadata + .keys() + .any(|x| x.is_in_future(uploaded.metadata.disk_consistent_lsn())) + } else { + false + }; if mention_having_future_layers { // find rationale near crate::tenant::timeline::init::cleanup_future_layer - tracing::info!(disk_consistent_lsn=%_lsn, "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup"); + tracing::info!( + disk_consistent_lsn = %uploaded.metadata.disk_consistent_lsn(), + "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup" + ); } } res @@ -1440,6 +1861,10 @@ impl RemoteTimelineClient { Ok(()) => { break; } + Err(e) if TimeoutOrCancel::caused_by_cancel(&e) => { + // loop around to do the proper stopping + continue; + } Err(e) => { let retries = task.retries.fetch_add(1, Ordering::SeqCst); @@ -1465,7 +1890,7 @@ impl RemoteTimelineClient { retries, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, - &shutdown_token(), + &cancel, ) .await; } @@ -1508,11 +1933,23 @@ impl RemoteTimelineClient { upload_queue.num_inprogress_layer_uploads -= 1; None } - UploadOp::UploadMetadata(_, lsn) => { + UploadOp::UploadMetadata { ref uploaded } => { upload_queue.num_inprogress_metadata_uploads -= 1; - // XXX monotonicity check? - upload_queue.projected_remote_consistent_lsn = Some(lsn); + // the task id is reused as a monotonicity check for storing the "clean" + // IndexPart. + let last_updater = upload_queue.clean.1; + let is_later = last_updater.is_some_and(|task_id| task_id < task.task_id); + let monotone = is_later || last_updater.is_none(); + + assert!(monotone, "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", task.task_id); + + // not taking ownership is wasteful + upload_queue.clean.0.clone_from(uploaded); + upload_queue.clean.1 = Some(task.task_id); + + let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn(); + if self.generation.is_none() { // Legacy mode: skip validating generation upload_queue.visible_remote_consistent_lsn.store(lsn); @@ -1548,10 +1985,10 @@ impl RemoteTimelineClient { .await; } - self.calls_unfinished_metric_end(&task.op); + self.metric_end(&task.op); } - fn calls_unfinished_metric_impl( + fn metric_impl( &self, op: &UploadOp, ) -> Option<( @@ -1564,9 +2001,9 @@ impl RemoteTimelineClient { UploadOp::UploadLayer(_, m) => ( RemoteOpFileKind::Layer, RemoteOpKind::Upload, - RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()), + RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size), ), - UploadOp::UploadMetadata(_, _) => ( + UploadOp::UploadMetadata { .. } => ( RemoteOpFileKind::Index, RemoteOpKind::Upload, DontTrackSize { @@ -1588,17 +2025,17 @@ impl RemoteTimelineClient { Some(res) } - fn calls_unfinished_metric_begin(&self, op: &UploadOp) { - let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) { + fn metric_begin(&self, op: &UploadOp) { + let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) { Some(x) => x, None => return, }; let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes); - guard.will_decrement_manually(); // in unfinished_ops_metric_end() + guard.will_decrement_manually(); // in metric_end(), see right below } - fn calls_unfinished_metric_end(&self, op: &UploadOp) { - let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) { + fn metric_end(&self, op: &UploadOp) { + let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) { Some(x) => x, None => return, }; @@ -1610,19 +2047,25 @@ impl RemoteTimelineClient { /// Use [`RemoteTimelineClient::shutdown`] for graceful stop. /// /// In-progress operations will still be running after this function returns. - /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))` + /// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))` /// to wait for them to complete, after calling this function. - pub(crate) fn stop(&self) -> Result<(), StopError> { + pub(crate) fn stop(&self) { // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue // into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet. // The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business. let mut guard = self.upload_queue.lock().unwrap(); - match &mut *guard { - UploadQueue::Uninitialized => Err(StopError::QueueUninitialized), + self.stop_impl(&mut guard); + } + + fn stop_impl(&self, guard: &mut std::sync::MutexGuard) { + match &mut **guard { + UploadQueue::Uninitialized => { + info!("UploadQueue is in state Uninitialized, nothing to do"); + **guard = UploadQueue::Stopped(UploadQueueStopped::Uninitialized); + } UploadQueue::Stopped(_) => { // nothing to do info!("another concurrent task already shut down the queue"); - Ok(()) } UploadQueue::Initialized(initialized) => { info!("shutting down upload queue"); @@ -1636,10 +2079,9 @@ impl RemoteTimelineClient { // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it. let upload_queue_for_deletion = UploadQueueInitialized { task_counter: 0, - latest_files: initialized.latest_files.clone(), + dirty: initialized.dirty.clone(), + clean: initialized.clean.clone(), latest_files_changes_since_metadata_upload_scheduled: 0, - latest_metadata: initialized.latest_metadata.clone(), - projected_remote_consistent_lsn: None, visible_remote_consistent_lsn: initialized .visible_remote_consistent_lsn .clone(), @@ -1655,11 +2097,13 @@ impl RemoteTimelineClient { }; let upload_queue = std::mem::replace( - &mut *guard, - UploadQueue::Stopped(UploadQueueStopped { - upload_queue_for_deletion, - deleted_at: SetDeletedFlagProgress::NotRunning, - }), + &mut **guard, + UploadQueue::Stopped(UploadQueueStopped::Deletable( + UploadQueueStoppedDeletable { + upload_queue_for_deletion, + deleted_at: SetDeletedFlagProgress::NotRunning, + }, + )), ); if let UploadQueue::Initialized(qi) = upload_queue { qi @@ -1683,42 +2127,56 @@ impl RemoteTimelineClient { // Tear down queued ops for op in qi.queued_operations.into_iter() { - self.calls_unfinished_metric_end(&op); + self.metric_end(&op); // Dropping UploadOp::Barrier() here will make wait_completion() return with an Err() // which is exactly what we want to happen. drop(op); } - - // We're done. - drop(guard); - Ok(()) } } } - pub(crate) fn get_layers_metadata( + /// Returns an accessor which will hold the UploadQueue mutex for accessing the upload queue + /// externally to RemoteTimelineClient. + pub(crate) fn initialized_upload_queue( &self, - layers: Vec, - ) -> anyhow::Result>> { - let q = self.upload_queue.lock().unwrap(); - let q = match &*q { - UploadQueue::Stopped(_) | UploadQueue::Uninitialized => { - anyhow::bail!("queue is in state {}", q.as_str()) - } - UploadQueue::Initialized(inner) => inner, - }; - - let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned()); - - Ok(decorated.collect()) + ) -> Result, NotInitialized> { + let mut inner = self.upload_queue.lock().unwrap(); + inner.initialized_mut()?; + Ok(UploadQueueAccessor { inner }) } } +pub(crate) struct UploadQueueAccessor<'a> { + inner: std::sync::MutexGuard<'a, UploadQueue>, +} + +impl<'a> UploadQueueAccessor<'a> { + pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart { + match &*self.inner { + UploadQueue::Initialized(x) => &x.clean.0, + UploadQueue::Uninitialized | UploadQueue::Stopped(_) => { + unreachable!("checked before constructing") + } + } + } +} + +pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath { + let path = format!("tenants/{tenant_shard_id}"); + RemotePath::from_string(&path).expect("Failed to construct path") +} + pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath { let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}"); RemotePath::from_string(&path).expect("Failed to construct path") } +fn remote_timelines_path_unsharded(tenant_id: &TenantId) -> RemotePath { + let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}"); + RemotePath::from_string(&path).expect("Failed to construct path") +} + pub fn remote_timeline_path( tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, @@ -1726,6 +2184,8 @@ pub fn remote_timeline_path( remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string())) } +/// Obtains the path of the given Layer in the remote +/// /// Note that the shard component of a remote layer path is _not_ always the same /// as in the TenantShardId of the caller: tenants may reference layers from a different /// ShardIndex. Use the ShardIndex from the layer's metadata. @@ -1733,14 +2193,14 @@ pub fn remote_layer_path( tenant_id: &TenantId, timeline_id: &TimelineId, shard: ShardIndex, - layer_file_name: &LayerFileName, + layer_file_name: &LayerName, generation: Generation, ) -> RemotePath { // Generation-aware key format let path = format!( "tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}", shard.get_suffix(), - layer_file_name.file_name(), + layer_file_name, generation.get_suffix() ); @@ -1801,29 +2261,6 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option { } } -/// Files on the remote storage are stored with paths, relative to the workdir. -/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path. -/// -/// Errors if the path provided does not start from pageserver's workdir. -pub fn remote_path( - conf: &PageServerConf, - local_path: &Utf8Path, - generation: Generation, -) -> anyhow::Result { - let stripped = local_path - .strip_prefix(&conf.workdir) - .context("Failed to strip workdir prefix")?; - - let suffixed = format!("{0}{1}", stripped, generation.get_suffix()); - - RemotePath::new(Utf8Path::new(&suffixed)).with_context(|| { - format!( - "to resolve remote part of path {:?} for base {:?}", - local_path, conf.workdir - ) - }) -} - #[cfg(test)] mod tests { use super::*; @@ -1831,14 +2268,13 @@ mod tests { context::RequestContext, tenant::{ harness::{TenantHarness, TIMELINE_ID}, - storage_layer::Layer, - Generation, Tenant, Timeline, + storage_layer::layer::local_layer_path, + Tenant, Timeline, }, DEFAULT_PG_VERSION, }; use std::collections::HashSet; - use utils::lsn::Lsn; pub(super) fn dummy_contents(name: &str) -> Vec { format!("contents for {name}").into() @@ -1861,8 +2297,8 @@ mod tests { TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap() } - fn assert_file_list(a: &HashSet, b: &[&str]) { - let mut avec: Vec = a.iter().map(|x| x.file_name()).collect(); + fn assert_file_list(a: &HashSet, b: &[&str]) { + let mut avec: Vec = a.iter().map(|x| x.to_string()).collect(); avec.sort(); let mut bvec = b.to_vec(); @@ -1899,7 +2335,7 @@ mod tests { impl TestSetup { async fn new(test_name: &str) -> anyhow::Result { let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}"))); - let harness = TenantHarness::create(test_name)?; + let harness = TenantHarness::create(test_name).await?; let (tenant, ctx) = harness.load().await; let timeline = tenant @@ -1939,6 +2375,7 @@ mod tests { tracing::info_span!( "test", tenant_id = %self.harness.tenant_shard_id.tenant_id, + shard_id = %self.harness.tenant_shard_id.shard_slug(), timeline_id = %TIMELINE_ID ) } @@ -1972,11 +2409,11 @@ mod tests { tenant_ctx: _tenant_ctx, } = test_setup; - let client = timeline.remote_client.as_ref().unwrap(); + let client = &timeline.remote_client; // Download back the index.json, and check that the list of files is correct let initial_index_part = match client - .download_index_file(CancellationToken::new()) + .download_index_file(&CancellationToken::new()) .await .unwrap() { @@ -1987,7 +2424,7 @@ mod tests { .layer_metadata .keys() .map(|f| f.to_owned()) - .collect::>(); + .collect::>(); let initial_layer = { assert!(initial_layers.len() == 1); initial_layers.into_iter().next().unwrap() @@ -2013,12 +2450,21 @@ mod tests { ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz")) ] .into_iter() - .map(|(name, contents): (LayerFileName, Vec)| { - std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap(); + .map(|(name, contents): (LayerName, Vec)| { + + let local_path = local_layer_path( + harness.conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &name, + &generation, + ); + std::fs::write(&local_path, &contents).unwrap(); Layer::for_resident( harness.conf, &timeline, + local_path, name, LayerFileMetadata::new(contents.len() as u64, generation, shard), ) @@ -2049,7 +2495,7 @@ mod tests { // Schedule upload of index. Check that it is queued let metadata = dummy_metadata(Lsn(0x20)); client - .schedule_index_upload_for_metadata_update(&metadata) + .schedule_index_upload_for_full_metadata_update(&metadata) .unwrap(); { let mut guard = client.upload_queue.lock().unwrap(); @@ -2070,7 +2516,7 @@ mod tests { // Download back the index.json, and check that the list of files is correct let index_part = match client - .download_index_file(CancellationToken::new()) + .download_index_file(&CancellationToken::new()) .await .unwrap() { @@ -2085,9 +2531,9 @@ mod tests { .map(|f| f.to_owned()) .collect(), &[ - &initial_layer.file_name(), - &layers[0].layer_desc().filename().file_name(), - &layers[1].layer_desc().filename().file_name(), + &initial_layer.to_string(), + &layers[0].layer_desc().layer_name().to_string(), + &layers[1].layer_desc().layer_name().to_string(), ], ); assert_eq!(index_part.metadata, metadata); @@ -2101,7 +2547,7 @@ mod tests { // keep using schedule_layer_file_deletion because we don't have a way to wait for the // spawn_blocking started by the drop. client - .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()]) + .schedule_layer_file_deletion(&[layers[0].layer_desc().layer_name()]) .unwrap(); { let mut guard = client.upload_queue.lock().unwrap(); @@ -2119,9 +2565,9 @@ mod tests { } assert_remote_files( &[ - &initial_layer.file_name(), - &layers[0].layer_desc().filename().file_name(), - &layers[1].layer_desc().filename().file_name(), + &initial_layer.to_string(), + &layers[0].layer_desc().layer_name().to_string(), + &layers[1].layer_desc().layer_name().to_string(), "index_part.json", ], &remote_timeline_dir, @@ -2134,9 +2580,9 @@ mod tests { assert_remote_files( &[ - &initial_layer.file_name(), - &layers[1].layer_desc().filename().file_name(), - &layers[2].layer_desc().filename().file_name(), + &initial_layer.to_string(), + &layers[1].layer_desc().layer_name().to_string(), + &layers[2].layer_desc().layer_name().to_string(), "index_part.json", ], &remote_timeline_dir, @@ -2154,20 +2600,23 @@ mod tests { timeline, .. } = TestSetup::new("metrics").await.unwrap(); - let client = timeline.remote_client.as_ref().unwrap(); - let timeline_path = harness.timeline_path(&TIMELINE_ID); + let client = &timeline.remote_client; - let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let local_path = local_layer_path( + harness.conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &layer_file_name_1, + &harness.generation, + ); let content_1 = dummy_contents("foo"); - std::fs::write( - timeline_path.join(layer_file_name_1.file_name()), - &content_1, - ) - .unwrap(); + std::fs::write(&local_path, &content_1).unwrap(); let layer_file_1 = Layer::for_resident( harness.conf, &timeline, + local_path, layer_file_name_1.clone(), LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard), ); @@ -2236,12 +2685,7 @@ mod tests { async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart { // An empty IndexPart, just sufficient to ensure deserialization will succeed - let example_metadata = TimelineMetadata::example(); - let example_index_part = IndexPart::new( - HashMap::new(), - example_metadata.disk_consistent_lsn(), - example_metadata, - ); + let example_index_part = IndexPart::example(); let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap(); @@ -2272,7 +2716,7 @@ mod tests { let client = test_state.build_client(get_generation); let download_r = client - .download_index_file(CancellationToken::new()) + .download_index_file(&CancellationToken::new()) .await .expect("download should always succeed"); assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_))); diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 4309c683e2..9fbe2f0da5 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -5,35 +5,38 @@ use std::collections::HashSet; use std::future::Future; +use std::str::FromStr; use anyhow::{anyhow, Context}; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::shard::TenantShardId; use tokio::fs::{self, File, OpenOptions}; use tokio::io::{AsyncSeekExt, AsyncWriteExt}; +use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::warn; -use utils::timeout::timeout_cancellable; -use utils::{backoff, crashsafe}; +use utils::backoff; use crate::config::PageServerConf; -use crate::tenant::remote_timeline_client::{ - download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT, -}; -use crate::tenant::storage_layer::LayerFileName; -use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::context::RequestContext; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; +use crate::tenant::storage_layer::LayerName; use crate::tenant::Generation; -use crate::virtual_file::on_fatal_io_error; +#[cfg_attr(target_os = "macos", allow(unused_imports))] +use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; +use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; -use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode}; +use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath}; use utils::crashsafe::path_with_suffix_extension; -use utils::id::TimelineId; +use utils::id::{TenantId, TimelineId}; +use utils::pausable_failpoint; use super::index::{IndexPart, LayerFileMetadata}; use super::{ parse_remote_index_path, remote_index_path, remote_initdb_archive_path, - remote_initdb_preserved_archive_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, - INITDB_PATH, + remote_initdb_preserved_archive_path, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, INITDB_PATH, }; /// @@ -41,20 +44,21 @@ use super::{ /// in the metadata. (In the future, we might do more cross-checks, like CRC validation) /// /// Returns the size of the downloaded file. +#[allow(clippy::too_many_arguments)] pub async fn download_layer_file<'a>( conf: &'static PageServerConf, storage: &'a GenericRemoteStorage, tenant_shard_id: TenantShardId, timeline_id: TimelineId, - layer_file_name: &'a LayerFileName, + layer_file_name: &'a LayerName, layer_metadata: &'a LayerFileMetadata, + local_path: &Utf8Path, cancel: &CancellationToken, + ctx: &RequestContext, ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); - let local_path = conf - .timeline_path(&tenant_shard_id, &timeline_id) - .join(layer_file_name.file_name()); + let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id); let remote_path = remote_layer_path( &tenant_shard_id.tenant_id, @@ -74,105 +78,22 @@ pub async fn download_layer_file<'a>( // For more context about durable_rename check this email from postgres mailing list: // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com // If pageserver crashes the temp file will be deleted on startup and re-downloaded. - let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION); + let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION); - let cancel_inner = cancel.clone(); - let (mut destination_file, bytes_amount) = download_retry( - || async { - let destination_file = tokio::fs::File::create(&temp_file_path) - .await - .with_context(|| format!("create a destination file for layer '{temp_file_path}'")) - .map_err(DownloadError::Other)?; - - // Cancellation safety: it is safe to cancel this future, because it isn't writing to a local - // file: the write to local file doesn't start until after the request header is returned - // and we start draining the body stream below - let download = download_cancellable(&cancel_inner, storage.download(&remote_path)) - .await - .with_context(|| { - format!( - "open a download stream for layer with remote storage path '{remote_path:?}'" - ) - }) - .map_err(DownloadError::Other)?; - - let mut destination_file = - tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file); - - let mut reader = tokio_util::io::StreamReader::new(download.download_stream); - - // Cancellation safety: it is safe to cancel this future because it is writing into a temporary file, - // and we will unlink the temporary file if there is an error. This unlink is important because we - // are in a retry loop, and we wouldn't want to leave behind a rogue write I/O to a file that - // we will imminiently try and write to again. - let bytes_amount: u64 = match timeout_cancellable( - DOWNLOAD_TIMEOUT, - &cancel_inner, - tokio::io::copy_buf(&mut reader, &mut destination_file), - ) - .await - .with_context(|| { - format!( - "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}" - ) - }) - .map_err(DownloadError::Other)? - { - Ok(b) => Ok(b), - Err(e) => { - // Remove incomplete files: on restart Timeline would do this anyway, but we must - // do it here for the retry case. - if let Err(e) = tokio::fs::remove_file(&temp_file_path).await { - on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}")); - } - Err(e) - } - } - .with_context(|| { - format!( - "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}" - ) - }) - .map_err(DownloadError::Other)?; - - let destination_file = destination_file.into_inner(); - - Ok((destination_file, bytes_amount)) - }, + let bytes_amount = download_retry( + || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await }, &format!("download {remote_path:?}"), cancel, ) .await?; - // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that: - // A file will not be closed immediately when it goes out of scope if there are any IO operations - // that have not yet completed. To ensure that a file is closed immediately when it is dropped, - // you should call flush before dropping it. - // - // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because - // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations. - // But for additional safety lets check/wait for any pending operations. - destination_file - .flush() - .await - .with_context(|| format!("flush source file at {temp_file_path}")) - .map_err(DownloadError::Other)?; - - let expected = layer_metadata.file_size(); + let expected = layer_metadata.file_size; if expected != bytes_amount { return Err(DownloadError::Other(anyhow!( "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}", ))); } - // not using sync_data because it can lose file size update - destination_file - .sync_all() - .await - .with_context(|| format!("failed to fsync source file at {temp_file_path}")) - .map_err(DownloadError::Other)?; - drop(destination_file); - fail::fail_point!("remote-storage-download-pre-rename", |_| { Err(DownloadError::Other(anyhow!( "remote-storage-download-pre-rename failpoint triggered" @@ -184,19 +105,159 @@ pub async fn download_layer_file<'a>( .with_context(|| format!("rename download layer file to {local_path}")) .map_err(DownloadError::Other)?; - crashsafe::fsync_async(&local_path) - .await - .with_context(|| format!("fsync layer file {local_path}")) - .map_err(DownloadError::Other)?; + // We use fatal_err() below because the after the rename above, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. + let work = { + let ctx = ctx.detached_child(ctx.task_kind(), ctx.download_behavior()); + async move { + let timeline_dir = VirtualFile::open(&timeline_path, &ctx) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + } + }; + crate::virtual_file::io_engine::get() + .spawn_blocking_and_block_on_if_std(work) + .await; tracing::debug!("download complete: {local_path}"); Ok(bytes_amount) } +/// Download the object `src_path` in the remote `storage` to local path `dst_path`. +/// +/// If Ok() is returned, the download succeeded and the inode & data have been made durable. +/// (Note that the directory entry for the inode is not made durable.) +/// The file size in bytes is returned. +/// +/// If Err() is returned, there was some error. The file at `dst_path` has been unlinked. +/// The unlinking has _not_ been made durable. +async fn download_object<'a>( + storage: &'a GenericRemoteStorage, + src_path: &RemotePath, + dst_path: &Utf8PathBuf, + cancel: &CancellationToken, + #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext, +) -> Result { + let res = match crate::virtual_file::io_engine::get() { + crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"), + crate::virtual_file::io_engine::IoEngine::StdFs => { + async { + let destination_file = tokio::fs::File::create(dst_path) + .await + .with_context(|| format!("create a destination file for layer '{dst_path}'")) + .map_err(DownloadError::Other)?; + + let download = storage.download(src_path, cancel).await?; + + pausable_failpoint!("before-downloading-layer-stream-pausable"); + + let mut buf_writer = + tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file); + + let mut reader = tokio_util::io::StreamReader::new(download.download_stream); + + let bytes_amount = tokio::io::copy_buf(&mut reader, &mut buf_writer).await?; + buf_writer.flush().await?; + + let mut destination_file = buf_writer.into_inner(); + + // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that: + // A file will not be closed immediately when it goes out of scope if there are any IO operations + // that have not yet completed. To ensure that a file is closed immediately when it is dropped, + // you should call flush before dropping it. + // + // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because + // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations. + // But for additional safety lets check/wait for any pending operations. + destination_file + .flush() + .await + .with_context(|| format!("flush source file at {dst_path}")) + .map_err(DownloadError::Other)?; + + // not using sync_data because it can lose file size update + destination_file + .sync_all() + .await + .with_context(|| format!("failed to fsync source file at {dst_path}")) + .map_err(DownloadError::Other)?; + + Ok(bytes_amount) + } + .await + } + #[cfg(target_os = "linux")] + crate::virtual_file::io_engine::IoEngine::TokioEpollUring => { + use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer}; + use bytes::BytesMut; + async { + let destination_file = VirtualFile::create(dst_path, ctx) + .await + .with_context(|| format!("create a destination file for layer '{dst_path}'")) + .map_err(DownloadError::Other)?; + + let mut download = storage.download(src_path, cancel).await?; + + pausable_failpoint!("before-downloading-layer-stream-pausable"); + + // TODO: use vectored write (writev) once supported by tokio-epoll-uring. + // There's chunks_vectored() on the stream. + let (bytes_amount, destination_file) = async { + let size_tracking = size_tracking_writer::Writer::new(destination_file); + let mut buffered = owned_buffers_io::write::BufferedWriter::::new( + size_tracking, + BytesMut::with_capacity(super::BUFFER_SIZE), + ); + while let Some(res) = + futures::StreamExt::next(&mut download.download_stream).await + { + let chunk = match res { + Ok(chunk) => chunk, + Err(e) => return Err(e), + }; + buffered.write_buffered(chunk.slice_len(), ctx).await?; + } + let size_tracking = buffered.flush_and_into_inner(ctx).await?; + Ok(size_tracking.into_inner()) + } + .await?; + + // not using sync_data because it can lose file size update + destination_file + .sync_all() + .await + .with_context(|| format!("failed to fsync source file at {dst_path}")) + .map_err(DownloadError::Other)?; + + Ok(bytes_amount) + } + .await + } + }; + + // in case the download failed, clean up + match res { + Ok(bytes_amount) => Ok(bytes_amount), + Err(e) => { + if let Err(e) = tokio::fs::remove_file(dst_path).await { + if e.kind() != std::io::ErrorKind::NotFound { + on_fatal_io_error(&e, &format!("Removing temporary file {dst_path}")); + } + } + Err(e) + } + } +} + const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; -pub fn is_temp_download_file(path: &Utf8Path) -> bool { +pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool { let extension = path.extension(); match extension { Some(TEMP_DOWNLOAD_EXTENSION) => true, @@ -205,53 +266,68 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool { } } -/// List timelines of given tenant in remote storage +async fn list_identifiers( + storage: &GenericRemoteStorage, + prefix: RemotePath, + cancel: CancellationToken, +) -> anyhow::Result<(HashSet, HashSet)> +where + T: FromStr + Eq + std::hash::Hash, +{ + let listing = download_retry_forever( + || storage.list(Some(&prefix), ListingMode::WithDelimiter, None, &cancel), + &format!("list identifiers in prefix {prefix}"), + &cancel, + ) + .await?; + + let mut parsed_ids = HashSet::new(); + let mut other_prefixes = HashSet::new(); + + for id_remote_storage_key in listing.prefixes { + let object_name = id_remote_storage_key.object_name().ok_or_else(|| { + anyhow::anyhow!("failed to get object name for key {id_remote_storage_key}") + })?; + + match object_name.parse::() { + Ok(t) => parsed_ids.insert(t), + Err(_) => other_prefixes.insert(object_name.to_string()), + }; + } + + for object in listing.keys { + let object_name = object + .key + .object_name() + .ok_or_else(|| anyhow::anyhow!("object name for key {}", object.key))?; + other_prefixes.insert(object_name.to_string()); + } + + Ok((parsed_ids, other_prefixes)) +} + +/// List shards of given tenant in remote storage +pub(crate) async fn list_remote_tenant_shards( + storage: &GenericRemoteStorage, + tenant_id: TenantId, + cancel: CancellationToken, +) -> anyhow::Result<(HashSet, HashSet)> { + let remote_path = remote_tenant_path(&TenantShardId::unsharded(tenant_id)); + list_identifiers::(storage, remote_path, cancel).await +} + +/// List timelines of given tenant shard in remote storage pub async fn list_remote_timelines( storage: &GenericRemoteStorage, tenant_shard_id: TenantShardId, cancel: CancellationToken, ) -> anyhow::Result<(HashSet, HashSet)> { - let remote_path = remote_timelines_path(&tenant_shard_id); - fail::fail_point!("storage-sync-list-remote-timelines", |_| { anyhow::bail!("storage-sync-list-remote-timelines"); }); - let cancel_inner = cancel.clone(); - let listing = download_retry_forever( - || { - download_cancellable( - &cancel_inner, - storage.list(Some(&remote_path), ListingMode::WithDelimiter), - ) - }, - &format!("list timelines for {tenant_shard_id}"), - cancel, - ) - .await?; - - let mut timeline_ids = HashSet::new(); - let mut other_prefixes = HashSet::new(); - - for timeline_remote_storage_key in listing.prefixes { - let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { - anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_shard_id}") - })?; - - match object_name.parse::() { - Ok(t) => timeline_ids.insert(t), - Err(_) => other_prefixes.insert(object_name.to_string()), - }; - } - - for key in listing.keys { - let object_name = key - .object_name() - .ok_or_else(|| anyhow::anyhow!("object name for key {key}"))?; - other_prefixes.insert(object_name.to_string()); - } - - Ok((timeline_ids, other_prefixes)) + let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash(); + list_identifiers::(storage, remote_path, cancel).await } async fn do_download_index_part( @@ -259,29 +335,22 @@ async fn do_download_index_part( tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, index_generation: Generation, - cancel: CancellationToken, -) -> Result { - use futures::stream::StreamExt; - + cancel: &CancellationToken, +) -> Result<(IndexPart, Generation), DownloadError> { let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation); - let cancel_inner = cancel.clone(); let index_part_bytes = download_retry_forever( || async { - // Cancellation: if is safe to cancel this future because we're just downloading into - // a memory buffer, not touching local disk. - let index_part_download = - download_cancellable(&cancel_inner, storage.download(&remote_path)).await?; + let download = storage.download(&remote_path, cancel).await?; - let mut index_part_bytes = Vec::new(); - let mut stream = std::pin::pin!(index_part_download.download_stream); - while let Some(chunk) = stream.next().await { - let chunk = chunk - .with_context(|| format!("download index part at {remote_path:?}")) - .map_err(DownloadError::Other)?; - index_part_bytes.extend_from_slice(&chunk[..]); - } - Ok(index_part_bytes) + let mut bytes = Vec::new(); + + let stream = download.download_stream; + let mut stream = StreamReader::new(stream); + + tokio::io::copy_buf(&mut stream, &mut bytes).await?; + + Ok(bytes) }, &format!("download {remote_path:?}"), cancel, @@ -289,10 +358,10 @@ async fn do_download_index_part( .await?; let index_part: IndexPart = serde_json::from_slice(&index_part_bytes) - .with_context(|| format!("download index part file at {remote_path:?}")) + .with_context(|| format!("deserialize index part file at {remote_path:?}")) .map_err(DownloadError::Other)?; - Ok(index_part) + Ok((index_part, index_generation)) } /// index_part.json objects are suffixed with a generation number, so we cannot @@ -301,13 +370,13 @@ async fn do_download_index_part( /// In this function we probe for the most recent index in a generation <= our current generation. /// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md #[tracing::instrument(skip_all, fields(generation=?my_generation))] -pub(super) async fn download_index_part( +pub(crate) async fn download_index_part( storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, my_generation: Generation, - cancel: CancellationToken, -) -> Result { + cancel: &CancellationToken, +) -> Result<(IndexPart, Generation), DownloadError> { debug_assert_current_span_has_tenant_and_timeline_id(); if my_generation.is_none() { @@ -326,14 +395,8 @@ pub(super) async fn download_index_part( // index in our generation. // // This is an optimization to avoid doing the listing for the general case below. - let res = do_download_index_part( - storage, - tenant_shard_id, - timeline_id, - my_generation, - cancel.clone(), - ) - .await; + let res = + do_download_index_part(storage, tenant_shard_id, timeline_id, my_generation, cancel).await; match res { Ok(index_part) => { tracing::debug!( @@ -358,7 +421,7 @@ pub(super) async fn download_index_part( tenant_shard_id, timeline_id, my_generation.previous(), - cancel.clone(), + cancel, ) .await; match res { @@ -380,22 +443,24 @@ pub(super) async fn download_index_part( // objects, and select the highest one with a generation <= my_generation. Constructing the prefix is equivalent // to constructing a full index path with no generation, because the generation is a suffix. let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none()); - let indices = backoff::retry( - || async { storage.list_files(Some(&index_prefix)).await }, - |_| false, - FAILED_DOWNLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - "listing index_part files", - backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")), + + let indices = download_retry( + || async { + storage + .list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel) + .await + }, + "list index_part files", + cancel, ) - .await - .map_err(DownloadError::Other)?; + .await? + .keys; // General case logic for which index to use: the latest index whose generation // is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md let max_previous_generation = indices .into_iter() - .filter_map(parse_remote_index_path) + .filter_map(|o| parse_remote_index_path(o.key)) .filter(|g| g <= &my_generation) .max(); @@ -446,8 +511,6 @@ pub(crate) async fn download_initdb_tar_zst( "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}" )); - let cancel_inner = cancel.clone(); - let file = download_retry( || async { let file = OpenOptions::new() @@ -460,26 +523,17 @@ pub(crate) async fn download_initdb_tar_zst( .with_context(|| format!("tempfile creation {temp_path}")) .map_err(DownloadError::Other)?; - let download = match download_cancellable(&cancel_inner, storage.download(&remote_path)) - .await - { + let download = match storage.download(&remote_path, cancel).await { Ok(dl) => dl, Err(DownloadError::NotFound) => { - download_cancellable(&cancel_inner, storage.download(&remote_preserved_path)) - .await? + storage.download(&remote_preserved_path, cancel).await? } Err(other) => Err(other)?, }; let mut download = tokio_util::io::StreamReader::new(download.download_stream); - let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file); + let mut writer = tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, file); - // TODO: this consumption of the response body should be subject to timeout + cancellation, but - // not without thinking carefully about how to recover safely from cancelling a write to - // local storage (e.g. by writing into a temp file as we do in download_layer) - tokio::io::copy_buf(&mut download, &mut writer) - .await - .with_context(|| format!("download initdb.tar.zst at {remote_path:?}")) - .map_err(DownloadError::Other)?; + tokio::io::copy_buf(&mut download, &mut writer).await?; let mut file = writer.into_inner(); @@ -494,7 +548,7 @@ pub(crate) async fn download_initdb_tar_zst( cancel, ) .await - .map_err(|e| { + .inspect_err(|_e| { // Do a best-effort attempt at deleting the temporary file upon encountering an error. // We don't have async here nor do we want to pile on any extra errors. if let Err(e) = std::fs::remove_file(&temp_path) { @@ -502,7 +556,6 @@ pub(crate) async fn download_initdb_tar_zst( warn!("error deleting temporary file {temp_path}: {e}"); } } - e })?; Ok((temp_path, file)) @@ -510,12 +563,12 @@ pub(crate) async fn download_initdb_tar_zst( /// Helper function to handle retries for a download operation. /// -/// Remote operations can fail due to rate limits (IAM, S3), spurious network +/// Remote operations can fail due to rate limits (S3), spurious network /// problems, or other external reasons. Retry FAILED_DOWNLOAD_RETRIES times, /// with backoff. /// /// (See similar logic for uploads in `perform_upload_task`) -async fn download_retry( +pub(super) async fn download_retry( op: O, description: &str, cancel: &CancellationToken, @@ -526,19 +579,21 @@ where { backoff::retry( op, - |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound), + DownloadError::is_permanent, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, description, - backoff::Cancel::new(cancel.clone(), || DownloadError::Cancelled), + cancel, ) .await + .ok_or_else(|| DownloadError::Cancelled) + .and_then(|x| x) } async fn download_retry_forever( op: O, description: &str, - cancel: CancellationToken, + cancel: &CancellationToken, ) -> Result where O: FnMut() -> F, @@ -546,11 +601,13 @@ where { backoff::retry( op, - |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound), + DownloadError::is_permanent, FAILED_DOWNLOAD_WARN_THRESHOLD, u32::MAX, description, - backoff::Cancel::new(cancel, || DownloadError::Cancelled), + cancel, ) .await + .ok_or_else(|| DownloadError::Cancelled) + .and_then(|x| x) } diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 0abfdeef02..c51ff54919 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -1,61 +1,22 @@ //! In-memory index to track the tenant files on the remote storage. +//! //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about //! remote timeline layers and its metadata. use std::collections::HashMap; use chrono::NaiveDateTime; +use pageserver_api::models::AuxFilePolicy; use serde::{Deserialize, Serialize}; -use utils::bin_ser::SerializeError; +use utils::id::TimelineId; use crate::tenant::metadata::TimelineMetadata; -use crate::tenant::storage_layer::LayerFileName; -use crate::tenant::upload_queue::UploadQueueInitialized; +use crate::tenant::storage_layer::LayerName; use crate::tenant::Generation; use pageserver_api::shard::ShardIndex; use utils::lsn::Lsn; -/// Metadata gathered for each of the layer files. -/// -/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which -/// might have less or more metadata depending if upgrading or rolling back an upgrade. -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] -//#[cfg_attr(test, derive(Default))] -pub struct LayerFileMetadata { - file_size: u64, - - pub(crate) generation: Generation, - - pub(crate) shard: ShardIndex, -} - -impl From<&'_ IndexLayerMetadata> for LayerFileMetadata { - fn from(other: &IndexLayerMetadata) -> Self { - LayerFileMetadata { - file_size: other.file_size, - generation: other.generation, - shard: other.shard, - } - } -} - -impl LayerFileMetadata { - pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self { - LayerFileMetadata { - file_size, - generation, - shard, - } - } - - pub fn file_size(&self) -> u64 { - self.file_size - } -} - -// TODO seems like another part of the remote storage file format -// compatibility issue, see https://github.com/neondatabase/neon/issues/3072 /// In-memory representation of an `index_part.json` file /// /// Contains the data about all files in the timeline, present remotely and its metadata. @@ -72,19 +33,46 @@ pub struct IndexPart { #[serde(skip_serializing_if = "Option::is_none")] pub deleted_at: Option, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub archived_at: Option, + /// Per layer file name metadata, which can be present for a present or missing layer file. /// /// Older versions of `IndexPart` will not have this property or have only a part of metadata /// that latest version stores. - pub layer_metadata: HashMap, + pub layer_metadata: HashMap, - // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata. - // It's duplicated for convenience when reading the serialized structure, but is - // private because internally we would read from metadata instead. - disk_consistent_lsn: Lsn, + /// Because of the trouble of eyeballing the legacy "metadata" field, we copied the + /// "disk_consistent_lsn" out. After version 7 this is no longer needed, but the name cannot be + /// reused. + pub(super) disk_consistent_lsn: Lsn, - #[serde(rename = "metadata_bytes")] + // TODO: rename as "metadata" next week, keep the alias = "metadata_bytes", bump version Adding + // the "alias = metadata" was forgotten in #7693, so we have to use "rewrite = metadata_bytes" + // for backwards compatibility. + #[serde( + rename = "metadata_bytes", + alias = "metadata", + with = "crate::tenant::metadata::modern_serde" + )] pub metadata: TimelineMetadata, + + #[serde(default)] + pub(crate) lineage: Lineage, + + #[serde(skip_serializing_if = "Option::is_none", default)] + pub(crate) gc_blocking: Option, + + /// Describes the kind of aux files stored in the timeline. + /// + /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable. + /// A V1 setting after V2 files have been committed is not accepted. + /// + /// None means no aux files have been written to the storage before the point + /// when this flag is introduced. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub(crate) last_aux_file_policy: Option, } impl IndexPart { @@ -97,40 +85,39 @@ impl IndexPart { /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers /// is always generated from the keys of `layer_metadata`) /// - 4: timeline_layers is fully removed. - const LATEST_VERSION: usize = 4; + /// - 5: lineage was added + /// - 6: last_aux_file_policy is added. + /// - 7: metadata_bytes is no longer written, but still read + /// - 8: added `archived_at` + /// - 9: +gc_blocking + const LATEST_VERSION: usize = 9; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9]; pub const FILE_NAME: &'static str = "index_part.json"; - pub fn new( - layers_and_metadata: HashMap, - disk_consistent_lsn: Lsn, - metadata: TimelineMetadata, - ) -> Self { - // Transform LayerFileMetadata into IndexLayerMetadata - let layer_metadata = layers_and_metadata - .into_iter() - .map(|(k, v)| (k, IndexLayerMetadata::from(v))) - .collect(); - - Self { + pub(crate) fn empty(metadata: TimelineMetadata) -> Self { + IndexPart { version: Self::LATEST_VERSION, - layer_metadata, - disk_consistent_lsn, + layer_metadata: Default::default(), + disk_consistent_lsn: metadata.disk_consistent_lsn(), metadata, deleted_at: None, + archived_at: None, + lineage: Default::default(), + gc_blocking: None, + last_aux_file_policy: None, } } - pub fn get_version(&self) -> usize { + pub fn version(&self) -> usize { self.version } /// If you want this under normal operations, read it from self.metadata: /// this method is just for the scrubber to use when validating an index. - pub fn get_disk_consistent_lsn(&self) -> Lsn { + pub fn duplicated_disk_consistent_lsn(&self) -> Lsn { self.disk_consistent_lsn } @@ -141,26 +128,23 @@ impl IndexPart { pub fn to_s3_bytes(&self) -> serde_json::Result> { serde_json::to_vec(self) } -} -impl TryFrom<&UploadQueueInitialized> for IndexPart { - type Error = SerializeError; + #[cfg(test)] + pub(crate) fn example() -> Self { + Self::empty(TimelineMetadata::example()) + } - fn try_from(upload_queue: &UploadQueueInitialized) -> Result { - let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); - let metadata = upload_queue.latest_metadata.clone(); - - Ok(Self::new( - upload_queue.latest_files.clone(), - disk_consistent_lsn, - metadata, - )) + pub(crate) fn last_aux_file_policy(&self) -> Option { + self.last_aux_file_policy } } -/// Serialized form of [`LayerFileMetadata`]. -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] -pub struct IndexLayerMetadata { +/// Metadata gathered for each of the layer files. +/// +/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which +/// might have less or more metadata depending if upgrading or rolling back an upgrade. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] +pub struct LayerFileMetadata { pub file_size: u64, #[serde(default = "Generation::none")] @@ -172,12 +156,188 @@ pub struct IndexLayerMetadata { pub shard: ShardIndex, } -impl From for IndexLayerMetadata { - fn from(other: LayerFileMetadata) -> Self { - IndexLayerMetadata { - file_size: other.file_size, - generation: other.generation, - shard: other.shard, +impl LayerFileMetadata { + pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self { + LayerFileMetadata { + file_size, + generation, + shard, + } + } +} + +/// Limited history of earlier ancestors. +/// +/// A timeline can have more than 1 earlier ancestor, in the rare case that it was repeatedly +/// reparented by having an later timeline be detached from it's ancestor. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)] +pub(crate) struct Lineage { + /// Has the `reparenting_history` been truncated to [`Lineage::REMEMBER_AT_MOST`]. + #[serde(skip_serializing_if = "is_false", default)] + reparenting_history_truncated: bool, + + /// Earlier ancestors, truncated when [`Self::reparenting_history_truncated`] + /// + /// These are stored in case we want to support WAL based DR on the timeline. There can be many + /// of these and at most one [`Self::original_ancestor`]. There cannot be more reparentings + /// after [`Self::original_ancestor`] has been set. + #[serde(skip_serializing_if = "Vec::is_empty", default)] + reparenting_history: Vec, + + /// The ancestor from which this timeline has been detached from and when. + /// + /// If you are adding support for detaching from a hierarchy, consider changing the ancestry + /// into a `Vec<(TimelineId, Lsn)>` to be a path instead. + // FIXME: this is insufficient even for path of two timelines for future wal recovery + // purposes: + // + // assuming a "old main" which has received most of the WAL, and has a branch "new main", + // starting a bit before "old main" last_record_lsn. the current version works fine, + // because we will know to replay wal and branch at the recorded Lsn to do wal recovery. + // + // then assuming "new main" would similarly receive a branch right before its last_record_lsn, + // "new new main". the current implementation would just store ("new main", ancestor_lsn, _) + // here. however, we cannot recover from WAL using only that information, we would need the + // whole ancestry here: + // + // ```json + // [ + // ["old main", ancestor_lsn("new main"), _], + // ["new main", ancestor_lsn("new new main"), _] + // ] + // ``` + #[serde(skip_serializing_if = "Option::is_none", default)] + original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>, +} + +fn is_false(b: &bool) -> bool { + !b +} + +impl Lineage { + const REMEMBER_AT_MOST: usize = 100; + + pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) -> bool { + if self.reparenting_history.last() == Some(old_ancestor) { + // do not re-record it + false + } else { + #[cfg(feature = "testing")] + { + let existing = self + .reparenting_history + .iter() + .position(|x| x == old_ancestor); + assert_eq!( + existing, None, + "we cannot reparent onto and off and onto the same timeline twice" + ); + } + let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST; + + self.reparenting_history_truncated |= drop_oldest; + if drop_oldest { + self.reparenting_history.remove(0); + } + self.reparenting_history.push(*old_ancestor); + true + } + } + + /// Returns true if anything changed. + pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) -> bool { + if let Some((id, lsn, _)) = self.original_ancestor { + assert_eq!( + &(id, lsn), + branchpoint, + "detaching attempt has to be for the same ancestor we are already detached from" + ); + false + } else { + self.original_ancestor = + Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc())); + true + } + } + + /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed + /// to start a read/write primary at this lsn". + /// + /// Returns true if the Lsn was previously our branch point. + pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool { + self.original_ancestor + .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn) + } + + /// Returns true if the timeline originally had an ancestor, and no longer has one. + pub(crate) fn is_detached_from_ancestor(&self) -> bool { + self.original_ancestor.is_some() + } + + /// Returns original ancestor timeline id and lsn that this timeline has been detached from. + pub(crate) fn detached_previous_ancestor(&self) -> Option<(TimelineId, Lsn)> { + self.original_ancestor.map(|(id, lsn, _)| (id, lsn)) + } + + pub(crate) fn is_reparented(&self) -> bool { + !self.reparenting_history.is_empty() + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub(crate) struct GcBlocking { + pub(crate) started_at: NaiveDateTime, + pub(crate) reasons: enumset::EnumSet, +} + +#[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)] +#[enumset(serialize_repr = "list")] +pub(crate) enum GcBlockingReason { + Manual, + DetachAncestor, +} + +impl GcBlocking { + pub(super) fn started_now_for(reason: GcBlockingReason) -> Self { + GcBlocking { + started_at: chrono::Utc::now().naive_utc(), + reasons: enumset::EnumSet::only(reason), + } + } + + /// Returns true if the given reason is one of the reasons why the gc is blocked. + pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool { + self.reasons.contains(reason) + } + + /// Returns a version of self with the given reason. + pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self { + assert!(!self.blocked_by(reason)); + let mut reasons = self.reasons; + reasons.insert(reason); + + Self { + started_at: self.started_at, + reasons, + } + } + + /// Returns a version of self without the given reason. Assumption is that if + /// there are no more reasons, we can unblock the gc by returning `None`. + pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option { + assert!(self.blocked_by(reason)); + + if self.reasons.len() == 1 { + None + } else { + let mut reasons = self.reasons; + assert!(reasons.remove(reason)); + assert!(!reasons.is_empty()); + + Some(Self { + started_at: self.started_at, + reasons, + }) } } } @@ -185,6 +345,8 @@ impl From for IndexLayerMetadata { #[cfg(test)] mod tests { use super::*; + use std::str::FromStr; + use utils::id::TimelineId; #[test] fn v1_indexpart_is_parsed() { @@ -203,12 +365,12 @@ mod tests { // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? version: 1, layer_metadata: HashMap::from([ - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, @@ -219,6 +381,10 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + archived_at: None, + lineage: Lineage::default(), + gc_blocking: None, + last_aux_file_policy: None, }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -243,12 +409,12 @@ mod tests { // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? version: 1, layer_metadata: HashMap::from([ - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, @@ -259,6 +425,10 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + archived_at: None, + lineage: Lineage::default(), + gc_blocking: None, + last_aux_file_policy: None, }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -284,12 +454,12 @@ mod tests { // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? version: 2, layer_metadata: HashMap::from([ - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, @@ -299,8 +469,11 @@ mod tests { ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), - deleted_at: Some(chrono::NaiveDateTime::parse_from_str( - "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()) + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: None, + lineage: Lineage::default(), + gc_blocking: None, + last_aux_file_policy: None, }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -345,6 +518,10 @@ mod tests { ]) .unwrap(), deleted_at: None, + archived_at: None, + lineage: Lineage::default(), + gc_blocking: None, + last_aux_file_policy: None, }; let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap(); @@ -368,12 +545,12 @@ mod tests { let expected = IndexPart { version: 4, layer_metadata: HashMap::from([ - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, @@ -383,11 +560,290 @@ mod tests { ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), - deleted_at: Some(chrono::NaiveDateTime::parse_from_str( - "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()), + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: None, + lineage: Lineage::default(), + gc_blocking: None, + last_aux_file_policy: None, }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } + + #[test] + fn v5_indexpart_is_parsed() { + let example = r#"{ + "version":5, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499":{"file_size":23289856,"generation":1}, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619":{"file_size":1015808,"generation":1}}, + "disk_consistent_lsn":"0/15A7618", + "metadata_bytes":[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], + "lineage":{ + "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"], + "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"] + } + }"#; + + let expected = IndexPart { + version: 5, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), LayerFileMetadata { + file_size: 23289856, + generation: Generation::new(1), + shard: ShardIndex::unsharded(), + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), LayerFileMetadata { + file_size: 1015808, + generation: Generation::new(1), + shard: ShardIndex::unsharded(), + }) + ]), + disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(), + metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), + deleted_at: None, + archived_at: None, + lineage: Lineage { + reparenting_history_truncated: false, + reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], + original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), + }, + gc_blocking: None, + last_aux_file_policy: None, + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v6_indexpart_is_parsed() { + let example = r#"{ + "version":6, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], + "deleted_at": "2023-07-31T09:00:00.123", + "lineage":{ + "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"], + "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"] + }, + "last_aux_file_policy": "V2" + }"#; + + let expected = IndexPart { + version: 6, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + // serde_json should always parse this but this might be a double with jq for + // example. + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: None, + lineage: Lineage { + reparenting_history_truncated: false, + reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], + original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), + }, + gc_blocking: None, + last_aux_file_policy: Some(AuxFilePolicy::V2), + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v7_indexpart_is_parsed() { + let example = r#"{ + "version": 7, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "deleted_at": "2023-07-31T09:00:00.123" + }"#; + + let expected = IndexPart { + version: 7, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: None, + lineage: Default::default(), + gc_blocking: None, + last_aux_file_policy: Default::default(), + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v8_indexpart_is_parsed() { + let example = r#"{ + "version": 8, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "deleted_at": "2023-07-31T09:00:00.123", + "archived_at": "2023-04-29T09:00:00.123" + }"#; + + let expected = IndexPart { + version: 8, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")), + lineage: Default::default(), + gc_blocking: None, + last_aux_file_policy: Default::default(), + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v9_indexpart_is_parsed() { + let example = r#"{ + "version": 9, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "gc_blocking": { + "started_at": "2024-07-19T09:00:00.123", + "reasons": ["DetachAncestor"] + } + }"#; + + let expected = IndexPart { + version: 9, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: None, + lineage: Default::default(), + gc_blocking: Some(GcBlocking { + started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), + reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), + }), + last_aux_file_policy: Default::default(), + archived_at: None, + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + fn parse_naive_datetime(s: &str) -> NaiveDateTime { + chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap() + } } diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 58d95f75c2..c4dd184610 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -1,36 +1,34 @@ //! Helper functions to upload files to remote storage with a RemoteStorage use anyhow::{bail, Context}; +use bytes::Bytes; use camino::Utf8Path; use fail::fail_point; use pageserver_api::shard::TenantShardId; use std::io::{ErrorKind, SeekFrom}; +use std::time::SystemTime; use tokio::fs::{self, File}; use tokio::io::AsyncSeekExt; use tokio_util::sync::CancellationToken; +use utils::{backoff, pausable_failpoint}; +use super::index::IndexPart; use super::Generation; -use crate::{ - config::PageServerConf, - tenant::remote_timeline_client::{ - index::IndexPart, remote_index_path, remote_initdb_archive_path, - remote_initdb_preserved_archive_path, remote_path, upload_cancellable, - }, +use crate::tenant::remote_timeline_client::{ + remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path, }; -use remote_storage::GenericRemoteStorage; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError}; use utils::id::{TenantId, TimelineId}; -use super::index::LayerFileMetadata; - use tracing::info; /// Serializes and uploads the given index part data to the remote storage. -pub(super) async fn upload_index_part<'a>( +pub(crate) async fn upload_index_part<'a>( storage: &'a GenericRemoteStorage, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, generation: Generation, - index_part: &'a IndexPart, + index_part: &IndexPart, cancel: &CancellationToken, ) -> anyhow::Result<()> { tracing::trace!("uploading new index part"); @@ -40,23 +38,22 @@ pub(super) async fn upload_index_part<'a>( }); pausable_failpoint!("before-upload-index-pausable"); - let index_part_bytes = index_part - .to_s3_bytes() - .context("serialize index part file into bytes")?; - let index_part_size = index_part_bytes.len(); - let index_part_bytes = bytes::Bytes::from(index_part_bytes); + // FIXME: this error comes too late + let serialized = index_part.to_s3_bytes()?; + let serialized = Bytes::from(serialized); + + let index_part_size = serialized.len(); let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation); - upload_cancellable( - cancel, - storage.upload_storage_object( - futures::stream::once(futures::future::ready(Ok(index_part_bytes))), + storage + .upload_storage_object( + futures::stream::once(futures::future::ready(Ok(serialized))), index_part_size, &remote_path, - ), - ) - .await - .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'")) + cancel, + ) + .await + .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'")) } /// Attempts to upload given layer files. @@ -64,11 +61,10 @@ pub(super) async fn upload_index_part<'a>( /// /// On an error, bumps the retries count and reschedules the entire task. pub(super) async fn upload_timeline_layer<'a>( - conf: &'static PageServerConf, storage: &'a GenericRemoteStorage, - source_path: &'a Utf8Path, - known_metadata: &'a LayerFileMetadata, - generation: Generation, + local_path: &'a Utf8Path, + remote_path: &'a RemotePath, + metadata_size: u64, cancel: &CancellationToken, ) -> anyhow::Result<()> { fail_point!("before-upload-layer", |_| { @@ -77,8 +73,7 @@ pub(super) async fn upload_timeline_layer<'a>( pausable_failpoint!("before-upload-layer-pausable"); - let storage_path = remote_path(conf, source_path, generation)?; - let source_file_res = fs::File::open(&source_path).await; + let source_file_res = fs::File::open(&local_path).await; let source_file = match source_file_res { Ok(source_file) => source_file, Err(e) if e.kind() == ErrorKind::NotFound => { @@ -89,35 +84,49 @@ pub(super) async fn upload_timeline_layer<'a>( // it has been written to disk yet. // // This is tested against `test_compaction_delete_before_upload` - info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."); + info!(path = %local_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."); return Ok(()); } - Err(e) => { - Err(e).with_context(|| format!("open a source file for layer {source_path:?}"))? - } + Err(e) => Err(e).with_context(|| format!("open a source file for layer {local_path:?}"))?, }; let fs_size = source_file .metadata() .await - .with_context(|| format!("get the source file metadata for layer {source_path:?}"))? + .with_context(|| format!("get the source file metadata for layer {local_path:?}"))? .len(); - let metadata_size = known_metadata.file_size(); if metadata_size != fs_size { - bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"); + bail!("File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"); } let fs_size = usize::try_from(fs_size) - .with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?; + .with_context(|| format!("convert {local_path:?} size {fs_size} usize"))?; let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE); - upload_cancellable(cancel, storage.upload(reader, fs_size, &storage_path, None)) + storage + .upload(reader, fs_size, remote_path, None, cancel) .await - .with_context(|| format!("upload layer from local path '{source_path}'"))?; + .with_context(|| format!("upload layer from local path '{local_path}'")) +} - Ok(()) +pub(super) async fn copy_timeline_layer( + storage: &GenericRemoteStorage, + source_path: &RemotePath, + target_path: &RemotePath, + cancel: &CancellationToken, +) -> anyhow::Result<()> { + fail_point!("before-copy-layer", |_| { + bail!("failpoint before-copy-layer") + }); + + pausable_failpoint!("before-copy-layer-pausable"); + + storage + .copy_object(source_path, target_path, cancel) + .await + .with_context(|| format!("copy layer {source_path} to {target_path}")) } /// Uploads the given `initdb` data to the remote storage. @@ -137,12 +146,10 @@ pub(crate) async fn upload_initdb_dir( let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE); let remote_path = remote_initdb_archive_path(tenant_id, timeline_id); - upload_cancellable( - cancel, - storage.upload_storage_object(file, size as usize, &remote_path), - ) - .await - .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'")) + storage + .upload_storage_object(file, size as usize, &remote_path, cancel) + .await + .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'")) } pub(crate) async fn preserve_initdb_archive( @@ -153,7 +160,52 @@ pub(crate) async fn preserve_initdb_archive( ) -> anyhow::Result<()> { let source_path = remote_initdb_archive_path(tenant_id, timeline_id); let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id); - upload_cancellable(cancel, storage.copy_object(&source_path, &dest_path)) + storage + .copy_object(&source_path, &dest_path, cancel) .await .with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'")) } + +pub(crate) async fn time_travel_recover_tenant( + storage: &GenericRemoteStorage, + tenant_shard_id: &TenantShardId, + timestamp: SystemTime, + done_if_after: SystemTime, + cancel: &CancellationToken, +) -> Result<(), TimeTravelError> { + let warn_after = 3; + let max_attempts = 10; + let mut prefixes = Vec::with_capacity(2); + if tenant_shard_id.is_shard_zero() { + // Also recover the unsharded prefix for a shard of zero: + // - if the tenant is totally unsharded, the unsharded prefix contains all the data + // - if the tenant is sharded, we still want to recover the initdb data, but we only + // want to do it once, so let's do it on the 0 shard + let timelines_path_unsharded = + super::remote_timelines_path_unsharded(&tenant_shard_id.tenant_id); + prefixes.push(timelines_path_unsharded); + } + if !tenant_shard_id.is_unsharded() { + // If the tenant is sharded, we need to recover the sharded prefix + let timelines_path = super::remote_timelines_path(tenant_shard_id); + prefixes.push(timelines_path); + } + for prefix in &prefixes { + backoff::retry( + || async { + storage + .time_travel_recover(Some(prefix), timestamp, done_if_after, cancel) + .await + }, + |e| !matches!(e, TimeTravelError::Other(_)), + warn_after, + max_attempts, + "time travel recovery of tenant prefix", + cancel, + ) + .await + .ok_or_else(|| TimeTravelError::Cancelled) + .and_then(|x| x)?; + } + Ok(()) +} diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index d00d901be6..1331c07d05 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -6,10 +6,10 @@ mod scheduler; use std::{sync::Arc, time::SystemTime}; use crate::{ - config::PageServerConf, + context::RequestContext, disk_usage_eviction_task::DiskUsageEvictionInfo, + metrics::SECONDARY_HEATMAP_TOTAL_SIZE, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, - virtual_file::MaybeFatalIo, }; use self::{ @@ -21,18 +21,21 @@ use super::{ config::{SecondaryLocationConfig, TenantConfOpt}, mgr::TenantManager, span::debug_assert_current_span_has_tenant_id, - storage_layer::LayerFileName, + storage_layer::LayerName, }; +use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE; +use metrics::UIntGauge; use pageserver_api::{ models, shard::{ShardIdentity, TenantShardId}, }; use remote_storage::GenericRemoteStorage; +use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::instrument; -use utils::{completion::Barrier, fs_ext, id::TimelineId, sync::gate::Gate}; +use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate}; enum DownloadCommand { Download(TenantShardId), @@ -95,7 +98,26 @@ pub(crate) struct SecondaryTenant { shard_identity: ShardIdentity, tenant_conf: std::sync::Mutex, + // Internal state used by the Downloader. detail: std::sync::Mutex, + + // Public state indicating overall progress of downloads relative to the last heatmap seen + pub(crate) progress: std::sync::Mutex, + + // Sum of layer sizes on local disk + pub(super) resident_size_metric: UIntGauge, + + // Sum of layer sizes in the most recently downloaded heatmap + pub(super) heatmap_total_size_metric: UIntGauge, +} + +impl Drop for SecondaryTenant { + fn drop(&mut self) { + let tenant_id = self.tenant_shard_id.tenant_id.to_string(); + let shard_id = format!("{}", self.tenant_shard_id.shard_slug()); + let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); + let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); + } } impl SecondaryTenant { @@ -105,6 +127,16 @@ impl SecondaryTenant { tenant_conf: TenantConfOpt, config: &SecondaryLocationConfig, ) -> Arc { + let tenant_id = tenant_shard_id.tenant_id.to_string(); + let shard_id = format!("{}", tenant_shard_id.shard_slug()); + let resident_size_metric = SECONDARY_RESIDENT_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id]) + .unwrap(); + + let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id]) + .unwrap(); + Arc::new(Self { tenant_shard_id, // todo: shall we make this a descendent of the @@ -112,15 +144,24 @@ impl SecondaryTenant { // on shutdown we walk the tenants and fire their // individual cancellations? cancel: CancellationToken::new(), - gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")), + gate: Gate::default(), shard_identity, tenant_conf: std::sync::Mutex::new(tenant_conf), detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())), + + progress: std::sync::Mutex::default(), + + resident_size_metric, + heatmap_total_size_metric, }) } + pub(crate) fn tenant_shard_id(&self) -> TenantShardId { + self.tenant_shard_id + } + pub(crate) async fn shutdown(&self) { self.cancel.cancel(); @@ -133,7 +174,7 @@ impl SecondaryTenant { } pub(crate) fn set_tenant_conf(&self, config: &TenantConfOpt) { - *(self.tenant_conf.lock().unwrap()) = *config; + *(self.tenant_conf.lock().unwrap()) = config.clone(); } /// For API access: generate a LocationConfig equivalent to the one that would be used to @@ -144,13 +185,13 @@ impl SecondaryTenant { let conf = models::LocationConfigSecondary { warm: conf.warm }; - let tenant_conf = *self.tenant_conf.lock().unwrap(); + let tenant_conf = self.tenant_conf.lock().unwrap().clone(); models::LocationConfig { mode: models::LocationConfigMode::Secondary, generation: None, secondary_conf: Some(conf), shard_number: self.tenant_shard_id.shard_number.0, - shard_count: self.tenant_shard_id.shard_count.0, + shard_count: self.tenant_shard_id.shard_count.literal(), shard_stripe_size: self.shard_identity.stripe_size.0, tenant_conf: tenant_conf.into(), } @@ -160,20 +201,16 @@ impl SecondaryTenant { &self.tenant_shard_id } - pub(crate) fn get_layers_for_eviction(self: &Arc) -> DiskUsageEvictionInfo { + pub(crate) fn get_layers_for_eviction(self: &Arc) -> (DiskUsageEvictionInfo, usize) { self.detail.lock().unwrap().get_layers_for_eviction(self) } + /// Cancellation safe, but on cancellation the eviction will go through #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))] - pub(crate) async fn evict_layer( - &self, - conf: &PageServerConf, - timeline_id: TimelineId, - name: LayerFileName, - ) { + pub(crate) async fn evict_layer(self: &Arc, timeline_id: TimelineId, name: LayerName) { debug_assert_current_span_has_tenant_id(); - let _guard = match self.gate.enter() { + let guard = match self.gate.enter() { Ok(g) => g, Err(_) => { tracing::debug!("Dropping layer evictions, secondary tenant shutting down",); @@ -182,47 +219,50 @@ impl SecondaryTenant { }; let now = SystemTime::now(); + tracing::info!("Evicting secondary layer"); - let path = conf - .timeline_path(&self.tenant_shard_id, &timeline_id) - .join(name.file_name()); + let this = self.clone(); - // We tolerate ENOENT, because between planning eviction and executing - // it, the secondary downloader could have seen an updated heatmap that - // resulted in a layer being deleted. - // Other local I/O errors are process-fatal: these should never happen. - tokio::fs::remove_file(path) - .await - .or_else(fs_ext::ignore_not_found) - .fatal_err("Deleting layer during eviction"); + // spawn it to be cancellation safe + tokio::task::spawn_blocking(move || { + let _guard = guard; - // Update the timeline's state. This does not have to be synchronized with - // the download process, because: - // - If downloader is racing with us to remove a file (e.g. because it is - // removed from heatmap), then our mutual .remove() operations will both - // succeed. - // - If downloader is racing with us to download the object (this would require - // multiple eviction iterations to race with multiple download iterations), then - // if we remove it from the state, the worst that happens is the downloader - // downloads it again before re-inserting, or we delete the file but it remains - // in the state map (in which case it will be downloaded if this secondary - // tenant transitions to attached and tries to access it) - // - // The important assumption here is that the secondary timeline state does not - // have to 100% match what is on disk, because it's a best-effort warming - // of the cache. - let mut detail = self.detail.lock().unwrap(); - if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) { - timeline_detail.on_disk_layers.remove(&name); - timeline_detail.evicted_at.insert(name, now); - } + // Update the timeline's state. This does not have to be synchronized with + // the download process, because: + // - If downloader is racing with us to remove a file (e.g. because it is + // removed from heatmap), then our mutual .remove() operations will both + // succeed. + // - If downloader is racing with us to download the object (this would require + // multiple eviction iterations to race with multiple download iterations), then + // if we remove it from the state, the worst that happens is the downloader + // downloads it again before re-inserting, or we delete the file but it remains + // in the state map (in which case it will be downloaded if this secondary + // tenant transitions to attached and tries to access it) + // + // The important assumption here is that the secondary timeline state does not + // have to 100% match what is on disk, because it's a best-effort warming + // of the cache. + let mut detail = this.detail.lock().unwrap(); + if let Some(removed) = + detail.evict_layer(name, &timeline_id, now, &this.resident_size_metric) + { + // We might race with removal of the same layer during downloads, so finding the layer we + // were trying to remove is optional. Only issue the disk I/O to remove it if we found it. + removed.remove_blocking(); + } + }) + .await + .expect("secondary eviction should not have panicked"); } } /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads, -/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests, -/// where we want to immediately upload/download for a particular tenant. In normal operation -/// uploads & downloads are autonomous and not driven by this interface. +/// and heatmap uploads. This is not a hot data path: it's used for: +/// - Live migrations, where we want to ensure a migration destination has the freshest possible +/// content before trying to cut over. +/// - Tests, where we want to immediately upload/download for a particular tenant. +/// +/// In normal operations, outside of migrations, uploads & downloads are autonomous and not driven by this interface. pub struct SecondaryController { upload_req_tx: tokio::sync::mpsc::Sender>, download_req_tx: tokio::sync::mpsc::Sender>, @@ -264,15 +304,50 @@ impl SecondaryController { } } +pub struct GlobalTasks { + cancel: CancellationToken, + uploader: JoinHandle<()>, + downloader: JoinHandle<()>, +} + +impl GlobalTasks { + /// Caller is responsible for requesting shutdown via the cancellation token that was + /// passed to [`spawn_tasks`]. + /// + /// # Panics + /// + /// This method panics if that token is not cancelled. + /// This is low-risk because we're calling this during process shutdown, so, a panic + /// will be informative but not cause undue downtime. + pub async fn wait(self) { + let Self { + cancel, + uploader, + downloader, + } = self; + assert!( + cancel.is_cancelled(), + "must cancel cancellation token, otherwise the tasks will not shut down" + ); + + let (uploader, downloader) = futures::future::join(uploader, downloader).await; + uploader.expect( + "unreachable: exit_on_panic_or_error would catch the panic and exit the process", + ); + downloader.expect( + "unreachable: exit_on_panic_or_error would catch the panic and exit the process", + ); + } +} + pub fn spawn_tasks( tenant_manager: Arc, remote_storage: GenericRemoteStorage, background_jobs_can_start: Barrier, cancel: CancellationToken, -) -> SecondaryController { +) -> (SecondaryController, GlobalTasks) { let mgr_clone = tenant_manager.clone(); let storage_clone = remote_storage.clone(); - let cancel_clone = cancel.clone(); let bg_jobs_clone = background_jobs_can_start.clone(); let (download_req_tx, download_req_rx) = @@ -280,13 +355,9 @@ pub fn spawn_tasks( let (upload_req_tx, upload_req_rx) = tokio::sync::mpsc::channel::>(16); - task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), - TaskKind::SecondaryDownloads, - None, - None, + let cancel_clone = cancel.clone(); + let downloader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "secondary tenant downloads", - false, async move { downloader_task( mgr_clone, @@ -294,48 +365,41 @@ pub fn spawn_tasks( download_req_rx, bg_jobs_clone, cancel_clone, + RequestContext::new( + TaskKind::SecondaryDownloads, + crate::context::DownloadBehavior::Download, + ), ) .await; - - Ok(()) + anyhow::Ok(()) }, - ); + )); - task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), - TaskKind::SecondaryUploads, - None, - None, + let cancel_clone = cancel.clone(); + let uploader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "heatmap uploads", - false, async move { heatmap_uploader_task( tenant_manager, remote_storage, upload_req_rx, background_jobs_can_start, - cancel, + cancel_clone, ) .await; - - Ok(()) + anyhow::Ok(()) }, - ); + )); - SecondaryController { - download_req_tx, - upload_req_tx, - } -} - -/// For running with remote storage disabled: a SecondaryController that is connected to nothing. -pub fn null_controller() -> SecondaryController { - let (download_req_tx, _download_req_rx) = - tokio::sync::mpsc::channel::>(16); - let (upload_req_tx, _upload_req_rx) = - tokio::sync::mpsc::channel::>(16); - SecondaryController { - upload_req_tx, - download_req_tx, - } + ( + SecondaryController { + upload_req_tx, + download_req_tx, + }, + GlobalTasks { + cancel, + uploader, + downloader, + }, + ) } diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 702c0b1ec1..90e1c01dbd 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -8,6 +8,7 @@ use std::{ use crate::{ config::PageServerConf, + context::RequestContext, disk_usage_eviction_task::{ finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, }, @@ -15,20 +16,25 @@ use crate::{ tenant::{ config::SecondaryLocationConfig, debug_assert_current_span_has_tenant_and_timeline_id, + ephemeral_file::is_ephemeral_file, remote_timeline_client::{ - index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, + index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, }, span::debug_assert_current_span_has_tenant_id, - storage_layer::LayerFileName, + storage_layer::{layer::local_layer_path, LayerName, LayerVisibilityHint}, tasks::{warn_when_period_overrun, BackgroundLoopKind}, }, virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}, - METADATA_FILE_NAME, TEMP_FILE_SUFFIX, + TEMP_FILE_SUFFIX, }; use super::{ heatmap::HeatMapLayer, - scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs}, + scheduler::{ + self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult, + TenantBackgroundJobs, + }, SecondaryTenant, }; @@ -37,16 +43,19 @@ use crate::tenant::{ remote_timeline_client::{download::download_layer_file, remote_heatmap_path}, }; +use camino::Utf8PathBuf; use chrono::format::{DelayedFormat, StrftimeItems}; use futures::Future; +use metrics::UIntGauge; +use pageserver_api::models::SecondaryProgress; use pageserver_api::shard::TenantShardId; -use rand::Rng; -use remote_storage::{DownloadError, GenericRemoteStorage}; +use remote_storage::{DownloadError, Etag, GenericRemoteStorage}; use tokio_util::sync::CancellationToken; -use tracing::{info_span, instrument, Instrument}; +use tracing::{info_span, instrument, warn, Instrument}; use utils::{ - backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId, + backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext, + id::TimelineId, pausable_failpoint, serde_system_time, }; use super::{ @@ -54,14 +63,10 @@ use super::{ CommandRequest, DownloadCommand, }; -/// For each tenant, how long must have passed since the last download_tenant call before -/// calling it again. This is approximately the time by which local data is allowed -/// to fall behind remote data. -/// -/// TODO: this should just be a default, and the actual period should be controlled -/// via the heatmap itself -/// `` -const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000); +/// For each tenant, default period for how long must have passed since the last download_tenant call before +/// calling it again. This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first +/// download, if the uploader populated it. +const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000); pub(super) async fn downloader_task( tenant_manager: Arc, @@ -69,30 +74,34 @@ pub(super) async fn downloader_task( command_queue: tokio::sync::mpsc::Receiver>, background_jobs_can_start: Barrier, cancel: CancellationToken, + root_ctx: RequestContext, ) { let concurrency = tenant_manager.get_conf().secondary_download_concurrency; let generator = SecondaryDownloader { tenant_manager, remote_storage, + root_ctx, }; let mut scheduler = Scheduler::new(generator, concurrency); scheduler .run(command_queue, background_jobs_can_start, cancel) - .instrument(info_span!("secondary_downloads")) + .instrument(info_span!("secondary_download_scheduler")) .await } struct SecondaryDownloader { tenant_manager: Arc, remote_storage: GenericRemoteStorage, + root_ctx: RequestContext, } #[derive(Debug, Clone)] pub(super) struct OnDiskState { metadata: LayerFileMetadata, access_time: SystemTime, + local_path: Utf8PathBuf, } impl OnDiskState { @@ -100,23 +109,96 @@ impl OnDiskState { _conf: &'static PageServerConf, _tenant_shard_id: &TenantShardId, _imeline_id: &TimelineId, - _ame: LayerFileName, + _ame: LayerName, metadata: LayerFileMetadata, access_time: SystemTime, + local_path: Utf8PathBuf, ) -> Self { Self { metadata, access_time, + local_path, } } + + // This is infallible, because all errors are either acceptable (ENOENT), or totally + // unexpected (fatal). + pub(super) fn remove_blocking(&self) { + // We tolerate ENOENT, because between planning eviction and executing + // it, the secondary downloader could have seen an updated heatmap that + // resulted in a layer being deleted. + // Other local I/O errors are process-fatal: these should never happen. + std::fs::remove_file(&self.local_path) + .or_else(fs_ext::ignore_not_found) + .fatal_err("Deleting secondary layer") + } + + pub(crate) fn file_size(&self) -> u64 { + self.metadata.file_size + } } #[derive(Debug, Clone, Default)] pub(super) struct SecondaryDetailTimeline { - pub(super) on_disk_layers: HashMap, + on_disk_layers: HashMap, /// We remember when layers were evicted, to prevent re-downloading them. - pub(super) evicted_at: HashMap, + pub(super) evicted_at: HashMap, +} + +impl SecondaryDetailTimeline { + pub(super) fn remove_layer( + &mut self, + name: &LayerName, + resident_metric: &UIntGauge, + ) -> Option { + let removed = self.on_disk_layers.remove(name); + if let Some(removed) = &removed { + resident_metric.sub(removed.file_size()); + } + removed + } + + /// `local_path` + fn touch_layer( + &mut self, + conf: &'static PageServerConf, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + touched: &HeatMapLayer, + resident_metric: &UIntGauge, + local_path: F, + ) where + F: FnOnce() -> Utf8PathBuf, + { + use std::collections::hash_map::Entry; + match self.on_disk_layers.entry(touched.name.clone()) { + Entry::Occupied(mut v) => { + v.get_mut().access_time = touched.access_time; + } + Entry::Vacant(e) => { + e.insert(OnDiskState::new( + conf, + tenant_shard_id, + timeline_id, + touched.name.clone(), + touched.metadata.clone(), + touched.access_time, + local_path(), + )); + resident_metric.add(touched.metadata.file_size); + } + } + } +} + +// Aspects of a heatmap that we remember after downloading it +#[derive(Clone, Debug)] +struct DownloadSummary { + etag: Etag, + #[allow(unused)] + mtime: SystemTime, + upload_period: Duration, } /// This state is written by the secondary downloader, it is opaque @@ -125,9 +207,9 @@ pub(super) struct SecondaryDetailTimeline { pub(super) struct SecondaryDetail { pub(super) config: SecondaryLocationConfig, - last_download: Option, + last_download: Option, next_download: Option, - pub(super) timelines: HashMap, + timelines: HashMap, } /// Helper for logging SystemTime @@ -136,6 +218,20 @@ fn strftime(t: &'_ SystemTime) -> DelayedFormat> { datetime.format("%d/%m/%Y %T") } +/// Information returned from download function when it detects the heatmap has changed +struct HeatMapModified { + etag: Etag, + last_modified: SystemTime, + bytes: Vec, +} + +enum HeatMapDownload { + // The heatmap's etag has changed: return the new etag, mtime and the body bytes + Modified(HeatMapModified), + // The heatmap's etag is unchanged + Unmodified, +} + impl SecondaryDetail { pub(super) fn new(config: SecondaryLocationConfig) -> Self { Self { @@ -146,14 +242,47 @@ impl SecondaryDetail { } } + pub(super) fn evict_layer( + &mut self, + name: LayerName, + timeline_id: &TimelineId, + now: SystemTime, + resident_metric: &UIntGauge, + ) -> Option { + let timeline = self.timelines.get_mut(timeline_id)?; + let removed = timeline.remove_layer(&name, resident_metric); + if removed.is_some() { + timeline.evicted_at.insert(name, now); + } + removed + } + + pub(super) fn remove_timeline( + &mut self, + timeline_id: &TimelineId, + resident_metric: &UIntGauge, + ) { + let removed = self.timelines.remove(timeline_id); + if let Some(removed) = removed { + resident_metric.sub( + removed + .on_disk_layers + .values() + .map(|l| l.metadata.file_size) + .sum(), + ); + } + } + + /// Additionally returns the total number of layers, used for more stable relative access time + /// based eviction. pub(super) fn get_layers_for_eviction( &self, parent: &Arc, - ) -> DiskUsageEvictionInfo { - let mut result = DiskUsageEvictionInfo { - max_layer_size: None, - resident_layers: Vec::new(), - }; + ) -> (DiskUsageEvictionInfo, usize) { + let mut result = DiskUsageEvictionInfo::default(); + let mut total_layers = 0; + for (timeline_id, timeline_detail) in &self.timelines { result .resident_layers @@ -167,8 +296,15 @@ impl SecondaryDetail { }), last_activity_ts: ods.access_time, relative_last_activity: finite_f32::FiniteF32::ZERO, + // Secondary location layers are presumed visible, because Covered layers + // are excluded from the heatmap + visibility: LayerVisibilityHint::Visible, } })); + + // total might be missing currently downloading layers, but as a lower than actual + // value it is good enough approximation. + total_layers += timeline_detail.on_disk_layers.len() + timeline_detail.evicted_at.len(); } result.max_layer_size = result .resident_layers @@ -183,15 +319,14 @@ impl SecondaryDetail { result.resident_layers.len() ); - result + (result, total_layers) } } struct PendingDownload { secondary_state: Arc, - last_download: Option, + last_download: Option, target_time: Option, - period: Option, } impl scheduler::PendingJob for PendingDownload { @@ -213,6 +348,7 @@ impl scheduler::RunningJob for RunningDownload { struct CompleteDownload { secondary_state: Arc, completed_at: Instant, + result: Result<(), UpdateError>, } impl scheduler::Completion for CompleteDownload { @@ -237,14 +373,33 @@ impl JobGenerator { + // Start downloading again as soon as we can. This will involve waiting for the scheduler's + // scheduling interval. This slightly reduces the peak download speed of tenants that hit their + // deadline and keep restarting, but that also helps give other tenants a chance to execute rather + // that letting one big tenant dominate for a long time. + detail.next_download = Some(Instant::now()); + } + _ => { + let period = detail + .last_download + .as_ref() + .map(|d| d.upload_period) + .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL); + + // We advance next_download irrespective of errors: we don't want error cases to result in + // expensive busy-polling. + detail.next_download = Some(Instant::now() + period_jitter(period, 5)); + } + } } async fn schedule(&mut self) -> SchedulingResult { @@ -275,23 +430,20 @@ impl JobGenerator next_download { Some(PendingDownload { secondary_state: secondary_tenant, last_download, target_time: Some(next_download), - period: Some(DOWNLOAD_FRESHEN_INTERVAL), }) } else { None @@ -312,14 +464,11 @@ impl JobGenerator { tracing::info!("No heatmap found for tenant. This is fine if it is new."); @@ -357,7 +507,7 @@ impl JobGenerator { - tracing::debug!("Shut down while downloading"); + tracing::info!("Shut down while downloading"); }, Err(UpdateError::Deserialize(e)) => { tracing::error!("Corrupt content while downloading tenant: {e}"); @@ -365,6 +515,9 @@ impl JobGenerator { tracing::error!("Error while downloading tenant: {e}"); }, + Err(UpdateError::Restart) => { + tracing::info!("Download reached deadline & will restart to update heatmap") + } Ok(()) => {} }; @@ -372,26 +525,22 @@ impl JobGenerator { /// Errors that may be encountered while updating a tenant #[derive(thiserror::Error, Debug)] enum UpdateError { + /// This is not a true failure, but it's how a download indicates that it would like to be restarted by + /// the scheduler, to pick up the latest heatmap + #[error("Reached deadline, restarting downloads")] + Restart, + #[error("No remote data found")] NoData, #[error("Insufficient local storage space")] @@ -435,8 +589,14 @@ impl From for UpdateError { fn from(value: std::io::Error) -> Self { if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) { UpdateError::NoSpace + } else if value + .get_ref() + .and_then(|x| x.downcast_ref::()) + .is_some() + { + UpdateError::from(DownloadError::from(value)) } else { - // An I/O error from e.g. tokio::io::copy is most likely a remote storage issue + // An I/O error from e.g. tokio::io::copy_buf is most likely a remote storage issue UpdateError::Other(anyhow::anyhow!(value)) } } @@ -455,22 +615,42 @@ impl<'a> TenantDownloader<'a> { } } - async fn download(&self) -> Result<(), UpdateError> { + async fn download(&self, ctx: &RequestContext) -> Result<(), UpdateError> { debug_assert_current_span_has_tenant_id(); // For the duration of a download, we must hold the SecondaryTenant::gate, to ensure // cover our access to local storage. let Ok(_guard) = self.secondary_state.gate.enter() else { // Shutting down - return Ok(()); + return Err(UpdateError::Cancelled); }; let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); + + // We will use the etag from last successful download to make the download conditional on changes + let last_download = self + .secondary_state + .detail + .lock() + .unwrap() + .last_download + .clone(); + // Download the tenant's heatmap - let heatmap_bytes = tokio::select!( - bytes = self.download_heatmap() => {bytes?}, + let HeatMapModified { + last_modified: heatmap_mtime, + etag: heatmap_etag, + bytes: heatmap_bytes, + } = match tokio::select!( + bytes = self.download_heatmap(last_download.as_ref().map(|d| &d.etag)) => {bytes?}, _ = self.secondary_state.cancel.cancelled() => return Ok(()) - ); + ) { + HeatMapDownload::Unmodified => { + tracing::info!("Heatmap unchanged since last successful download"); + return Ok(()); + } + HeatMapDownload::Modified(m) => m, + }; let heatmap = serde_json::from_slice::(&heatmap_bytes)?; @@ -481,25 +661,98 @@ impl<'a> TenantDownloader<'a> { let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX); let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}"); let heatmap_path_bg = heatmap_path.clone(); - tokio::task::spawn_blocking(move || { - tokio::runtime::Handle::current().block_on(async move { - VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await - }) - }) - .await - .expect("Blocking task is never aborted") - .maybe_fatal_err(&context_msg)?; + VirtualFile::crashsafe_overwrite(heatmap_path_bg, temp_path, heatmap_bytes) + .await + .maybe_fatal_err(&context_msg)?; - tracing::debug!("Wrote local heatmap to {}", heatmap_path); + tracing::debug!( + "Wrote local heatmap to {}, with {} timelines", + heatmap_path, + heatmap.timelines.len() + ); + + // Get or initialize the local disk state for the timelines we will update + let mut timeline_states = HashMap::new(); + for timeline in &heatmap.timelines { + let timeline_state = self + .secondary_state + .detail + .lock() + .unwrap() + .timelines + .get(&timeline.timeline_id) + .cloned(); + + let timeline_state = match timeline_state { + Some(t) => t, + None => { + // We have no existing state: need to scan local disk for layers first. + let timeline_state = init_timeline_state( + self.conf, + tenant_shard_id, + timeline, + &self.secondary_state.resident_size_metric, + ) + .await; + + // Re-acquire detail lock now that we're done with async load from local FS + self.secondary_state + .detail + .lock() + .unwrap() + .timelines + .insert(timeline.timeline_id, timeline_state.clone()); + timeline_state + } + }; + + timeline_states.insert(timeline.timeline_id, timeline_state); + } + + // Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general + // principle that deletions should be done before writes wherever possible, and so that we can use this + // phase to initialize our SecondaryProgress. + { + *self.secondary_state.progress.lock().unwrap() = + self.prepare_timelines(&heatmap, heatmap_mtime).await?; + } + + // Calculate a deadline for downloads: if downloading takes longer than this, it is useful to drop out and start again, + // so that we are always using reasonably a fresh heatmap. Otherwise, if we had really huge content to download, we might + // spend 10s of minutes downloading layers we don't need. + // (see https://github.com/neondatabase/neon/issues/8182) + let deadline = { + let period = self + .secondary_state + .detail + .lock() + .unwrap() + .last_download + .as_ref() + .map(|d| d.upload_period) + .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL); + + // Use double the period: we are not promising to complete within the period, this is just a heuristic + // to keep using a "reasonably fresh" heatmap. + Instant::now() + period * 2 + }; // Download the layers in the heatmap for timeline in heatmap.timelines { + let timeline_state = timeline_states + .remove(&timeline.timeline_id) + .expect("Just populated above"); + if self.secondary_state.cancel.is_cancelled() { + tracing::debug!( + "Cancelled before downloading timeline {}", + timeline.timeline_id + ); return Ok(()); } let timeline_id = timeline.timeline_id; - self.download_timeline(timeline) + self.download_timeline(timeline, timeline_state, deadline, ctx) .instrument(tracing::info_span!( "secondary_download_timeline", tenant_id=%tenant_shard_id.tenant_id, @@ -509,117 +762,283 @@ impl<'a> TenantDownloader<'a> { .await?; } + // Metrics consistency check in testing builds + if cfg!(feature = "testing") { + let detail = self.secondary_state.detail.lock().unwrap(); + let resident_size = detail + .timelines + .values() + .map(|tl| { + tl.on_disk_layers + .values() + .map(|v| v.metadata.file_size) + .sum::() + }) + .sum::(); + assert_eq!( + resident_size, + self.secondary_state.resident_size_metric.get() + ); + } + + // Only update last_etag after a full successful download: this way will not skip + // the next download, even if the heatmap's actual etag is unchanged. + self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary { + etag: heatmap_etag, + mtime: heatmap_mtime, + upload_period: heatmap + .upload_period_ms + .map(|ms| Duration::from_millis(ms as u64)) + .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL), + }); + + // Robustness: we should have updated progress properly, but in case we didn't, make sure + // we don't leave the tenant in a state where we claim to have successfully downloaded + // everything, but our progress is incomplete. The invariant here should be that if + // we have set `last_download` to this heatmap's etag, then the next time we see that + // etag we can safely do no work (i.e. we must be complete). + let mut progress = self.secondary_state.progress.lock().unwrap(); + debug_assert!(progress.layers_downloaded == progress.layers_total); + debug_assert!(progress.bytes_downloaded == progress.bytes_total); + if progress.layers_downloaded != progress.layers_total + || progress.bytes_downloaded != progress.bytes_total + { + tracing::warn!("Correcting drift in progress stats ({progress:?})"); + progress.layers_downloaded = progress.layers_total; + progress.bytes_downloaded = progress.bytes_total; + } + Ok(()) } - async fn download_heatmap(&self) -> Result, UpdateError> { + /// Do any fast local cleanup that comes before the much slower process of downloading + /// layers from remote storage. In the process, initialize the SecondaryProgress object + /// that will later be updated incrementally as we download layers. + async fn prepare_timelines( + &self, + heatmap: &HeatMapTenant, + heatmap_mtime: SystemTime, + ) -> Result { + let heatmap_stats = heatmap.get_stats(); + // We will construct a progress object, and then populate its initial "downloaded" numbers + // while iterating through local layer state in [`Self::prepare_timelines`] + let mut progress = SecondaryProgress { + layers_total: heatmap_stats.layers, + bytes_total: heatmap_stats.bytes, + heatmap_mtime: Some(serde_system_time::SystemTime(heatmap_mtime)), + layers_downloaded: 0, + bytes_downloaded: 0, + }; + + // Also expose heatmap bytes_total as a metric + self.secondary_state + .heatmap_total_size_metric + .set(heatmap_stats.bytes); + + // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock + let mut delete_layers = Vec::new(); + let mut delete_timelines = Vec::new(); + { + let mut detail = self.secondary_state.detail.lock().unwrap(); + for (timeline_id, timeline_state) in &mut detail.timelines { + let Some(heatmap_timeline_index) = heatmap + .timelines + .iter() + .position(|t| t.timeline_id == *timeline_id) + else { + // This timeline is no longer referenced in the heatmap: delete it locally + delete_timelines.push(*timeline_id); + continue; + }; + + let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap(); + + let layers_in_heatmap = heatmap_timeline + .layers + .iter() + .map(|l| (&l.name, l.metadata.generation)) + .collect::>(); + let layers_on_disk = timeline_state + .on_disk_layers + .iter() + .map(|l| (l.0, l.1.metadata.generation)) + .collect::>(); + + let mut layer_count = layers_on_disk.len(); + let mut layer_byte_count: u64 = timeline_state + .on_disk_layers + .values() + .map(|l| l.metadata.file_size) + .sum(); + + // Remove on-disk layers that are no longer present in heatmap + for (layer_file_name, generation) in layers_on_disk.difference(&layers_in_heatmap) { + layer_count -= 1; + layer_byte_count -= timeline_state + .on_disk_layers + .get(layer_file_name) + .unwrap() + .metadata + .file_size; + + let local_path = local_layer_path( + self.conf, + self.secondary_state.get_tenant_shard_id(), + timeline_id, + layer_file_name, + generation, + ); + + delete_layers.push((*timeline_id, (*layer_file_name).clone(), local_path)); + } + + progress.bytes_downloaded += layer_byte_count; + progress.layers_downloaded += layer_count; + } + + for delete_timeline in &delete_timelines { + // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal + // from disk fails that will be a fatal error. + detail.remove_timeline(delete_timeline, &self.secondary_state.resident_size_metric); + } + } + + // Execute accumulated deletions + for (timeline_id, layer_name, local_path) in delete_layers { + tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",); + + tokio::fs::remove_file(&local_path) + .await + .or_else(fs_ext::ignore_not_found) + .maybe_fatal_err("Removing secondary layer")?; + + // Update in-memory housekeeping to reflect the absence of the deleted layer + let mut detail = self.secondary_state.detail.lock().unwrap(); + let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else { + continue; + }; + timeline_state.remove_layer(&layer_name, &self.secondary_state.resident_size_metric); + } + + for timeline_id in delete_timelines { + let timeline_path = self + .conf + .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id); + tracing::info!(timeline_id=%timeline_id, + "Timeline no longer in heatmap, removing from secondary location" + ); + tokio::fs::remove_dir_all(&timeline_path) + .await + .or_else(fs_ext::ignore_not_found) + .maybe_fatal_err("Removing secondary timeline")?; + } + + Ok(progress) + } + + /// Returns downloaded bytes if the etag differs from `prev_etag`, or None if the object + /// still matches `prev_etag`. + async fn download_heatmap( + &self, + prev_etag: Option<&Etag>, + ) -> Result { debug_assert_current_span_has_tenant_id(); let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); - // TODO: make download conditional on ETag having changed since last download + // TODO: pull up etag check into the request, to do a conditional GET rather than + // issuing a GET and then maybe ignoring the response body // (https://github.com/neondatabase/neon/issues/6199) tracing::debug!("Downloading heatmap for secondary tenant",); let heatmap_path = remote_heatmap_path(tenant_shard_id); + let cancel = &self.secondary_state.cancel; - let heatmap_bytes = backoff::retry( + backoff::retry( || async { let download = self .remote_storage - .download(&heatmap_path) + .download(&heatmap_path, cancel) .await .map_err(UpdateError::from)?; - let mut heatmap_bytes = Vec::new(); - let mut body = tokio_util::io::StreamReader::new(download.download_stream); - let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?; - Ok(heatmap_bytes) + + SECONDARY_MODE.download_heatmap.inc(); + + if Some(&download.etag) == prev_etag { + Ok(HeatMapDownload::Unmodified) + } else { + let mut heatmap_bytes = Vec::new(); + let mut body = tokio_util::io::StreamReader::new(download.download_stream); + let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?; + Ok(HeatMapDownload::Modified(HeatMapModified { + etag: download.etag, + last_modified: download.last_modified, + bytes: heatmap_bytes, + })) + } }, |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled), FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "download heatmap", - backoff::Cancel::new(self.secondary_state.cancel.clone(), || { - UpdateError::Cancelled - }), + cancel, ) - .await?; - - SECONDARY_MODE.download_heatmap.inc(); - - Ok(heatmap_bytes) + .await + .ok_or_else(|| UpdateError::Cancelled) + .and_then(|x| x) } - async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> { - debug_assert_current_span_has_tenant_and_timeline_id(); - let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); - let timeline_path = self - .conf - .timeline_path(tenant_shard_id, &timeline.timeline_id); - + /// Download heatmap layers that are not present on local disk, or update their + /// access time if they are already present. + async fn download_timeline_layers( + &self, + tenant_shard_id: &TenantShardId, + timeline: HeatMapTimeline, + timeline_state: SecondaryDetailTimeline, + deadline: Instant, + ctx: &RequestContext, + ) -> (Result<(), UpdateError>, Vec) { // Accumulate updates to the state let mut touched = Vec::new(); - // Clone a view of what layers already exist on disk - let timeline_state = self - .secondary_state - .detail - .lock() - .unwrap() - .timelines - .get(&timeline.timeline_id) - .cloned(); - - let timeline_state = match timeline_state { - Some(t) => t, - None => { - // We have no existing state: need to scan local disk for layers first. - let timeline_state = - init_timeline_state(self.conf, tenant_shard_id, &timeline).await; - - // Re-acquire detail lock now that we're done with async load from local FS - self.secondary_state - .detail - .lock() - .unwrap() - .timelines - .insert(timeline.timeline_id, timeline_state.clone()); - timeline_state - } - }; - - let layers_in_heatmap = timeline - .layers - .iter() - .map(|l| &l.name) - .collect::>(); - let layers_on_disk = timeline_state - .on_disk_layers - .iter() - .map(|l| l.0) - .collect::>(); - - // Remove on-disk layers that are no longer present in heatmap - for layer in layers_on_disk.difference(&layers_in_heatmap) { - let local_path = timeline_path.join(layer.to_string()); - tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",); - tokio::fs::remove_file(&local_path) - .await - .or_else(fs_ext::ignore_not_found) - .maybe_fatal_err("Removing secondary layer")?; - } - - // Download heatmap layers that are not present on local disk, or update their - // access time if they are already present. for layer in timeline.layers { if self.secondary_state.cancel.is_cancelled() { - return Ok(()); + tracing::debug!("Cancelled -- dropping out of layer loop"); + return (Err(UpdateError::Cancelled), touched); + } + + if Instant::now() > deadline { + // We've been running downloads for a while, restart to download latest heatmap. + return (Err(UpdateError::Restart), touched); } // Existing on-disk layers: just update their access time. if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) { tracing::debug!("Layer {} is already on disk", layer.name); - if on_disk.metadata != LayerFileMetadata::from(&layer.metadata) - || on_disk.access_time != layer.access_time - { + + if cfg!(debug_assertions) { + // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think + // are already present on disk are really there. + match tokio::fs::metadata(&on_disk.local_path).await { + Ok(meta) => { + tracing::debug!( + "Layer {} present at {}, size {}", + layer.name, + on_disk.local_path, + meta.len(), + ); + } + Err(e) => { + tracing::warn!( + "Layer {} not found at {} ({})", + layer.name, + on_disk.local_path, + e + ); + debug_assert!(false); + } + } + } + + if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time { // We already have this layer on disk. Update its access time. tracing::debug!( "Access time updated for layer {}: {} -> {}", @@ -651,86 +1070,172 @@ impl<'a> TenantDownloader<'a> { strftime(&layer.access_time), strftime(evicted_at) ); + self.skip_layer(layer); continue; } } - // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally - let downloaded_bytes = match download_layer_file( - self.conf, - self.remote_storage, - *tenant_shard_id, - timeline.timeline_id, - &layer.name, - &LayerFileMetadata::from(&layer.metadata), - &self.secondary_state.cancel, - ) - .await + match self + .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx) + .await { - Ok(bytes) => bytes, - Err(e) => { - if let DownloadError::NotFound = e { - // A heatmap might be out of date and refer to a layer that doesn't exist any more. - // This is harmless: continue to download the next layer. It is expected during compaction - // GC. - tracing::debug!( - "Skipped downloading missing layer {}, raced with compaction/gc?", - layer.name - ); - continue; - } else { - return Err(e.into()); - } + Ok(Some(layer)) => touched.push(layer), + Ok(None) => { + // Not an error but we didn't download it: remote layer is missing. Don't add it to the list of + // things to consider touched. + } + Err(e) => { + return (Err(e), touched); } - }; - - if downloaded_bytes != layer.metadata.file_size { - let local_path = timeline_path.join(layer.name.to_string()); - - tracing::warn!( - "Downloaded layer {} with unexpected size {} != {}. Removing download.", - layer.name, - downloaded_bytes, - layer.metadata.file_size - ); - - tokio::fs::remove_file(&local_path) - .await - .or_else(fs_ext::ignore_not_found)?; } - - SECONDARY_MODE.download_layer.inc(); - touched.push(layer) } - // Write updates to state to record layers we just downloaded or touched. + (Ok(()), touched) + } + + async fn download_timeline( + &self, + timeline: HeatMapTimeline, + timeline_state: SecondaryDetailTimeline, + deadline: Instant, + ctx: &RequestContext, + ) -> Result<(), UpdateError> { + debug_assert_current_span_has_tenant_and_timeline_id(); + let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); + let timeline_id = timeline.timeline_id; + + tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len()); + + let (result, touched) = self + .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx) + .await; + + // Write updates to state to record layers we just downloaded or touched, irrespective of whether the overall result was successful { let mut detail = self.secondary_state.detail.lock().unwrap(); - let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default(); + let timeline_detail = detail.timelines.entry(timeline_id).or_default(); tracing::info!("Wrote timeline_detail for {} touched layers", touched.len()); - - for t in touched { - use std::collections::hash_map::Entry; - match timeline_detail.on_disk_layers.entry(t.name.clone()) { - Entry::Occupied(mut v) => { - v.get_mut().access_time = t.access_time; - } - Entry::Vacant(e) => { - e.insert(OnDiskState::new( + touched.into_iter().for_each(|t| { + timeline_detail.touch_layer( + self.conf, + tenant_shard_id, + &timeline_id, + &t, + &self.secondary_state.resident_size_metric, + || { + local_layer_path( self.conf, tenant_shard_id, - &timeline.timeline_id, - t.name, - LayerFileMetadata::from(&t.metadata), - t.access_time, - )); - } - } - } + &timeline_id, + &t.name, + &t.metadata.generation, + ) + }, + ) + }); } - Ok(()) + result + } + + /// Call this during timeline download if a layer will _not_ be downloaded, to update progress statistics + fn skip_layer(&self, layer: HeatMapLayer) { + let mut progress = self.secondary_state.progress.lock().unwrap(); + progress.layers_total = progress.layers_total.saturating_sub(1); + progress.bytes_total = progress + .bytes_total + .saturating_sub(layer.metadata.file_size); + } + + async fn download_layer( + &self, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + layer: HeatMapLayer, + ctx: &RequestContext, + ) -> Result, UpdateError> { + // Failpoints for simulating slow remote storage + failpoint_support::sleep_millis_async!( + "secondary-layer-download-sleep", + &self.secondary_state.cancel + ); + + pausable_failpoint!("secondary-layer-download-pausable"); + + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + timeline_id, + &layer.name, + &layer.metadata.generation, + ); + + // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally + tracing::info!( + "Starting download of layer {}, size {}", + layer.name, + layer.metadata.file_size + ); + let downloaded_bytes = download_layer_file( + self.conf, + self.remote_storage, + *tenant_shard_id, + *timeline_id, + &layer.name, + &layer.metadata, + &local_path, + &self.secondary_state.cancel, + ctx, + ) + .await; + + let downloaded_bytes = match downloaded_bytes { + Ok(bytes) => bytes, + Err(DownloadError::NotFound) => { + // A heatmap might be out of date and refer to a layer that doesn't exist any more. + // This is harmless: continue to download the next layer. It is expected during compaction + // GC. + tracing::debug!( + "Skipped downloading missing layer {}, raced with compaction/gc?", + layer.name + ); + self.skip_layer(layer); + + return Ok(None); + } + Err(e) => return Err(e.into()), + }; + + if downloaded_bytes != layer.metadata.file_size { + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + timeline_id, + &layer.name, + &layer.metadata.generation, + ); + + tracing::warn!( + "Downloaded layer {} with unexpected size {} != {}. Removing download.", + layer.name, + downloaded_bytes, + layer.metadata.file_size + ); + + tokio::fs::remove_file(&local_path) + .await + .or_else(fs_ext::ignore_not_found)?; + } else { + tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes); + let mut progress = self.secondary_state.progress.lock().unwrap(); + progress.bytes_downloaded += downloaded_bytes; + progress.layers_downloaded += 1; + } + + SECONDARY_MODE.download_layer.inc(); + + Ok(Some(layer)) } } @@ -739,6 +1244,7 @@ async fn init_timeline_state( conf: &'static PageServerConf, tenant_shard_id: &TenantShardId, heatmap: &HeatMapTimeline, + resident_metric: &UIntGauge, ) -> SecondaryDetailTimeline { let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id); let mut detail = SecondaryDetailTimeline::default(); @@ -763,7 +1269,7 @@ async fn init_timeline_state( // As we iterate through layers found on disk, we will look up their metadata from this map. // Layers not present in metadata will be discarded. - let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> = + let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> = heatmap.layers.iter().map(|l| (&l.name, l)).collect(); while let Some(dentry) = dir @@ -771,19 +1277,32 @@ async fn init_timeline_state( .await .fatal_err(&format!("Listing {timeline_path}")) { - let dentry_file_name = dentry.file_name(); - let file_name = dentry_file_name.to_string_lossy(); - let local_meta = dentry.metadata().await.fatal_err(&format!( - "Read metadata on {}", - dentry.path().to_string_lossy() - )); + let Ok(file_path) = Utf8PathBuf::from_path_buf(dentry.path()) else { + tracing::warn!("Malformed filename at {}", dentry.path().to_string_lossy()); + continue; + }; + let local_meta = dentry + .metadata() + .await + .fatal_err(&format!("Read metadata on {}", file_path)); - // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant. - if file_name == METADATA_FILE_NAME { + let file_name = file_path.file_name().expect("created it from the dentry"); + if crate::is_temporary(&file_path) + || is_temp_download_file(&file_path) + || is_ephemeral_file(file_name) + { + // Temporary files are frequently left behind from restarting during downloads + tracing::info!("Cleaning up temporary file {file_path}"); + if let Err(e) = tokio::fs::remove_file(&file_path) + .await + .or_else(fs_ext::ignore_not_found) + { + tracing::error!("Failed to remove temporary file {file_path}: {e}"); + } continue; } - match LayerFileName::from_str(&file_name) { + match LayerName::from_str(file_name) { Ok(name) => { let remote_meta = heatmap_metadata.get(&name); match remote_meta { @@ -801,16 +1320,13 @@ async fn init_timeline_state( } else { // We expect the access time to be initialized immediately afterwards, when // the latest heatmap is applied to the state. - detail.on_disk_layers.insert( - name.clone(), - OnDiskState::new( - conf, - tenant_shard_id, - &heatmap.timeline_id, - name, - LayerFileMetadata::from(&remote_meta.metadata), - remote_meta.access_time, - ), + detail.touch_layer( + conf, + tenant_shard_id, + &heatmap.timeline_id, + remote_meta, + resident_metric, + || file_path, ); } } diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index 99aaaeb8c8..4a8e66d38a 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -1,8 +1,6 @@ use std::time::SystemTime; -use crate::tenant::{ - remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName, -}; +use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr, TimestampSeconds}; @@ -17,22 +15,30 @@ pub(super) struct HeatMapTenant { pub(super) generation: Generation, pub(super) timelines: Vec, + + /// Uploaders provide their own upload period in the heatmap, as a hint to downloaders + /// of how frequently it is worthwhile to check for updates. + /// + /// This is optional for backward compat, and because we sometimes might upload + /// a heatmap explicitly via API for a tenant that has no periodic upload configured. + #[serde(default)] + pub(super) upload_period_ms: Option, } #[serde_as] #[derive(Serialize, Deserialize)] pub(crate) struct HeatMapTimeline { #[serde_as(as = "DisplayFromStr")] - pub(super) timeline_id: TimelineId, + pub(crate) timeline_id: TimelineId, - pub(super) layers: Vec, + pub(crate) layers: Vec, } #[serde_as] #[derive(Serialize, Deserialize)] pub(crate) struct HeatMapLayer { - pub(super) name: LayerFileName, - pub(super) metadata: IndexLayerMetadata, + pub(crate) name: LayerName, + pub(crate) metadata: LayerFileMetadata, #[serde_as(as = "TimestampSeconds")] pub(super) access_time: SystemTime, @@ -42,8 +48,8 @@ pub(crate) struct HeatMapLayer { impl HeatMapLayer { pub(crate) fn new( - name: LayerFileName, - metadata: IndexLayerMetadata, + name: LayerName, + metadata: LayerFileMetadata, access_time: SystemTime, ) -> Self { Self { @@ -62,3 +68,42 @@ impl HeatMapTimeline { } } } + +pub(crate) struct HeatMapStats { + pub(crate) bytes: u64, + pub(crate) layers: usize, +} + +impl HeatMapTenant { + pub(crate) fn get_stats(&self) -> HeatMapStats { + let mut stats = HeatMapStats { + bytes: 0, + layers: 0, + }; + for timeline in &self.timelines { + for layer in &timeline.layers { + stats.layers += 1; + stats.bytes += layer.metadata.file_size; + } + } + + stats + } + + pub(crate) fn strip_atimes(self) -> Self { + Self { + timelines: self + .timelines + .into_iter() + .map(|mut tl| { + for layer in &mut tl.layers { + layer.access_time = SystemTime::UNIX_EPOCH; + } + tl + }) + .collect(), + generation: self.generation, + upload_period_ms: self.upload_period_ms, + } + } +} diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index df865658a4..0aad5bf392 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -9,6 +9,7 @@ use crate::{ metrics::SECONDARY_MODE, tenant::{ config::AttachmentMode, + mgr::GetTenantError, mgr::TenantManager, remote_timeline_client::remote_heatmap_path, span::debug_assert_current_span_has_tenant_id, @@ -18,21 +19,21 @@ use crate::{ }; use futures::Future; -use md5; use pageserver_api::shard::TenantShardId; -use rand::Rng; -use remote_storage::GenericRemoteStorage; +use remote_storage::{GenericRemoteStorage, TimeoutOrCancel}; use super::{ - scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs}, - CommandRequest, + heatmap::HeatMapTenant, + scheduler::{ + self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult, + TenantBackgroundJobs, + }, + CommandRequest, UploadCommand, }; use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, Instrument}; use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop}; -use super::{heatmap::HeatMapTenant, UploadCommand}; - pub(super) async fn heatmap_uploader_task( tenant_manager: Arc, remote_storage: GenericRemoteStorage, @@ -52,7 +53,7 @@ pub(super) async fn heatmap_uploader_task( scheduler .run(command_queue, background_jobs_can_start, cancel) - .instrument(info_span!("heatmap_uploader")) + .instrument(info_span!("heatmap_upload_scheduler")) .await } @@ -79,7 +80,7 @@ impl RunningJob for WriteInProgress { struct UploadPending { tenant: Arc, - last_digest: Option, + last_upload: Option, target_time: Option, period: Option, } @@ -93,7 +94,7 @@ impl scheduler::PendingJob for UploadPending { struct WriteComplete { tenant_shard_id: TenantShardId, completed_at: Instant, - digest: Option, + uploaded: Option, next_upload: Option, } @@ -114,10 +115,7 @@ struct UploaderTenantState { tenant: Weak, /// Digest of the serialized heatmap that we last successfully uploaded - /// - /// md5 is generally a bad hash. We use it because it's convenient for interop with AWS S3's ETag, - /// which is also an md5sum. - last_digest: Option, + last_upload_state: Option, /// When the last upload attempt completed (may have been successful or failed) last_upload: Option, @@ -182,15 +180,11 @@ impl JobGenerator let state = self .tenants .entry(*tenant.get_tenant_shard_id()) - .or_insert_with(|| { - let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period); - - UploaderTenantState { - tenant: Arc::downgrade(&tenant), - last_upload: None, - next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)), - last_digest: None, - } + .or_insert_with(|| UploaderTenantState { + tenant: Arc::downgrade(&tenant), + last_upload: None, + next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)), + last_upload_state: None, }); // Decline to do the upload if insufficient time has passed @@ -198,10 +192,10 @@ impl JobGenerator return; } - let last_digest = state.last_digest; + let last_upload = state.last_upload_state.clone(); result.jobs.push(UploadPending { tenant, - last_digest, + last_upload, target_time: state.next_upload, period: Some(period), }); @@ -221,7 +215,7 @@ impl JobGenerator ) { let UploadPending { tenant, - last_digest, + last_upload, target_time, period, } = job; @@ -234,16 +228,16 @@ impl JobGenerator let _completion = completion; let started_at = Instant::now(); - let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await { - Ok(UploadHeatmapOutcome::Uploaded(digest)) => { + let uploaded = match upload_tenant_heatmap(remote_storage, &tenant, last_upload.clone()).await { + Ok(UploadHeatmapOutcome::Uploaded(uploaded)) => { let duration = Instant::now().duration_since(started_at); SECONDARY_MODE .upload_heatmap_duration .observe(duration.as_secs_f64()); SECONDARY_MODE.upload_heatmap.inc(); - Some(digest) + Some(uploaded) } - Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest, + Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_upload, Err(UploadHeatmapError::Upload(e)) => { tracing::warn!( "Failed to upload heatmap for tenant {}: {e:#}", @@ -254,11 +248,11 @@ impl JobGenerator .upload_heatmap_duration .observe(duration.as_secs_f64()); SECONDARY_MODE.upload_heatmap_errors.inc(); - last_digest + last_upload } Err(UploadHeatmapError::Cancelled) => { tracing::info!("Cancelled heatmap upload, shutting down"); - last_digest + last_upload } }; @@ -275,12 +269,12 @@ impl JobGenerator let next_upload = tenant .get_heatmap_period() - .and_then(|period| now.checked_add(period)); + .and_then(|period| now.checked_add(period_jitter(period, 5))); WriteComplete { tenant_shard_id: *tenant.get_tenant_shard_id(), completed_at: now, - digest, + uploaded, next_upload, } }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())))) @@ -294,12 +288,15 @@ impl JobGenerator "Starting heatmap write on command"); let tenant = self .tenant_manager - .get_attached_tenant_shard(*tenant_shard_id, true) + .get_attached_tenant_shard(*tenant_shard_id) .map_err(|e| anyhow::anyhow!(e))?; + if !tenant.is_active() { + return Err(GetTenantError::NotActive(*tenant_shard_id).into()); + } Ok(UploadPending { // Ignore our state for last digest: this forces an upload even if nothing has changed - last_digest: None, + last_upload: None, tenant, target_time: None, period: None, @@ -312,7 +309,7 @@ impl JobGenerator let WriteComplete { tenant_shard_id, completed_at, - digest, + uploaded, next_upload, } = completion; use std::collections::hash_map::Entry; @@ -322,7 +319,7 @@ impl JobGenerator } Entry::Occupied(mut entry) => { entry.get_mut().last_upload = Some(completed_at); - entry.get_mut().last_digest = digest; + entry.get_mut().last_upload_state = uploaded; entry.get_mut().next_upload = next_upload } } @@ -331,7 +328,7 @@ impl JobGenerator enum UploadHeatmapOutcome { /// We successfully wrote to remote storage, with this digest. - Uploaded(md5::Digest), + Uploaded(LastUploadState), /// We did not upload because the heatmap digest was unchanged since the last upload NoChange, /// We skipped the upload for some reason, such as tenant/timeline not ready @@ -347,20 +344,32 @@ enum UploadHeatmapError { Upload(#[from] anyhow::Error), } +/// Digests describing the heatmap we most recently uploaded successfully. +/// +/// md5 is generally a bad hash. We use it because it's convenient for interop with AWS S3's ETag, +/// which is also an md5sum. +#[derive(Clone)] +struct LastUploadState { + // Digest of json-encoded HeatMapTenant + uploaded_digest: md5::Digest, + + // Digest without atimes set. + layers_only_digest: md5::Digest, +} + /// The inner upload operation. This will skip if `last_digest` is Some and matches the digest /// of the object we would have uploaded. async fn upload_tenant_heatmap( remote_storage: GenericRemoteStorage, tenant: &Arc, - last_digest: Option, + last_upload: Option, ) -> Result { debug_assert_current_span_has_tenant_id(); let generation = tenant.get_generation(); + debug_assert!(!generation.is_none()); if generation.is_none() { - // We do not expect this: generations were implemented before heatmap uploads. However, - // handle it so that we don't have to make the generation in the heatmap an Option<> - // (Generation::none is not serializable) + // We do not expect this: None generations should only appear in historic layer metadata, not in running Tenants tracing::warn!("Skipping heatmap upload for tenant with generation==None"); return Ok(UploadHeatmapOutcome::Skipped); } @@ -368,20 +377,16 @@ async fn upload_tenant_heatmap( let mut heatmap = HeatMapTenant { timelines: Vec::new(), generation, + upload_period_ms: tenant.get_heatmap_period().map(|p| p.as_millis()), }; let timelines = tenant.timelines.lock().unwrap().clone(); - let tenant_cancel = tenant.cancel.clone(); - // Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise // when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind // in remote storage. - let _guard = match tenant.gate.enter() { - Ok(g) => g, - Err(_) => { - tracing::info!("Skipping heatmap upload for tenant which is shutting down"); - return Err(UploadHeatmapError::Cancelled); - } + let Ok(_guard) = tenant.gate.enter() else { + tracing::info!("Skipping heatmap upload for tenant which is shutting down"); + return Err(UploadHeatmapError::Cancelled); }; for (timeline_id, timeline) in timelines { @@ -401,36 +406,54 @@ async fn upload_tenant_heatmap( // Serialize the heatmap let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?; - let size = bytes.len(); // Drop out early if nothing changed since our last upload let digest = md5::compute(&bytes); - if Some(digest) == last_digest { + if Some(&digest) == last_upload.as_ref().map(|d| &d.uploaded_digest) { return Ok(UploadHeatmapOutcome::NoChange); } + // Calculate a digest that omits atimes, so that we can distinguish actual changes in + // layers from changes only in atimes. + let heatmap_size_bytes = heatmap.get_stats().bytes; + let layers_only_bytes = + serde_json::to_vec(&heatmap.strip_atimes()).map_err(|e| anyhow::anyhow!(e))?; + let layers_only_digest = md5::compute(&layers_only_bytes); + if heatmap_size_bytes < tenant.get_checkpoint_distance() { + // For small tenants, skip upload if only atimes changed. This avoids doing frequent + // uploads from long-idle tenants whose atimes are just incremented by periodic + // size calculations. + if Some(&layers_only_digest) == last_upload.as_ref().map(|d| &d.layers_only_digest) { + return Ok(UploadHeatmapOutcome::NoChange); + } + } + + let bytes = bytes::Bytes::from(bytes); + let size = bytes.len(); + let path = remote_heatmap_path(tenant.get_tenant_shard_id()); - // Write the heatmap. + let cancel = &tenant.cancel; + tracing::debug!("Uploading {size} byte heatmap to {path}"); if let Err(e) = backoff::retry( || async { - let bytes = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from( - bytes.clone(), - )))); + let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone()))); remote_storage - .upload_storage_object(bytes, size, &path) + .upload_storage_object(bytes, size, &path, cancel) .await }, - |_| false, + TimeoutOrCancel::caused_by_cancel, 3, u32::MAX, "Uploading heatmap", - backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")), + cancel, ) .await + .ok_or_else(|| anyhow::anyhow!("Shutting down")) + .and_then(|x| x) { - if tenant_cancel.is_cancelled() { + if cancel.is_cancelled() { return Err(UploadHeatmapError::Cancelled); } else { return Err(e.into()); @@ -439,5 +462,8 @@ async fn upload_tenant_heatmap( tracing::info!("Successfully uploaded {size} byte heatmap to {path}"); - Ok(UploadHeatmapOutcome::Uploaded(digest)) + Ok(UploadHeatmapOutcome::Uploaded(LastUploadState { + uploaded_digest: digest, + layers_only_digest, + })) } diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs index 58bdb54161..28cf2125df 100644 --- a/pageserver/src/tenant/secondary/scheduler.rs +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -1,4 +1,5 @@ use futures::Future; +use rand::Rng; use std::{ collections::HashMap, marker::PhantomData, @@ -19,6 +20,26 @@ use super::{CommandRequest, CommandResponse}; const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10); const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1); +/// Jitter a Duration by an integer percentage. Returned values are uniform +/// in the range 100-pct..100+pct (i.e. a 5% jitter is 5% either way: a ~10% range) +pub(super) fn period_jitter(d: Duration, pct: u32) -> Duration { + if d == Duration::ZERO { + d + } else { + rand::thread_rng().gen_range((d * (100 - pct)) / 100..(d * (100 + pct)) / 100) + } +} + +/// When a periodic task first starts, it should wait for some time in the range 0..period, so +/// that starting many such tasks at the same time spreads them across the time range. +pub(super) fn period_warmup(period: Duration) -> Duration { + if period == Duration::ZERO { + period + } else { + rand::thread_rng().gen_range(Duration::ZERO..period) + } +} + /// Scheduling helper for background work across many tenants. /// /// Systems that need to run background work across many tenants may use this type @@ -158,6 +179,13 @@ where // Schedule some work, if concurrency limit permits it self.spawn_pending(); + // This message is printed every scheduling iteration as proof of liveness when looking at logs + tracing::info!( + "Status: {} tasks running, {} pending", + self.running.len(), + self.pending.len() + ); + // Between scheduling iterations, we will: // - Drain any complete tasks and spawn pending tasks // - Handle incoming administrative commands @@ -237,7 +265,11 @@ where self.tasks.spawn(fut); - self.running.insert(tenant_shard_id, in_progress); + let replaced = self.running.insert(tenant_shard_id, in_progress); + debug_assert!(replaced.is_none()); + if replaced.is_some() { + tracing::warn!(%tenant_shard_id, "Unexpectedly spawned a task when one was already running") + } } /// For all pending tenants that are elegible for execution, spawn their task. @@ -247,7 +279,9 @@ where while !self.pending.is_empty() && self.running.len() < self.concurrency { // unwrap: loop condition includes !is_empty() let pending = self.pending.pop_front().unwrap(); - self.do_spawn(pending); + if !self.running.contains_key(pending.get_tenant_shard_id()) { + self.do_spawn(pending); + } } } @@ -300,6 +334,11 @@ where let tenant_shard_id = job.get_tenant_shard_id(); let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) { + tracing::info!( + tenant_id=%tenant_shard_id.tenant_id, + shard_id=%tenant_shard_id.shard_slug(), + "Command already running, waiting for it" + ); barrier } else { let running = self.spawn_now(job); diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index e0b1652d98..41d558d3f6 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -3,7 +3,7 @@ use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use anyhow::{bail, Context}; +use tenant_size_model::svg::SvgBranchKind; use tokio::sync::oneshot::error::RecvError; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; @@ -11,7 +11,7 @@ use tokio_util::sync::CancellationToken; use crate::context::RequestContext; use crate::pgdatadir_mapping::CalculateLogicalSizeError; -use super::{LogicalSizeCalculationCause, Tenant}; +use super::{GcError, LogicalSizeCalculationCause, Tenant}; use crate::tenant::Timeline; use utils::id::TimelineId; use utils::lsn::Lsn; @@ -43,6 +43,40 @@ pub struct SegmentMeta { pub kind: LsnKind, } +#[derive(thiserror::Error, Debug)] +pub(crate) enum CalculateSyntheticSizeError { + /// Something went wrong internally to the calculation of logical size at a particular branch point + #[error("Failed to calculated logical size on timeline {timeline_id} at {lsn}: {error}")] + LogicalSize { + timeline_id: TimelineId, + lsn: Lsn, + error: CalculateLogicalSizeError, + }, + + /// Something went wrong internally when calculating GC parameters at start of size calculation + #[error(transparent)] + GcInfo(GcError), + + /// Totally unexpected errors, like panics joining a task + #[error(transparent)] + Fatal(anyhow::Error), + + /// Tenant shut down while calculating size + #[error("Cancelled")] + Cancelled, +} + +impl From for CalculateSyntheticSizeError { + fn from(value: GcError) -> Self { + match value { + GcError::TenantCancelled | GcError::TimelineCancelled => { + CalculateSyntheticSizeError::Cancelled + } + other => CalculateSyntheticSizeError::GcInfo(other), + } + } +} + impl SegmentMeta { fn size_needed(&self) -> bool { match self.kind { @@ -54,6 +88,9 @@ impl SegmentMeta { LsnKind::BranchPoint => true, LsnKind::GcCutOff => true, LsnKind::BranchEnd => false, + LsnKind::LeasePoint => true, + LsnKind::LeaseStart => false, + LsnKind::LeaseEnd => false, } } } @@ -70,6 +107,21 @@ pub enum LsnKind { GcCutOff, /// Last record LSN BranchEnd, + /// A LSN lease is granted here. + LeasePoint, + /// A lease starts from here. + LeaseStart, + /// Last record LSN for the lease (should have the same LSN as the previous [`LsnKind::LeaseStart`]). + LeaseEnd, +} + +impl From for SvgBranchKind { + fn from(kind: LsnKind) -> Self { + match kind { + LsnKind::LeasePoint | LsnKind::LeaseStart | LsnKind::LeaseEnd => SvgBranchKind::Lease, + _ => SvgBranchKind::Timeline, + } + } } /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as @@ -83,19 +135,20 @@ pub struct TimelineInputs { ancestor_lsn: Lsn, last_record: Lsn, latest_gc_cutoff: Lsn, - horizon_cutoff: Lsn, - pitr_cutoff: Lsn, /// Cutoff point based on GC settings - next_gc_cutoff: Lsn, + next_pitr_cutoff: Lsn, /// Cutoff point calculated from the user-supplied 'max_retention_period' retention_param_cutoff: Option, + + /// Lease points on the timeline + lease_points: Vec, } /// Gathers the inputs for the tenant sizing model. /// -/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which +/// Tenant size does not consider the latest state, but only the state until next_pitr_cutoff, which /// is updated on-demand, during the start of this calculation and separate from the /// [`TimelineInputs::latest_gc_cutoff`]. /// @@ -103,11 +156,8 @@ pub struct TimelineInputs { /// /// ```text /// 0-----|---------|----|------------| · · · · · |·> lsn -/// initdb_lsn branchpoints* next_gc_cutoff latest +/// initdb_lsn branchpoints* next_pitr_cutoff latest /// ``` -/// -/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the -/// tenant size will be zero. pub(super) async fn gather_inputs( tenant: &Tenant, limit: &Arc, @@ -116,12 +166,9 @@ pub(super) async fn gather_inputs( cause: LogicalSizeCalculationCause, cancel: &CancellationToken, ctx: &RequestContext, -) -> anyhow::Result { - // refresh is needed to update gc related pitr_cutoff and horizon_cutoff - tenant - .refresh_gc_info(cancel, ctx) - .await - .context("Failed to refresh gc_info before gathering inputs")?; +) -> Result { + // refresh is needed to update [`timeline::GcCutoffs`] + tenant.refresh_gc_info(cancel, ctx).await?; // Collect information about all the timelines let mut timelines = tenant.list_timelines(); @@ -183,20 +230,33 @@ pub(super) async fn gather_inputs( // new gc run, which we have no control over. however differently from `Timeline::gc` // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not // actually removing files. - let mut next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff); + // + // We only consider [`timeline::GcCutoffs::time`], and not [`timeline::GcCutoffs::space`], because from + // a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather + // than our internal space cutoff. This means that if someone drops a database and waits for their + // PITR interval, they will see synthetic size decrease, even if we are still storing data inside + // the space cutoff. + let mut next_pitr_cutoff = gc_info.cutoffs.time; // If the caller provided a shorter retention period, use that instead of the GC cutoff. let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period { let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period)); - if next_gc_cutoff < param_cutoff { - next_gc_cutoff = param_cutoff; + if next_pitr_cutoff < param_cutoff { + next_pitr_cutoff = param_cutoff; } Some(param_cutoff) } else { None }; - // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we + let lease_points = gc_info + .leases + .keys() + .filter(|&&lsn| lsn > ancestor_lsn) + .copied() + .collect::>(); + + // next_pitr_cutoff in parent branch are not of interest (right now at least), nor do we // want to query any logical size before initdb_lsn. let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn); @@ -204,12 +264,16 @@ pub(super) async fn gather_inputs( let mut lsns: Vec<(Lsn, LsnKind)> = gc_info .retain_lsns .iter() - .filter(|&&lsn| lsn > ancestor_lsn) + .filter(|(lsn, _child_id)| lsn > &ancestor_lsn) .copied() // this assumes there are no other retain_lsns than the branchpoints - .map(|lsn| (lsn, LsnKind::BranchPoint)) + .map(|(lsn, _child_id)| (lsn, LsnKind::BranchPoint)) .collect::>(); + lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint))); + + drop(gc_info); + // Add branch points we collected earlier, just in case there were any that were // not present in retain_lsns. We will remove any duplicates below later. if let Some(this_branchpoints) = branchpoints.get(&timeline_id) { @@ -220,10 +284,10 @@ pub(super) async fn gather_inputs( ) } - // Add a point for the GC cutoff - let branch_start_needed = next_gc_cutoff <= branch_start_lsn; + // Add a point for the PITR cutoff + let branch_start_needed = next_pitr_cutoff <= branch_start_lsn; if !branch_start_needed { - lsns.push((next_gc_cutoff, LsnKind::GcCutOff)); + lsns.push((next_pitr_cutoff, LsnKind::GcCutOff)); } lsns.sort_unstable(); @@ -256,17 +320,56 @@ pub(super) async fn gather_inputs( if kind == LsnKind::BranchPoint { branchpoint_segments.insert((timeline_id, lsn), segments.len()); } + segments.push(SegmentMeta { segment: Segment { parent: Some(parent), lsn: lsn.0, size: None, - needed: lsn > next_gc_cutoff, + needed: lsn > next_pitr_cutoff, }, timeline_id: timeline.timeline_id, kind, }); - parent += 1; + + parent = segments.len() - 1; + + if kind == LsnKind::LeasePoint { + // Needs `LeaseStart` and `LeaseEnd` as well to model lease as a read-only branch that never writes data + // (i.e. it's lsn has not advanced from ancestor_lsn), and therefore the three segments have the same LSN + // value. Without the other two segments, the calculation code would not count the leased LSN as a point + // to be retained. + // Did not use `BranchStart` or `BranchEnd` so we can differentiate branches and leases during debug. + // + // Alt Design: rewrite the entire calculation code to be independent of timeline id. Both leases and + // branch points can be given a synthetic id so we can unite them. + let mut lease_parent = parent; + + // Start of a lease. + segments.push(SegmentMeta { + segment: Segment { + parent: Some(lease_parent), + lsn: lsn.0, + size: None, // Filled in later, if necessary + needed: lsn > next_pitr_cutoff, // only needed if the point is within rentention. + }, + timeline_id: timeline.timeline_id, + kind: LsnKind::LeaseStart, + }); + lease_parent += 1; + + // End of the lease. + segments.push(SegmentMeta { + segment: Segment { + parent: Some(lease_parent), + lsn: lsn.0, + size: None, // Filled in later, if necessary + needed: true, // everything at the lease LSN must be readable => is needed + }, + timeline_id: timeline.timeline_id, + kind: LsnKind::LeaseEnd, + }); + } } // Current end of the timeline @@ -288,10 +391,9 @@ pub(super) async fn gather_inputs( last_record: last_record_lsn, // this is not used above, because it might not have updated recently enough latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), - horizon_cutoff: gc_info.horizon_cutoff, - pitr_cutoff: gc_info.pitr_cutoff, - next_gc_cutoff, + next_pitr_cutoff, retention_param_cutoff, + lease_points, }); } @@ -317,6 +419,12 @@ pub(super) async fn gather_inputs( ) .await?; + if tenant.cancel.is_cancelled() { + // If we're shutting down, return an error rather than a sparse result that might include some + // timelines from before we started shutting down + return Err(CalculateSyntheticSizeError::Cancelled); + } + Ok(ModelInputs { segments, timeline_inputs, @@ -325,9 +433,8 @@ pub(super) async fn gather_inputs( /// Augment 'segments' with logical sizes /// -/// this will probably conflict with on-demand downloaded layers, or at least force them all -/// to be downloaded -/// +/// This will leave segments' sizes as None if the Timeline associated with the segment is deleted concurrently +/// (i.e. we cannot read its logical size at a particular LSN). async fn fill_logical_sizes( timelines: &[Arc], segments: &mut [SegmentMeta], @@ -335,7 +442,7 @@ async fn fill_logical_sizes( logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>, cause: LogicalSizeCalculationCause, ctx: &RequestContext, -) -> anyhow::Result<()> { +) -> Result<(), CalculateSyntheticSizeError> { let timeline_hash: HashMap> = HashMap::from_iter( timelines .iter() @@ -377,7 +484,7 @@ async fn fill_logical_sizes( } // Perform the size lookups - let mut have_any_error = false; + let mut have_any_error = None; while let Some(res) = joinset.join_next().await { // each of these come with Result, JoinError> // because of spawn + spawn_blocking @@ -388,21 +495,36 @@ async fn fill_logical_sizes( Err(join_error) => { // cannot really do anything, as this panic is likely a bug error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}"); - have_any_error = true; + + have_any_error = Some(CalculateSyntheticSizeError::Fatal( + anyhow::anyhow!(join_error) + .context("task that calls spawn_ondemand_logical_size_calculation"), + )); } Ok(Err(recv_result_error)) => { // cannot really do anything, as this panic is likely a bug error!("failed to receive logical size query result: {recv_result_error:#}"); - have_any_error = true; + have_any_error = Some(CalculateSyntheticSizeError::Fatal( + anyhow::anyhow!(recv_result_error) + .context("Receiving logical size query result"), + )); } Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => { - if !matches!(error, CalculateLogicalSizeError::Cancelled) { + if matches!(error, CalculateLogicalSizeError::Cancelled) { + // Skip this: it's okay if one timeline among many is shutting down while we + // calculate inputs for the overall tenant. + continue; + } else { warn!( timeline_id=%timeline.timeline_id, "failed to calculate logical size at {lsn}: {error:#}" ); + have_any_error = Some(CalculateSyntheticSizeError::LogicalSize { + timeline_id: timeline.timeline_id, + lsn, + error, + }); } - have_any_error = true; } Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => { debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated"); @@ -416,10 +538,10 @@ async fn fill_logical_sizes( // prune any keys not needed anymore; we record every used key and added key. logical_size_cache.retain(|key, _| sizes_needed.contains_key(key)); - if have_any_error { + if let Some(error) = have_any_error { // we cannot complete this round, because we are missing data. // we have however cached all we were able to request calculation on. - anyhow::bail!("failed to calculate some logical_sizes"); + return Err(error); } // Insert the looked up sizes to the Segments @@ -433,33 +555,28 @@ async fn fill_logical_sizes( if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) { seg.segment.size = Some(*size); - } else { - bail!("could not find size at {} in timeline {}", lsn, timeline_id); } } Ok(()) } impl ModelInputs { - pub fn calculate_model(&self) -> anyhow::Result { + pub fn calculate_model(&self) -> tenant_size_model::StorageModel { // Convert SegmentMetas into plain Segments - let storage = StorageModel { + StorageModel { segments: self .segments .iter() .map(|seg| seg.segment.clone()) .collect(), - }; - - Ok(storage) + } } // calculate total project size - pub fn calculate(&self) -> anyhow::Result { - let storage = self.calculate_model()?; + pub fn calculate(&self) -> u64 { + let storage = self.calculate_model(); let sizes = storage.calculate(); - - Ok(sizes.total_size) + sizes.total_size } } @@ -616,37 +733,34 @@ fn verify_size_for_multiple_branches() { "ancestor_lsn": "0/18D3D98", "last_record": "0/2230CD0", "latest_gc_cutoff": "0/1698C48", - "horizon_cutoff": "0/2210CD0", - "pitr_cutoff": "0/2210CD0", - "next_gc_cutoff": "0/2210CD0", - "retention_param_cutoff": null + "next_pitr_cutoff": "0/2210CD0", + "retention_param_cutoff": null, + "lease_points": [] }, { "timeline_id": "454626700469f0a9914949b9d018e876", "ancestor_lsn": "0/176D998", "last_record": "0/1837770", "latest_gc_cutoff": "0/1698C48", - "horizon_cutoff": "0/1817770", - "pitr_cutoff": "0/1817770", - "next_gc_cutoff": "0/1817770", - "retention_param_cutoff": null + "next_pitr_cutoff": "0/1817770", + "retention_param_cutoff": null, + "lease_points": [] }, { "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", "ancestor_lsn": "0/0", "last_record": "0/18D3D98", "latest_gc_cutoff": "0/1698C48", - "horizon_cutoff": "0/18B3D98", - "pitr_cutoff": "0/18B3D98", - "next_gc_cutoff": "0/18B3D98", - "retention_param_cutoff": null + "next_pitr_cutoff": "0/18B3D98", + "retention_param_cutoff": null, + "lease_points": [] } ] } "#; let inputs: ModelInputs = serde_json::from_str(doc).unwrap(); - assert_eq!(inputs.calculate().unwrap(), 37_851_408); + assert_eq!(inputs.calculate(), 37_851_408); } #[test] @@ -691,17 +805,16 @@ fn verify_size_for_one_branch() { "ancestor_lsn": "0/0", "last_record": "47/280A5860", "latest_gc_cutoff": "47/240A5860", - "horizon_cutoff": "47/240A5860", - "pitr_cutoff": "47/240A5860", - "next_gc_cutoff": "47/240A5860", - "retention_param_cutoff": "0/0" + "next_pitr_cutoff": "47/240A5860", + "retention_param_cutoff": "0/0", + "lease_points": [] } ] }"#; let model: ModelInputs = serde_json::from_str(doc).unwrap(); - let res = model.calculate_model().unwrap().calculate(); + let res = model.calculate_model().calculate(); println!("calculated synthetic size: {}", res.total_size); println!("result: {:?}", serde_json::to_string(&res.segments)); diff --git a/pageserver/src/tenant/span.rs b/pageserver/src/tenant/span.rs deleted file mode 100644 index 04e92f4096..0000000000 --- a/pageserver/src/tenant/span.rs +++ /dev/null @@ -1,17 +0,0 @@ -#[cfg(debug_assertions)] -use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor}; - -#[cfg(not(debug_assertions))] -pub(crate) fn debug_assert_current_span_has_tenant_id() {} - -#[cfg(debug_assertions)] -pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy> = - once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id"])); - -#[cfg(debug_assertions)] -#[track_caller] -pub(crate) fn debug_assert_current_span_has_tenant_id() { - if let Err(missing) = check_fields_present!([&*TENANT_ID_EXTRACTOR]) { - panic!("missing extractors: {missing:?}") - } -} diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 6e9a4932d8..dac6b2f893 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -1,39 +1,43 @@ //! Common traits and structs for layers pub mod delta_layer; -mod filename; pub mod image_layer; -mod inmemory_layer; +pub mod inmemory_layer; pub(crate) mod layer; mod layer_desc; +mod layer_name; +pub mod merge_iterator; + +pub mod split_writer; use crate::context::{AccessStatsBehavior, RequestContext}; -use crate::task_mgr::TaskKind; +use crate::repository::Value; use crate::walrecord::NeonWalRecord; use bytes::Bytes; -use enum_map::EnumMap; -use enumset::EnumSet; -use once_cell::sync::Lazy; -use pageserver_api::models::{ - LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus, -}; +use pageserver_api::key::Key; +use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; +use std::cmp::{Ordering, Reverse}; +use std::collections::hash_map::Entry; +use std::collections::{BinaryHeap, HashMap}; use std::ops::Range; -use std::sync::Mutex; +use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use tracing::warn; -use utils::history_buffer::HistoryBufferWithDropCounter; -use utils::rate_limit::RateLimit; -use utils::{id::TimelineId, lsn::Lsn}; +use utils::lsn::Lsn; pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; -pub use filename::{DeltaFileName, ImageFileName, LayerFileName}; pub use image_layer::{ImageLayer, ImageLayerWriter}; pub use inmemory_layer::InMemoryLayer; pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; +pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName}; pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; +use self::inmemory_layer::InMemoryLayerFileId; + +use super::timeline::GetVectoredError; +use super::PageReconstructError; + pub fn range_overlaps(a: &Range, b: &Range) -> bool where T: PartialOrd, @@ -61,115 +65,456 @@ where /// the same ValueReconstructState struct in the next 'get_value_reconstruct_data' /// call, to collect more records. /// -#[derive(Debug)] -pub struct ValueReconstructState { - pub records: Vec<(Lsn, NeonWalRecord)>, - pub img: Option<(Lsn, Bytes)>, +#[derive(Debug, Default)] +pub(crate) struct ValueReconstructState { + pub(crate) records: Vec<(Lsn, NeonWalRecord)>, + pub(crate) img: Option<(Lsn, Bytes)>, } -/// Return value from [`Layer::get_value_reconstruct_data`] -#[derive(Clone, Copy, Debug)] -pub enum ValueReconstructResult { - /// Got all the data needed to reconstruct the requested page +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub(crate) enum ValueReconstructSituation { Complete, - /// This layer didn't contain all the required data, the caller should look up - /// the predecessor layer at the returned LSN and collect more data from there. + #[default] Continue, - - /// This layer didn't contain data needed to reconstruct the page version at - /// the returned LSN. This is usually considered an error, but might be OK - /// in some circumstances. - Missing, } -#[derive(Debug)] -pub struct LayerAccessStats(Mutex); - -/// This struct holds two instances of [`LayerAccessStatsInner`]. -/// Accesses are recorded to both instances. -/// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`]. -/// The `for_eviction_policy` is never reset. +/// Reconstruct data accumulated for a single key during a vectored get #[derive(Debug, Default, Clone)] -struct LayerAccessStatsLocked { - for_scraping_api: LayerAccessStatsInner, - for_eviction_policy: LayerAccessStatsInner, +pub(crate) struct VectoredValueReconstructState { + pub(crate) records: Vec<(Lsn, NeonWalRecord)>, + pub(crate) img: Option<(Lsn, Bytes)>, + + situation: ValueReconstructSituation, } -impl LayerAccessStatsLocked { - fn iter_mut(&mut self) -> impl Iterator { - [&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter() +impl VectoredValueReconstructState { + fn get_cached_lsn(&self) -> Option { + self.img.as_ref().map(|img| img.0) } } -#[derive(Debug, Default, Clone)] -struct LayerAccessStatsInner { - first_access: Option, - count_by_access_kind: EnumMap, - task_kind_flag: EnumSet, - last_accesses: HistoryBufferWithDropCounter, - last_residence_changes: HistoryBufferWithDropCounter, -} +impl From for ValueReconstructState { + fn from(mut state: VectoredValueReconstructState) -> Self { + // walredo expects the records to be descending in terms of Lsn + state.records.sort_by_key(|(lsn, _)| Reverse(*lsn)); -#[derive(Debug, Clone, Copy)] -pub(crate) struct LayerAccessStatFullDetails { - pub(crate) when: SystemTime, - pub(crate) task_kind: TaskKind, - pub(crate) access_kind: LayerAccessKind, -} - -#[derive(Clone, Copy, strum_macros::EnumString)] -pub enum LayerAccessStatsReset { - NoReset, - JustTaskKindFlags, - AllStats, -} - -fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 { - ts.duration_since(UNIX_EPOCH) - .expect("better to die in this unlikely case than report false stats") - .as_millis() - .try_into() - .expect("64 bits is enough for few more years") -} - -impl LayerAccessStatFullDetails { - fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails { - let Self { - when, - task_kind, - access_kind, - } = self; - pageserver_api::models::LayerAccessStatFullDetails { - when_millis_since_epoch: system_time_to_millis_since_epoch(when), - task_kind: task_kind.into(), // into static str, powered by strum_macros - access_kind: *access_kind, + ValueReconstructState { + records: state.records, + img: state.img, } } } -impl LayerAccessStats { - /// Create an empty stats object. - /// - /// The caller is responsible for recording a residence event - /// using [`record_residence_event`] before calling `latest_activity`. - /// If they don't, [`latest_activity`] will return `None`. - /// - /// [`record_residence_event`]: Self::record_residence_event - /// [`latest_activity`]: Self::latest_activity - pub(crate) fn empty_will_record_residence_event_later() -> Self { - LayerAccessStats(Mutex::default()) +/// Bag of data accumulated during a vectored get.. +pub(crate) struct ValuesReconstructState { + /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline` + /// should not expect to get anything from this hashmap. + pub(crate) keys: HashMap>, + /// The keys which are already retrieved + keys_done: KeySpaceRandomAccum, + + /// The keys covered by the image layers + keys_with_image_coverage: Option>, + + // Statistics that are still accessible as a caller of `get_vectored_impl`. + layers_visited: u32, + delta_layers_visited: u32, +} + +impl ValuesReconstructState { + pub(crate) fn new() -> Self { + Self { + keys: HashMap::new(), + keys_done: KeySpaceRandomAccum::new(), + keys_with_image_coverage: None, + layers_visited: 0, + delta_layers_visited: 0, + } } - /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status. + /// Associate a key with the error which it encountered and mark it as done + pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) { + let previous = self.keys.insert(key, Err(err)); + if let Some(Ok(state)) = previous { + if state.situation == ValueReconstructSituation::Continue { + self.keys_done.add_key(key); + } + } + } + + pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) { + self.layers_visited += 1; + if let ReadableLayer::PersistentLayer(layer) = layer { + if layer.layer_desc().is_delta() { + self.delta_layers_visited += 1; + } + } + } + + pub(crate) fn get_delta_layers_visited(&self) -> u32 { + self.delta_layers_visited + } + + pub(crate) fn get_layers_visited(&self) -> u32 { + self.layers_visited + } + + /// This function is called after reading a keyspace from a layer. + /// It checks if the read path has now moved past the cached Lsn for any keys. /// - /// See [`record_residence_event`] for why you need to do this while holding the layer map lock. + /// Implementation note: We intentionally iterate over the keys for which we've + /// already collected some reconstruct data. This avoids scaling complexity with + /// the size of the search space. + pub(crate) fn on_lsn_advanced(&mut self, keyspace: &KeySpace, advanced_to: Lsn) { + for (key, value) in self.keys.iter_mut() { + if !keyspace.contains(key) { + continue; + } + + if let Ok(state) = value { + if state.situation != ValueReconstructSituation::Complete + && state.get_cached_lsn() >= Some(advanced_to) + { + state.situation = ValueReconstructSituation::Complete; + self.keys_done.add_key(*key); + } + } + } + } + + /// On hitting image layer, we can mark all keys in this range as done, because + /// if the image layer does not contain a key, it is deleted/never added. + pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range) { + let prev_val = self.keys_with_image_coverage.replace(key_range.clone()); + assert_eq!( + prev_val, None, + "should consume the keyspace before the next iteration" + ); + } + + /// Update the state collected for a given key. + /// Returns true if this was the last value needed for the key and false otherwise. /// - /// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad - /// [`record_residence_event`]: Self::record_residence_event - pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self { - let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default())); - new.record_residence_event(status, LayerResidenceEventReason::LayerLoad); - new + /// If the key is done after the update, mark it as such. + pub(crate) fn update_key( + &mut self, + key: &Key, + lsn: Lsn, + value: Value, + ) -> ValueReconstructSituation { + let state = self + .keys + .entry(*key) + .or_insert(Ok(VectoredValueReconstructState::default())); + + if let Ok(state) = state { + let key_done = match state.situation { + ValueReconstructSituation::Complete => unreachable!(), + ValueReconstructSituation::Continue => match value { + Value::Image(img) => { + state.img = Some((lsn, img)); + true + } + Value::WalRecord(rec) => { + debug_assert!( + Some(lsn) > state.get_cached_lsn(), + "Attempt to collect a record below cached LSN for walredo: {} < {}", + lsn, + state + .get_cached_lsn() + .expect("Assertion can only fire if a cached lsn is present") + ); + + let will_init = rec.will_init(); + state.records.push((lsn, rec)); + will_init + } + }, + }; + + if key_done && state.situation == ValueReconstructSituation::Continue { + state.situation = ValueReconstructSituation::Complete; + self.keys_done.add_key(*key); + } + + state.situation + } else { + ValueReconstructSituation::Complete + } + } + + /// Returns the Lsn at which this key is cached if one exists. + /// The read path should go no further than this Lsn for the given key. + pub(crate) fn get_cached_lsn(&self, key: &Key) -> Option { + self.keys + .get(key) + .and_then(|k| k.as_ref().ok()) + .and_then(|state| state.get_cached_lsn()) + } + + /// Returns the key space describing the keys that have + /// been marked as completed since the last call to this function. + /// Returns individual keys done, and the image layer coverage. + pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option>) { + ( + self.keys_done.consume_keyspace(), + self.keys_with_image_coverage.take(), + ) + } +} + +impl Default for ValuesReconstructState { + fn default() -> Self { + Self::new() + } +} + +/// A key that uniquely identifies a layer in a timeline +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +pub(crate) enum LayerId { + PersitentLayerId(PersistentLayerKey), + InMemoryLayerId(InMemoryLayerFileId), +} + +/// Layer wrapper for the read path. Note that it is valid +/// to use these layers even after external operations have +/// been performed on them (compaction, freeze, etc.). +#[derive(Debug)] +pub(crate) enum ReadableLayer { + PersistentLayer(Layer), + InMemoryLayer(Arc), +} + +/// A partial description of a read to be done. +#[derive(Debug, Clone)] +struct ReadDesc { + /// An id used to resolve the readable layer within the fringe + layer_id: LayerId, + /// Lsn range for the read, used for selecting the next read + lsn_range: Range, +} + +/// Data structure which maintains a fringe of layers for the +/// read path. The fringe is the set of layers which intersects +/// the current keyspace that the search is descending on. +/// Each layer tracks the keyspace that intersects it. +/// +/// The fringe must appear sorted by Lsn. Hence, it uses +/// a two layer indexing scheme. +#[derive(Debug)] +pub(crate) struct LayerFringe { + planned_reads_by_lsn: BinaryHeap, + layers: HashMap, +} + +#[derive(Debug)] +struct LayerKeyspace { + layer: ReadableLayer, + target_keyspace: KeySpaceRandomAccum, +} + +impl LayerFringe { + pub(crate) fn new() -> Self { + LayerFringe { + planned_reads_by_lsn: BinaryHeap::new(), + layers: HashMap::new(), + } + } + + pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range)> { + let read_desc = match self.planned_reads_by_lsn.pop() { + Some(desc) => desc, + None => return None, + }; + + let removed = self.layers.remove_entry(&read_desc.layer_id); + + match removed { + Some(( + _, + LayerKeyspace { + layer, + mut target_keyspace, + }, + )) => Some(( + layer, + target_keyspace.consume_keyspace(), + read_desc.lsn_range, + )), + None => unreachable!("fringe internals are always consistent"), + } + } + + pub(crate) fn update( + &mut self, + layer: ReadableLayer, + keyspace: KeySpace, + lsn_range: Range, + ) { + let layer_id = layer.id(); + let entry = self.layers.entry(layer_id.clone()); + match entry { + Entry::Occupied(mut entry) => { + entry.get_mut().target_keyspace.add_keyspace(keyspace); + } + Entry::Vacant(entry) => { + self.planned_reads_by_lsn.push(ReadDesc { + lsn_range, + layer_id: layer_id.clone(), + }); + let mut accum = KeySpaceRandomAccum::new(); + accum.add_keyspace(keyspace); + entry.insert(LayerKeyspace { + layer, + target_keyspace: accum, + }); + } + } + } +} + +impl Default for LayerFringe { + fn default() -> Self { + Self::new() + } +} + +impl Ord for ReadDesc { + fn cmp(&self, other: &Self) -> Ordering { + let ord = self.lsn_range.end.cmp(&other.lsn_range.end); + if ord == std::cmp::Ordering::Equal { + self.lsn_range.start.cmp(&other.lsn_range.start).reverse() + } else { + ord + } + } +} + +impl PartialOrd for ReadDesc { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for ReadDesc { + fn eq(&self, other: &Self) -> bool { + self.lsn_range == other.lsn_range + } +} + +impl Eq for ReadDesc {} + +impl ReadableLayer { + pub(crate) fn id(&self) -> LayerId { + match self { + Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()), + Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()), + } + } + + pub(crate) async fn get_values_reconstruct_data( + &self, + keyspace: KeySpace, + lsn_range: Range, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> Result<(), GetVectoredError> { + match self { + ReadableLayer::PersistentLayer(layer) => { + layer + .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx) + .await + } + ReadableLayer::InMemoryLayer(layer) => { + layer + .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx) + .await + } + } + } +} + +/// Layers contain a hint indicating whether they are likely to be used for reads. +/// +/// This is a hint rather than an authoritative value, so that we do not have to update it synchronously +/// when changing the visibility of layers (for example when creating a branch that makes some previously +/// covered layers visible). It should be used for cache management but not for correctness-critical checks. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum LayerVisibilityHint { + /// A Visible layer might be read while serving a read, because there is not an image layer between it + /// and a readable LSN (the tip of the branch or a child's branch point) + Visible, + /// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates + /// a branch or ephemeral endpoint at an LSN below the layer that covers this. + Covered, +} + +pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64); + +#[derive(Clone, Copy, strum_macros::EnumString)] +pub(crate) enum LayerAccessStatsReset { + NoReset, + AllStats, +} + +impl Default for LayerAccessStats { + fn default() -> Self { + // Default value is to assume resident since creation time, and visible. + let (_mask, mut value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, SystemTime::now()); + value |= 0x1 << Self::VISIBILITY_SHIFT; + + Self(std::sync::atomic::AtomicU64::new(value)) + } +} + +// Efficient store of two very-low-resolution timestamps and some bits. Used for storing last access time and +// last residence change time. +impl LayerAccessStats { + // How many high bits to drop from a u32 timestamp? + // - Only storing up to a u32 timestamp will work fine until 2038 (if this code is still in use + // after that, this software has been very successful!) + // - Dropping the top bit is implicitly safe because unix timestamps are meant to be + // stored in an i32, so they never used it. + // - Dropping the next two bits is safe because this code is only running on systems in + // years >= 2024, and these bits have been 1 since 2021 + // + // Therefore we may store only 28 bits for a timestamp with one second resolution. We do + // this truncation to make space for some flags in the high bits of our u64. + const TS_DROP_HIGH_BITS: u32 = u32::count_ones(Self::TS_ONES) + 1; + const TS_MASK: u32 = 0x1f_ff_ff_ff; + const TS_ONES: u32 = 0x60_00_00_00; + + const ATIME_SHIFT: u32 = 0; + const RTIME_SHIFT: u32 = 32 - Self::TS_DROP_HIGH_BITS; + const VISIBILITY_SHIFT: u32 = 64 - 2 * Self::TS_DROP_HIGH_BITS; + + fn write_bits(&self, mask: u64, value: u64) -> u64 { + self.0 + .fetch_update( + // TODO: decide what orderings are correct + std::sync::atomic::Ordering::Relaxed, + std::sync::atomic::Ordering::Relaxed, + |v| Some((v & !mask) | (value & mask)), + ) + .expect("Inner function is infallible") + } + + fn to_low_res_timestamp(shift: u32, time: SystemTime) -> (u64, u64) { + // Drop the low three bits of the timestamp, for an ~8s accuracy + let timestamp = time.duration_since(UNIX_EPOCH).unwrap().as_secs() & (Self::TS_MASK as u64); + + ((Self::TS_MASK as u64) << shift, timestamp << shift) + } + + fn read_low_res_timestamp(&self, shift: u32) -> Option { + let read = self.0.load(std::sync::atomic::Ordering::Relaxed); + + let ts_bits = (read & ((Self::TS_MASK as u64) << shift)) >> shift; + if ts_bits == 0 { + None + } else { + Some(UNIX_EPOCH + Duration::from_secs(ts_bits | (Self::TS_ONES as u64))) + } } /// Record a change in layer residency. @@ -185,128 +530,130 @@ impl LayerAccessStats { /// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map. /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock. /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event. - /// - pub(crate) fn record_residence_event( - &self, - status: LayerResidenceStatus, - reason: LayerResidenceEventReason, - ) { - let mut locked = self.0.lock().unwrap(); - locked.iter_mut().for_each(|inner| { - inner - .last_residence_changes - .write(LayerResidenceEvent::new(status, reason)) - }); + pub(crate) fn record_residence_event_at(&self, now: SystemTime) { + let (mask, value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, now); + self.write_bits(mask, value); } - fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) { + pub(crate) fn record_residence_event(&self) { + self.record_residence_event_at(SystemTime::now()) + } + + fn record_access_at(&self, now: SystemTime) -> bool { + let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now); + + // A layer which is accessed must be visible. + mask |= 0x1 << Self::VISIBILITY_SHIFT; + value |= 0x1 << Self::VISIBILITY_SHIFT; + + let old_bits = self.write_bits(mask, value); + !matches!( + self.decode_visibility(old_bits), + LayerVisibilityHint::Visible + ) + } + + /// Returns true if we modified the layer's visibility to set it to Visible implicitly + /// as a result of this access + pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool { if ctx.access_stats_behavior() == AccessStatsBehavior::Skip { - return; + return false; } - let this_access = LayerAccessStatFullDetails { - when: SystemTime::now(), - task_kind: ctx.task_kind(), - access_kind, - }; - - let mut locked = self.0.lock().unwrap(); - locked.iter_mut().for_each(|inner| { - inner.first_access.get_or_insert(this_access); - inner.count_by_access_kind[access_kind] += 1; - inner.task_kind_flag |= ctx.task_kind(); - inner.last_accesses.write(this_access); - }) + self.record_access_at(SystemTime::now()) } fn as_api_model( &self, reset: LayerAccessStatsReset, ) -> pageserver_api::models::LayerAccessStats { - let mut locked = self.0.lock().unwrap(); - let inner = &mut locked.for_scraping_api; - let LayerAccessStatsInner { - first_access, - count_by_access_kind, - task_kind_flag, - last_accesses, - last_residence_changes, - } = inner; let ret = pageserver_api::models::LayerAccessStats { - access_count_by_access_kind: count_by_access_kind - .iter() - .map(|(kind, count)| (kind, *count)) - .collect(), - task_kind_access_flag: task_kind_flag - .iter() - .map(|task_kind| task_kind.into()) // into static str, powered by strum_macros - .collect(), - first: first_access.as_ref().map(|a| a.as_api_model()), - accesses_history: last_accesses.map(|m| m.as_api_model()), - residence_events_history: last_residence_changes.clone(), + access_time: self + .read_low_res_timestamp(Self::ATIME_SHIFT) + .unwrap_or(UNIX_EPOCH), + residence_time: self + .read_low_res_timestamp(Self::RTIME_SHIFT) + .unwrap_or(UNIX_EPOCH), + visible: matches!(self.visibility(), LayerVisibilityHint::Visible), }; match reset { - LayerAccessStatsReset::NoReset => (), - LayerAccessStatsReset::JustTaskKindFlags => { - inner.task_kind_flag.clear(); - } + LayerAccessStatsReset::NoReset => {} LayerAccessStatsReset::AllStats => { - *inner = LayerAccessStatsInner::default(); + self.write_bits((Self::TS_MASK as u64) << Self::ATIME_SHIFT, 0x0); + self.write_bits((Self::TS_MASK as u64) << Self::RTIME_SHIFT, 0x0); } } ret } - /// Get the latest access timestamp, falling back to latest residence event. - /// - /// This function can only return `None` if there has not yet been a call to the - /// [`record_residence_event`] method. That would generally be considered an - /// implementation error. This function logs a rate-limited warning in that case. - /// - /// TODO: use type system to avoid the need for `fallback`. - /// The approach in - /// could be used to enforce that a residence event is recorded - /// before a layer is added to the layer map. We could also have - /// a layer wrapper type that holds the LayerAccessStats, and ensure - /// that that type can only be produced by inserting into the layer map. - /// - /// [`record_residence_event`]: Self::record_residence_event - pub(crate) fn latest_activity(&self) -> Option { - let locked = self.0.lock().unwrap(); - let inner = &locked.for_eviction_policy; - match inner.last_accesses.recent() { - Some(a) => Some(a.when), - None => match inner.last_residence_changes.recent() { - Some(e) => Some(e.timestamp), - None => { - static WARN_RATE_LIMIT: Lazy> = - Lazy::new(|| Mutex::new((0, RateLimit::new(Duration::from_secs(10))))); - let mut guard = WARN_RATE_LIMIT.lock().unwrap(); - guard.0 += 1; - let occurences = guard.0; - guard.1.call(move || { - warn!(parent: None, occurences, "latest_activity not available, this is an implementation bug, using fallback value"); - }); - None - } - }, + /// Get the latest access timestamp, falling back to latest residence event. The latest residence event + /// will be this Layer's construction time, if its residence hasn't changed since then. + pub(crate) fn latest_activity(&self) -> SystemTime { + if let Some(t) = self.read_low_res_timestamp(Self::ATIME_SHIFT) { + t + } else { + self.read_low_res_timestamp(Self::RTIME_SHIFT) + .expect("Residence time is set on construction") } } + + /// Whether this layer has been accessed (excluding in [`AccessStatsBehavior::Skip`]). + /// + /// This indicates whether the layer has been used for some purpose that would motivate + /// us to keep it on disk, such as for serving a getpage request. + fn accessed(&self) -> bool { + // Consider it accessed if the most recent access is more recent than + // the most recent change in residence status. + match ( + self.read_low_res_timestamp(Self::ATIME_SHIFT), + self.read_low_res_timestamp(Self::RTIME_SHIFT), + ) { + (None, _) => false, + (Some(_), None) => true, + (Some(a), Some(r)) => a >= r, + } + } + + /// Helper for extracting the visibility hint from the literal value of our inner u64 + fn decode_visibility(&self, bits: u64) -> LayerVisibilityHint { + match (bits >> Self::VISIBILITY_SHIFT) & 0x1 { + 1 => LayerVisibilityHint::Visible, + 0 => LayerVisibilityHint::Covered, + _ => unreachable!(), + } + } + + /// Returns the old value which has been replaced + pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) -> LayerVisibilityHint { + let value = match visibility { + LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT, + LayerVisibilityHint::Covered => 0x0, + }; + + let old_bits = self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value); + self.decode_visibility(old_bits) + } + + pub(crate) fn visibility(&self) -> LayerVisibilityHint { + let read = self.0.load(std::sync::atomic::Ordering::Relaxed); + self.decode_visibility(read) + } } /// Get a layer descriptor from a layer. -pub trait AsLayerDesc { +pub(crate) trait AsLayerDesc { /// Get the layer descriptor. fn layer_desc(&self) -> &PersistentLayerDesc; } pub mod tests { use pageserver_api::shard::TenantShardId; + use utils::id::TimelineId; use super::*; - impl From for PersistentLayerDesc { - fn from(value: DeltaFileName) -> Self { + impl From for PersistentLayerDesc { + fn from(value: DeltaLayerName) -> Self { PersistentLayerDesc::new_delta( TenantShardId::from([0; 18]), TimelineId::from_array([0; 16]), @@ -317,8 +664,8 @@ pub mod tests { } } - impl From for PersistentLayerDesc { - fn from(value: ImageFileName) -> Self { + impl From for PersistentLayerDesc { + fn from(value: ImageLayerName) -> Self { PersistentLayerDesc::new_img( TenantShardId::from([0; 18]), TimelineId::from_array([0; 16]), @@ -329,11 +676,11 @@ pub mod tests { } } - impl From for PersistentLayerDesc { - fn from(value: LayerFileName) -> Self { + impl From for PersistentLayerDesc { + fn from(value: LayerName) -> Self { match value { - LayerFileName::Delta(d) => Self::from(d), - LayerFileName::Image(i) => Self::from(i), + LayerName::Delta(d) => Self::from(d), + LayerName::Image(i) => Self::from(i), } } } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 4ded6d6a8d..34f1b15138 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -20,8 +20,8 @@ //! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051 //! ``` //! -//! Every delta file consists of three parts: "summary", "index", and -//! "values". The summary is a fixed size header at the beginning of the file, +//! Every delta file consists of three parts: "summary", "values", and +//! "index". The summary is a fixed size header at the beginning of the file, //! and it contains basic information about the layer, and offsets to the other //! parts. The "index" is a B-tree, mapping from Key and LSN to an offset in the //! "values" part. The actual page images and WAL records are stored in the @@ -29,28 +29,44 @@ //! use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; -use crate::page_cache::PAGE_SZ; +use crate::page_cache::{self, FileId, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader}; -use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; -use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; -use crate::tenant::Timeline; -use crate::virtual_file::VirtualFile; +use crate::tenant::disk_btree::{ + DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, +}; +use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT; +use crate::tenant::timeline::GetVectoredError; +use crate::tenant::vectored_blob_io::{ + BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, + VectoredReadCoalesceMode, VectoredReadPlanner, +}; +use crate::tenant::PageReconstructError; +use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; +use crate::virtual_file::{self, VirtualFile}; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{anyhow, bail, ensure, Context, Result}; +use bytes::BytesMut; use camino::{Utf8Path, Utf8PathBuf}; -use pageserver_api::models::LayerAccessKind; +use futures::StreamExt; +use itertools::Itertools; +use pageserver_api::config::MaxVectoredReadBytes; +use pageserver_api::keyspace::KeySpace; +use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; +use std::collections::VecDeque; use std::fs::File; use std::io::SeekFrom; use std::ops::Range; use std::os::unix::fs::FileExt; +use std::str::FromStr; use std::sync::Arc; use tokio::sync::OnceCell; +use tokio_epoll_uring::IoBuf; use tracing::*; use utils::{ @@ -59,7 +75,7 @@ use utils::{ lsn::Lsn, }; -use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer}; +use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState}; /// /// Header stored in the beginning of the file @@ -120,10 +136,11 @@ impl Summary { // Flag indicating that this version initialize the page const WILL_INIT: u64 = 1; -/// Struct representing reference to BLOB in layers. Reference contains BLOB -/// offset, and for WAL records it also contains `will_init` flag. The flag -/// helps to determine the range of records that needs to be applied, without -/// reading/deserializing records themselves. +/// Struct representing reference to BLOB in layers. +/// +/// Reference contains BLOB offset, and for WAL records it also contains +/// `will_init` flag. The flag helps to determine the range of records +/// that needs to be applied, without reading/deserializing records themselves. #[derive(Debug, Serialize, Deserialize, Copy, Clone)] pub struct BlobRef(pub u64); @@ -184,7 +201,6 @@ impl DeltaKey { pub struct DeltaLayer { path: Utf8PathBuf, pub desc: PersistentLayerDesc, - access_stats: LayerAccessStats, inner: OnceCell>, } @@ -208,8 +224,25 @@ pub struct DeltaLayerInner { index_start_blk: u32, index_root_blk: u32, - /// Reader object for reading blocks from the file. - file: FileBlockReader, + file: VirtualFile, + file_id: FileId, + + layer_key_range: Range, + layer_lsn_range: Range, + + max_vectored_read_bytes: Option, +} + +impl DeltaLayerInner { + pub(crate) fn layer_dbg_info(&self) -> String { + format!( + "delta {}..{} {}..{}", + self.key_range().start, + self.key_range().end, + self.lsn_range().start, + self.lsn_range().end + ) + } } impl std::fmt::Debug for DeltaLayerInner { @@ -242,7 +275,7 @@ impl DeltaLayer { return Ok(()); } - let inner = self.load(LayerAccessKind::Dump, ctx).await?; + let inner = self.load(ctx).await?; inner.dump(ctx).await } @@ -275,12 +308,7 @@ impl DeltaLayer { /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. /// - async fn load( - &self, - access_kind: LayerAccessKind, - ctx: &RequestContext, - ) -> Result<&Arc> { - self.access_stats.record_access(access_kind, ctx); + async fn load(&self, ctx: &RequestContext) -> Result<&Arc> { // Quick exit if already loaded self.inner .get_or_try_init(|| self.load_inner(ctx)) @@ -288,21 +316,19 @@ impl DeltaLayer { .with_context(|| format!("Failed to load delta layer {}", self.path())) } - async fn load_inner(&self, ctx: &RequestContext) -> Result> { + async fn load_inner(&self, ctx: &RequestContext) -> anyhow::Result> { let path = self.path(); - let loaded = DeltaLayerInner::load(&path, None, ctx) - .await - .and_then(|res| res)?; + let loaded = DeltaLayerInner::load(&path, None, None, ctx).await?; // not production code - let actual_filename = path.file_name().unwrap().to_owned(); - let expected_filename = self.layer_desc().filename().file_name(); + let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); + let expected_layer_name = self.layer_desc().layer_name(); - if actual_filename != expected_filename { + if actual_layer_name != expected_layer_name { println!("warning: filename does not match what is expected from in-file summary"); - println!("actual: {:?}", actual_filename); - println!("expected: {:?}", expected_filename); + println!("actual: {:?}", actual_layer_name.to_string()); + println!("expected: {:?}", expected_layer_name.to_string()); } Ok(Arc::new(loaded)) @@ -333,7 +359,6 @@ impl DeltaLayer { summary.lsn_range, metadata.len(), ), - access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: OnceCell::new(), }) } @@ -356,7 +381,6 @@ impl DeltaLayer { /// 3. Call `finish`. /// struct DeltaLayerWriterInner { - conf: &'static PageServerConf, pub path: Utf8PathBuf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, @@ -367,6 +391,9 @@ struct DeltaLayerWriterInner { tree: DiskBtreeBuilder, blob_writer: BlobWriter, + + // Number of key-lsns in the layer. + num_keys: usize, } impl DeltaLayerWriterInner { @@ -379,6 +406,7 @@ impl DeltaLayerWriterInner { tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, + ctx: &RequestContext, ) -> anyhow::Result { // Create the file initially with a temporary filename. We don't know // the end key yet, so we cannot form the final filename yet. We will @@ -389,7 +417,7 @@ impl DeltaLayerWriterInner { let path = DeltaLayer::temp_path_for(conf, &tenant_shard_id, &timeline_id, key_start, &lsn_range); - let mut file = VirtualFile::create(&path).await?; + let mut file = VirtualFile::create(&path, ctx).await?; // make room for the header block file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?; let blob_writer = BlobWriter::new(file, PAGE_SZ as u64); @@ -399,7 +427,6 @@ impl DeltaLayerWriterInner { let tree_builder = DiskBtreeBuilder::new(block_buf); Ok(Self { - conf, path, timeline_id, tenant_shard_id, @@ -407,6 +434,7 @@ impl DeltaLayerWriterInner { lsn_range, tree: tree_builder, blob_writer, + num_keys: 0, }) } @@ -415,28 +443,61 @@ impl DeltaLayerWriterInner { /// /// The values must be appended in key, lsn order. /// - async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { - self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init()) - .await - } - - async fn put_value_bytes( + async fn put_value( &mut self, key: Key, lsn: Lsn, - val: &[u8], - will_init: bool, + val: Value, + ctx: &RequestContext, ) -> anyhow::Result<()> { - assert!(self.lsn_range.start <= lsn); + let (_, res) = self + .put_value_bytes( + key, + lsn, + Value::ser(&val)?.slice_len(), + val.will_init(), + ctx, + ) + .await; + res + } - let off = self.blob_writer.write_blob(val).await?; + async fn put_value_bytes( + &mut self, + key: Key, + lsn: Lsn, + val: FullSlice, + will_init: bool, + ctx: &RequestContext, + ) -> (FullSlice, anyhow::Result<()>) + where + Buf: IoBuf + Send, + { + assert!( + self.lsn_range.start <= lsn, + "lsn_start={}, lsn={}", + self.lsn_range.start, + lsn + ); + // We don't want to use compression in delta layer creation + let compression = ImageCompressionAlgorithm::Disabled; + let (val, res) = self + .blob_writer + .write_blob_maybe_compressed(val, ctx, compression) + .await; + let off = match res { + Ok((off, _)) => off, + Err(e) => return (val, Err(anyhow::anyhow!(e))), + }; let blob_ref = BlobRef::new(off, will_init); let delta_key = DeltaKey::from_key_lsn(&key, lsn); - self.tree.append(&delta_key.0, blob_ref.0)?; + let res = self.tree.append(&delta_key.0, blob_ref.0); - Ok(()) + self.num_keys += 1; + + (val, res.map_err(|e| anyhow::anyhow!(e))) } fn size(&self) -> u64 { @@ -446,18 +507,39 @@ impl DeltaLayerWriterInner { /// /// Finish writing the delta layer. /// - async fn finish(self, key_end: Key, timeline: &Arc) -> anyhow::Result { + async fn finish( + self, + key_end: Key, + ctx: &RequestContext, + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { + let temp_path = self.path.clone(); + let result = self.finish0(key_end, ctx).await; + if result.is_err() { + tracing::info!(%temp_path, "cleaning up temporary file after error during writing"); + if let Err(e) = std::fs::remove_file(&temp_path) { + tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing"); + } + } + result + } + + async fn finish0( + self, + key_end: Key, + ctx: &RequestContext, + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; - let mut file = self.blob_writer.into_inner().await?; + let mut file = self.blob_writer.into_inner(ctx).await?; // Write out the index let (index_root_blk, block_buf) = self.tree.finish()?; file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64)) .await?; for buf in block_buf.blocks { - file.write_all(buf.as_ref()).await?; + let (_buf, res) = file.write_all(buf.slice_len(), ctx).await; + res?; } assert!(self.lsn_range.start < self.lsn_range.end); // Fill in the summary on blk 0 @@ -472,17 +554,12 @@ impl DeltaLayerWriterInner { index_root_blk, }; - let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new(); + let mut buf = Vec::with_capacity(PAGE_SZ); + // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&summary, &mut buf)?; - if buf.spilled() { - // This is bad as we only have one free block for the summary - warn!( - "Used more than one page size for summary buffer: {}", - buf.len() - ); - } file.seek(SeekFrom::Start(0)).await?; - file.write_all(&buf).await?; + let (_buf, res) = file.write_all(buf.slice_len(), ctx).await; + res?; let metadata = file .metadata() @@ -492,7 +569,6 @@ impl DeltaLayerWriterInner { // 5GB limit for objects without multipart upload (which we don't want to use) // Make it a little bit below to account for differing GB units // https://docs.aws.amazon.com/AmazonS3/latest/userguide/upload-objects.html - const S3_UPLOAD_LIMIT: u64 = 4_500_000_000; ensure!( metadata.len() <= S3_UPLOAD_LIMIT, "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!", @@ -515,11 +591,9 @@ impl DeltaLayerWriterInner { // fsync the file file.sync_all().await?; - let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?; + trace!("created delta layer {}", self.path); - trace!("created delta layer {}", layer.local_path()); - - Ok(layer) + Ok((desc, self.path)) } } @@ -559,6 +633,7 @@ impl DeltaLayerWriter { tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, + ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { inner: Some( @@ -568,6 +643,7 @@ impl DeltaLayerWriter { tenant_shard_id, key_start, lsn_range, + ctx, ) .await?, ), @@ -579,21 +655,35 @@ impl DeltaLayerWriter { /// /// The values must be appended in key, lsn order. /// - pub async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { - self.inner.as_mut().unwrap().put_value(key, lsn, val).await - } - - pub async fn put_value_bytes( + pub async fn put_value( &mut self, key: Key, lsn: Lsn, - val: &[u8], - will_init: bool, + val: Value, + ctx: &RequestContext, ) -> anyhow::Result<()> { self.inner .as_mut() .unwrap() - .put_value_bytes(key, lsn, val, will_init) + .put_value(key, lsn, val, ctx) + .await + } + + pub async fn put_value_bytes( + &mut self, + key: Key, + lsn: Lsn, + val: FullSlice, + will_init: bool, + ctx: &RequestContext, + ) -> (FullSlice, anyhow::Result<()>) + where + Buf: IoBuf + Send, + { + self.inner + .as_mut() + .unwrap() + .put_value_bytes(key, lsn, val, will_init, ctx) .await } @@ -607,9 +697,18 @@ impl DeltaLayerWriter { pub(crate) async fn finish( mut self, key_end: Key, - timeline: &Arc, - ) -> anyhow::Result { - self.inner.take().unwrap().finish(key_end, timeline).await + ctx: &RequestContext, + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { + self.inner.take().unwrap().finish(key_end, ctx).await + } + + pub(crate) fn num_keys(&self) -> usize { + self.inner.as_ref().unwrap().num_keys + } + + pub(crate) fn estimated_size(&self) -> u64 { + let inner = self.inner.as_ref().unwrap(); + inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64 } } @@ -647,57 +746,60 @@ impl DeltaLayer { where F: Fn(Summary) -> Summary, { - let file = VirtualFile::open_with_options( + let mut file = VirtualFile::open_with_options( path, - &*std::fs::OpenOptions::new().read(true).write(true), + virtual_file::OpenOptions::new().read(true).write(true), + ctx, ) .await .with_context(|| format!("Failed to open file '{}'", path))?; - let file = FileBlockReader::new(file); - let summary_blk = file.read_blk(0, ctx).await?; + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?; - let mut file = file.file; if actual_summary.magic != DELTA_FILE_MAGIC { return Err(RewriteSummaryError::MagicMismatch); } let new_summary = rewrite(actual_summary); - let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new(); + let mut buf = Vec::with_capacity(PAGE_SZ); + // TODO: could use smallvec here, but it's a pain with Slice Summary::ser_into(&new_summary, &mut buf).context("serialize")?; - if buf.spilled() { - // The code in DeltaLayerWriterInner just warn!()s for this. - // It should probably error out as well. - return Err(RewriteSummaryError::Other(anyhow::anyhow!( - "Used more than one page size for summary buffer: {}", - buf.len() - ))); - } file.seek(SeekFrom::Start(0)).await?; - file.write_all(&buf).await?; + let (_buf, res) = file.write_all(buf.slice_len(), ctx).await; + res?; Ok(()) } } impl DeltaLayerInner { - /// Returns nested result following Result, Critical>: - /// - inner has the success or transient failure - /// - outer has the permanent failure + pub(crate) fn key_range(&self) -> &Range { + &self.layer_key_range + } + + pub(crate) fn lsn_range(&self) -> &Range { + &self.layer_lsn_range + } + pub(super) async fn load( path: &Utf8Path, summary: Option, + max_vectored_read_bytes: Option, ctx: &RequestContext, - ) -> Result, anyhow::Error> { - let file = match VirtualFile::open(path).await { - Ok(file) => file, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), - }; - let file = FileBlockReader::new(file); + ) -> anyhow::Result { + let file = VirtualFile::open(path, ctx) + .await + .context("open layer file")?; - let summary_blk = match file.read_blk(0, ctx).await { - Ok(blk) => blk, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))), - }; + let file_id = page_cache::next_file_id(); + + let block_reader = FileBlockReader::new(&file, file_id); + + let summary_blk = block_reader + .read_blk(0, ctx) + .await + .context("read first block")?; // TODO: this should be an assertion instead; see ImageLayerInner::load let actual_summary = @@ -707,6 +809,9 @@ impl DeltaLayerInner { // production code path expected_summary.index_start_blk = actual_summary.index_start_blk; expected_summary.index_root_blk = actual_summary.index_root_blk; + // mask out the timeline_id, but still require the layers to be from the same tenant + expected_summary.timeline_id = actual_summary.timeline_id; + if actual_summary != expected_summary { bail!( "in-file summary does not match expected summary. actual = {:?} expected = {:?}", @@ -716,99 +821,236 @@ impl DeltaLayerInner { } } - Ok(Ok(DeltaLayerInner { + Ok(DeltaLayerInner { file, + file_id, index_start_blk: actual_summary.index_start_blk, index_root_blk: actual_summary.index_root_blk, - })) + max_vectored_read_bytes, + layer_key_range: actual_summary.key_range, + layer_lsn_range: actual_summary.lsn_range, + }) } - pub(super) async fn get_value_reconstruct_data( + // Look up the keys in the provided keyspace and update + // the reconstruct state with whatever is found. + // + // If the key is cached, go no further than the cached Lsn. + // + // Currently, the index is visited for each range, but this + // can be further optimised to visit the index only once. + pub(super) async fn get_values_reconstruct_data( &self, - key: Key, + keyspace: KeySpace, lsn_range: Range, - reconstruct_state: &mut ValueReconstructState, + reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, - ) -> anyhow::Result { - let mut need_image = true; - // Scan the page versions backwards, starting from `lsn`. - let file = &self.file; - let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + ) -> Result<(), GetVectoredError> { + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, - file, + block_reader, ); - let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1)); - let mut offsets: Vec<(Lsn, u64)> = Vec::new(); + let planner = VectoredReadPlanner::new( + self.max_vectored_read_bytes + .expect("Layer is loaded with max vectored bytes config") + .0 + .into(), + ); - tree_reader - .visit( - &search_key.0, - VisitDirection::Backwards, - |key, value| { - let blob_ref = BlobRef(value); - if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] { - return false; - } - let entry_lsn = DeltaKey::extract_lsn_from_buf(key); - if entry_lsn < lsn_range.start { - return false; - } - offsets.push((entry_lsn, blob_ref.pos())); + let data_end_offset = self.index_start_offset(); - !blob_ref.will_init() - }, - &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::DeltaLayerBtreeNode) - .build(), - ) - .await?; + let reads = Self::plan_reads( + &keyspace, + lsn_range.clone(), + data_end_offset, + index_reader, + planner, + reconstruct_state, + ctx, + ) + .await + .map_err(GetVectoredError::Other)?; - let ctx = &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::DeltaLayerValue) + self.do_reads_and_update_state(reads, reconstruct_state, ctx) + .await; + + reconstruct_state.on_lsn_advanced(&keyspace, lsn_range.start); + + Ok(()) + } + + async fn plan_reads( + keyspace: &KeySpace, + lsn_range: Range, + data_end_offset: u64, + index_reader: DiskBtreeReader, + mut planner: VectoredReadPlanner, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> anyhow::Result> + where + Reader: BlockReader + Clone, + { + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::DeltaLayerBtreeNode) .build(); - // Ok, 'offsets' now contains the offsets of all the entries we need to read - let cursor = file.block_cursor(); - let mut buf = Vec::new(); - for (entry_lsn, pos) in offsets { - cursor - .read_blob_into_buf(pos, &mut buf, ctx) - .await - .with_context(|| { - format!("Failed to read blob from virtual file {}", file.file.path) - })?; - let val = Value::des(&buf).with_context(|| { - format!( - "Failed to deserialize file blob from virtual file {}", - file.file.path - ) - })?; - match val { - Value::Image(img) => { - reconstruct_state.img = Some((entry_lsn, img)); - need_image = false; - break; - } - Value::WalRecord(rec) => { - let will_init = rec.will_init(); - reconstruct_state.records.push((entry_lsn, rec)); - if will_init { - // This WAL record initializes the page, so no need to go further back - need_image = false; - break; + for range in keyspace.ranges.iter() { + let mut range_end_handled = false; + + let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start); + let index_stream = index_reader.clone().into_stream(&start_key.0, &ctx); + let mut index_stream = std::pin::pin!(index_stream); + + while let Some(index_entry) = index_stream.next().await { + let (raw_key, value) = index_entry?; + let key = Key::from_slice(&raw_key[..KEY_SIZE]); + let lsn = DeltaKey::extract_lsn_from_buf(&raw_key); + let blob_ref = BlobRef(value); + + // Lsns are not monotonically increasing across keys, so we don't assert on them. + assert!(key >= range.start); + + let outside_lsn_range = !lsn_range.contains(&lsn); + let below_cached_lsn = reconstruct_state.get_cached_lsn(&key) >= Some(lsn); + + let flag = { + if outside_lsn_range || below_cached_lsn { + BlobFlag::Ignore + } else if blob_ref.will_init() { + BlobFlag::ReplaceAll + } else { + // Usual path: add blob to the read + BlobFlag::None } + }; + + if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) { + planner.handle_range_end(blob_ref.pos()); + range_end_handled = true; + break; + } else { + planner.handle(key, lsn, blob_ref.pos(), flag); } } + + if !range_end_handled { + tracing::debug!("Handling range end fallback at {}", data_end_offset); + planner.handle_range_end(data_end_offset); + } } - // If an older page image is needed to reconstruct the page, let the - // caller know. - if need_image { - Ok(ValueReconstructResult::Continue) - } else { - Ok(ValueReconstructResult::Complete) + Ok(planner.finish()) + } + + fn get_min_read_buffer_size( + planned_reads: &[VectoredRead], + read_size_soft_max: usize, + ) -> usize { + let Some(largest_read) = planned_reads.iter().max_by_key(|read| read.size()) else { + return read_size_soft_max; + }; + + let largest_read_size = largest_read.size(); + if largest_read_size > read_size_soft_max { + // If the read is oversized, it should only contain one key. + let offenders = largest_read + .blobs_at + .as_slice() + .iter() + .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn)) + .join(", "); + tracing::warn!( + "Oversized vectored read ({} > {}) for keys {}", + largest_read_size, + read_size_soft_max, + offenders + ); + } + + largest_read_size + } + + async fn do_reads_and_update_state( + &self, + reads: Vec, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) { + let vectored_blob_reader = VectoredBlobReader::new(&self.file); + let mut ignore_key_with_err = None; + + let max_vectored_read_bytes = self + .max_vectored_read_bytes + .expect("Layer is loaded with max vectored bytes config") + .0 + .into(); + let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes); + let mut buf = Some(BytesMut::with_capacity(buf_size)); + + // Note that reads are processed in reverse order (from highest key+lsn). + // This is the order that `ReconstructState` requires such that it can + // track when a key is done. + for read in reads.into_iter().rev() { + let res = vectored_blob_reader + .read_blobs(&read, buf.take().expect("Should have a buffer"), ctx) + .await; + + let blobs_buf = match res { + Ok(blobs_buf) => blobs_buf, + Err(err) => { + let kind = err.kind(); + for (_, blob_meta) in read.blobs_at.as_slice() { + reconstruct_state.on_key_error( + blob_meta.key, + PageReconstructError::Other(anyhow!( + "Failed to read blobs from virtual file {}: {}", + self.file.path, + kind + )), + ); + } + + // We have "lost" the buffer since the lower level IO api + // doesn't return the buffer on error. Allocate a new one. + buf = Some(BytesMut::with_capacity(buf_size)); + + continue; + } + }; + + for meta in blobs_buf.blobs.iter().rev() { + if Some(meta.meta.key) == ignore_key_with_err { + continue; + } + + let value = Value::des(&blobs_buf.buf[meta.start..meta.end]); + let value = match value { + Ok(v) => v, + Err(e) => { + reconstruct_state.on_key_error( + meta.meta.key, + PageReconstructError::Other(anyhow!(e).context(format!( + "Failed to deserialize blob from virtual file {}", + self.file.path, + ))), + ); + + ignore_key_with_err = Some(meta.meta.key); + continue; + } + }; + + // Invariant: once a key reaches [`ValueReconstructSituation::Complete`] + // state, no further updates shall be made to it. The call below will + // panic if the invariant is violated. + reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value); + } + + buf = Some(blobs_buf.buf); } } @@ -816,12 +1058,11 @@ impl DeltaLayerInner { &'a self, ctx: &RequestContext, ) -> Result>> { - let file = &self.file; - + let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, - file, + block_reader, ); let mut all_keys: Vec> = Vec::new(); @@ -834,9 +1075,7 @@ impl DeltaLayerInner { let delta_key = DeltaKey::from_slice(key); let val_ref = ValueRef { blob_ref: BlobRef(value), - reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter( - Adapter(self), - )), + layer: self, }; let pos = BlobRef(value).pos(); if let Some(last) = all_keys.last_mut() { @@ -862,30 +1101,227 @@ impl DeltaLayerInner { if let Some(last) = all_keys.last_mut() { // Last key occupies all space till end of value storage, // which corresponds to beginning of the index - last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size; + last.size = self.index_start_offset() - last.size; } Ok(all_keys) } + /// Using the given writer, write out a version which has the earlier Lsns than `until`. + /// + /// Return the amount of key value records pushed to the writer. + pub(super) async fn copy_prefix( + &self, + writer: &mut DeltaLayerWriter, + until: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result { + use crate::tenant::vectored_blob_io::{ + BlobMeta, VectoredReadBuilder, VectoredReadExtended, + }; + use futures::stream::TryStreamExt; + + #[derive(Debug)] + enum Item { + Actual(Key, Lsn, BlobRef), + Sentinel, + } + + impl From for Option<(Key, Lsn, BlobRef)> { + fn from(value: Item) -> Self { + match value { + Item::Actual(key, lsn, blob) => Some((key, lsn, blob)), + Item::Sentinel => None, + } + } + } + + impl Item { + fn offset(&self) -> Option { + match self { + Item::Actual(_, _, blob) => Some(*blob), + Item::Sentinel => None, + } + } + + fn is_last(&self) -> bool { + matches!(self, Item::Sentinel) + } + } + + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + self.index_start_blk, + self.index_root_blk, + block_reader, + ); + + let stream = self.stream_index_forwards(tree_reader, &[0u8; DELTA_KEY_SIZE], ctx); + let stream = stream.map_ok(|(key, lsn, pos)| Item::Actual(key, lsn, pos)); + // put in a sentinel value for getting the end offset for last item, and not having to + // repeat the whole read part + let stream = stream.chain(futures::stream::once(futures::future::ready(Ok( + Item::Sentinel, + )))); + let mut stream = std::pin::pin!(stream); + + let mut prev: Option<(Key, Lsn, BlobRef)> = None; + + let mut read_builder: Option = None; + let read_mode = VectoredReadCoalesceMode::get(); + + let max_read_size = self + .max_vectored_read_bytes + .map(|x| x.0.get()) + .unwrap_or(8192); + + let mut buffer = Some(BytesMut::with_capacity(max_read_size)); + + // FIXME: buffering of DeltaLayerWriter + let mut per_blob_copy = Vec::new(); + + let mut records = 0; + + while let Some(item) = stream.try_next().await? { + tracing::debug!(?item, "popped"); + let offset = item + .offset() + .unwrap_or(BlobRef::new(self.index_start_offset(), false)); + + let actionable = if let Some((key, lsn, start_offset)) = prev.take() { + let end_offset = offset; + + Some((BlobMeta { key, lsn }, start_offset..end_offset)) + } else { + None + }; + + let is_last = item.is_last(); + + prev = Option::from(item); + + let actionable = actionable.filter(|x| x.0.lsn < until); + + let builder = if let Some((meta, offsets)) = actionable { + // extend or create a new builder + if read_builder + .as_mut() + .map(|x| x.extend(offsets.start.pos(), offsets.end.pos(), meta)) + .unwrap_or(VectoredReadExtended::No) + == VectoredReadExtended::Yes + { + None + } else { + read_builder.replace(VectoredReadBuilder::new( + offsets.start.pos(), + offsets.end.pos(), + meta, + max_read_size, + read_mode, + )) + } + } else { + // nothing to do, except perhaps flush any existing for the last element + None + }; + + // flush the possible older builder and also the new one if the item was the last one + let builders = builder.into_iter(); + let builders = if is_last { + builders.chain(read_builder.take()) + } else { + builders.chain(None) + }; + + for builder in builders { + let read = builder.build(); + + let reader = VectoredBlobReader::new(&self.file); + + let mut buf = buffer.take().unwrap(); + + buf.clear(); + buf.reserve(read.size()); + let res = reader.read_blobs(&read, buf, ctx).await?; + + for blob in res.blobs { + let key = blob.meta.key; + let lsn = blob.meta.lsn; + let data = &res.buf[blob.start..blob.end]; + + #[cfg(debug_assertions)] + Value::des(data) + .with_context(|| { + format!( + "blob failed to deserialize for {}@{}, {}..{}: {:?}", + blob.meta.key, + blob.meta.lsn, + blob.start, + blob.end, + utils::Hex(data) + ) + }) + .unwrap(); + + // is it an image or will_init walrecord? + // FIXME: this could be handled by threading the BlobRef to the + // VectoredReadBuilder + let will_init = crate::repository::ValueBytes::will_init(data) + .inspect_err(|_e| { + #[cfg(feature = "testing")] + tracing::error!(data=?utils::Hex(data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value"); + }) + .unwrap_or(false); + + per_blob_copy.clear(); + per_blob_copy.extend_from_slice(data); + + let (tmp, res) = writer + .put_value_bytes( + key, + lsn, + std::mem::take(&mut per_blob_copy).slice_len(), + will_init, + ctx, + ) + .await; + per_blob_copy = tmp.into_raw_slice().into_inner(); + + res?; + + records += 1; + } + + buffer = Some(res.buf); + } + } + + assert!( + read_builder.is_none(), + "with the sentinel above loop should had handled all" + ); + + Ok(records) + } + pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> { println!( "index_start_blk: {}, root {}", self.index_start_blk, self.index_root_blk ); - let file = &self.file; + let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, - file, + block_reader, ); tree_reader.dump().await?; let keys = self.load_keys(ctx).await?; - async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result { - let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?; + async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result { + let buf = val.load_raw(ctx).await?; let val = Value::des(&buf)?; let desc = match val { Value::Image(img) => { @@ -906,17 +1342,89 @@ impl DeltaLayerInner { for entry in keys { let DeltaEntry { key, lsn, val, .. } = entry; - let desc = match dump_blob(val, ctx).await { + let desc = match dump_blob(&val, ctx).await { Ok(desc) => desc, Err(err) => { format!("ERROR: {err}") } }; println!(" key {key} at {lsn}: {desc}"); + + // Print more details about CHECKPOINT records. Would be nice to print details + // of many other record types too, but these are particularly interesting, as + // have a lot of special processing for them in walingest.rs. + use pageserver_api::key::CHECKPOINT_KEY; + use postgres_ffi::CheckPoint; + if key == CHECKPOINT_KEY { + let val = val.load(ctx).await?; + match val { + Value::Image(img) => { + let checkpoint = CheckPoint::decode(&img)?; + println!(" CHECKPOINT: {:?}", checkpoint); + } + Value::WalRecord(_rec) => { + println!(" unexpected walrecord value for checkpoint key"); + } + } + } } Ok(()) } + + fn stream_index_forwards<'a, R>( + &'a self, + reader: DiskBtreeReader, + start: &'a [u8; DELTA_KEY_SIZE], + ctx: &'a RequestContext, + ) -> impl futures::stream::Stream< + Item = Result<(Key, Lsn, BlobRef), crate::tenant::disk_btree::DiskBtreeError>, + > + 'a + where + R: BlockReader + 'a, + { + use futures::stream::TryStreamExt; + let stream = reader.into_stream(start, ctx); + stream.map_ok(|(key, value)| { + let key = DeltaKey::from_slice(&key); + let (key, lsn) = (key.key(), key.lsn()); + let offset = BlobRef(value); + + (key, lsn, offset) + }) + } + + /// The file offset to the first block of index. + /// + /// The file structure is summary, values, and index. We often need this for the size of last blob. + fn index_start_offset(&self) -> u64 { + let offset = self.index_start_blk as u64 * PAGE_SZ as u64; + let bref = BlobRef(offset); + tracing::debug!( + index_start_blk = self.index_start_blk, + offset, + pos = bref.pos(), + "index_start_offset" + ); + offset + } + + pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> { + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = + DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader); + DeltaLayerIterator { + delta_layer: self, + ctx, + index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx), + key_values_batch: std::collections::VecDeque::new(), + is_end: false, + planner: StreamingVectoredReadPlanner::new( + 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer. + 1024, // The default value. Unit tests might use a different value + ), + } + } } /// A set of data associated with a delta layer key and its value @@ -932,17 +1440,24 @@ pub struct DeltaEntry<'a> { /// Reference to an on-disk value pub struct ValueRef<'a> { blob_ref: BlobRef, - reader: BlockCursor<'a>, + layer: &'a DeltaLayerInner, } impl<'a> ValueRef<'a> { /// Loads the value from disk pub async fn load(&self, ctx: &RequestContext) -> Result { - // theoretically we *could* record an access time for each, but it does not really matter - let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?; + let buf = self.load_raw(ctx).await?; let val = Value::des(&buf)?; Ok(val) } + + async fn load_raw(&self, ctx: &RequestContext) -> Result> { + let reader = BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(Adapter( + self.layer, + ))); + let buf = reader.read_blob(self.blob_ref.pos(), ctx).await?; + Ok(buf) + } } pub(crate) struct Adapter(T); @@ -953,7 +1468,8 @@ impl> Adapter { blknum: u32, ctx: &RequestContext, ) -> Result { - self.0.as_ref().file.read_blk(blknum, ctx).await + let block_reader = FileBlockReader::new(&self.0.as_ref().file, self.0.as_ref().file_id); + block_reader.read_blk(blknum, ctx).await } } @@ -962,3 +1478,799 @@ impl AsRef for DeltaLayerInner { self } } + +impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for DeltaEntry<'a> { + fn key(&self) -> Key { + self.key + } + fn lsn(&self) -> Lsn { + self.lsn + } + fn size(&self) -> u64 { + self.size + } +} + +pub struct DeltaLayerIterator<'a> { + delta_layer: &'a DeltaLayerInner, + ctx: &'a RequestContext, + planner: StreamingVectoredReadPlanner, + index_iter: DiskBtreeIterator<'a>, + key_values_batch: VecDeque<(Key, Lsn, Value)>, + is_end: bool, +} + +impl<'a> DeltaLayerIterator<'a> { + pub(crate) fn layer_dbg_info(&self) -> String { + self.delta_layer.layer_dbg_info() + } + + /// Retrieve a batch of key-value pairs into the iterator buffer. + async fn next_batch(&mut self) -> anyhow::Result<()> { + assert!(self.key_values_batch.is_empty()); + assert!(!self.is_end); + + let plan = loop { + if let Some(res) = self.index_iter.next().await { + let (raw_key, value) = res?; + let key = Key::from_slice(&raw_key[..KEY_SIZE]); + let lsn = DeltaKey::extract_lsn_from_buf(&raw_key); + let blob_ref = BlobRef(value); + let offset = blob_ref.pos(); + if let Some(batch_plan) = self.planner.handle(key, lsn, offset) { + break batch_plan; + } + } else { + self.is_end = true; + let data_end_offset = self.delta_layer.index_start_offset(); + if let Some(item) = self.planner.handle_range_end(data_end_offset) { + break item; + } else { + return Ok(()); // TODO: test empty iterator + } + } + }; + let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file); + let mut next_batch = std::collections::VecDeque::new(); + let buf_size = plan.size(); + let buf = BytesMut::with_capacity(buf_size); + let blobs_buf = vectored_blob_reader + .read_blobs(&plan, buf, self.ctx) + .await?; + let frozen_buf = blobs_buf.buf.freeze(); + for meta in blobs_buf.blobs.iter() { + let value = Value::des(&frozen_buf[meta.start..meta.end])?; + next_batch.push_back((meta.meta.key, meta.meta.lsn, value)); + } + self.key_values_batch = next_batch; + Ok(()) + } + + pub async fn next(&mut self) -> anyhow::Result> { + if self.key_values_batch.is_empty() { + if self.is_end { + return Ok(None); + } + self.next_batch().await?; + } + Ok(Some( + self.key_values_batch + .pop_front() + .expect("should not be empty"), + )) + } +} + +#[cfg(test)] +pub(crate) mod test { + use std::collections::BTreeMap; + + use itertools::MinMaxResult; + use rand::prelude::{SeedableRng, SliceRandom, StdRng}; + use rand::RngCore; + + use super::*; + use crate::repository::Value; + use crate::tenant::harness::TIMELINE_ID; + use crate::tenant::storage_layer::{Layer, ResidentLayer}; + use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; + use crate::tenant::{Tenant, Timeline}; + use crate::{ + context::DownloadBehavior, + task_mgr::TaskKind, + tenant::{disk_btree::tests::TestDisk, harness::TenantHarness}, + DEFAULT_PG_VERSION, + }; + use bytes::Bytes; + + /// Construct an index for a fictional delta layer and and then + /// traverse in order to plan vectored reads for a query. Finally, + /// verify that the traversal fed the right index key and value + /// pairs into the planner. + #[tokio::test] + async fn test_delta_layer_index_traversal() { + let base_key = Key { + field1: 0, + field2: 1663, + field3: 12972, + field4: 16396, + field5: 0, + field6: 246080, + }; + + // Populate the index with some entries + let entries: BTreeMap> = BTreeMap::from([ + (base_key, vec![Lsn(1), Lsn(5), Lsn(25), Lsn(26), Lsn(28)]), + (base_key.add(1), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]), + (base_key.add(2), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]), + (base_key.add(5), vec![Lsn(10), Lsn(15), Lsn(16), Lsn(20)]), + ]); + + let mut disk = TestDisk::default(); + let mut writer = DiskBtreeBuilder::<_, DELTA_KEY_SIZE>::new(&mut disk); + + let mut disk_offset = 0; + for (key, lsns) in &entries { + for lsn in lsns { + let index_key = DeltaKey::from_key_lsn(key, *lsn); + let blob_ref = BlobRef::new(disk_offset, false); + writer + .append(&index_key.0, blob_ref.0) + .expect("In memory disk append should never fail"); + + disk_offset += 1; + } + } + + // Prepare all the arguments for the call into `plan_reads` below + let (root_offset, _writer) = writer + .finish() + .expect("In memory disk finish should never fail"); + let reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(0, root_offset, disk); + let planner = VectoredReadPlanner::new(100); + let mut reconstruct_state = ValuesReconstructState::new(); + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + let keyspace = KeySpace { + ranges: vec![ + base_key..base_key.add(3), + base_key.add(3)..base_key.add(100), + ], + }; + let lsn_range = Lsn(2)..Lsn(40); + + // Plan and validate + let vectored_reads = DeltaLayerInner::plan_reads( + &keyspace, + lsn_range.clone(), + disk_offset, + reader, + planner, + &mut reconstruct_state, + &ctx, + ) + .await + .expect("Read planning should not fail"); + + validate(keyspace, lsn_range, vectored_reads, entries); + } + + fn validate( + keyspace: KeySpace, + lsn_range: Range, + vectored_reads: Vec, + index_entries: BTreeMap>, + ) { + #[derive(Debug, PartialEq, Eq)] + struct BlobSpec { + key: Key, + lsn: Lsn, + at: u64, + } + + let mut planned_blobs = Vec::new(); + for read in vectored_reads { + for (at, meta) in read.blobs_at.as_slice() { + planned_blobs.push(BlobSpec { + key: meta.key, + lsn: meta.lsn, + at: *at, + }); + } + } + + let mut expected_blobs = Vec::new(); + let mut disk_offset = 0; + for (key, lsns) in index_entries { + for lsn in lsns { + let key_included = keyspace.ranges.iter().any(|range| range.contains(&key)); + let lsn_included = lsn_range.contains(&lsn); + + if key_included && lsn_included { + expected_blobs.push(BlobSpec { + key, + lsn, + at: disk_offset, + }); + } + + disk_offset += 1; + } + } + + assert_eq!(planned_blobs, expected_blobs); + } + + mod constants { + use utils::lsn::Lsn; + + /// Offset used by all lsns in this test + pub(super) const LSN_OFFSET: Lsn = Lsn(0x08); + /// Number of unique keys including in the test data + pub(super) const KEY_COUNT: u8 = 60; + /// Max number of different lsns for each key + pub(super) const MAX_ENTRIES_PER_KEY: u8 = 20; + /// Possible value sizes for each key along with a probability weight + pub(super) const VALUE_SIZES: [(usize, u8); 3] = [(100, 2), (1024, 2), (1024 * 1024, 1)]; + /// Probability that there will be a gap between the current key and the next one (33.3%) + pub(super) const KEY_GAP_CHANGES: [(bool, u8); 2] = [(true, 1), (false, 2)]; + /// The minimum size of a key range in all the generated reads + pub(super) const MIN_RANGE_SIZE: i128 = 10; + /// The number of ranges included in each vectored read + pub(super) const RANGES_COUNT: u8 = 2; + /// The number of vectored reads performed + pub(super) const READS_COUNT: u8 = 100; + /// Soft max size of a vectored read. Will be violated if we have to read keys + /// with values larger than the limit + pub(super) const MAX_VECTORED_READ_BYTES: usize = 64 * 1024; + } + + struct Entry { + key: Key, + lsn: Lsn, + value: Vec, + } + + fn generate_entries(rng: &mut StdRng) -> Vec { + let mut current_key = Key::MIN; + + let mut entries = Vec::new(); + for _ in 0..constants::KEY_COUNT { + let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY); + let mut lsns_iter = + std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| { + Some(Lsn(lsn.0 + 0x08)) + }); + let mut lsns = Vec::new(); + while lsns.len() < count as usize { + let take = rng.gen_bool(0.5); + let lsn = lsns_iter.next().unwrap(); + if take { + lsns.push(lsn); + } + } + + for lsn in lsns { + let size = constants::VALUE_SIZES + .choose_weighted(rng, |item| item.1) + .unwrap() + .0; + let mut buf = vec![0; size]; + rng.fill_bytes(&mut buf); + + entries.push(Entry { + key: current_key, + lsn, + value: buf, + }) + } + + let gap = constants::KEY_GAP_CHANGES + .choose_weighted(rng, |item| item.1) + .unwrap() + .0; + if gap { + current_key = current_key.add(2); + } else { + current_key = current_key.add(1); + } + } + + entries + } + + struct EntriesMeta { + key_range: Range, + lsn_range: Range, + index: BTreeMap<(Key, Lsn), Vec>, + } + + fn get_entries_meta(entries: &[Entry]) -> EntriesMeta { + let key_range = match entries.iter().minmax_by_key(|e| e.key) { + MinMaxResult::MinMax(min, max) => min.key..max.key.next(), + _ => panic!("More than one entry is always expected"), + }; + + let lsn_range = match entries.iter().minmax_by_key(|e| e.lsn) { + MinMaxResult::MinMax(min, max) => min.lsn..Lsn(max.lsn.0 + 1), + _ => panic!("More than one entry is always expected"), + }; + + let mut index = BTreeMap::new(); + for entry in entries.iter() { + index.insert((entry.key, entry.lsn), entry.value.clone()); + } + + EntriesMeta { + key_range, + lsn_range, + index, + } + } + + fn pick_random_keyspace(rng: &mut StdRng, key_range: &Range) -> KeySpace { + let start = key_range.start.to_i128(); + let end = key_range.end.to_i128(); + + let mut keyspace = KeySpace::default(); + + for _ in 0..constants::RANGES_COUNT { + let mut range: Option> = Option::default(); + while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) { + let range_start = rng.gen_range(start..end); + let range_end_offset = range_start + constants::MIN_RANGE_SIZE; + if range_end_offset >= end { + range = Some(Key::from_i128(range_start)..Key::from_i128(end)); + } else { + let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end); + range = Some(Key::from_i128(range_start)..Key::from_i128(range_end)); + } + } + keyspace.ranges.push(range.unwrap()); + } + + keyspace + } + + #[tokio::test] + async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read").await?; + let (tenant, ctx) = harness.load().await; + + let timeline_id = TimelineId::generate(); + let timeline = tenant + .create_test_timeline(timeline_id, constants::LSN_OFFSET, DEFAULT_PG_VERSION, &ctx) + .await?; + + tracing::info!("Generating test data ..."); + + let rng = &mut StdRng::seed_from_u64(0); + let entries = generate_entries(rng); + let entries_meta = get_entries_meta(&entries); + + tracing::info!("Done generating {} entries", entries.len()); + + tracing::info!("Writing test data to delta layer ..."); + let mut writer = DeltaLayerWriter::new( + harness.conf, + timeline_id, + harness.tenant_shard_id, + entries_meta.key_range.start, + entries_meta.lsn_range.clone(), + &ctx, + ) + .await?; + + for entry in entries { + let (_, res) = writer + .put_value_bytes(entry.key, entry.lsn, entry.value.slice_len(), false, &ctx) + .await; + res?; + } + + let (desc, path) = writer.finish(entries_meta.key_range.end, &ctx).await?; + let resident = Layer::finish_creating(harness.conf, &timeline, desc, &path)?; + + let inner = resident.get_as_delta(&ctx).await?; + + let file_size = inner.file.metadata().await?.len(); + tracing::info!( + "Done writing test data to delta layer. Resulting file size is: {}", + file_size + ); + + for i in 0..constants::READS_COUNT { + tracing::info!("Doing vectored read {}/{}", i + 1, constants::READS_COUNT); + + let block_reader = FileBlockReader::new(&inner.file, inner.file_id); + let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + inner.index_start_blk, + inner.index_root_blk, + block_reader, + ); + + let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES); + let mut reconstruct_state = ValuesReconstructState::new(); + let keyspace = pick_random_keyspace(rng, &entries_meta.key_range); + let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64; + + let vectored_reads = DeltaLayerInner::plan_reads( + &keyspace, + entries_meta.lsn_range.clone(), + data_end_offset, + index_reader, + planner, + &mut reconstruct_state, + &ctx, + ) + .await?; + + let vectored_blob_reader = VectoredBlobReader::new(&inner.file); + let buf_size = DeltaLayerInner::get_min_read_buffer_size( + &vectored_reads, + constants::MAX_VECTORED_READ_BYTES, + ); + let mut buf = Some(BytesMut::with_capacity(buf_size)); + + for read in vectored_reads { + let blobs_buf = vectored_blob_reader + .read_blobs(&read, buf.take().expect("Should have a buffer"), &ctx) + .await?; + for meta in blobs_buf.blobs.iter() { + let value = &blobs_buf.buf[meta.start..meta.end]; + assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]); + } + + buf = Some(blobs_buf.buf); + } + } + + Ok(()) + } + + #[tokio::test] + async fn copy_delta_prefix_smoke() { + use crate::walrecord::NeonWalRecord; + use bytes::Bytes; + + let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke") + .await + .unwrap(); + let (tenant, ctx) = h.load().await; + let ctx = &ctx; + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx) + .await + .unwrap(); + + let initdb_layer = timeline + .layers + .read() + .await + .likely_resident_layers() + .next() + .cloned() + .unwrap(); + + { + let mut writer = timeline.writer().await; + + let data = [ + (0x20, 12, Value::Image(Bytes::from_static(b"foobar"))), + ( + 0x30, + 12, + Value::WalRecord(NeonWalRecord::Postgres { + will_init: false, + rec: Bytes::from_static(b"1"), + }), + ), + ( + 0x40, + 12, + Value::WalRecord(NeonWalRecord::Postgres { + will_init: true, + rec: Bytes::from_static(b"2"), + }), + ), + // build an oversized value so we cannot extend and existing read over + // this + ( + 0x50, + 12, + Value::WalRecord(NeonWalRecord::Postgres { + will_init: true, + rec: { + let mut buf = + vec![0u8; tenant.conf.max_vectored_read_bytes.0.get() + 1024]; + buf.iter_mut() + .enumerate() + .for_each(|(i, slot)| *slot = (i % 256) as u8); + Bytes::from(buf) + }, + }), + ), + // because the oversized read cannot be extended further, we are sure to exercise the + // builder created on the last round with this: + ( + 0x60, + 12, + Value::WalRecord(NeonWalRecord::Postgres { + will_init: true, + rec: Bytes::from_static(b"3"), + }), + ), + ( + 0x60, + 9, + Value::Image(Bytes::from_static(b"something for a different key")), + ), + ]; + + let mut last_lsn = None; + + for (lsn, key, value) in data { + let key = Key::from_i128(key); + writer.put(key, Lsn(lsn), &value, ctx).await.unwrap(); + last_lsn = Some(lsn); + } + + writer.finish_write(Lsn(last_lsn.unwrap())); + } + timeline.freeze_and_flush().await.unwrap(); + + let new_layer = timeline + .layers + .read() + .await + .likely_resident_layers() + .find(|&x| x != &initdb_layer) + .cloned() + .unwrap(); + + // create a copy for the timeline, so we don't overwrite the file + let branch = tenant + .branch_timeline_test(&timeline, TimelineId::generate(), None, ctx) + .await + .unwrap(); + + assert_eq!(branch.get_ancestor_lsn(), Lsn(0x60)); + + // truncating at 0x61 gives us a full copy, otherwise just go backwards until there's just + // a single key + + for truncate_at in [0x61, 0x51, 0x41, 0x31, 0x21] { + let truncate_at = Lsn(truncate_at); + + let mut writer = DeltaLayerWriter::new( + tenant.conf, + branch.timeline_id, + tenant.tenant_shard_id, + Key::MIN, + Lsn(0x11)..truncate_at, + ctx, + ) + .await + .unwrap(); + + let new_layer = new_layer.download_and_keep_resident().await.unwrap(); + + new_layer + .copy_delta_prefix(&mut writer, truncate_at, ctx) + .await + .unwrap(); + + let (desc, path) = writer.finish(Key::MAX, ctx).await.unwrap(); + let copied_layer = Layer::finish_creating(tenant.conf, &branch, desc, &path).unwrap(); + + copied_layer.get_as_delta(ctx).await.unwrap(); + + assert_keys_and_values_eq( + new_layer.get_as_delta(ctx).await.unwrap(), + copied_layer.get_as_delta(ctx).await.unwrap(), + truncate_at, + ctx, + ) + .await; + } + } + + async fn assert_keys_and_values_eq( + source: &DeltaLayerInner, + truncated: &DeltaLayerInner, + truncated_at: Lsn, + ctx: &RequestContext, + ) { + use futures::future::ready; + use futures::stream::TryStreamExt; + + let start_key = [0u8; DELTA_KEY_SIZE]; + + let source_reader = FileBlockReader::new(&source.file, source.file_id); + let source_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + source.index_start_blk, + source.index_root_blk, + &source_reader, + ); + let source_stream = source.stream_index_forwards(source_tree, &start_key, ctx); + let source_stream = source_stream.filter(|res| match res { + Ok((_, lsn, _)) => ready(lsn < &truncated_at), + _ => ready(true), + }); + let mut source_stream = std::pin::pin!(source_stream); + + let truncated_reader = FileBlockReader::new(&truncated.file, truncated.file_id); + let truncated_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + truncated.index_start_blk, + truncated.index_root_blk, + &truncated_reader, + ); + let truncated_stream = truncated.stream_index_forwards(truncated_tree, &start_key, ctx); + let mut truncated_stream = std::pin::pin!(truncated_stream); + + let mut scratch_left = Vec::new(); + let mut scratch_right = Vec::new(); + + loop { + let (src, truncated) = (source_stream.try_next(), truncated_stream.try_next()); + let (src, truncated) = tokio::try_join!(src, truncated).unwrap(); + + if src.is_none() { + assert!(truncated.is_none()); + break; + } + + let (src, truncated) = (src.unwrap(), truncated.unwrap()); + + // because we've filtered the source with Lsn, we should always have the same keys from both. + assert_eq!(src.0, truncated.0); + assert_eq!(src.1, truncated.1); + + // if this is needed for something else, just drop this assert. + assert!( + src.2.pos() >= truncated.2.pos(), + "value position should not go backwards {} vs. {}", + src.2.pos(), + truncated.2.pos() + ); + + scratch_left.clear(); + let src_cursor = source_reader.block_cursor(); + let left = src_cursor.read_blob_into_buf(src.2.pos(), &mut scratch_left, ctx); + scratch_right.clear(); + let trunc_cursor = truncated_reader.block_cursor(); + let right = trunc_cursor.read_blob_into_buf(truncated.2.pos(), &mut scratch_right, ctx); + + tokio::try_join!(left, right).unwrap(); + + assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right)); + } + } + + pub(crate) fn sort_delta( + (k1, l1, _): &(Key, Lsn, Value), + (k2, l2, _): &(Key, Lsn, Value), + ) -> std::cmp::Ordering { + (k1, l1).cmp(&(k2, l2)) + } + + pub(crate) fn sort_delta_value( + (k1, l1, v1): &(Key, Lsn, Value), + (k2, l2, v2): &(Key, Lsn, Value), + ) -> std::cmp::Ordering { + let order_1 = if v1.is_image() { 0 } else { 1 }; + let order_2 = if v2.is_image() { 0 } else { 1 }; + (k1, l1, order_1).cmp(&(k2, l2, order_2)) + } + + pub(crate) async fn produce_delta_layer( + tenant: &Tenant, + tline: &Arc, + mut deltas: Vec<(Key, Lsn, Value)>, + ctx: &RequestContext, + ) -> anyhow::Result { + deltas.sort_by(sort_delta); + let (key_start, _, _) = deltas.first().unwrap(); + let (key_max, _, _) = deltas.last().unwrap(); + let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap(); + let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap(); + let lsn_end = Lsn(lsn_max.0 + 1); + let mut writer = DeltaLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + *key_start, + (*lsn_min)..lsn_end, + ctx, + ) + .await?; + let key_end = key_max.next(); + + for (key, lsn, value) in deltas { + writer.put_value(key, lsn, value, ctx).await?; + } + + let (desc, path) = writer.finish(key_end, ctx).await?; + let delta_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?; + + Ok::<_, anyhow::Error>(delta_layer) + } + + async fn assert_delta_iter_equal( + delta_iter: &mut DeltaLayerIterator<'_>, + expect: &[(Key, Lsn, Value)], + ) { + let mut expect_iter = expect.iter(); + loop { + let o1 = delta_iter.next().await.unwrap(); + let o2 = expect_iter.next(); + assert_eq!(o1.is_some(), o2.is_some()); + if o1.is_none() && o2.is_none() { + break; + } + let (k1, l1, v1) = o1.unwrap(); + let (k2, l2, v2) = o2.unwrap(); + assert_eq!(&k1, k2); + assert_eq!(l1, *l2); + assert_eq!(&v1, v2); + } + } + + #[tokio::test] + async fn delta_layer_iterator() { + let harness = TenantHarness::create("delta_layer_iterator").await.unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + const N: usize = 1000; + let test_deltas = (0..N) + .map(|idx| { + ( + get_key(idx as u32 / 10), + Lsn(0x10 * ((idx as u64) % 10 + 1)), + Value::Image(Bytes::from(format!("img{idx:05}"))), + ) + }) + .collect_vec(); + let resident_layer = produce_delta_layer(&tenant, &tline, test_deltas.clone(), &ctx) + .await + .unwrap(); + let delta_layer = resident_layer.get_as_delta(&ctx).await.unwrap(); + for max_read_size in [1, 1024] { + for batch_size in [1, 2, 4, 8, 3, 7, 13] { + println!("running with batch_size={batch_size} max_read_size={max_read_size}"); + // Test if the batch size is correctly determined + let mut iter = delta_layer.iter(&ctx); + iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut num_items = 0; + for _ in 0..3 { + iter.next_batch().await.unwrap(); + num_items += iter.key_values_batch.len(); + if max_read_size == 1 { + // every key should be a batch b/c the value is larger than max_read_size + assert_eq!(iter.key_values_batch.len(), 1); + } else { + assert!(iter.key_values_batch.len() <= batch_size); + } + if num_items >= N { + break; + } + iter.key_values_batch.clear(); + } + // Test if the result is correct + let mut iter = delta_layer.iter(&ctx); + iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + assert_delta_iter_equal(&mut iter, &test_deltas).await; + } + } + } +} diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index f03c7642eb..875e223c9c 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -1,7 +1,9 @@ //! An ImageLayer represents an image or a snapshot of a key-range at -//! one particular LSN. It contains an image of all key-value pairs -//! in its key-range. Any key that falls into the image layer's range -//! but does not exist in the layer, does not exist. +//! one particular LSN. +//! +//! It contains an image of all key-value pairs in its key-range. Any key +//! that falls into the image layer's range but does not exist in the layer, +//! does not exist. //! //! An image layer is stored in a file on disk. The file is stored in //! timelines/ directory. Currently, there are no @@ -25,31 +27,40 @@ //! actual page images are stored in the "values" part. use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; -use crate::page_cache::PAGE_SZ; -use crate::repository::{Key, KEY_SIZE}; +use crate::page_cache::{self, FileId, PAGE_SZ}; +use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; -use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader}; -use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; -use crate::tenant::storage_layer::{ - LayerAccessStats, ValueReconstructResult, ValueReconstructState, +use crate::tenant::block_io::{BlockBuf, FileBlockReader}; +use crate::tenant::disk_btree::{ + DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, }; -use crate::tenant::Timeline; -use crate::virtual_file::VirtualFile; +use crate::tenant::timeline::GetVectoredError; +use crate::tenant::vectored_blob_io::{ + BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner, +}; +use crate::tenant::{PageReconstructError, Timeline}; +use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; +use crate::virtual_file::{self, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; -use anyhow::{bail, ensure, Context, Result}; -use bytes::Bytes; +use anyhow::{anyhow, bail, ensure, Context, Result}; +use bytes::{Bytes, BytesMut}; use camino::{Utf8Path, Utf8PathBuf}; use hex; -use pageserver_api::models::LayerAccessKind; -use pageserver_api::shard::TenantShardId; +use itertools::Itertools; +use pageserver_api::config::MaxVectoredReadBytes; +use pageserver_api::keyspace::KeySpace; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; +use std::collections::VecDeque; use std::fs::File; use std::io::SeekFrom; use std::ops::Range; use std::os::unix::prelude::FileExt; +use std::str::FromStr; use std::sync::Arc; use tokio::sync::OnceCell; +use tokio_stream::StreamExt; use tracing::*; use utils::{ @@ -58,8 +69,10 @@ use utils::{ lsn::Lsn, }; -use super::filename::ImageFileName; -use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer}; +use super::layer_name::ImageLayerName; +use super::{ + AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, +}; /// /// Header stored in the beginning of the file @@ -124,7 +137,6 @@ pub struct ImageLayer { pub desc: PersistentLayerDesc, // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn pub lsn: Lsn, - access_stats: LayerAccessStats, inner: OnceCell, } @@ -148,10 +160,24 @@ pub struct ImageLayerInner { index_start_blk: u32, index_root_blk: u32, + key_range: Range, lsn: Lsn, - /// Reader object for reading blocks from the file. - file: FileBlockReader, + file: VirtualFile, + file_id: FileId, + + max_vectored_read_bytes: Option, +} + +impl ImageLayerInner { + pub(crate) fn layer_dbg_info(&self) -> String { + format!( + "image {}..{} {}", + self.key_range().start, + self.key_range().end, + self.lsn() + ) + } } impl std::fmt::Debug for ImageLayerInner { @@ -165,9 +191,12 @@ impl std::fmt::Debug for ImageLayerInner { impl ImageLayerInner { pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> { - let file = &self.file; - let tree_reader = - DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file); + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new( + self.index_start_blk, + self.index_root_blk, + block_reader, + ); tree_reader.dump().await?; @@ -208,7 +237,7 @@ impl ImageLayer { return Ok(()); } - let inner = self.load(LayerAccessKind::Dump, ctx).await?; + let inner = self.load(ctx).await?; inner.dump(ctx).await?; @@ -219,7 +248,7 @@ impl ImageLayer { conf: &PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, - fname: &ImageFileName, + fname: &ImageLayerName, ) -> Utf8PathBuf { let rand_string: String = rand::thread_rng() .sample_iter(&Alphanumeric) @@ -235,12 +264,7 @@ impl ImageLayer { /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. /// - async fn load( - &self, - access_kind: LayerAccessKind, - ctx: &RequestContext, - ) -> Result<&ImageLayerInner> { - self.access_stats.record_access(access_kind, ctx); + async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> { self.inner .get_or_try_init(|| self.load_inner(ctx)) .await @@ -250,18 +274,17 @@ impl ImageLayer { async fn load_inner(&self, ctx: &RequestContext) -> Result { let path = self.path(); - let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx) - .await - .and_then(|res| res)?; + let loaded = + ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx).await?; // not production code - let actual_filename = path.file_name().unwrap().to_owned(); - let expected_filename = self.layer_desc().filename().file_name(); + let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); + let expected_layer_name = self.layer_desc().layer_name(); - if actual_filename != expected_filename { + if actual_layer_name != expected_layer_name { println!("warning: filename does not match what is expected from in-file summary"); - println!("actual: {:?}", actual_filename); - println!("expected: {:?}", expected_filename); + println!("actual: {:?}", actual_layer_name.to_string()); + println!("expected: {:?}", expected_layer_name.to_string()); } Ok(loaded) @@ -292,7 +315,6 @@ impl ImageLayer { metadata.len(), ), // Now we assume image layer ALWAYS covers the full range. This may change in the future. lsn: summary.lsn, - access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: OnceCell::new(), }) } @@ -325,57 +347,58 @@ impl ImageLayer { where F: Fn(Summary) -> Summary, { - let file = VirtualFile::open_with_options( + let mut file = VirtualFile::open_with_options( path, - &*std::fs::OpenOptions::new().read(true).write(true), + virtual_file::OpenOptions::new().read(true).write(true), + ctx, ) .await .with_context(|| format!("Failed to open file '{}'", path))?; - let file = FileBlockReader::new(file); - let summary_blk = file.read_blk(0, ctx).await?; + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?; - let mut file = file.file; if actual_summary.magic != IMAGE_FILE_MAGIC { return Err(RewriteSummaryError::MagicMismatch); } let new_summary = rewrite(actual_summary); - let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new(); + let mut buf = Vec::with_capacity(PAGE_SZ); + // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&new_summary, &mut buf).context("serialize")?; - if buf.spilled() { - // The code in ImageLayerWriterInner just warn!()s for this. - // It should probably error out as well. - return Err(RewriteSummaryError::Other(anyhow::anyhow!( - "Used more than one page size for summary buffer: {}", - buf.len() - ))); - } file.seek(SeekFrom::Start(0)).await?; - file.write_all(&buf).await?; + let (_buf, res) = file.write_all(buf.slice_len(), ctx).await; + res?; Ok(()) } } impl ImageLayerInner { - /// Returns nested result following Result, Critical>: - /// - inner has the success or transient failure - /// - outer has the permanent failure + pub(crate) fn key_range(&self) -> &Range { + &self.key_range + } + + pub(crate) fn lsn(&self) -> Lsn { + self.lsn + } + pub(super) async fn load( path: &Utf8Path, lsn: Lsn, summary: Option, + max_vectored_read_bytes: Option, ctx: &RequestContext, - ) -> Result, anyhow::Error> { - let file = match VirtualFile::open(path).await { - Ok(file) => file, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), - }; - let file = FileBlockReader::new(file); - let summary_blk = match file.read_blk(0, ctx).await { - Ok(blk) => blk, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))), - }; + ) -> anyhow::Result { + let file = VirtualFile::open(path, ctx) + .await + .context("open layer file")?; + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = block_reader + .read_blk(0, ctx) + .await + .context("read first block")?; // length is the only way how this could fail, so it's not actually likely at all unless // read_blk returns wrong sized block. @@ -388,6 +411,8 @@ impl ImageLayerInner { // production code path expected_summary.index_start_blk = actual_summary.index_start_blk; expected_summary.index_root_blk = actual_summary.index_root_blk; + // mask out the timeline_id, but still require the layers to be from the same tenant + expected_summary.timeline_id = actual_summary.timeline_id; if actual_summary != expected_summary { bail!( @@ -398,50 +423,229 @@ impl ImageLayerInner { } } - Ok(Ok(ImageLayerInner { + Ok(ImageLayerInner { index_start_blk: actual_summary.index_start_blk, index_root_blk: actual_summary.index_root_blk, lsn, file, - })) + file_id, + max_vectored_read_bytes, + key_range: actual_summary.key_range, + }) } - pub(super) async fn get_value_reconstruct_data( + // Look up the keys in the provided keyspace and update + // the reconstruct state with whatever is found. + pub(super) async fn get_values_reconstruct_data( &self, - key: Key, - reconstruct_state: &mut ValueReconstructState, + keyspace: KeySpace, + reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, - ) -> anyhow::Result { - let file = &self.file; - let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file); + ) -> Result<(), GetVectoredError> { + let reads = self + .plan_reads(keyspace, None, ctx) + .await + .map_err(GetVectoredError::Other)?; - let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; - key.write_to_byte_slice(&mut keybuf); - if let Some(offset) = tree_reader - .get( - &keybuf, - &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::ImageLayerBtreeNode) - .build(), + self.do_reads_and_update_state(reads, reconstruct_state, ctx) + .await; + + reconstruct_state.on_image_layer_visited(&self.key_range); + + Ok(()) + } + + /// Traverse the layer's index to build read operations on the overlap of the input keyspace + /// and the keys in this layer. + /// + /// If shard_identity is provided, it will be used to filter keys down to those stored on + /// this shard. + async fn plan_reads( + &self, + keyspace: KeySpace, + shard_identity: Option<&ShardIdentity>, + ctx: &RequestContext, + ) -> anyhow::Result> { + let mut planner = VectoredReadPlanner::new( + self.max_vectored_read_bytes + .expect("Layer is loaded with max vectored bytes config") + .0 + .into(), + ); + + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = + DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader); + + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::ImageLayerBtreeNode) + .build(); + + for range in keyspace.ranges.iter() { + let mut range_end_handled = false; + let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; + range.start.write_to_byte_slice(&mut search_key); + + let index_stream = tree_reader.clone().into_stream(&search_key, &ctx); + let mut index_stream = std::pin::pin!(index_stream); + + while let Some(index_entry) = index_stream.next().await { + let (raw_key, offset) = index_entry?; + + let key = Key::from_slice(&raw_key[..KEY_SIZE]); + assert!(key >= range.start); + + let flag = if let Some(shard_identity) = shard_identity { + if shard_identity.is_key_disposable(&key) { + BlobFlag::Ignore + } else { + BlobFlag::None + } + } else { + BlobFlag::None + }; + + if key >= range.end { + planner.handle_range_end(offset); + range_end_handled = true; + break; + } else { + planner.handle(key, self.lsn, offset, flag); + } + } + + if !range_end_handled { + let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64; + planner.handle_range_end(payload_end); + } + } + + Ok(planner.finish()) + } + + /// Given a key range, select the parts of that range that should be retained by the ShardIdentity, + /// then execute vectored GET operations, passing the results of all read keys into the writer. + pub(super) async fn filter( + &self, + shard_identity: &ShardIdentity, + writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + // Fragment the range into the regions owned by this ShardIdentity + let plan = self + .plan_reads( + KeySpace { + // If asked for the total key space, plan_reads will give us all the keys in the layer + ranges: vec![Key::MIN..Key::MAX], + }, + Some(shard_identity), + ctx, ) - .await? - { - let blob = file - .block_cursor() - .read_blob( - offset, - &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::ImageLayerValue) - .build(), - ) - .await - .with_context(|| format!("failed to read value from offset {}", offset))?; - let value = Bytes::from(blob); + .await?; - reconstruct_state.img = Some((self.lsn, value)); - Ok(ValueReconstructResult::Complete) - } else { - Ok(ValueReconstructResult::Missing) + let vectored_blob_reader = VectoredBlobReader::new(&self.file); + let mut key_count = 0; + for read in plan.into_iter() { + let buf_size = read.size(); + + let buf = BytesMut::with_capacity(buf_size); + let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?; + + let frozen_buf = blobs_buf.buf.freeze(); + + for meta in blobs_buf.blobs.iter() { + let img_buf = frozen_buf.slice(meta.start..meta.end); + + key_count += 1; + writer + .put_image(meta.meta.key, img_buf, ctx) + .await + .context(format!("Storing key {}", meta.meta.key))?; + } + } + + Ok(key_count) + } + + async fn do_reads_and_update_state( + &self, + reads: Vec, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) { + let max_vectored_read_bytes = self + .max_vectored_read_bytes + .expect("Layer is loaded with max vectored bytes config") + .0 + .into(); + + let vectored_blob_reader = VectoredBlobReader::new(&self.file); + for read in reads.into_iter() { + let buf_size = read.size(); + + if buf_size > max_vectored_read_bytes { + // If the read is oversized, it should only contain one key. + let offenders = read + .blobs_at + .as_slice() + .iter() + .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn)) + .join(", "); + tracing::warn!( + "Oversized vectored read ({} > {}) for keys {}", + buf_size, + max_vectored_read_bytes, + offenders + ); + } + + let buf = BytesMut::with_capacity(buf_size); + let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await; + + match res { + Ok(blobs_buf) => { + let frozen_buf = blobs_buf.buf.freeze(); + + for meta in blobs_buf.blobs.iter() { + let img_buf = frozen_buf.slice(meta.start..meta.end); + reconstruct_state.update_key( + &meta.meta.key, + self.lsn, + Value::Image(img_buf), + ); + } + } + Err(err) => { + let kind = err.kind(); + for (_, blob_meta) in read.blobs_at.as_slice() { + reconstruct_state.on_key_error( + blob_meta.key, + PageReconstructError::from(anyhow!( + "Failed to read blobs from virtual file {}: {}", + self.file.path, + kind + )), + ); + } + } + }; + } + } + + pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> { + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = + DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader); + ImageLayerIterator { + image_layer: self, + ctx, + index_iter: tree_reader.iter(&[0; KEY_SIZE], ctx), + key_values_batch: VecDeque::new(), + is_end: false, + planner: StreamingVectoredReadPlanner::new( + 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer. + 1024, // The default value. Unit tests might use a different value + ), } } } @@ -465,8 +669,25 @@ struct ImageLayerWriterInner { key_range: Range, lsn: Lsn, + // Total uncompressed bytes passed into put_image + uncompressed_bytes: u64, + + // Like `uncompressed_bytes`, + // but only of images we might consider for compression + uncompressed_bytes_eligible: u64, + + // Like `uncompressed_bytes`, but only of images + // where we have chosen their compressed form + uncompressed_bytes_chosen: u64, + + // Number of keys in the layer. + num_keys: usize, + blob_writer: BlobWriter, tree: DiskBtreeBuilder, + + #[cfg(feature = "testing")] + last_written_key: Key, } impl ImageLayerWriterInner { @@ -479,6 +700,7 @@ impl ImageLayerWriterInner { tenant_shard_id: TenantShardId, key_range: &Range, lsn: Lsn, + ctx: &RequestContext, ) -> anyhow::Result { // Create the file initially with a temporary filename. // We'll atomically rename it to the final name when we're done. @@ -486,17 +708,22 @@ impl ImageLayerWriterInner { conf, timeline_id, tenant_shard_id, - &ImageFileName { + &ImageLayerName { key_range: key_range.clone(), lsn, }, ); - info!("new image layer {path}"); - let mut file = VirtualFile::open_with_options( - &path, - std::fs::OpenOptions::new().write(true).create_new(true), - ) - .await?; + trace!("creating image layer {}", path); + let mut file = { + VirtualFile::open_with_options( + &path, + virtual_file::OpenOptions::new() + .write(true) + .create_new(true), + ctx, + ) + .await? + }; // make room for the header block file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?; let blob_writer = BlobWriter::new(file, PAGE_SZ as u64); @@ -514,6 +741,12 @@ impl ImageLayerWriterInner { lsn, tree: tree_builder, blob_writer, + uncompressed_bytes: 0, + uncompressed_bytes_eligible: 0, + uncompressed_bytes_chosen: 0, + num_keys: 0, + #[cfg(feature = "testing")] + last_written_key: Key::MIN, }; Ok(writer) @@ -524,24 +757,64 @@ impl ImageLayerWriterInner { /// /// The page versions must be appended in blknum order. /// - async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> { + async fn put_image( + &mut self, + key: Key, + img: Bytes, + ctx: &RequestContext, + ) -> anyhow::Result<()> { ensure!(self.key_range.contains(&key)); - let off = self.blob_writer.write_blob(img).await?; + let compression = self.conf.image_compression; + let uncompressed_len = img.len() as u64; + self.uncompressed_bytes += uncompressed_len; + self.num_keys += 1; + let (_img, res) = self + .blob_writer + .write_blob_maybe_compressed(img.slice_len(), ctx, compression) + .await; + // TODO: re-use the buffer for `img` further upstack + let (off, compression_info) = res?; + if compression_info.compressed_size.is_some() { + // The image has been considered for compression at least + self.uncompressed_bytes_eligible += uncompressed_len; + } + if compression_info.written_compressed { + // The image has been compressed + self.uncompressed_bytes_chosen += uncompressed_len; + } let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; key.write_to_byte_slice(&mut keybuf); self.tree.append(&keybuf, off)?; + #[cfg(feature = "testing")] + { + self.last_written_key = key; + } + Ok(()) } /// /// Finish writing the image layer. /// - async fn finish(self, timeline: &Arc) -> anyhow::Result { + async fn finish( + self, + timeline: &Arc, + ctx: &RequestContext, + end_key: Option, + ) -> anyhow::Result { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; + // Calculate compression ratio + let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header + crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes); + crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED + .inc_by(self.uncompressed_bytes_eligible); + crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen); + crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size); + let mut file = self.blob_writer.into_inner(); // Write out the index @@ -549,32 +822,34 @@ impl ImageLayerWriterInner { .await?; let (index_root_blk, block_buf) = self.tree.finish()?; for buf in block_buf.blocks { - file.write_all(buf.as_ref()).await?; + let (_buf, res) = file.write_all(buf.slice_len(), ctx).await; + res?; } + let final_key_range = if let Some(end_key) = end_key { + self.key_range.start..end_key + } else { + self.key_range.clone() + }; + // Fill in the summary on blk 0 let summary = Summary { magic: IMAGE_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, tenant_id: self.tenant_shard_id.tenant_id, timeline_id: self.timeline_id, - key_range: self.key_range.clone(), + key_range: final_key_range.clone(), lsn: self.lsn, index_start_blk, index_root_blk, }; - let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new(); + let mut buf = Vec::with_capacity(PAGE_SZ); + // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&summary, &mut buf)?; - if buf.spilled() { - // This is bad as we only have one free block for the summary - warn!( - "Used more than one page size for summary buffer: {}", - buf.len() - ); - } file.seek(SeekFrom::Start(0)).await?; - file.write_all(&buf).await?; + let (_buf, res) = file.write_all(buf.slice_len(), ctx).await; + res?; let metadata = file .metadata() @@ -584,11 +859,19 @@ impl ImageLayerWriterInner { let desc = PersistentLayerDesc::new_img( self.tenant_shard_id, self.timeline_id, - self.key_range.clone(), + final_key_range, self.lsn, metadata.len(), ); + #[cfg(feature = "testing")] + if let Some(end_key) = end_key { + assert!( + self.last_written_key < end_key, + "written key violates end_key range" + ); + } + // Note: Because we open the file in write-only mode, we cannot // reuse the same VirtualFile for reading later. That's why we don't // set inner.file here. The first read will have to re-open it. @@ -599,7 +882,7 @@ impl ImageLayerWriterInner { // FIXME: why not carry the virtualfile here, it supports renaming? let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?; - trace!("created image layer {}", layer.local_path()); + info!("created image layer {}", layer.local_path()); Ok(layer) } @@ -641,10 +924,11 @@ impl ImageLayerWriter { tenant_shard_id: TenantShardId, key_range: &Range, lsn: Lsn, + ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { inner: Some( - ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn) + ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn, ctx) .await?, ), }) @@ -655,8 +939,23 @@ impl ImageLayerWriter { /// /// The page versions must be appended in blknum order. /// - pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> { - self.inner.as_mut().unwrap().put_image(key, img).await + pub async fn put_image( + &mut self, + key: Key, + img: Bytes, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.inner.as_mut().unwrap().put_image(key, img, ctx).await + } + + /// Estimated size of the image layer. + pub(crate) fn estimated_size(&self) -> u64 { + let inner = self.inner.as_ref().unwrap(); + inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64 + } + + pub(crate) fn num_keys(&self) -> usize { + self.inner.as_ref().unwrap().num_keys } /// @@ -665,8 +964,23 @@ impl ImageLayerWriter { pub(crate) async fn finish( mut self, timeline: &Arc, + ctx: &RequestContext, ) -> anyhow::Result { - self.inner.take().unwrap().finish(timeline).await + self.inner.take().unwrap().finish(timeline, ctx, None).await + } + + /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive. + pub(super) async fn finish_with_end_key( + mut self, + timeline: &Arc, + end_key: Key, + ctx: &RequestContext, + ) -> anyhow::Result { + self.inner + .take() + .unwrap() + .finish(timeline, ctx, Some(end_key)) + .await } } @@ -677,3 +991,384 @@ impl Drop for ImageLayerWriter { } } } + +pub struct ImageLayerIterator<'a> { + image_layer: &'a ImageLayerInner, + ctx: &'a RequestContext, + planner: StreamingVectoredReadPlanner, + index_iter: DiskBtreeIterator<'a>, + key_values_batch: VecDeque<(Key, Lsn, Value)>, + is_end: bool, +} + +impl<'a> ImageLayerIterator<'a> { + pub(crate) fn layer_dbg_info(&self) -> String { + self.image_layer.layer_dbg_info() + } + + /// Retrieve a batch of key-value pairs into the iterator buffer. + async fn next_batch(&mut self) -> anyhow::Result<()> { + assert!(self.key_values_batch.is_empty()); + assert!(!self.is_end); + + let plan = loop { + if let Some(res) = self.index_iter.next().await { + let (raw_key, offset) = res?; + if let Some(batch_plan) = self.planner.handle( + Key::from_slice(&raw_key[..KEY_SIZE]), + self.image_layer.lsn, + offset, + ) { + break batch_plan; + } + } else { + self.is_end = true; + let payload_end = self.image_layer.index_start_blk as u64 * PAGE_SZ as u64; + if let Some(item) = self.planner.handle_range_end(payload_end) { + break item; + } else { + return Ok(()); // TODO: a test case on empty iterator + } + } + }; + let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file); + let mut next_batch = std::collections::VecDeque::new(); + let buf_size = plan.size(); + let buf = BytesMut::with_capacity(buf_size); + let blobs_buf = vectored_blob_reader + .read_blobs(&plan, buf, self.ctx) + .await?; + let frozen_buf: Bytes = blobs_buf.buf.freeze(); + for meta in blobs_buf.blobs.iter() { + let img_buf = frozen_buf.slice(meta.start..meta.end); + next_batch.push_back((meta.meta.key, self.image_layer.lsn, Value::Image(img_buf))); + } + self.key_values_batch = next_batch; + Ok(()) + } + + pub async fn next(&mut self) -> anyhow::Result> { + if self.key_values_batch.is_empty() { + if self.is_end { + return Ok(None); + } + self.next_batch().await?; + } + Ok(Some( + self.key_values_batch + .pop_front() + .expect("should not be empty"), + )) + } +} + +#[cfg(test)] +mod test { + use std::{sync::Arc, time::Duration}; + + use bytes::Bytes; + use itertools::Itertools; + use pageserver_api::{ + key::Key, + shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}, + }; + use utils::{ + generation::Generation, + id::{TenantId, TimelineId}, + lsn::Lsn, + }; + + use crate::{ + context::RequestContext, + repository::Value, + tenant::{ + config::TenantConf, + harness::{TenantHarness, TIMELINE_ID}, + storage_layer::ResidentLayer, + vectored_blob_io::StreamingVectoredReadPlanner, + Tenant, Timeline, + }, + DEFAULT_PG_VERSION, + }; + + use super::{ImageLayerIterator, ImageLayerWriter}; + + #[tokio::test] + async fn image_layer_rewrite() { + let tenant_conf = TenantConf { + gc_period: Duration::ZERO, + compaction_period: Duration::ZERO, + ..TenantConf::default() + }; + let tenant_id = TenantId::generate(); + let mut gen = Generation::new(0xdead0001); + let mut get_next_gen = || { + let ret = gen; + gen = gen.next(); + ret + }; + // The LSN at which we will create an image layer to filter + let lsn = Lsn(0xdeadbeef0000); + let timeline_id = TimelineId::generate(); + + // + // Create an unsharded parent with a layer. + // + + let harness = TenantHarness::create_custom( + "test_image_layer_rewrite--parent", + tenant_conf.clone(), + tenant_id, + ShardIdentity::unsharded(), + get_next_gen(), + ) + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + let timeline = tenant + .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap(); + let range = input_start..input_end; + + // Build an image layer to filter + let resident = { + let mut writer = ImageLayerWriter::new( + harness.conf, + timeline_id, + harness.tenant_shard_id, + &range, + lsn, + &ctx, + ) + .await + .unwrap(); + + let foo_img = Bytes::from_static(&[1, 2, 3, 4]); + let mut key = range.start; + while key < range.end { + writer.put_image(key, foo_img.clone(), &ctx).await.unwrap(); + + key = key.next(); + } + writer.finish(&timeline, &ctx).await.unwrap() + }; + let original_size = resident.metadata().file_size; + + // + // Create child shards and do the rewrite, exercising filter(). + // TODO: abstraction in TenantHarness for splits. + // + + // Filter for various shards: this exercises cases like values at start of key range, end of key + // range, middle of key range. + let shard_count = ShardCount::new(4); + for shard_number in 0..shard_count.count() { + // + // mimic the shard split + // + let shard_identity = ShardIdentity::new( + ShardNumber(shard_number), + shard_count, + ShardStripeSize(0x8000), + ) + .unwrap(); + let harness = TenantHarness::create_custom( + Box::leak(Box::new(format!( + "test_image_layer_rewrite--child{}", + shard_identity.shard_slug() + ))), + tenant_conf.clone(), + tenant_id, + shard_identity, + // NB: in reality, the shards would each fork off their own gen number sequence from the parent. + // But here, all we care about is that the gen number is unique. + get_next_gen(), + ) + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + let timeline = tenant + .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + // + // use filter() and make assertions + // + + let mut filtered_writer = ImageLayerWriter::new( + harness.conf, + timeline_id, + harness.tenant_shard_id, + &range, + lsn, + &ctx, + ) + .await + .unwrap(); + + let wrote_keys = resident + .filter(&shard_identity, &mut filtered_writer, &ctx) + .await + .unwrap(); + let replacement = if wrote_keys > 0 { + Some(filtered_writer.finish(&timeline, &ctx).await.unwrap()) + } else { + None + }; + + // This exact size and those below will need updating as/when the layer encoding changes, but + // should be deterministic for a given version of the format, as we used no randomness generating the input. + assert_eq!(original_size, 1597440); + + match shard_number { + 0 => { + // We should have written out just one stripe for our shard identity + assert_eq!(wrote_keys, 0x8000); + let replacement = replacement.unwrap(); + + // We should have dropped some of the data + assert!(replacement.metadata().file_size < original_size); + assert!(replacement.metadata().file_size > 0); + + // Assert that we dropped ~3/4 of the data. + assert_eq!(replacement.metadata().file_size, 417792); + } + 1 => { + // Shard 1 has no keys in our input range + assert_eq!(wrote_keys, 0x0); + assert!(replacement.is_none()); + } + 2 => { + // Shard 2 has one stripes in the input range + assert_eq!(wrote_keys, 0x8000); + let replacement = replacement.unwrap(); + assert!(replacement.metadata().file_size < original_size); + assert!(replacement.metadata().file_size > 0); + assert_eq!(replacement.metadata().file_size, 417792); + } + 3 => { + // Shard 3 has two stripes in the input range + assert_eq!(wrote_keys, 0x10000); + let replacement = replacement.unwrap(); + assert!(replacement.metadata().file_size < original_size); + assert!(replacement.metadata().file_size > 0); + assert_eq!(replacement.metadata().file_size, 811008); + } + _ => unreachable!(), + } + } + } + + async fn produce_image_layer( + tenant: &Tenant, + tline: &Arc, + mut images: Vec<(Key, Bytes)>, + lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result { + images.sort(); + let (key_start, _) = images.first().unwrap(); + let (key_last, _) = images.last().unwrap(); + let key_end = key_last.next(); + let key_range = *key_start..key_end; + let mut writer = ImageLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + &key_range, + lsn, + ctx, + ) + .await?; + + for (key, img) in images { + writer.put_image(key, img, ctx).await?; + } + let img_layer = writer.finish(tline, ctx).await?; + + Ok::<_, anyhow::Error>(img_layer) + } + + async fn assert_img_iter_equal( + img_iter: &mut ImageLayerIterator<'_>, + expect: &[(Key, Bytes)], + expect_lsn: Lsn, + ) { + let mut expect_iter = expect.iter(); + loop { + let o1 = img_iter.next().await.unwrap(); + let o2 = expect_iter.next(); + match (o1, o2) { + (None, None) => break, + (Some((k1, l1, v1)), Some((k2, i2))) => { + let Value::Image(i1) = v1 else { + panic!("expect Value::Image") + }; + assert_eq!(&k1, k2); + assert_eq!(l1, expect_lsn); + assert_eq!(&i1, i2); + } + (o1, o2) => panic!("iterators length mismatch: {:?}, {:?}", o1, o2), + } + } + } + + #[tokio::test] + async fn image_layer_iterator() { + let harness = TenantHarness::create("image_layer_iterator").await.unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + const N: usize = 1000; + let test_imgs = (0..N) + .map(|idx| (get_key(idx as u32), Bytes::from(format!("img{idx:05}")))) + .collect_vec(); + let resident_layer = + produce_image_layer(&tenant, &tline, test_imgs.clone(), Lsn(0x10), &ctx) + .await + .unwrap(); + let img_layer = resident_layer.get_as_image(&ctx).await.unwrap(); + for max_read_size in [1, 1024] { + for batch_size in [1, 2, 4, 8, 3, 7, 13] { + println!("running with batch_size={batch_size} max_read_size={max_read_size}"); + // Test if the batch size is correctly determined + let mut iter = img_layer.iter(&ctx); + iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut num_items = 0; + for _ in 0..3 { + iter.next_batch().await.unwrap(); + num_items += iter.key_values_batch.len(); + if max_read_size == 1 { + // every key should be a batch b/c the value is larger than max_read_size + assert_eq!(iter.key_values_batch.len(), 1); + } else { + assert!(iter.key_values_batch.len() <= batch_size); + } + if num_items >= N { + break; + } + iter.key_values_batch.clear(); + } + // Test if the result is correct + let mut iter = img_layer.iter(&ctx); + iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + assert_img_iter_equal(&mut iter, &test_imgs, Lsn(0x10)).await; + } + } + } +} diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 7c9103eea8..e487bee1f2 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -4,33 +4,51 @@ //! held in an ephemeral file, not in memory. The metadata for each page version, i.e. //! its position in the file, is kept in memory, though. //! +use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64}; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::repository::{Key, Value}; -use crate::tenant::block_io::BlockReader; use crate::tenant::ephemeral_file::EphemeralFile; -use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState}; -use crate::tenant::Timeline; -use crate::walrecord; -use anyhow::{ensure, Result}; +use crate::tenant::timeline::GetVectoredError; +use crate::tenant::PageReconstructError; +use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; +use crate::{l0_flush, page_cache}; +use anyhow::{anyhow, Context, Result}; +use bytes::Bytes; +use camino::Utf8PathBuf; +use pageserver_api::key::CompactKey; +use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::sync::{Arc, OnceLock}; +use std::time::Instant; use tracing::*; use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap}; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods -use std::fmt::Write as _; +use crate::metrics::TIMELINE_EPHEMERAL_BYTES; +use std::cmp::Ordering; +use std::fmt::Write; use std::ops::Range; -use tokio::sync::{RwLock, RwLockWriteGuard}; +use std::sync::atomic::Ordering as AtomicOrdering; +use std::sync::atomic::{AtomicU64, AtomicUsize}; +use tokio::sync::RwLock; -use super::{DeltaLayerWriter, ResidentLayer}; +use super::{ + DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState, +}; + +pub(crate) mod vectored_dio_read; + +#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] +pub(crate) struct InMemoryLayerFileId(page_cache::FileId); pub struct InMemoryLayer { conf: &'static PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + file_id: InMemoryLayerFileId, /// This layer contains all the changes from 'start_lsn'. The /// start is inclusive. @@ -38,7 +56,12 @@ pub struct InMemoryLayer { /// Frozen layers have an exclusive end LSN. /// Writes are only allowed when this is `None`. - end_lsn: OnceLock, + pub(crate) end_lsn: OnceLock, + + /// Used for traversal path. Cached representation of the in-memory layer after frozen. + frozen_local_path_str: OnceLock>, + + opened_at: Instant, /// The above fields never change, except for `end_lsn`, which is only set once. /// All other changing parts are in `inner`, and protected by a mutex. @@ -56,15 +79,165 @@ impl std::fmt::Debug for InMemoryLayer { } pub struct InMemoryLayerInner { - /// All versions of all pages in the layer are kept here. Indexed - /// by block number and LSN. The value is an offset into the + /// All versions of all pages in the layer are kept here. Indexed + /// by block number and LSN. The [`IndexEntry`] is an offset into the /// ephemeral file where the page version is stored. - index: HashMap>, + index: BTreeMap>, /// The values are stored in a serialized format in this file. /// Each serialized Value is preceded by a 'u32' length field. /// PerSeg::page_versions map stores offsets into this file. file: EphemeralFile, + + resource_units: GlobalResourceUnits, +} + +/// Support the same max blob length as blob_io, because ultimately +/// all the InMemoryLayer contents end up being written into a delta layer, +/// using the [`crate::tenant::blob_io`]. +const MAX_SUPPORTED_BLOB_LEN: usize = crate::tenant::blob_io::MAX_SUPPORTED_BLOB_LEN; +const MAX_SUPPORTED_BLOB_LEN_BITS: usize = { + let trailing_ones = MAX_SUPPORTED_BLOB_LEN.trailing_ones() as usize; + let leading_zeroes = MAX_SUPPORTED_BLOB_LEN.leading_zeros() as usize; + assert!(trailing_ones + leading_zeroes == std::mem::size_of::() * 8); + trailing_ones +}; + +/// See [`InMemoryLayerInner::index`]. +/// +/// For memory efficiency, the data is packed into a u64. +/// +/// Layout: +/// - 1 bit: `will_init` +/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len` +/// - [`MAX_SUPPORTED_POS_BITS`]: `pos` +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct IndexEntry(u64); + +impl IndexEntry { + /// See [`Self::MAX_SUPPORTED_POS`]. + const MAX_SUPPORTED_POS_BITS: usize = { + let remainder = 64 - 1 - MAX_SUPPORTED_BLOB_LEN_BITS; + if remainder < 32 { + panic!("pos can be u32 as per type system, support that"); + } + remainder + }; + /// The maximum supported blob offset that can be represented by [`Self`]. + /// See also [`Self::validate_checkpoint_distance`]. + const MAX_SUPPORTED_POS: usize = (1 << Self::MAX_SUPPORTED_POS_BITS) - 1; + + // Layout + const WILL_INIT_RANGE: Range = 0..1; + const LEN_RANGE: Range = + Self::WILL_INIT_RANGE.end..Self::WILL_INIT_RANGE.end + MAX_SUPPORTED_BLOB_LEN_BITS; + const POS_RANGE: Range = + Self::LEN_RANGE.end..Self::LEN_RANGE.end + Self::MAX_SUPPORTED_POS_BITS; + const _ASSERT: () = { + if Self::POS_RANGE.end != 64 { + panic!("we don't want undefined bits for our own sanity") + } + }; + + /// Fails if and only if the offset or length encoded in `arg` is too large to be represented by [`Self`]. + /// + /// The only reason why that can happen in the system is if the [`InMemoryLayer`] grows too long. + /// The [`InMemoryLayer`] size is determined by the checkpoint distance, enforced by [`crate::tenant::Timeline::should_roll`]. + /// + /// Thus, to avoid failure of this function, whenever we start up and/or change checkpoint distance, + /// call [`Self::validate_checkpoint_distance`] with the new checkpoint distance value. + /// + /// TODO: this check should happen ideally at config parsing time (and in the request handler when a change to checkpoint distance is requested) + /// When cleaning this up, also look into the s3 max file size check that is performed in delta layer writer. + #[inline(always)] + fn new(arg: IndexEntryNewArgs) -> anyhow::Result { + let IndexEntryNewArgs { + base_offset, + batch_offset, + len, + will_init, + } = arg; + + let pos = base_offset + .checked_add(batch_offset) + .ok_or_else(|| anyhow::anyhow!("base_offset + batch_offset overflows u64: base_offset={base_offset} batch_offset={batch_offset}"))?; + + if pos.into_usize() > Self::MAX_SUPPORTED_POS { + anyhow::bail!( + "base_offset+batch_offset exceeds the maximum supported value: base_offset={base_offset} batch_offset={batch_offset} (+)={pos} max={max}", + max = Self::MAX_SUPPORTED_POS + ); + } + + if len > MAX_SUPPORTED_BLOB_LEN { + anyhow::bail!( + "len exceeds the maximum supported length: len={len} max={MAX_SUPPORTED_BLOB_LEN}", + ); + } + + let mut data: u64 = 0; + use bit_field::BitField; + data.set_bits(Self::WILL_INIT_RANGE, if will_init { 1 } else { 0 }); + data.set_bits(Self::LEN_RANGE, len.into_u64()); + data.set_bits(Self::POS_RANGE, pos); + + Ok(Self(data)) + } + + #[inline(always)] + fn unpack(&self) -> IndexEntryUnpacked { + use bit_field::BitField; + IndexEntryUnpacked { + will_init: self.0.get_bits(Self::WILL_INIT_RANGE) != 0, + len: self.0.get_bits(Self::LEN_RANGE), + pos: self.0.get_bits(Self::POS_RANGE), + } + } + + /// See [`Self::new`]. + pub(crate) const fn validate_checkpoint_distance( + checkpoint_distance: u64, + ) -> Result<(), &'static str> { + if checkpoint_distance > Self::MAX_SUPPORTED_POS as u64 { + return Err("exceeds the maximum supported value"); + } + let res = u64_to_usize(checkpoint_distance).checked_add(MAX_SUPPORTED_BLOB_LEN); + if res.is_none() { + return Err( + "checkpoint distance + max supported blob len overflows in-memory addition", + ); + } + + // NB: it is ok for the result of the addition to be larger than MAX_SUPPORTED_POS + + Ok(()) + } + + const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = { + let res = Self::validate_checkpoint_distance( + pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE, + ); + if res.is_err() { + panic!("default checkpoint distance is valid") + } + }; +} + +/// Args to [`IndexEntry::new`]. +#[derive(Clone, Copy)] +struct IndexEntryNewArgs { + base_offset: u64, + batch_offset: u64, + len: usize, + will_init: bool, +} + +/// Unpacked representation of the bitfielded [`IndexEntry`]. +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +struct IndexEntryUnpacked { + will_init: bool, + len: u64, + pos: u64, } impl std::fmt::Debug for InMemoryLayerInner { @@ -73,7 +246,126 @@ impl std::fmt::Debug for InMemoryLayerInner { } } +/// State shared by all in-memory (ephemeral) layers. Updated infrequently during background ticks in Timeline, +/// to minimize contention. +/// +/// This global state is used to implement behaviors that require a global view of the system, e.g. +/// rolling layers proactively to limit the total amount of dirty data. +pub(crate) struct GlobalResources { + // Limit on how high dirty_bytes may grow before we start freezing layers to reduce it. + // Zero means unlimited. + pub(crate) max_dirty_bytes: AtomicU64, + // How many bytes are in all EphemeralFile objects + dirty_bytes: AtomicU64, + // How many layers are contributing to dirty_bytes + dirty_layers: AtomicUsize, +} + +// Per-timeline RAII struct for its contribution to [`GlobalResources`] +struct GlobalResourceUnits { + // How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible + // for decrementing the global counter by this many bytes when dropped. + dirty_bytes: u64, +} + +impl GlobalResourceUnits { + // Hint for the layer append path to update us when the layer size differs from the last + // call to update_size by this much. If we don't reach this threshold, we'll still get + // updated when the Timeline "ticks" in the background. + const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024; + + fn new() -> Self { + GLOBAL_RESOURCES + .dirty_layers + .fetch_add(1, AtomicOrdering::Relaxed); + Self { dirty_bytes: 0 } + } + + /// Do not call this frequently: all timelines will write to these same global atomics, + /// so this is a relatively expensive operation. Wait at least a few seconds between calls. + /// + /// Returns the effective layer size limit that should be applied, if any, to keep + /// the total number of dirty bytes below the configured maximum. + fn publish_size(&mut self, size: u64) -> Option { + let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) { + Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed), + Ordering::Greater => { + let delta = size - self.dirty_bytes; + let old = GLOBAL_RESOURCES + .dirty_bytes + .fetch_add(delta, AtomicOrdering::Relaxed); + old + delta + } + Ordering::Less => { + let delta = self.dirty_bytes - size; + let old = GLOBAL_RESOURCES + .dirty_bytes + .fetch_sub(delta, AtomicOrdering::Relaxed); + old - delta + } + }; + + // This is a sloppy update: concurrent updates to the counter will race, and the exact + // value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes. + // That's okay: as long as the metric contains some recent value, it doesn't have to always + // be literally the last update. + TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes); + + self.dirty_bytes = size; + + let max_dirty_bytes = GLOBAL_RESOURCES + .max_dirty_bytes + .load(AtomicOrdering::Relaxed); + if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes { + // Set the layer file limit to the average layer size: this implies that all above-average + // sized layers will be elegible for freezing. They will be frozen in the order they + // next enter publish_size. + Some( + new_global_dirty_bytes + / GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64, + ) + } else { + None + } + } + + // Call publish_size if the input size differs from last published size by more than + // the drift limit + fn maybe_publish_size(&mut self, size: u64) { + let publish = match size.cmp(&self.dirty_bytes) { + Ordering::Equal => false, + Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT, + Ordering::Less => self.dirty_bytes - size > Self::MAX_SIZE_DRIFT, + }; + + if publish { + self.publish_size(size); + } + } +} + +impl Drop for GlobalResourceUnits { + fn drop(&mut self) { + GLOBAL_RESOURCES + .dirty_layers + .fetch_sub(1, AtomicOrdering::Relaxed); + + // Subtract our contribution to the global total dirty bytes + self.publish_size(0); + } +} + +pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources { + max_dirty_bytes: AtomicU64::new(0), + dirty_bytes: AtomicU64::new(0), + dirty_layers: AtomicUsize::new(0), +}; + impl InMemoryLayer { + pub(crate) fn file_id(&self) -> InMemoryLayerFileId { + self.file_id + } + pub(crate) fn get_timeline_id(&self) -> TimelineId { self.timeline_id } @@ -88,6 +380,10 @@ impl InMemoryLayer { } } + pub(crate) fn try_len(&self) -> Option { + self.inner.try_read().map(|i| i.file.len()).ok() + } + pub(crate) fn assert_writable(&self) { assert!(self.end_lsn.get().is_none()); } @@ -103,9 +399,7 @@ impl InMemoryLayer { /// debugging function to print out the contents of the layer /// /// this is likely completly unused - pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> { - let inner = self.inner.read().await; - + pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> { let end_str = self.end_lsn_or_max(); println!( @@ -113,101 +407,192 @@ impl InMemoryLayer { self.timeline_id, self.start_lsn, end_str, ); - if !verbose { - return Ok(()); - } - - let cursor = inner.file.block_cursor(); - let mut buf = Vec::new(); - for (key, vec_map) in inner.index.iter() { - for (lsn, pos) in vec_map.as_slice() { - let mut desc = String::new(); - cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?; - let val = Value::des(&buf); - match val { - Ok(Value::Image(img)) => { - write!(&mut desc, " img {} bytes", img.len())?; - } - Ok(Value::WalRecord(rec)) => { - let wal_desc = walrecord::describe_wal_record(&rec).unwrap(); - write!( - &mut desc, - " rec {} bytes will_init: {} {}", - buf.len(), - rec.will_init(), - wal_desc - )?; - } - Err(err) => { - write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?; - } - } - println!(" key {} at {}: {}", key, lsn, desc); - } - } - Ok(()) } - /// Look up given value in the layer. - pub(crate) async fn get_value_reconstruct_data( + // Look up the keys in the provided keyspace and update + // the reconstruct state with whatever is found. + // + // If the key is cached, go no further than the cached Lsn. + pub(crate) async fn get_values_reconstruct_data( &self, - key: Key, - lsn_range: Range, - reconstruct_state: &mut ValueReconstructState, + keyspace: KeySpace, + end_lsn: Lsn, + reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, - ) -> anyhow::Result { - ensure!(lsn_range.start >= self.start_lsn); - let mut need_image = true; - + ) -> Result<(), GetVectoredError> { let ctx = RequestContextBuilder::extend(ctx) .page_content_kind(PageContentKind::InMemoryLayer) .build(); let inner = self.inner.read().await; - let reader = inner.file.block_cursor(); + struct ValueRead { + entry_lsn: Lsn, + read: vectored_dio_read::LogicalRead>, + } + let mut reads: HashMap> = HashMap::new(); - // Scan the page versions backwards, starting from `lsn`. - if let Some(vec_map) = inner.index.get(&key) { - let slice = vec_map.slice_range(lsn_range); - for (entry_lsn, pos) in slice.iter().rev() { - let buf = reader.read_blob(*pos, &ctx).await?; - let value = Value::des(&buf)?; - match value { - Value::Image(img) => { - reconstruct_state.img = Some((*entry_lsn, img)); - return Ok(ValueReconstructResult::Complete); - } - Value::WalRecord(rec) => { - let will_init = rec.will_init(); - reconstruct_state.records.push((*entry_lsn, rec)); - if will_init { - // This WAL record initializes the page, so no need to go further back - need_image = false; - break; - } + for range in keyspace.ranges.iter() { + for (key, vec_map) in inner + .index + .range(range.start.to_compact()..range.end.to_compact()) + { + let key = Key::from_compact(*key); + let lsn_range = match reconstruct_state.get_cached_lsn(&key) { + Some(cached_lsn) => (cached_lsn + 1)..end_lsn, + None => self.start_lsn..end_lsn, + }; + + let slice = vec_map.slice_range(lsn_range); + + for (entry_lsn, index_entry) in slice.iter().rev() { + let IndexEntryUnpacked { + pos, + len, + will_init, + } = index_entry.unpack(); + reads.entry(key).or_default().push(ValueRead { + entry_lsn: *entry_lsn, + read: vectored_dio_read::LogicalRead::new( + pos, + Vec::with_capacity(len as usize), + ), + }); + if will_init { + break; } } } } - // release lock on 'inner' + // Execute the reads. - // If an older page image is needed to reconstruct the page, let the - // caller know. - if need_image { - Ok(ValueReconstructResult::Continue) - } else { - Ok(ValueReconstructResult::Complete) + let f = vectored_dio_read::execute( + &inner.file, + reads + .iter() + .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)), + &ctx, + ); + send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865 + .await; + + // Process results into the reconstruct state + 'next_key: for (key, value_reads) in reads { + for ValueRead { entry_lsn, read } in value_reads { + match read.into_result().expect("we run execute() above") { + Err(e) => { + reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e))); + continue 'next_key; + } + Ok(value_buf) => { + let value = Value::des(&value_buf); + if let Err(e) = value { + reconstruct_state + .on_key_error(key, PageReconstructError::from(anyhow!(e))); + continue 'next_key; + } + + let key_situation = + reconstruct_state.update_key(&key, entry_lsn, value.unwrap()); + if key_situation == ValueReconstructSituation::Complete { + // TODO: metric to see if we fetched more values than necessary + continue 'next_key; + } + + // process the next value in the next iteration of the loop + } + } + } } + + reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn); + + Ok(()) } } +/// Offset of a particular Value within a serialized batch. +struct SerializedBatchOffset { + key: CompactKey, + lsn: Lsn, + // TODO: separate type when we start serde-serializing this value, to avoid coupling + // in-memory representation to serialization format. + index_entry: IndexEntry, +} + +pub struct SerializedBatch { + /// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`]. + pub(crate) raw: Vec, + + /// Index of values in [`Self::raw`], using offsets relative to the start of the buffer. + offsets: Vec, + + /// The highest LSN of any value in the batch + pub(crate) max_lsn: Lsn, +} + +impl SerializedBatch { + pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> anyhow::Result { + // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by + // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`] + let buffer_size = batch.iter().map(|i| i.2).sum::(); + let mut cursor = std::io::Cursor::new(Vec::::with_capacity(buffer_size)); + + let mut offsets: Vec = Vec::with_capacity(batch.len()); + let mut max_lsn: Lsn = Lsn(0); + for (key, lsn, val_ser_size, val) in batch { + let relative_off = cursor.position(); + + val.ser_into(&mut cursor) + .expect("Writing into in-memory buffer is infallible"); + + offsets.push(SerializedBatchOffset { + key, + lsn, + index_entry: IndexEntry::new(IndexEntryNewArgs { + base_offset: 0, + batch_offset: relative_off, + len: val_ser_size, + will_init: val.will_init(), + }) + .context("higher-level code ensures that values are within supported ranges")?, + }); + max_lsn = std::cmp::max(max_lsn, lsn); + } + + let buffer = cursor.into_inner(); + + // Assert that we didn't do any extra allocations while building buffer. + debug_assert!(buffer.len() <= buffer_size); + + Ok(Self { + raw: buffer, + offsets, + max_lsn, + }) + } +} + +fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result { + write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0) +} + +fn inmem_layer_log_display( + mut f: impl Write, + timeline: TimelineId, + start_lsn: Lsn, + end_lsn: Lsn, +) -> std::fmt::Result { + write!(f, "timeline {} in-memory ", timeline)?; + inmem_layer_display(f, start_lsn, end_lsn) +} + impl std::fmt::Display for InMemoryLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let end_lsn = self.end_lsn_or_max(); - write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0) + inmem_layer_display(f, self.start_lsn, end_lsn) } } @@ -224,92 +609,112 @@ impl InMemoryLayer { timeline_id: TimelineId, tenant_shard_id: TenantShardId, start_lsn: Lsn, + gate_guard: utils::sync::gate::GateGuard, + ctx: &RequestContext, ) -> Result { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?; + let file = + EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?; + let key = InMemoryLayerFileId(file.page_cache_file_id()); Ok(InMemoryLayer { + file_id: key, + frozen_local_path_str: OnceLock::new(), conf, timeline_id, tenant_shard_id, start_lsn, end_lsn: OnceLock::new(), + opened_at: Instant::now(), inner: RwLock::new(InMemoryLayerInner { - index: HashMap::new(), + index: BTreeMap::new(), file, + resource_units: GlobalResourceUnits::new(), }), }) } - // Write operations - - /// Common subroutine of the public put_wal_record() and put_page_image() functions. - /// Adds the page version to the in-memory tree - pub(crate) async fn put_value( + /// Write path. + /// + /// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from. + /// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable. + /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors. + pub async fn put_batch( &self, - key: Key, - lsn: Lsn, - val: &Value, + serialized_batch: SerializedBatch, ctx: &RequestContext, - ) -> Result<()> { + ) -> anyhow::Result<()> { let mut inner = self.inner.write().await; self.assert_writable(); - self.put_value_locked(&mut inner, key, lsn, val, ctx).await - } - pub(crate) async fn put_values( - &self, - values: &HashMap>, - ctx: &RequestContext, - ) -> Result<()> { - let mut inner = self.inner.write().await; - self.assert_writable(); - for (key, vals) in values { - for (lsn, val) in vals { - self.put_value_locked(&mut inner, *key, *lsn, val, ctx) - .await?; + let base_offset = inner.file.len(); + + let SerializedBatch { + raw, + mut offsets, + max_lsn: _, + } = serialized_batch; + + // Add the base_offset to the batch's index entries which are relative to the batch start. + for offset in &mut offsets { + let IndexEntryUnpacked { + will_init, + len, + pos, + } = offset.index_entry.unpack(); + offset.index_entry = IndexEntry::new(IndexEntryNewArgs { + base_offset, + batch_offset: pos, + len: len.into_usize(), + will_init, + })?; + } + + // Write the batch to the file + inner.file.write_raw(&raw, ctx).await?; + let new_size = inner.file.len(); + let expected_new_len = base_offset + .checked_add(raw.len().into_u64()) + // write_raw would error if we were to overflow u64. + // also IndexEntry and higher levels in + //the code don't allow the file to grow that large + .unwrap(); + assert_eq!(new_size, expected_new_len); + + // Update the index with the new entries + for SerializedBatchOffset { + key, + lsn, + index_entry, + } in offsets + { + let vec_map = inner.index.entry(key).or_default(); + let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0; + if old.is_some() { + // This should not break anything, but is unexpected: ingestion code aims to filter out + // multiple writes to the same key at the same LSN. This happens in cases where our + // ingenstion code generates some write like an empty page, and we see a write from postgres + // to the same key in the same wal record. If one such write makes it through, we + // index the most recent write, implicitly ignoring the earlier write. We log a warning + // because this case is unexpected, and we would like tests to fail if this happens. + warn!("Key {} at {} written twice at same LSN", key, lsn); } } + + inner.resource_units.maybe_publish_size(new_size); + Ok(()) } - async fn put_value_locked( - &self, - locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>, - key: Key, - lsn: Lsn, - val: &Value, - ctx: &RequestContext, - ) -> Result<()> { - trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); + pub(crate) fn get_opened_at(&self) -> Instant { + self.opened_at + } - let off = { - // Avoid doing allocations for "small" values. - // In the regression test suite, the limit of 256 avoided allocations in 95% of cases: - // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061 - let mut buf = smallvec::SmallVec::<[u8; 256]>::new(); - buf.clear(); - val.ser_into(&mut buf)?; - locked_inner - .file - .write_blob( - &buf, - &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::InMemoryLayer) - .build(), - ) - .await? - }; - - let vec_map = locked_inner.index.entry(key).or_default(); - let old = vec_map.append_or_update_last(lsn, off).unwrap().0; - if old.is_some() { - // We already had an entry for this LSN. That's odd.. - warn!("Key {} at {} already exists", key, lsn); - } - - Ok(()) + pub(crate) async fn tick(&self) -> Option { + let mut inner = self.inner.write().await; + let size = inner.file.len(); + inner.resource_units.publish_size(size) } pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range, Lsn)]) -> Result<()> { @@ -320,26 +725,45 @@ impl InMemoryLayer { /// Records the end_lsn for non-dropped layers. /// `end_lsn` is exclusive pub async fn freeze(&self, end_lsn: Lsn) { - let inner = self.inner.write().await; - - assert!(self.start_lsn < end_lsn); + assert!( + self.start_lsn < end_lsn, + "{} >= {}", + self.start_lsn, + end_lsn + ); self.end_lsn.set(end_lsn).expect("end_lsn set only once"); - for vec_map in inner.index.values() { - for (lsn, _pos) in vec_map.as_slice() { - assert!(*lsn < end_lsn); + self.frozen_local_path_str + .set({ + let mut buf = String::new(); + inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn) + .unwrap(); + buf.into() + }) + .expect("frozen_local_path_str set only once"); + + #[cfg(debug_assertions)] + { + let inner = self.inner.write().await; + for vec_map in inner.index.values() { + for (lsn, _) in vec_map.as_slice() { + assert!(*lsn < end_lsn); + } } } } - /// Write this frozen in-memory layer to disk. + /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta + /// layer will only contain the key range the user specifies, and may return `None` + /// if there are no matching keys. /// /// Returns a new delta layer with all the same data as this in-memory layer - pub(crate) async fn write_to_disk( + pub async fn write_to_disk( &self, - timeline: &Arc, ctx: &RequestContext, - ) -> Result { + key_range: Option>, + l0_flush_global_state: &l0_flush::Inner, + ) -> Result> { // Grab the lock in read-mode. We hold it over the I/O, but because this // layer is not writeable anymore, no one should be trying to acquire the // write lock on it, so we shouldn't block anyone. There's one exception @@ -351,46 +775,216 @@ impl InMemoryLayer { // rare though, so we just accept the potential latency hit for now. let inner = self.inner.read().await; + use l0_flush::Inner; + let _concurrency_permit = match l0_flush_global_state { + Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await), + }; + let end_lsn = *self.end_lsn.get().unwrap(); + let key_count = if let Some(key_range) = key_range { + let key_range = key_range.start.to_compact()..key_range.end.to_compact(); + + inner + .index + .iter() + .filter(|(k, _)| key_range.contains(k)) + .count() + } else { + inner.index.len() + }; + if key_count == 0 { + return Ok(None); + } + let mut delta_layer_writer = DeltaLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, Key::MIN, self.start_lsn..end_lsn, + ctx, ) .await?; - let mut buf = Vec::new(); + match l0_flush_global_state { + l0_flush::Inner::Direct { .. } => { + let file_contents: Vec = inner.file.load_to_vec(ctx).await?; - let cursor = inner.file.block_cursor(); + let file_contents = Bytes::from(file_contents); - // Sort the keys because delta layer writer expects them sorted. - // - // NOTE: this sort can take up significant time if the layer has millions of - // keys. To speed up all the comparisons we convert the key to i128 and - // keep the value as a reference. - let mut keys: Vec<_> = inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect(); - keys.sort_unstable_by_key(|k| k.0); - - let ctx = RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::InMemoryLayer) - .build(); - for (key, vec_map) in keys.iter() { - let key = Key::from_i128(*key); - // Write all page versions - for (lsn, pos) in vec_map.as_slice() { - cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?; - let will_init = Value::des(&buf)?.will_init(); - delta_layer_writer - .put_value_bytes(key, *lsn, &buf, will_init) - .await?; + for (key, vec_map) in inner.index.iter() { + // Write all page versions + for (lsn, entry) in vec_map + .as_slice() + .iter() + .map(|(lsn, entry)| (lsn, entry.unpack())) + { + let IndexEntryUnpacked { + pos, + len, + will_init, + } = entry; + let buf = Bytes::slice(&file_contents, pos as usize..(pos + len) as usize); + let (_buf, res) = delta_layer_writer + .put_value_bytes( + Key::from_compact(*key), + *lsn, + buf.slice_len(), + will_init, + ctx, + ) + .await; + res?; + } + } } } // MAX is used here because we identify L0 layers by full key range - let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?; - Ok(delta_layer) + let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?; + + // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``. + // + // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of + // the `file_contents: Vec` until the IO is done, but not the permit's lifetime. + // Thus, we'd have more concurrenct `Vec` in existence than the semaphore allows. + // + // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages + // we dirtied when writing to the filesystem have been flushed and marked !dirty. + drop(_concurrency_permit); + + Ok(Some((desc, path))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_index_entry() { + const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS; + use IndexEntryNewArgs as Args; + use IndexEntryUnpacked as Unpacked; + + let roundtrip = |args, expect: Unpacked| { + let res = IndexEntry::new(args).expect("this tests expects no errors"); + let IndexEntryUnpacked { + will_init, + len, + pos, + } = res.unpack(); + assert_eq!(will_init, expect.will_init); + assert_eq!(len, expect.len); + assert_eq!(pos, expect.pos); + }; + + // basic roundtrip + for pos in [0, MAX_SUPPORTED_POS] { + for len in [0, MAX_SUPPORTED_BLOB_LEN] { + for will_init in [true, false] { + let expect = Unpacked { + will_init, + len: len.into_u64(), + pos: pos.into_u64(), + }; + roundtrip( + Args { + will_init, + base_offset: pos.into_u64(), + batch_offset: 0, + len, + }, + expect, + ); + roundtrip( + Args { + will_init, + base_offset: 0, + batch_offset: pos.into_u64(), + len, + }, + expect, + ); + } + } + } + + // too-large len + let too_large = Args { + will_init: false, + len: MAX_SUPPORTED_BLOB_LEN + 1, + base_offset: 0, + batch_offset: 0, + }; + assert!(IndexEntry::new(too_large).is_err()); + + // too-large pos + { + let too_large = Args { + will_init: false, + len: 0, + base_offset: MAX_SUPPORTED_POS.into_u64() + 1, + batch_offset: 0, + }; + assert!(IndexEntry::new(too_large).is_err()); + let too_large = Args { + will_init: false, + len: 0, + base_offset: 0, + batch_offset: MAX_SUPPORTED_POS.into_u64() + 1, + }; + assert!(IndexEntry::new(too_large).is_err()); + } + + // too large (base_offset + batch_offset) + { + let too_large = Args { + will_init: false, + len: 0, + base_offset: MAX_SUPPORTED_POS.into_u64(), + batch_offset: 1, + }; + assert!(IndexEntry::new(too_large).is_err()); + let too_large = Args { + will_init: false, + len: 0, + base_offset: MAX_SUPPORTED_POS.into_u64() - 1, + batch_offset: MAX_SUPPORTED_POS.into_u64() - 1, + }; + assert!(IndexEntry::new(too_large).is_err()); + } + + // valid special cases + // - area past the max supported pos that is accessible by len + for len in [1, MAX_SUPPORTED_BLOB_LEN] { + roundtrip( + Args { + will_init: false, + len, + base_offset: MAX_SUPPORTED_POS.into_u64(), + batch_offset: 0, + }, + Unpacked { + will_init: false, + len: len as u64, + pos: MAX_SUPPORTED_POS.into_u64(), + }, + ); + roundtrip( + Args { + will_init: false, + len, + base_offset: 0, + batch_offset: MAX_SUPPORTED_POS.into_u64(), + }, + Unpacked { + will_init: false, + len: len as u64, + pos: MAX_SUPPORTED_POS.into_u64(), + }, + ); + } } } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs new file mode 100644 index 0000000000..0683e15659 --- /dev/null +++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs @@ -0,0 +1,937 @@ +use std::{ + collections::BTreeMap, + sync::{Arc, RwLock}, +}; + +use itertools::Itertools; +use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice}; + +use crate::{ + assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}, + context::RequestContext, +}; + +/// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`]. +pub trait File: Send { + /// Attempt to read the bytes in `self` in range `[start,start+dst.bytes_total())` + /// and return the number of bytes read (let's call it `nread`). + /// The bytes read are placed in `dst`, i.e., `&dst[..nread]` will contain the read bytes. + /// + /// The only reason why the read may be short (i.e., `nread != dst.bytes_total()`) + /// is if the file is shorter than `start+dst.len()`. + /// + /// This is unlike [`std::os::unix::fs::FileExt::read_exact_at`] which returns an + /// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`. + /// + /// No guarantees are made about the remaining bytes in `dst` in case of a short read. + async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( + &'b self, + start: u64, + dst: Slice, + ctx: &'a RequestContext, + ) -> std::io::Result<(Slice, usize)>; +} + +/// A logical read from [`File`]. See [`Self::new`]. +pub struct LogicalRead { + pos: u64, + state: RwLockRefCell>, +} + +enum LogicalReadState { + NotStarted(B), + Ongoing(B), + Ok(B), + Error(Arc), + Undefined, +} + +impl LogicalRead { + /// Create a new [`LogicalRead`] from [`File`] of the data in the file in range `[ pos, pos + buf.cap() )`. + pub fn new(pos: u64, buf: B) -> Self { + Self { + pos, + state: RwLockRefCell::new(LogicalReadState::NotStarted(buf)), + } + } + pub fn into_result(self) -> Option>> { + match self.state.into_inner() { + LogicalReadState::Ok(buf) => Some(Ok(buf)), + LogicalReadState::Error(e) => Some(Err(e)), + LogicalReadState::NotStarted(_) | LogicalReadState::Ongoing(_) => None, + LogicalReadState::Undefined => unreachable!(), + } + } +} + +/// The buffer into which a [`LogicalRead`] result is placed. +pub trait Buffer: std::ops::Deref { + /// Immutable. + fn cap(&self) -> usize; + /// Changes only through [`Self::extend_from_slice`]. + fn len(&self) -> usize; + /// Panics if the total length would exceed the initialized capacity. + fn extend_from_slice(&mut self, src: &[u8]); +} + +/// The minimum alignment and size requirement for disk offsets and memory buffer size for direct IO. +const DIO_CHUNK_SIZE: usize = 512; + +/// If multiple chunks need to be read, merge adjacent chunk reads into batches of max size `MAX_CHUNK_BATCH_SIZE`. +/// (The unit is the number of chunks.) +const MAX_CHUNK_BATCH_SIZE: usize = { + let desired = 128 * 1024; // 128k + if desired % DIO_CHUNK_SIZE != 0 { + panic!("MAX_CHUNK_BATCH_SIZE must be a multiple of DIO_CHUNK_SIZE") + // compile-time error + } + desired / DIO_CHUNK_SIZE +}; + +/// Execute the given logical `reads` against `file`. +/// The results are placed in the buffers of the [`LogicalRead`]s. +/// Retrieve the results by calling [`LogicalRead::into_result`] on each [`LogicalRead`]. +/// +/// The [`LogicalRead`]s must be freshly created using [`LogicalRead::new`] when calling this function. +/// Otherwise, this function panics. +pub async fn execute<'a, I, F, B>(file: &F, reads: I, ctx: &RequestContext) +where + I: IntoIterator>, + F: File, + B: Buffer + IoBufMut + Send, +{ + // Terminology: + // logical read = a request to read an arbitrary range of bytes from `file`; byte-level granularity + // chunk = we conceptually divide up the byte range of `file` into DIO_CHUNK_SIZEs ranges + // interest = a range within a chunk that a logical read is interested in; one logical read gets turned into many interests + // physical read = the read request we're going to issue to the OS; covers a range of chunks; chunk-level granularity + + // Preserve a copy of the logical reads for debug assertions at the end + #[cfg(debug_assertions)] + let (reads, assert_logical_reads) = { + let (reads, assert) = reads.into_iter().tee(); + (reads, Some(Vec::from_iter(assert))) + }; + #[cfg(not(debug_assertions))] + let (reads, assert_logical_reads): (_, Option>>) = (reads, None); + + // Plan which parts of which chunks need to be appended to which buffer + let mut by_chunk: BTreeMap>> = BTreeMap::new(); + struct Interest<'a, B: Buffer> { + logical_read: &'a LogicalRead, + offset_in_chunk: u64, + len: u64, + } + for logical_read in reads { + let LogicalRead { pos, state } = logical_read; + let mut state = state.borrow_mut(); + + // transition from NotStarted to Ongoing + let cur = std::mem::replace(&mut *state, LogicalReadState::Undefined); + let req_len = match cur { + LogicalReadState::NotStarted(buf) => { + if buf.len() != 0 { + panic!("The `LogicalRead`s that are passed in must be freshly created using `LogicalRead::new`"); + } + // buf.cap() == 0 is ok + + // transition into Ongoing state + let req_len = buf.cap(); + *state = LogicalReadState::Ongoing(buf); + req_len + } + x => panic!("must only call with fresh LogicalReads, got another state, leaving Undefined state behind state={x:?}"), + }; + + // plan which chunks we need to read from + let mut remaining = req_len; + let mut chunk_no = *pos / (DIO_CHUNK_SIZE.into_u64()); + let mut offset_in_chunk = pos.into_usize() % DIO_CHUNK_SIZE; + while remaining > 0 { + let remaining_in_chunk = std::cmp::min(remaining, DIO_CHUNK_SIZE - offset_in_chunk); + by_chunk.entry(chunk_no).or_default().push(Interest { + logical_read, + offset_in_chunk: offset_in_chunk.into_u64(), + len: remaining_in_chunk.into_u64(), + }); + offset_in_chunk = 0; + chunk_no += 1; + remaining -= remaining_in_chunk; + } + } + + // At this point, we could iterate over by_chunk, in chunk order, + // read each chunk from disk, and fill the buffers. + // However, we can merge adjacent chunks into batches of MAX_CHUNK_BATCH_SIZE + // so we issue fewer IOs = fewer roundtrips = lower overall latency. + struct PhysicalRead<'a, B: Buffer> { + start_chunk_no: u64, + nchunks: usize, + dsts: Vec>, + } + struct PhysicalInterest<'a, B: Buffer> { + logical_read: &'a LogicalRead, + offset_in_physical_read: u64, + len: u64, + } + let mut physical_reads: Vec> = Vec::new(); + let mut by_chunk = by_chunk.into_iter().peekable(); + loop { + let mut last_chunk_no = None; + let to_merge: Vec<(u64, Vec>)> = by_chunk + .peeking_take_while(|(chunk_no, _)| { + if let Some(last_chunk_no) = last_chunk_no { + if *chunk_no != last_chunk_no + 1 { + return false; + } + } + last_chunk_no = Some(*chunk_no); + true + }) + .take(MAX_CHUNK_BATCH_SIZE) + .collect(); // TODO: avoid this .collect() + let Some(start_chunk_no) = to_merge.first().map(|(chunk_no, _)| *chunk_no) else { + break; + }; + let nchunks = to_merge.len(); + let dsts = to_merge + .into_iter() + .enumerate() + .flat_map(|(i, (_, dsts))| { + dsts.into_iter().map( + move |Interest { + logical_read, + offset_in_chunk, + len, + }| { + PhysicalInterest { + logical_read, + offset_in_physical_read: i + .checked_mul(DIO_CHUNK_SIZE) + .unwrap() + .into_u64() + + offset_in_chunk, + len, + } + }, + ) + }) + .collect(); + physical_reads.push(PhysicalRead { + start_chunk_no, + nchunks, + dsts, + }); + } + drop(by_chunk); + + // Execute physical reads and fill the logical read buffers + // TODO: pipelined reads; prefetch; + let get_io_buffer = |nchunks| Vec::with_capacity(nchunks * DIO_CHUNK_SIZE); + for PhysicalRead { + start_chunk_no, + nchunks, + dsts, + } in physical_reads + { + let all_done = dsts + .iter() + .all(|PhysicalInterest { logical_read, .. }| logical_read.state.borrow().is_terminal()); + if all_done { + continue; + } + let read_offset = start_chunk_no + .checked_mul(DIO_CHUNK_SIZE.into_u64()) + .expect("we produce chunk_nos by dividing by DIO_CHUNK_SIZE earlier"); + let io_buf = get_io_buffer(nchunks).slice_full(); + let req_len = io_buf.len(); + let (io_buf_slice, nread) = match file.read_exact_at_eof_ok(read_offset, io_buf, ctx).await + { + Ok(t) => t, + Err(e) => { + let e = Arc::new(e); + for PhysicalInterest { logical_read, .. } in dsts { + *logical_read.state.borrow_mut() = LogicalReadState::Error(Arc::clone(&e)); + // this will make later reads for the given LogicalRead short-circuit, see top of loop body + } + continue; + } + }; + let io_buf = io_buf_slice.into_inner(); + assert!( + nread <= io_buf.len(), + "the last chunk in the file can be a short read, so, no ==" + ); + let io_buf = &io_buf[..nread]; + for PhysicalInterest { + logical_read, + offset_in_physical_read, + len, + } in dsts + { + let mut logical_read_state_borrow = logical_read.state.borrow_mut(); + let logical_read_buf = match &mut *logical_read_state_borrow { + LogicalReadState::NotStarted(_) => { + unreachable!("we transition it into Ongoing at function entry") + } + LogicalReadState::Ongoing(buf) => buf, + LogicalReadState::Ok(_) | LogicalReadState::Error(_) => { + continue; + } + LogicalReadState::Undefined => unreachable!(), + }; + let range_in_io_buf = std::ops::Range { + start: offset_in_physical_read as usize, + end: offset_in_physical_read as usize + len as usize, + }; + assert!(range_in_io_buf.end >= range_in_io_buf.start); + if range_in_io_buf.end > nread { + let msg = format!( + "physical read returned EOF where this logical read expected more data in the file: offset=0x{read_offset:x} req_len=0x{req_len:x} nread=0x{nread:x} {:?}", + &*logical_read_state_borrow + ); + logical_read_state_borrow.transition_to_terminal(Err(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + msg, + ))); + continue; + } + let data = &io_buf[range_in_io_buf]; + + // Copy data from io buffer into the logical read buffer. + // (And in debug mode, validate that the buffer impl adheres to the Buffer trait spec.) + let pre = if cfg!(debug_assertions) { + Some((logical_read_buf.len(), logical_read_buf.cap())) + } else { + None + }; + logical_read_buf.extend_from_slice(data); + let post = if cfg!(debug_assertions) { + Some((logical_read_buf.len(), logical_read_buf.cap())) + } else { + None + }; + match (pre, post) { + (None, None) => {} + (Some(_), None) | (None, Some(_)) => unreachable!(), + (Some((pre_len, pre_cap)), Some((post_len, post_cap))) => { + assert_eq!(pre_len + len as usize, post_len); + assert_eq!(pre_cap, post_cap); + } + } + + if logical_read_buf.len() == logical_read_buf.cap() { + logical_read_state_borrow.transition_to_terminal(Ok(())); + } + } + } + + if let Some(assert_logical_reads) = assert_logical_reads { + for logical_read in assert_logical_reads { + assert!(logical_read.state.borrow().is_terminal()); + } + } +} + +impl LogicalReadState { + fn is_terminal(&self) -> bool { + match self { + LogicalReadState::NotStarted(_) | LogicalReadState::Ongoing(_) => false, + LogicalReadState::Ok(_) | LogicalReadState::Error(_) => true, + LogicalReadState::Undefined => unreachable!(), + } + } + fn transition_to_terminal(&mut self, err: std::io::Result<()>) { + let cur = std::mem::replace(self, LogicalReadState::Undefined); + let buf = match cur { + LogicalReadState::Ongoing(buf) => buf, + x => panic!("must only call in state Ongoing, got {x:?}"), + }; + *self = match err { + Ok(()) => LogicalReadState::Ok(buf), + Err(e) => LogicalReadState::Error(Arc::new(e)), + }; + } +} + +impl std::fmt::Debug for LogicalReadState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + #[derive(Debug)] + #[allow(unused)] + struct BufferDebug { + len: usize, + cap: usize, + } + impl<'a> From<&'a dyn Buffer> for BufferDebug { + fn from(buf: &'a dyn Buffer) -> Self { + Self { + len: buf.len(), + cap: buf.cap(), + } + } + } + match self { + LogicalReadState::NotStarted(b) => { + write!(f, "NotStarted({:?})", BufferDebug::from(b as &dyn Buffer)) + } + LogicalReadState::Ongoing(b) => { + write!(f, "Ongoing({:?})", BufferDebug::from(b as &dyn Buffer)) + } + LogicalReadState::Ok(b) => write!(f, "Ok({:?})", BufferDebug::from(b as &dyn Buffer)), + LogicalReadState::Error(e) => write!(f, "Error({:?})", e), + LogicalReadState::Undefined => write!(f, "Undefined"), + } + } +} + +#[derive(Debug)] +struct RwLockRefCell(RwLock); +impl RwLockRefCell { + fn new(value: T) -> Self { + Self(RwLock::new(value)) + } + fn borrow(&self) -> impl std::ops::Deref + '_ { + self.0.try_read().unwrap() + } + fn borrow_mut(&self) -> impl std::ops::DerefMut + '_ { + self.0.try_write().unwrap() + } + fn into_inner(self) -> T { + self.0.into_inner().unwrap() + } +} + +impl Buffer for Vec { + fn cap(&self) -> usize { + self.capacity() + } + + fn len(&self) -> usize { + self.len() + } + + fn extend_from_slice(&mut self, src: &[u8]) { + if self.len() + src.len() > self.cap() { + panic!("Buffer capacity exceeded"); + } + Vec::extend_from_slice(self, src); + } +} + +#[cfg(test)] +#[allow(clippy::assertions_on_constants)] +mod tests { + use rand::Rng; + + use crate::{ + context::DownloadBehavior, task_mgr::TaskKind, + virtual_file::owned_buffers_io::slice::SliceMutExt, + }; + + use super::*; + use std::{cell::RefCell, collections::VecDeque}; + + struct InMemoryFile { + content: Vec, + } + + impl InMemoryFile { + fn new_random(len: usize) -> Self { + Self { + content: rand::thread_rng() + .sample_iter(rand::distributions::Standard) + .take(len) + .collect(), + } + } + fn test_logical_read(&self, pos: u64, len: usize) -> TestLogicalRead { + let expected_result = if pos as usize + len > self.content.len() { + Err("InMemoryFile short read".to_string()) + } else { + Ok(self.content[pos as usize..pos as usize + len].to_vec()) + }; + TestLogicalRead::new(pos, len, expected_result) + } + } + + #[test] + fn test_in_memory_file() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let file = InMemoryFile::new_random(10); + let test_read = |pos, len| { + let buf = vec![0; len]; + let fut = file.read_exact_at_eof_ok(pos, buf.slice_full(), &ctx); + use futures::FutureExt; + let (slice, nread) = fut + .now_or_never() + .expect("impl never awaits") + .expect("impl never errors"); + let mut buf = slice.into_inner(); + buf.truncate(nread); + buf + }; + assert_eq!(test_read(0, 1), &file.content[0..1]); + assert_eq!(test_read(1, 2), &file.content[1..3]); + assert_eq!(test_read(9, 2), &file.content[9..]); + assert!(test_read(10, 2).is_empty()); + assert!(test_read(11, 2).is_empty()); + } + + impl File for InMemoryFile { + async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( + &'b self, + start: u64, + mut dst: Slice, + _ctx: &'a RequestContext, + ) -> std::io::Result<(Slice, usize)> { + let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed(); + let nread = { + let req_len = dst_slice.len(); + let len = std::cmp::min(req_len, self.content.len().saturating_sub(start as usize)); + if start as usize >= self.content.len() { + 0 + } else { + dst_slice[..len] + .copy_from_slice(&self.content[start as usize..start as usize + len]); + len + } + }; + rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[nread..]); // to discover bugs + Ok((dst, nread)) + } + } + + #[derive(Clone)] + struct TestLogicalRead { + pos: u64, + len: usize, + expected_result: Result, String>, + } + + impl TestLogicalRead { + fn new(pos: u64, len: usize, expected_result: Result, String>) -> Self { + Self { + pos, + len, + expected_result, + } + } + fn make_logical_read(&self) -> LogicalRead> { + LogicalRead::new(self.pos, Vec::with_capacity(self.len)) + } + } + + async fn execute_and_validate_test_logical_reads( + file: &F, + test_logical_reads: I, + ctx: &RequestContext, + ) where + I: IntoIterator, + F: File, + { + let (tmp, test_logical_reads) = test_logical_reads.into_iter().tee(); + let logical_reads = tmp.map(|tr| tr.make_logical_read()).collect::>(); + execute(file, logical_reads.iter(), ctx).await; + for (logical_read, test_logical_read) in logical_reads.into_iter().zip(test_logical_reads) { + let actual = logical_read.into_result().expect("we call execute()"); + match (actual, test_logical_read.expected_result) { + (Ok(actual), Ok(expected)) if actual == expected => {} + (Err(actual), Err(expected)) => { + assert_eq!(actual.to_string(), expected); + } + (actual, expected) => panic!("expected {expected:?}\nactual {actual:?}"), + } + } + } + + #[tokio::test] + async fn test_blackbox() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let cs = DIO_CHUNK_SIZE; + let cs_u64 = cs.into_u64(); + + let file = InMemoryFile::new_random(10 * cs); + + let test_logical_reads = vec![ + file.test_logical_read(0, 1), + // adjacent to logical_read0 + file.test_logical_read(1, 2), + // gap + // spans adjacent chunks + file.test_logical_read(cs_u64 - 1, 2), + // gap + // tail of chunk 3, all of chunk 4, and 2 bytes of chunk 5 + file.test_logical_read(3 * cs_u64 - 1, cs + 2), + // gap + file.test_logical_read(5 * cs_u64, 1), + ]; + let num_test_logical_reads = test_logical_reads.len(); + let test_logical_reads_perms = test_logical_reads + .into_iter() + .permutations(num_test_logical_reads); + + // test all orderings of LogicalReads, the order shouldn't matter for the results + for test_logical_reads in test_logical_reads_perms { + execute_and_validate_test_logical_reads(&file, test_logical_reads, &ctx).await; + } + } + + #[tokio::test] + #[should_panic] + async fn test_reusing_logical_reads_panics() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let file = InMemoryFile::new_random(DIO_CHUNK_SIZE); + let a = file.test_logical_read(23, 10); + let logical_reads = vec![a.make_logical_read()]; + execute(&file, &logical_reads, &ctx).await; + // reuse pancis + execute(&file, &logical_reads, &ctx).await; + } + + struct RecorderFile<'a> { + recorded: RefCell>, + file: &'a InMemoryFile, + } + + struct RecordedRead { + pos: u64, + req_len: usize, + res: Vec, + } + + impl<'a> RecorderFile<'a> { + fn new(file: &'a InMemoryFile) -> RecorderFile<'a> { + Self { + recorded: Default::default(), + file, + } + } + } + + impl<'x> File for RecorderFile<'x> { + async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( + &'b self, + start: u64, + dst: Slice, + ctx: &'a RequestContext, + ) -> std::io::Result<(Slice, usize)> { + let (dst, nread) = self.file.read_exact_at_eof_ok(start, dst, ctx).await?; + self.recorded.borrow_mut().push(RecordedRead { + pos: start, + req_len: dst.bytes_total(), + res: Vec::from(&dst[..nread]), + }); + Ok((dst, nread)) + } + } + + #[tokio::test] + async fn test_logical_reads_to_same_chunk_are_merged_into_one_chunk_read() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + let file = InMemoryFile::new_random(2 * DIO_CHUNK_SIZE); + + let a = file.test_logical_read(DIO_CHUNK_SIZE.into_u64(), 10); + let b = file.test_logical_read(DIO_CHUNK_SIZE.into_u64() + 30, 20); + + let recorder = RecorderFile::new(&file); + + execute_and_validate_test_logical_reads(&recorder, vec![a, b], &ctx).await; + + let recorded = recorder.recorded.borrow(); + assert_eq!(recorded.len(), 1); + let RecordedRead { pos, req_len, .. } = &recorded[0]; + assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64()); + assert_eq!(*req_len, DIO_CHUNK_SIZE); + } + + #[tokio::test] + async fn test_max_chunk_batch_size_is_respected() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + let file = InMemoryFile::new_random(4 * MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE); + + // read the 10th byte of each chunk 3 .. 3+2*MAX_CHUNK_BATCH_SIZE + assert!(3 < MAX_CHUNK_BATCH_SIZE, "test assumption"); + assert!(10 < DIO_CHUNK_SIZE, "test assumption"); + let mut test_logical_reads = Vec::new(); + for i in 3..3 + MAX_CHUNK_BATCH_SIZE + MAX_CHUNK_BATCH_SIZE / 2 { + test_logical_reads + .push(file.test_logical_read(i.into_u64() * DIO_CHUNK_SIZE.into_u64() + 10, 1)); + } + + let recorder = RecorderFile::new(&file); + + execute_and_validate_test_logical_reads(&recorder, test_logical_reads, &ctx).await; + + let recorded = recorder.recorded.borrow(); + assert_eq!(recorded.len(), 2); + { + let RecordedRead { pos, req_len, .. } = &recorded[0]; + assert_eq!(*pos as usize, 3 * DIO_CHUNK_SIZE); + assert_eq!(*req_len, MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE); + } + { + let RecordedRead { pos, req_len, .. } = &recorded[1]; + assert_eq!(*pos as usize, (3 + MAX_CHUNK_BATCH_SIZE) * DIO_CHUNK_SIZE); + assert_eq!(*req_len, MAX_CHUNK_BATCH_SIZE / 2 * DIO_CHUNK_SIZE); + } + } + + #[tokio::test] + async fn test_batch_breaks_if_chunk_is_not_interesting() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + assert!(MAX_CHUNK_BATCH_SIZE > 10, "test assumption"); + let file = InMemoryFile::new_random(3 * DIO_CHUNK_SIZE); + + let a = file.test_logical_read(0, 1); // chunk 0 + let b = file.test_logical_read(2 * DIO_CHUNK_SIZE.into_u64(), 1); // chunk 2 + + let recorder = RecorderFile::new(&file); + + execute_and_validate_test_logical_reads(&recorder, vec![a, b], &ctx).await; + + let recorded = recorder.recorded.borrow(); + + assert_eq!(recorded.len(), 2); + { + let RecordedRead { pos, req_len, .. } = &recorded[0]; + assert_eq!(*pos, 0); + assert_eq!(*req_len, DIO_CHUNK_SIZE); + } + { + let RecordedRead { pos, req_len, .. } = &recorded[1]; + assert_eq!(*pos, 2 * DIO_CHUNK_SIZE.into_u64()); + assert_eq!(*req_len, DIO_CHUNK_SIZE); + } + } + + struct ExpectedRead { + expect_pos: u64, + expect_len: usize, + respond: Result, String>, + } + + struct MockFile { + expected: RefCell>, + } + + impl Drop for MockFile { + fn drop(&mut self) { + assert!( + self.expected.borrow().is_empty(), + "expected reads not satisfied" + ); + } + } + + macro_rules! mock_file { + ($($pos:expr , $len:expr => $respond:expr),* $(,)?) => {{ + MockFile { + expected: RefCell::new(VecDeque::from(vec![$(ExpectedRead { + expect_pos: $pos, + expect_len: $len, + respond: $respond, + }),*])), + } + }}; + } + + impl File for MockFile { + async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( + &'b self, + start: u64, + mut dst: Slice, + _ctx: &'a RequestContext, + ) -> std::io::Result<(Slice, usize)> { + let ExpectedRead { + expect_pos, + expect_len, + respond, + } = self + .expected + .borrow_mut() + .pop_front() + .expect("unexpected read"); + assert_eq!(start, expect_pos); + assert_eq!(dst.bytes_total(), expect_len); + match respond { + Ok(mocked_bytes) => { + let len = std::cmp::min(dst.bytes_total(), mocked_bytes.len()); + let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed(); + dst_slice[..len].copy_from_slice(&mocked_bytes[..len]); + rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[len..]); // to discover bugs + Ok((dst, len)) + } + Err(e) => Err(std::io::Error::new(std::io::ErrorKind::Other, e)), + } + } + } + + #[tokio::test] + async fn test_mock_file() { + // Self-test to ensure the relevant features of mock file work as expected. + + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + let mock_file = mock_file! { + 0 , 512 => Ok(vec![0; 512]), + 512 , 512 => Ok(vec![1; 512]), + 1024 , 512 => Ok(vec![2; 10]), + 2048, 1024 => Err("foo".to_owned()), + }; + + let buf = Vec::with_capacity(512); + let (buf, nread) = mock_file + .read_exact_at_eof_ok(0, buf.slice_full(), &ctx) + .await + .unwrap(); + assert_eq!(nread, 512); + assert_eq!(&buf.into_inner()[..nread], &[0; 512]); + + let buf = Vec::with_capacity(512); + let (buf, nread) = mock_file + .read_exact_at_eof_ok(512, buf.slice_full(), &ctx) + .await + .unwrap(); + assert_eq!(nread, 512); + assert_eq!(&buf.into_inner()[..nread], &[1; 512]); + + let buf = Vec::with_capacity(512); + let (buf, nread) = mock_file + .read_exact_at_eof_ok(1024, buf.slice_full(), &ctx) + .await + .unwrap(); + assert_eq!(nread, 10); + assert_eq!(&buf.into_inner()[..nread], &[2; 10]); + + let buf = Vec::with_capacity(1024); + let err = mock_file + .read_exact_at_eof_ok(2048, buf.slice_full(), &ctx) + .await + .err() + .unwrap(); + assert_eq!(err.to_string(), "foo"); + } + + #[tokio::test] + async fn test_error_on_one_chunk_read_fails_only_dependent_logical_reads() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + let test_logical_reads = vec![ + // read spanning two batches + TestLogicalRead::new( + DIO_CHUNK_SIZE.into_u64() / 2, + MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE, + Err("foo".to_owned()), + ), + // second read in failing chunk + TestLogicalRead::new( + (MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE).into_u64() + DIO_CHUNK_SIZE.into_u64() - 10, + 5, + Err("foo".to_owned()), + ), + // read unaffected + TestLogicalRead::new( + (MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE).into_u64() + + 2 * DIO_CHUNK_SIZE.into_u64() + + 10, + 5, + Ok(vec![1; 5]), + ), + ]; + let (tmp, test_logical_reads) = test_logical_reads.into_iter().tee(); + let test_logical_read_perms = tmp.permutations(test_logical_reads.len()); + + for test_logical_reads in test_logical_read_perms { + let file = mock_file!( + 0, MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE => Ok(vec![0; MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE]), + (MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE).into_u64(), DIO_CHUNK_SIZE => Err("foo".to_owned()), + (MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE + 2*DIO_CHUNK_SIZE).into_u64(), DIO_CHUNK_SIZE => Ok(vec![1; DIO_CHUNK_SIZE]), + ); + execute_and_validate_test_logical_reads(&file, test_logical_reads, &ctx).await; + } + } + + struct TestShortReadsSetup { + ctx: RequestContext, + file: InMemoryFile, + written: u64, + } + fn setup_short_chunk_read_tests() -> TestShortReadsSetup { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + assert!(DIO_CHUNK_SIZE > 20, "test assumption"); + let written = (2 * DIO_CHUNK_SIZE - 10).into_u64(); + let file = InMemoryFile::new_random(written as usize); + TestShortReadsSetup { ctx, file, written } + } + + #[tokio::test] + async fn test_short_chunk_read_from_written_range() { + // Test what happens if there are logical reads + // that start within the last chunk, and + // the last chunk is not the full chunk length. + // + // The read should succeed despite the short chunk length. + let TestShortReadsSetup { ctx, file, written } = setup_short_chunk_read_tests(); + + let a = file.test_logical_read(written - 10, 5); + let recorder = RecorderFile::new(&file); + + execute_and_validate_test_logical_reads(&recorder, vec![a], &ctx).await; + + let recorded = recorder.recorded.borrow(); + assert_eq!(recorded.len(), 1); + let RecordedRead { pos, req_len, res } = &recorded[0]; + assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64()); + assert_eq!(*req_len, DIO_CHUNK_SIZE); + assert_eq!(res, &file.content[DIO_CHUNK_SIZE..(written as usize)]); + } + + #[tokio::test] + async fn test_short_chunk_read_and_logical_read_from_unwritten_range() { + // Test what happens if there are logical reads + // that start within the last chunk, and + // the last chunk is not the full chunk length, and + // the logical reads end in the unwritten range. + // + // All should fail with UnexpectedEof and have the same IO pattern. + async fn the_impl(offset_delta: i64) { + let TestShortReadsSetup { ctx, file, written } = setup_short_chunk_read_tests(); + + let offset = u64::try_from( + i64::try_from(written) + .unwrap() + .checked_add(offset_delta) + .unwrap(), + ) + .unwrap(); + let a = file.test_logical_read(offset, 5); + let recorder = RecorderFile::new(&file); + let a_vr = a.make_logical_read(); + execute(&recorder, vec![&a_vr], &ctx).await; + + // validate the LogicalRead result + let a_res = a_vr.into_result().unwrap(); + let a_err = a_res.unwrap_err(); + assert_eq!(a_err.kind(), std::io::ErrorKind::UnexpectedEof); + + // validate the IO pattern + let recorded = recorder.recorded.borrow(); + assert_eq!(recorded.len(), 1); + let RecordedRead { pos, req_len, res } = &recorded[0]; + assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64()); + assert_eq!(*req_len, DIO_CHUNK_SIZE); + assert_eq!(res, &file.content[DIO_CHUNK_SIZE..(written as usize)]); + } + + the_impl(-1).await; // start == length - 1 + the_impl(0).await; // start == length + the_impl(1).await; // start == length + 1 + } + + // TODO: mixed: some valid, some UnexpectedEof + + // TODO: same tests but with merges +} diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 12af866810..b15cd4da39 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1,31 +1,41 @@ use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; -use pageserver_api::models::{ - HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus, -}; -use pageserver_api::shard::ShardIndex; +use pageserver_api::keyspace::KeySpace; +use pageserver_api::models::HistoricLayerInfo; +use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId}; use std::ops::Range; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; -use std::time::SystemTime; +use std::time::{Duration, SystemTime}; use tracing::Instrument; +use utils::id::TimelineId; use utils::lsn::Lsn; -use utils::sync::heavier_once_cell; +use utils::sync::{gate, heavier_once_cell}; use crate::config::PageServerConf; -use crate::context::RequestContext; -use crate::repository::Key; +use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::task_mgr::TaskKind; +use crate::tenant::timeline::{CompactionError, GetVectoredError}; use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; use super::delta_layer::{self, DeltaEntry}; -use super::image_layer; +use super::image_layer::{self}; use super::{ - AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc, - ValueReconstructResult, ValueReconstructState, + AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName, + LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState, }; use utils::generation::Generation; +#[cfg(test)] +mod tests; + +#[cfg(test)] +mod failpoints; + +pub const S3_UPLOAD_LIMIT: u64 = 4_500_000_000; + /// A Layer contains all data in a "rectangle" consisting of a range of keys and /// range of LSNs. /// @@ -40,7 +50,41 @@ use utils::generation::Generation; /// An image layer is a snapshot of all the data in a key-range, at a single /// LSN. /// -/// This type models the on-disk layers, which can be evicted and on-demand downloaded. +/// This type models the on-disk layers, which can be evicted and on-demand downloaded. As a +/// general goal, read accesses should always win eviction and eviction should not wait for +/// download. +/// +/// ### State transitions +/// +/// The internal state of `Layer` is composed of most importantly the on-filesystem state and the +/// [`ResidentOrWantedEvicted`] enum. On-filesystem state can be either present (fully downloaded, +/// right size) or deleted. +/// +/// Reads will always win requests to evict until `wait_for_turn_and_evict` has acquired the +/// `heavier_once_cell::InitPermit` and has started to `evict_blocking`. Before the +/// `heavier_once_cell::InitPermit` has been acquired, any read request +/// (`get_or_maybe_download`) can "re-initialize" using the existing downloaded file and thus +/// cancelling the eviction. +/// +/// ```text +/// +-----------------+ get_or_maybe_download +--------------------------------+ +/// | not initialized |--------------------------->| Resident(Arc) | +/// | ENOENT | /->| | +/// +-----------------+ | +--------------------------------+ +/// ^ | | ^ +/// | get_or_maybe_download | | | get_or_maybe_download, either: +/// evict_blocking | /-------------------------/ | | - upgrade weak to strong +/// | | | | - re-initialize without download +/// | | evict_and_wait | | +/// +-----------------+ v | +/// | not initialized | on_downloaded_layer_drop +--------------------------------------+ +/// | file is present |<---------------------------| WantedEvicted(Weak) | +/// +-----------------+ +--------------------------------------+ +/// ``` +/// +/// ### Unsupported +/// +/// - Evicting by the operator deleting files from the filesystem /// /// [`InMemoryLayer`]: super::inmemory_layer::InMemoryLayer #[derive(Clone)] @@ -48,16 +92,12 @@ pub(crate) struct Layer(Arc); impl std::fmt::Display for Layer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - if matches!(self.0.generation, Generation::Broken) { - write!(f, "{}-broken", self.layer_desc().short_id()) - } else { - write!( - f, - "{}{}", - self.layer_desc().short_id(), - self.0.generation.get_suffix() - ) - } + write!( + f, + "{}{}", + self.layer_desc().short_id(), + self.0.generation.get_suffix() + ) } } @@ -73,27 +113,56 @@ impl AsLayerDesc for Layer { } } +impl PartialEq for Layer { + fn eq(&self, other: &Self) -> bool { + Arc::as_ptr(&self.0) == Arc::as_ptr(&other.0) + } +} + +pub(crate) fn local_layer_path( + conf: &PageServerConf, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + layer_file_name: &LayerName, + generation: &Generation, +) -> Utf8PathBuf { + let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id); + + if generation.is_none() { + // Without a generation, we may only use legacy path style + timeline_path.join(layer_file_name.to_string()) + } else { + timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix())) + } +} + impl Layer { /// Creates a layer value for a file we know to not be resident. pub(crate) fn for_evicted( conf: &'static PageServerConf, timeline: &Arc, - file_name: LayerFileName, + file_name: LayerName, metadata: LayerFileMetadata, ) -> Self { + let local_path = local_layer_path( + conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &file_name, + &metadata.generation, + ); + let desc = PersistentLayerDesc::from_filename( timeline.tenant_shard_id, timeline.timeline_id, file_name, - metadata.file_size(), + metadata.file_size, ); - let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted); - let owner = Layer(Arc::new(LayerInner::new( conf, timeline, - access_stats, + local_path, desc, None, metadata.generation, @@ -109,18 +178,17 @@ impl Layer { pub(crate) fn for_resident( conf: &'static PageServerConf, timeline: &Arc, - file_name: LayerFileName, + local_path: Utf8PathBuf, + file_name: LayerName, metadata: LayerFileMetadata, ) -> ResidentLayer { let desc = PersistentLayerDesc::from_filename( timeline.tenant_shard_id, timeline.timeline_id, file_name, - metadata.file_size(), + metadata.file_size, ); - let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident); - let mut resident = None; let owner = Layer(Arc::new_cyclic(|owner| { @@ -134,7 +202,7 @@ impl Layer { LayerInner::new( conf, timeline, - access_stats, + local_path, desc, Some(inner), metadata.generation, @@ -148,7 +216,7 @@ impl Layer { timeline .metrics - .resident_physical_size_add(metadata.file_size()); + .resident_physical_size_add(metadata.file_size); ResidentLayer { downloaded, owner } } @@ -170,15 +238,19 @@ impl Layer { version: 0, }); resident = Some(inner.clone()); - let access_stats = LayerAccessStats::empty_will_record_residence_event_later(); - access_stats.record_residence_event( - LayerResidenceStatus::Resident, - LayerResidenceEventReason::LayerCreate, + + let local_path = local_layer_path( + conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &desc.layer_name(), + &timeline.generation, ); + LayerInner::new( conf, timeline, - access_stats, + local_path, desc, Some(inner), timeline.generation, @@ -188,8 +260,10 @@ impl Layer { let downloaded = resident.expect("just initialized"); - // if the rename works, the path is as expected - std::fs::rename(temp_path, owner.local_path()) + // We never want to overwrite an existing file, so we use `RENAME_NOREPLACE`. + // TODO: this leaves the temp file in place if the rename fails, risking us running + // out of space. Should we clean it up here or does the calling context deal with this? + utils::fs_ext::rename_noreplace(temp_path.as_std_path(), owner.local_path().as_std_path()) .with_context(|| format!("rename temporary file as correct path for {owner}"))?; Ok(ResidentLayer { downloaded, owner }) @@ -202,10 +276,14 @@ impl Layer { /// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is /// re-downloaded, [`EvictionError::Downloaded`] is returned. /// + /// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction + /// will happen regardless the future returned by this method completing unless there is a + /// read access before eviction gets to complete. + /// /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation /// of download-evict cycle on retry. - pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> { - self.0.evict_and_wait().await + pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> { + self.0.evict_and_wait(timeout).await } /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload @@ -223,42 +301,36 @@ impl Layer { self.0.delete_on_drop(); } - /// Return data needed to reconstruct given page at LSN. - /// - /// It is up to the caller to collect more data from the previous layer and - /// perform WAL redo, if necessary. - /// - /// # Cancellation-Safety - /// - /// This method is cancellation-safe. - pub(crate) async fn get_value_reconstruct_data( + pub(crate) async fn get_values_reconstruct_data( &self, - key: Key, + keyspace: KeySpace, lsn_range: Range, - reconstruct_data: &mut ValueReconstructState, + reconstruct_data: &mut ValuesReconstructState, ctx: &RequestContext, - ) -> anyhow::Result { - use anyhow::ensure; + ) -> Result<(), GetVectoredError> { + let layer = self + .0 + .get_or_maybe_download(true, Some(ctx)) + .await + .map_err(|err| match err { + DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => { + GetVectoredError::Cancelled + } + other => GetVectoredError::Other(anyhow::anyhow!(other)), + })?; - let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?; - self.0 - .access_stats - .record_access(LayerAccessKind::GetValueReconstructData, ctx); - - if self.layer_desc().is_delta { - ensure!(lsn_range.start >= self.layer_desc().lsn_range.start); - ensure!(self.layer_desc().key_range.contains(&key)); - } else { - ensure!(self.layer_desc().key_range.contains(&key)); - ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn()); - ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn()); - } + self.record_access(ctx); layer - .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx) - .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self)) + .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx) + .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self)) .await - .with_context(|| format!("get_value_reconstruct_data for layer {self}")) + .map_err(|err| match err { + GetVectoredError::Other(err) => GetVectoredError::Other( + err.context(format!("get_values_reconstruct_data for layer {self}")), + ), + err => err, + }) } /// Download the layer if evicted. @@ -272,25 +344,32 @@ impl Layer { /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction /// while the guard exists. /// - /// Returns None if the layer is currently evicted. - pub(crate) async fn keep_resident(&self) -> anyhow::Result> { - let downloaded = match self.0.get_or_maybe_download(false, None).await { - Ok(d) => d, - // technically there are a lot of possible errors, but in practice it should only be - // DownloadRequired which is tripped up. could work to improve this situation - // statically later. - Err(DownloadError::DownloadRequired) => return Ok(None), - Err(e) => return Err(e.into()), - }; + /// Returns None if the layer is currently evicted or becoming evicted. + #[cfg(test)] + pub(crate) async fn keep_resident(&self) -> Option { + let downloaded = self.0.inner.get().and_then(|rowe| rowe.get())?; - Ok(Some(ResidentLayer { + Some(ResidentLayer { downloaded, owner: self.clone(), - })) + }) + } + + /// Weak indicator of is the layer resident or not. Good enough for eviction, which can deal + /// with `EvictionError::NotFound`. + /// + /// Returns `true` if this layer might be resident, or `false`, if it most likely evicted or + /// will be unless a read happens soon. + pub(crate) fn is_likely_resident(&self) -> bool { + self.0 + .inner + .get() + .map(|rowe| rowe.is_likely_resident()) + .unwrap_or(false) } /// Downloads if necessary and creates a guard, which will keep this layer from being evicted. - pub(crate) async fn download_and_keep_resident(&self) -> anyhow::Result { + pub(crate) async fn download_and_keep_resident(&self) -> Result { let downloaded = self.0.get_or_maybe_download(true, None).await?; Ok(ResidentLayer { @@ -303,8 +382,12 @@ impl Layer { self.0.info(reset) } - pub(crate) fn access_stats(&self) -> &LayerAccessStats { - &self.0.access_stats + pub(crate) fn latest_activity(&self) -> SystemTime { + self.0.access_stats.latest_activity() + } + + pub(crate) fn visibility(&self) -> LayerVisibilityHint { + self.0.access_stats.visibility() } pub(crate) fn local_path(&self) -> &Utf8Path { @@ -315,6 +398,13 @@ impl Layer { self.0.metadata() } + pub(crate) fn get_timeline_id(&self) -> Option { + self.0 + .timeline + .upgrade() + .map(|timeline| timeline.timeline_id) + } + /// Traditional debug dumping facility #[allow(unused)] pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> { @@ -334,25 +424,69 @@ impl Layer { /// /// Does not start local deletion, use [`Self::delete_on_drop`] for that /// separatedly. - #[cfg(feature = "testing")] + #[cfg(any(feature = "testing", test))] pub(crate) fn wait_drop(&self) -> impl std::future::Future + 'static { - let mut rx = self.0.status.subscribe(); + let mut rx = self.0.status.as_ref().unwrap().subscribe(); async move { loop { - if let Err(tokio::sync::broadcast::error::RecvError::Closed) = rx.recv().await { + if rx.changed().await.is_err() { break; } } } } + + fn record_access(&self, ctx: &RequestContext) { + if self.0.access_stats.record_access(ctx) { + // Visibility was modified to Visible + tracing::info!( + "Layer {} became visible as a result of access", + self.0.desc.key() + ); + if let Some(tl) = self.0.timeline.upgrade() { + tl.metrics + .visible_physical_size_gauge + .add(self.0.desc.file_size) + } + } + } + + pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) { + let old_visibility = self.0.access_stats.set_visibility(visibility.clone()); + use LayerVisibilityHint::*; + match (old_visibility, visibility) { + (Visible, Covered) => { + // Subtract this layer's contribution to the visible size metric + if let Some(tl) = self.0.timeline.upgrade() { + debug_assert!( + tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size + ); + tl.metrics + .visible_physical_size_gauge + .sub(self.0.desc.file_size) + } + } + (Covered, Visible) => { + // Add this layer's contribution to the visible size metric + if let Some(tl) = self.0.timeline.upgrade() { + tl.metrics + .visible_physical_size_gauge + .add(self.0.desc.file_size) + } + } + (Covered, Covered) | (Visible, Visible) => { + // no change + } + } + } } /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted. /// /// However when we want something evicted, we cannot evict it right away as there might be current /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet -/// read with [`Layer::get_value_reconstruct_data`]. +/// read with [`Layer::get_values_reconstruct_data`]. /// /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search #[derive(Debug)] @@ -362,6 +496,32 @@ enum ResidentOrWantedEvicted { } impl ResidentOrWantedEvicted { + /// Non-mutating access to the a DownloadedLayer, if possible. + /// + /// This is not used on the read path (anything that calls + /// [`LayerInner::get_or_maybe_download`]) because it was decided that reads always win + /// evictions, and part of that winning is using [`ResidentOrWantedEvicted::get_and_upgrade`]. + #[cfg(test)] + fn get(&self) -> Option> { + match self { + ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()), + ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.upgrade(), + } + } + + /// Best-effort query for residency right now, not as strong guarantee as receiving a strong + /// reference from `ResidentOrWantedEvicted::get`. + fn is_likely_resident(&self) -> bool { + match self { + ResidentOrWantedEvicted::Resident(_) => true, + ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.strong_count() > 0, + } + } + + /// Upgrades any weak to strong if possible. + /// + /// Returns a strong reference if possible, along with a boolean telling if an upgrade + /// happened. fn get_and_upgrade(&mut self) -> Option<(Arc, bool)> { match self { ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)), @@ -382,7 +542,7 @@ impl ResidentOrWantedEvicted { /// /// Returns `Some` if this was the first time eviction was requested. Care should be taken to /// drop the possibly last strong reference outside of the mutex of - /// heavier_once_cell::OnceCell. + /// [`heavier_once_cell::OnceCell`]. fn downgrade(&mut self) -> Option> { match self { ResidentOrWantedEvicted::Resident(strong) => { @@ -410,35 +570,46 @@ struct LayerInner { desc: PersistentLayerDesc, /// Timeline access is needed for remote timeline client and metrics. + /// + /// There should not be an access to timeline for any reason without entering the + /// [`Timeline::gate`] at the same time. timeline: Weak, - /// Cached knowledge of [`Timeline::remote_client`] being `Some`. - have_remote_client: bool, - access_stats: LayerAccessStats, /// This custom OnceCell is backed by std mutex, but only held for short time periods. - /// Initialization and deinitialization are done while holding a permit. + /// + /// Filesystem changes (download, evict) are only done while holding a permit which the + /// `heavier_once_cell` provides. + /// + /// A number of fields in `Layer` are meant to only be updated when holding the InitPermit, but + /// possibly read while not holding it. inner: heavier_once_cell::OnceCell, /// Do we want to delete locally and remotely this when `LayerInner` is dropped wanted_deleted: AtomicBool, - /// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses - /// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger - /// [`LayerInner::on_downloaded_layer_drop`]. - wanted_evicted: AtomicBool, - - /// Version is to make sure we will only evict a specific download of a file. + /// Version is to make sure we will only evict a specific initialization of the downloaded file. /// - /// Incremented for each download, stored in `DownloadedLayer::version` or + /// Incremented for each initialization, stored in `DownloadedLayer::version` or /// `ResidentOrWantedEvicted::WantedEvicted`. version: AtomicUsize, - /// Allow subscribing to when the layer actually gets evicted. - status: tokio::sync::broadcast::Sender, + /// Allow subscribing to when the layer actually gets evicted, a non-cancellable download + /// starts, or completes. + /// + /// Updates must only be posted while holding the InitPermit or the heavier_once_cell::Guard. + /// Holding the InitPermit is the only time we can do state transitions, but we also need to + /// cancel a pending eviction on upgrading a [`ResidentOrWantedEvicted::WantedEvicted`] back to + /// [`ResidentOrWantedEvicted::Resident`] on access. + /// + /// The sender is wrapped in an Option to facilitate moving it out on [`LayerInner::drop`]. + status: Option>, - /// Counter for exponential backoff with the download + /// Counter for exponential backoff with the download. + /// + /// This is atomic only for the purposes of having additional data only accessed while holding + /// the InitPermit. consecutive_failures: AtomicUsize, /// The generation of this Layer. @@ -456,7 +627,13 @@ struct LayerInner { /// a shard split since the layer was originally written. shard: ShardIndex, + /// When the Layer was last evicted but has not been downloaded since. + /// + /// This is used solely for updating metrics. See [`LayerImplMetrics::redownload_after`]. last_evicted_at: std::sync::Mutex>, + + #[cfg(test)] + failpoints: std::sync::Mutex>, } impl std::fmt::Display for LayerInner { @@ -473,34 +650,76 @@ impl AsLayerDesc for LayerInner { #[derive(Debug, Clone, Copy)] enum Status { + Resident, Evicted, - Downloaded, + Downloading, } impl Drop for LayerInner { fn drop(&mut self) { + // if there was a pending eviction, mark it cancelled here to balance metrics + if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit() + { + // eviction has already been started + LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone); + + // eviction request is intentionally not honored as no one is present to wait for it + // and we could be delaying shutdown for nothing. + } + + if let Some(timeline) = self.timeline.upgrade() { + // Only need to decrement metrics if the timeline still exists: otherwise + // it will have already de-registered these metrics via TimelineMetrics::shutdown + if self.desc.is_delta() { + timeline.metrics.layer_count_delta.dec(); + timeline.metrics.layer_size_delta.sub(self.desc.file_size); + } else { + timeline.metrics.layer_count_image.dec(); + timeline.metrics.layer_size_image.sub(self.desc.file_size); + } + + if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) { + debug_assert!( + timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size + ); + timeline + .metrics + .visible_physical_size_gauge + .sub(self.desc.file_size); + } + } + if !*self.wanted_deleted.get_mut() { - // should we try to evict if the last wish was for eviction? - // feels like there's some hazard of overcrowding near shutdown near by, but we don't - // run drops during shutdown (yet) return; } let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id); let path = std::mem::take(&mut self.path); - let file_name = self.layer_desc().filename(); + let file_name = self.layer_desc().layer_name(); let file_size = self.layer_desc().file_size; let timeline = self.timeline.clone(); let meta = self.metadata(); - let status = self.status.clone(); + let status = self.status.take(); - crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || { + Self::spawn_blocking(move || { let _g = span.entered(); // carry this until we are finished for [`Layer::wait_drop`] support let _status = status; + let Some(timeline) = timeline.upgrade() else { + // no need to nag that timeline is gone: under normal situation on + // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped. + LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone); + return; + }; + + let Ok(_guard) = timeline.gate.enter() else { + LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone); + return; + }; + let removed = match std::fs::remove_file(path) { Ok(()) => true, Err(e) if e.kind() == std::io::ErrorKind::NotFound => { @@ -519,75 +738,85 @@ impl Drop for LayerInner { } }; - if let Some(timeline) = timeline.upgrade() { - if removed { - timeline.metrics.resident_physical_size_sub(file_size); - } - if let Some(remote_client) = timeline.remote_client.as_ref() { - let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]); + if removed { + timeline.metrics.resident_physical_size_sub(file_size); + } + let res = timeline + .remote_client + .schedule_deletion_of_unlinked(vec![(file_name, meta)]); - if let Err(e) = res { - // test_timeline_deletion_with_files_stuck_in_upload_queue is good at - // demonstrating this deadlock (without spawn_blocking): stop will drop - // queued items, which will have ResidentLayer's, and those drops would try - // to re-entrantly lock the RemoteTimelineClient inner state. - if !timeline.is_active() { - tracing::info!("scheduling deletion on drop failed: {e:#}"); - } else { - tracing::warn!("scheduling deletion on drop failed: {e:#}"); - } - LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed); - } else { - LAYER_IMPL_METRICS.inc_completed_deletes(); - } + if let Err(e) = res { + // test_timeline_deletion_with_files_stuck_in_upload_queue is good at + // demonstrating this deadlock (without spawn_blocking): stop will drop + // queued items, which will have ResidentLayer's, and those drops would try + // to re-entrantly lock the RemoteTimelineClient inner state. + if !timeline.is_active() { + tracing::info!("scheduling deletion on drop failed: {e:#}"); + } else { + tracing::warn!("scheduling deletion on drop failed: {e:#}"); } + LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed); } else { - // no need to nag that timeline is gone: under normal situation on - // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped. - LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone); + LAYER_IMPL_METRICS.inc_completed_deletes(); } }); } } impl LayerInner { + #[allow(clippy::too_many_arguments)] fn new( conf: &'static PageServerConf, timeline: &Arc, - access_stats: LayerAccessStats, + local_path: Utf8PathBuf, desc: PersistentLayerDesc, downloaded: Option>, generation: Generation, shard: ShardIndex, ) -> Self { - let path = conf - .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id) - .join(desc.filename().to_string()); - - let (inner, version) = if let Some(inner) = downloaded { + let (inner, version, init_status) = if let Some(inner) = downloaded { let version = inner.version; let resident = ResidentOrWantedEvicted::Resident(inner); - (heavier_once_cell::OnceCell::new(resident), version) + ( + heavier_once_cell::OnceCell::new(resident), + version, + Status::Resident, + ) } else { - (heavier_once_cell::OnceCell::default(), 0) + (heavier_once_cell::OnceCell::default(), 0, Status::Evicted) }; + // This object acts as a RAII guard on these metrics: increment on construction + if desc.is_delta() { + timeline.metrics.layer_count_delta.inc(); + timeline.metrics.layer_size_delta.add(desc.file_size); + } else { + timeline.metrics.layer_count_image.inc(); + timeline.metrics.layer_size_image.add(desc.file_size); + } + + // New layers are visible by default. This metric is later updated on drop or in set_visibility + timeline + .metrics + .visible_physical_size_gauge + .add(desc.file_size); + LayerInner { conf, - path, + path: local_path, desc, timeline: Arc::downgrade(timeline), - have_remote_client: timeline.remote_client.is_some(), - access_stats, + access_stats: Default::default(), wanted_deleted: AtomicBool::new(false), - wanted_evicted: AtomicBool::new(false), inner, version: AtomicUsize::new(version), - status: tokio::sync::broadcast::channel(1).0, + status: Some(tokio::sync::watch::channel(init_status).0), consecutive_failures: AtomicUsize::new(0), generation, shard, last_evicted_at: std::sync::Mutex::default(), + #[cfg(test)] + failpoints: Default::default(), } } @@ -603,48 +832,67 @@ impl LayerInner { /// Cancellation safe, however dropping the future and calling this method again might result /// in a new attempt to evict OR join the previously started attempt. - pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> { - use tokio::sync::broadcast::error::RecvError; + #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))] + pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> { + let mut rx = self.status.as_ref().unwrap().subscribe(); - assert!(self.have_remote_client); - - let mut rx = self.status.subscribe(); + { + let current = rx.borrow_and_update(); + match &*current { + Status::Resident => { + // we might get lucky and evict this; continue + } + Status::Evicted | Status::Downloading => { + // it is already evicted + return Err(EvictionError::NotFound); + } + } + } let strong = { match self.inner.get() { - Some(mut either) => { - self.wanted_evicted.store(true, Ordering::Relaxed); - either.downgrade() + Some(mut either) => either.downgrade(), + None => { + // we already have a scheduled eviction, which just has not gotten to run yet. + // it might still race with a read access, but that could also get cancelled, + // so let's say this is not evictable. + return Err(EvictionError::NotFound); } - None => return Err(EvictionError::NotFound), } }; if strong.is_some() { // drop the DownloadedLayer outside of the holding the guard drop(strong); + + // idea here is that only one evicter should ever get to witness a strong reference, + // which means whenever get_or_maybe_download upgrades a weak, it must mark up a + // cancelled eviction and signal us, like it currently does. + // + // a second concurrent evict_and_wait will not see a strong reference. LAYER_IMPL_METRICS.inc_started_evictions(); } - match rx.recv().await { - Ok(Status::Evicted) => Ok(()), - Ok(Status::Downloaded) => Err(EvictionError::Downloaded), - Err(RecvError::Closed) => { - unreachable!("sender cannot be dropped while we are in &self method") - } - Err(RecvError::Lagged(_)) => { - // this is quite unlikely, but we are blocking a lot in the async context, so - // we might be missing this because we are stuck on a LIFO slot on a thread - // which is busy blocking for a 1TB database create_image_layers. - // - // use however late (compared to the initial expressing of wanted) as the - // "outcome" now - LAYER_IMPL_METRICS.inc_broadcast_lagged(); - match self.inner.get() { - Some(_) => Err(EvictionError::Downloaded), - None => Ok(()), - } - } + let changed = rx.changed(); + let changed = tokio::time::timeout(timeout, changed).await; + + let Ok(changed) = changed else { + return Err(EvictionError::Timeout); + }; + + let _: () = changed.expect("cannot be closed, because we are holding a strong reference"); + + let current = rx.borrow_and_update(); + + match &*current { + // the easiest case + Status::Evicted => Ok(()), + // it surely was evicted in between, but then there was a new access now; we can't know + // if it'll succeed so lets just call it evicted + Status::Downloading => Ok(()), + // either the download which was started after eviction completed already, or it was + // never evicted + Status::Resident => Err(EvictionError::Downloaded), } } @@ -654,154 +902,122 @@ impl LayerInner { allow_download: bool, ctx: Option<&RequestContext>, ) -> Result, DownloadError> { - let mut init_permit = None; + let (weak, permit) = { + // get_or_init_detached can: + // - be fast (mutex lock) OR uncontested semaphore permit acquire + // - be slow (wait for semaphore permit or closing) + let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); - loop { - let download = move |permit| { - async move { - // disable any scheduled but not yet running eviction deletions for this - let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); + let locked = self + .inner + .get_or_init_detached() + .await + .map(|mut guard| guard.get_and_upgrade().ok_or(guard)); - // count cancellations, which currently remain largely unexpected - let init_cancelled = - scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); + scopeguard::ScopeGuard::into_inner(init_cancelled); - // no need to make the evict_and_wait wait for the actual download to complete - drop(self.status.send(Status::Downloaded)); - - let timeline = self - .timeline - .upgrade() - .ok_or_else(|| DownloadError::TimelineShutdown)?; - - // FIXME: grab a gate - - let can_ever_evict = timeline.remote_client.as_ref().is_some(); - - // check if we really need to be downloaded; could have been already downloaded by a - // cancelled previous attempt. - let needs_download = self - .needs_download() - .await - .map_err(DownloadError::PreStatFailed)?; - - let permit = if let Some(reason) = needs_download { - if let NeedsDownload::NotFile(ft) = reason { - return Err(DownloadError::NotFile(ft)); - } - - // only reset this after we've decided we really need to download. otherwise it'd - // be impossible to mark cancelled downloads for eviction, like one could imagine - // we would like to do for prefetching which was not needed. - self.wanted_evicted.store(false, Ordering::Release); - - if !can_ever_evict { - return Err(DownloadError::NoRemoteStorage); - } - - if let Some(ctx) = ctx { - self.check_expected_download(ctx)?; - } - - if !allow_download { - // this does look weird, but for LayerInner the "downloading" means also changing - // internal once related state ... - return Err(DownloadError::DownloadRequired); - } - - tracing::info!(%reason, "downloading on-demand"); - - self.spawn_download_and_wait(timeline, permit).await? - } else { - // the file is present locally, probably by a previous but cancelled call to - // get_or_maybe_download. alternatively we might be running without remote storage. - LAYER_IMPL_METRICS.inc_init_needed_no_download(); - - permit - }; - - let since_last_eviction = - self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed()); - if let Some(since_last_eviction) = since_last_eviction { - // FIXME: this will not always be recorded correctly until #6028 (the no - // download needed branch above) - LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); - } - - let res = Arc::new(DownloadedLayer { - owner: Arc::downgrade(self), - kind: tokio::sync::OnceCell::default(), - version: next_version, - }); - - self.access_stats.record_residence_event( - LayerResidenceStatus::Resident, - LayerResidenceEventReason::ResidenceChange, - ); - - let waiters = self.inner.initializer_count(); - if waiters > 0 { - tracing::info!( - waiters, - "completing the on-demand download for other tasks" - ); - } - - scopeguard::ScopeGuard::into_inner(init_cancelled); - - Ok((ResidentOrWantedEvicted::Resident(res), permit)) - } - .instrument(tracing::info_span!("get_or_maybe_download", layer=%self)) - }; - - if let Some(init_permit) = init_permit.take() { - // use the already held initialization permit because it is impossible to hit the - // below paths anymore essentially limiting the max loop iterations to 2. - let (value, init_permit) = download(init_permit).await?; - let mut guard = self.inner.set(value, init_permit); - let (strong, _upgraded) = guard - .get_and_upgrade() - .expect("init creates strong reference, we held the init permit"); - return Ok(strong); - } - - let (weak, permit) = { - let mut locked = self.inner.get_or_init(download).await?; - - if let Some((strong, upgraded)) = locked.get_and_upgrade() { - if upgraded { - // when upgraded back, the Arc is still available, but - // previously a `evict_and_wait` was received. - self.wanted_evicted.store(false, Ordering::Relaxed); - - // error out any `evict_and_wait` - drop(self.status.send(Status::Downloaded)); - LAYER_IMPL_METRICS - .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess); - } + match locked { + // this path could had been a RwLock::read + Ok(Ok((strong, upgraded))) if !upgraded => return Ok(strong), + Ok(Ok((strong, _))) => { + // when upgraded back, the Arc is still available, but + // previously a `evict_and_wait` was received. this is the only place when we + // send out an update without holding the InitPermit. + // + // note that we also have dropped the Guard; this is fine, because we just made + // a state change and are holding a strong reference to be returned. + self.status.as_ref().unwrap().send_replace(Status::Resident); + LAYER_IMPL_METRICS + .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess); return Ok(strong); - } else { - // path to here: the evict_blocking is stuck on spawn_blocking queue. - // - // reset the contents, deactivating the eviction and causing a - // EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed. - locked.take_and_deinit() } - }; - - // unlock first, then drop the weak, but because upgrade failed, we - // know it cannot be a problem. + Ok(Err(guard)) => { + // path to here: we won the eviction, the file should still be on the disk. + let (weak, permit) = guard.take_and_deinit(); + (Some(weak), permit) + } + Err(permit) => (None, permit), + } + }; + if let Some(weak) = weak { + // only drop the weak after dropping the heavier_once_cell guard assert!( matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)), "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug" ); - - init_permit = Some(permit); - - LAYER_IMPL_METRICS.inc_retried_get_or_maybe_download(); } + + let timeline = self + .timeline + .upgrade() + .ok_or_else(|| DownloadError::TimelineShutdown)?; + + // count cancellations, which currently remain largely unexpected + let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); + + // check if we really need to be downloaded: this can happen if a read access won the + // semaphore before eviction. + // + // if we are cancelled while doing this `stat` the `self.inner` will be uninitialized. a + // pending eviction will try to evict even upon finding an uninitialized `self.inner`. + let needs_download = self + .needs_download() + .await + .map_err(DownloadError::PreStatFailed); + + scopeguard::ScopeGuard::into_inner(init_cancelled); + + let needs_download = needs_download?; + + let Some(reason) = needs_download else { + // the file is present locally because eviction has not had a chance to run yet + + #[cfg(test)] + self.failpoint(failpoints::FailpointKind::AfterDeterminingLayerNeedsNoDownload) + .await?; + + LAYER_IMPL_METRICS.inc_init_needed_no_download(); + + return Ok(self.initialize_after_layer_is_on_disk(permit)); + }; + + // we must download; getting cancelled before spawning the download is not an issue as + // any still running eviction would not find anything to evict. + + if let NeedsDownload::NotFile(ft) = reason { + return Err(DownloadError::NotFile(ft)); + } + + if let Some(ctx) = ctx { + self.check_expected_download(ctx)?; + } + + if !allow_download { + // this is only used from tests, but it is hard to test without the boolean + return Err(DownloadError::DownloadRequired); + } + + let download_ctx = ctx + .map(|ctx| ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download)) + .unwrap_or(RequestContext::new( + TaskKind::LayerDownload, + DownloadBehavior::Download, + )); + + async move { + tracing::info!(%reason, "downloading on-demand"); + + let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); + let res = self + .download_init_and_wait(timeline, permit, download_ctx) + .await?; + scopeguard::ScopeGuard::into_inner(init_cancelled); + Ok(res) + } + .instrument(tracing::info_span!("get_or_maybe_download", layer=%self)) + .await } /// Nag or fail per RequestContext policy @@ -831,131 +1047,188 @@ impl LayerInner { } /// Actual download, at most one is executed at the time. - async fn spawn_download_and_wait( + async fn download_init_and_wait( self: &Arc, timeline: Arc, permit: heavier_once_cell::InitPermit, - ) -> Result { - let task_name = format!("download layer {}", self); + ctx: RequestContext, + ) -> Result, DownloadError> { + debug_assert_current_span_has_tenant_and_timeline_id(); let (tx, rx) = tokio::sync::oneshot::channel(); - // this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot - // block tenant::mgr::remove_tenant_from_memory. - let this: Arc = self.clone(); - crate::task_mgr::spawn( - &tokio::runtime::Handle::current(), - crate::task_mgr::TaskKind::RemoteDownloadTask, - Some(self.desc.tenant_shard_id), - Some(self.desc.timeline_id), - &task_name, - false, + let guard = timeline + .gate + .enter() + .map_err(|_| DownloadError::DownloadCancelled)?; + + Self::spawn( async move { + let _guard = guard; - let client = timeline - .remote_client + // now that we have commited to downloading, send out an update to: + // - unhang any pending eviction + // - break out of evict_and_wait + this.status .as_ref() - .expect("checked above with have_remote_client"); + .unwrap() + .send_replace(Status::Downloading); - let result = client.download_layer_file( - &this.desc.filename(), - &this.metadata(), - &crate::task_mgr::shutdown_token() - ) - .await; + #[cfg(test)] + this.failpoint(failpoints::FailpointKind::WaitBeforeDownloading) + .await + .unwrap(); - let result = match result { - Ok(size) => { - timeline.metrics.resident_physical_size_add(size); - Ok(()) - } - Err(e) => { - let consecutive_failures = - this.consecutive_failures.fetch_add(1, Ordering::Relaxed); + let res = this.download_and_init(timeline, permit, &ctx).await; - let backoff = utils::backoff::exponential_backoff_duration_seconds( - consecutive_failures.min(u32::MAX as usize) as u32, - 1.5, - 60.0, - ); - - let backoff = std::time::Duration::from_secs_f64(backoff); - - tokio::select! { - _ = tokio::time::sleep(backoff) => {}, - _ = crate::task_mgr::shutdown_token().cancelled_owned() => {}, - _ = timeline.cancel.cancelled() => {}, - }; - - Err(e) - } - }; - - if let Err(res) = tx.send((result, permit)) { + if let Err(res) = tx.send(res) { match res { - (Ok(()), _) => { - // our caller is cancellation safe so this is fine; if someone - // else requests the layer, they'll find it already downloaded. - // - // See counter [`LayerImplMetrics::inc_init_needed_no_download`] - // - // FIXME(#6028): however, could be that we should consider marking the - // layer for eviction? alas, cannot: because only DownloadedLayer will - // handle that. - }, - (Err(e), _) => { - // our caller is cancellation safe, but we might be racing with - // another attempt to initialize. before we have cancellation - // token support: these attempts should converge regardless of - // their completion order. - tracing::error!("layer file download failed, and additionally failed to communicate this to caller: {e:?}"); + Ok(_res) => { + tracing::debug!("layer initialized, but caller has been cancelled"); + LAYER_IMPL_METRICS.inc_init_completed_without_requester(); + } + Err(e) => { + tracing::info!( + "layer file download failed, and caller has been cancelled: {e:?}" + ); LAYER_IMPL_METRICS.inc_download_failed_without_requester(); } } } - - Ok(()) } .in_current_span(), ); + match rx.await { - Ok((Ok(()), permit)) => { - if let Some(reason) = self - .needs_download() - .await - .map_err(DownloadError::PostStatFailed)? - { - // this is really a bug in needs_download or remote timeline client - panic!("post-condition failed: needs_download returned {reason:?}"); - } - - self.consecutive_failures.store(0, Ordering::Relaxed); - tracing::info!("on-demand download successful"); - - Ok(permit) - } - Ok((Err(e), _permit)) => { - // sleep already happened in the spawned task, if it was not cancelled - let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed); - - match e.downcast_ref::() { - // If the download failed due to its cancellation token, - // propagate the cancellation error upstream. - Some(remote_storage::DownloadError::Cancelled) => { - Err(DownloadError::DownloadCancelled) - } - _ => { - tracing::error!(consecutive_failures, "layer file download failed: {e:#}"); - Err(DownloadError::DownloadFailed) - } - } + Ok(Ok(res)) => Ok(res), + Ok(Err(remote_storage::DownloadError::Cancelled)) => { + Err(DownloadError::DownloadCancelled) } + Ok(Err(_)) => Err(DownloadError::DownloadFailed), Err(_gone) => Err(DownloadError::DownloadCancelled), } } + async fn download_and_init( + self: &Arc, + timeline: Arc, + permit: heavier_once_cell::InitPermit, + ctx: &RequestContext, + ) -> Result, remote_storage::DownloadError> { + let result = timeline + .remote_client + .download_layer_file( + &self.desc.layer_name(), + &self.metadata(), + &self.path, + &timeline.cancel, + ctx, + ) + .await; + + match result { + Ok(size) => { + assert_eq!(size, self.desc.file_size); + + match self.needs_download().await { + Ok(Some(reason)) => { + // this is really a bug in needs_download or remote timeline client + panic!("post-condition failed: needs_download returned {reason:?}"); + } + Ok(None) => { + // as expected + } + Err(e) => { + panic!("post-condition failed: needs_download errored: {e:?}"); + } + } + + tracing::info!(size=%self.desc.file_size, "on-demand download successful"); + timeline + .metrics + .resident_physical_size_add(self.desc.file_size); + self.consecutive_failures.store(0, Ordering::Relaxed); + + let since_last_eviction = self + .last_evicted_at + .lock() + .unwrap() + .take() + .map(|ts| ts.elapsed()); + if let Some(since_last_eviction) = since_last_eviction { + LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); + } + + self.access_stats.record_residence_event(); + + Ok(self.initialize_after_layer_is_on_disk(permit)) + } + Err(e) => { + let consecutive_failures = + 1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed); + + if timeline.cancel.is_cancelled() { + // If we're shutting down, drop out before logging the error + return Err(e); + } + + tracing::error!(consecutive_failures, "layer file download failed: {e:#}"); + + let backoff = utils::backoff::exponential_backoff_duration_seconds( + consecutive_failures.min(u32::MAX as usize) as u32, + 1.5, + 60.0, + ); + + let backoff = std::time::Duration::from_secs_f64(backoff); + + tokio::select! { + _ = tokio::time::sleep(backoff) => {}, + _ = timeline.cancel.cancelled() => {}, + }; + + Err(e) + } + } + } + + /// Initializes the `Self::inner` to a "resident" state. + /// + /// Callers are assumed to ensure that the file is actually on disk with `Self::needs_download` + /// before calling this method. + /// + /// If this method is ever made async, it needs to be cancellation safe so that no state + /// changes are made before we can write to the OnceCell in non-cancellable fashion. + fn initialize_after_layer_is_on_disk( + self: &Arc, + permit: heavier_once_cell::InitPermit, + ) -> Arc { + debug_assert_current_span_has_tenant_and_timeline_id(); + + // disable any scheduled but not yet running eviction deletions for this initialization + let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); + self.status.as_ref().unwrap().send_replace(Status::Resident); + + let res = Arc::new(DownloadedLayer { + owner: Arc::downgrade(self), + kind: tokio::sync::OnceCell::default(), + version: next_version, + }); + + let waiters = self.inner.initializer_count(); + if waiters > 0 { + tracing::info!(waiters, "completing layer init for other tasks"); + } + + let value = ResidentOrWantedEvicted::Resident(res.clone()); + + self.inner.set(value, permit); + + res + } + async fn needs_download(&self) -> Result, std::io::Error> { match tokio::fs::metadata(&self.path).await { Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()), @@ -987,11 +1260,13 @@ impl LayerInner { } fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo { - let layer_file_name = self.desc.filename().file_name(); + let layer_name = self.desc.layer_name().to_string(); - // this is not accurate: we could have the file locally but there was a cancellation - // and now we are not in sync, or we are currently downloading it. - let remote = self.inner.get().is_none(); + let resident = self + .inner + .get() + .map(|rowe| rowe.is_likely_resident()) + .unwrap_or(false); let access_stats = self.access_stats.as_api_model(reset); @@ -999,122 +1274,225 @@ impl LayerInner { let lsn_range = &self.desc.lsn_range; HistoricLayerInfo::Delta { - layer_file_name, + layer_file_name: layer_name, layer_file_size: self.desc.file_size, lsn_start: lsn_range.start, lsn_end: lsn_range.end, - remote, + remote: !resident, access_stats, + l0: crate::tenant::layer_map::LayerMap::is_l0( + &self.layer_desc().key_range, + self.layer_desc().is_delta, + ), } } else { let lsn = self.desc.image_layer_lsn(); HistoricLayerInfo::Image { - layer_file_name, + layer_file_name: layer_name, layer_file_size: self.desc.file_size, lsn_start: lsn, - remote, + remote: !resident, access_stats, } } } /// `DownloadedLayer` is being dropped, so it calls this method. - fn on_downloaded_layer_drop(self: Arc, version: usize) { - let delete = self.wanted_deleted.load(Ordering::Acquire); - let evict = self.wanted_evicted.load(Ordering::Acquire); - let can_evict = self.have_remote_client; + fn on_downloaded_layer_drop(self: Arc, only_version: usize) { + // we cannot know without inspecting LayerInner::inner if we should evict or not, even + // though here it is very likely + let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version); - if delete { - // do nothing now, only in LayerInner::drop -- this was originally implemented because - // we could had already scheduled the deletion at the time. - // - // FIXME: this is not true anymore, we can safely evict wanted deleted files. - } else if can_evict && evict { - let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version); + // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might + // drop while the `self.inner` is being locked, leading to a deadlock. - // downgrade for queueing, in case there's a tear down already ongoing we should not - // hold it alive. - let this = Arc::downgrade(&self); - drop(self); + let start_evicting = async move { + #[cfg(test)] + self.failpoint(failpoints::FailpointKind::WaitBeforeStartingEvicting) + .await + .expect("failpoint should not have errored"); - // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might - // drop while the `self.inner` is being locked, leading to a deadlock. + tracing::debug!("eviction started"); - crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || { - let _g = span.entered(); + let res = self.wait_for_turn_and_evict(only_version).await; + // metrics: ignore the Ok branch, it is not done yet + if let Err(e) = res { + tracing::debug!(res=?Err::<(), _>(&e), "eviction completed"); + LAYER_IMPL_METRICS.inc_eviction_cancelled(e); + } + }; - // if LayerInner is already dropped here, do nothing because the delete on drop - // has already ran while we were in queue - let Some(this) = this.upgrade() else { - LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone); - return; - }; - match this.evict_blocking(version) { - Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(), - Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason), - } - }); - } + Self::spawn(start_evicting.instrument(span)); } - fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> { - // deleted or detached timeline, don't do anything. - let Some(timeline) = self.timeline.upgrade() else { + async fn wait_for_turn_and_evict( + self: Arc, + only_version: usize, + ) -> Result<(), EvictionCancelled> { + fn is_good_to_continue(status: &Status) -> Result<(), EvictionCancelled> { + use Status::*; + match status { + Resident => Ok(()), + Evicted => Err(EvictionCancelled::UnexpectedEvictedState), + Downloading => Err(EvictionCancelled::LostToDownload), + } + } + + let timeline = self + .timeline + .upgrade() + .ok_or(EvictionCancelled::TimelineGone)?; + + let mut rx = self + .status + .as_ref() + .expect("LayerInner cannot be dropped, holding strong ref") + .subscribe(); + + is_good_to_continue(&rx.borrow_and_update())?; + + let Ok(gate) = timeline.gate.enter() else { return Err(EvictionCancelled::TimelineGone); }; - // to avoid starting a new download while we evict, keep holding on to the - // permit. - let _permit = { - let maybe_downloaded = self.inner.get(); + let permit = { + // we cannot just `std::fs::remove_file` because there might already be an + // get_or_maybe_download which will inspect filesystem and reinitialize. filesystem + // operations must be done while holding the heavier_once_cell::InitPermit + let mut wait = std::pin::pin!(self.inner.get_or_init_detached()); - let (_weak, permit) = match maybe_downloaded { - Some(mut guard) => { - if let ResidentOrWantedEvicted::WantedEvicted(_weak, version) = &*guard { - if *version == only_version { - guard.take_and_deinit() - } else { - // this was not for us; maybe there's another eviction job - // TODO: does it make any sense to stall here? unique versions do not - // matter, we only want to make sure not to evict a resident, which we - // are not doing. - return Err(EvictionCancelled::VersionCheckFailed); - } - } else { - return Err(EvictionCancelled::AlreadyReinitialized); + let waited = loop { + // we must race to the Downloading starting, otherwise we would have to wait until the + // completion of the download. waiting for download could be long and hinder our + // efforts to alert on "hanging" evictions. + tokio::select! { + res = &mut wait => break res, + _ = rx.changed() => { + is_good_to_continue(&rx.borrow_and_update())?; + // two possibilities for Status::Resident: + // - the layer was found locally from disk by a read + // - we missed a bunch of updates and now the layer is + // again downloaded -- assume we'll fail later on with + // version check or AlreadyReinitialized } } - None => { - // already deinitialized, perhaps get_or_maybe_download did this and is - // currently waiting to reinitialize it - return Err(EvictionCancelled::LostToDownload); + }; + + // re-check now that we have the guard or permit; all updates should have happened + // while holding the permit. + is_good_to_continue(&rx.borrow_and_update())?; + + // the term deinitialize is used here, because we clearing out the Weak will eventually + // lead to deallocating the reference counted value, and the value we + // `Guard::take_and_deinit` is likely to be the last because the Weak is never cloned. + let (_weak, permit) = match waited { + Ok(guard) => { + match &*guard { + ResidentOrWantedEvicted::WantedEvicted(_weak, version) + if *version == only_version => + { + tracing::debug!(version, "deinitializing matching WantedEvicted"); + let (weak, permit) = guard.take_and_deinit(); + (Some(weak), permit) + } + ResidentOrWantedEvicted::WantedEvicted(_, version) => { + // if we were not doing the version check, we would need to try to + // upgrade the weak here to see if it really is dropped. version check + // is done instead assuming that it is cheaper. + tracing::debug!( + version, + only_version, + "version mismatch, not deinitializing" + ); + return Err(EvictionCancelled::VersionCheckFailed); + } + ResidentOrWantedEvicted::Resident(_) => { + return Err(EvictionCancelled::AlreadyReinitialized); + } + } + } + Err(permit) => { + tracing::debug!("continuing after cancelled get_or_maybe_download or eviction"); + (None, permit) } }; permit }; - // now accesses to inner.get_or_init wait on the semaphore or the `_permit` + let span = tracing::Span::current(); - self.access_stats.record_residence_event( - LayerResidenceStatus::Evicted, - LayerResidenceEventReason::ResidenceChange, - ); + let spawned_at = std::time::Instant::now(); - let res = match capture_mtime_and_remove(&self.path) { + // this is on purpose a detached spawn; we don't need to wait for it + // + // eviction completion reporting is the only thing hinging on this, and it can be just as + // well from a spawn_blocking thread. + // + // important to note that now that we've acquired the permit we have made sure the evicted + // file is either the exact `WantedEvicted` we wanted to evict, or uninitialized in case + // there are multiple evictions. The rest is not cancellable, and we've now commited to + // evicting. + // + // If spawn_blocking has a queue and maximum number of threads are in use, we could stall + // reads. We will need to add cancellation for that if necessary. + Self::spawn_blocking(move || { + let _span = span.entered(); + + let res = self.evict_blocking(&timeline, &gate, &permit); + + let waiters = self.inner.initializer_count(); + + if waiters > 0 { + LAYER_IMPL_METRICS.inc_evicted_with_waiters(); + } + + let completed_in = spawned_at.elapsed(); + LAYER_IMPL_METRICS.record_time_to_evict(completed_in); + + match res { + Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(), + Err(e) => LAYER_IMPL_METRICS.inc_eviction_cancelled(e), + } + + tracing::debug!(?res, elapsed_ms=%completed_in.as_millis(), %waiters, "eviction completed"); + }); + + Ok(()) + } + + /// This is blocking only to do just one spawn_blocking hop compared to multiple via tokio::fs. + fn evict_blocking( + &self, + timeline: &Timeline, + _gate: &gate::GateGuard, + _permit: &heavier_once_cell::InitPermit, + ) -> Result<(), EvictionCancelled> { + // now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit` + + match capture_mtime_and_remove(&self.path) { Ok(local_layer_mtime) => { let duration = SystemTime::now().duration_since(local_layer_mtime); match duration { Ok(elapsed) => { - timeline - .metrics - .evictions_with_low_residence_duration - .read() - .unwrap() - .observe(elapsed); + let accessed_and_visible = self.access_stats.accessed() + && self.access_stats.visibility() == LayerVisibilityHint::Visible; + if accessed_and_visible { + // Only layers used for reads contribute to our "low residence" metric that is used + // to detect thrashing. Layers promoted for other reasons (e.g. compaction) are allowed + // to be rapidly evicted without contributing to this metric. + timeline + .metrics + .evictions_with_low_residence_duration + .read() + .unwrap() + .observe(elapsed); + } + tracing::info!( residence_millis = elapsed.as_millis(), + accessed_and_visible, "evicted layer after known residence period" ); } @@ -1126,33 +1504,57 @@ impl LayerInner { timeline .metrics .resident_physical_size_sub(self.desc.file_size); - - Ok(()) } Err(e) if e.kind() == std::io::ErrorKind::NotFound => { tracing::error!( layer_size = %self.desc.file_size, - "failed to evict layer from disk, it was already gone (metrics will be inaccurate)" + "failed to evict layer from disk, it was already gone" ); - Err(EvictionCancelled::FileNotFound) + return Err(EvictionCancelled::FileNotFound); } Err(e) => { + // FIXME: this should probably be an abort tracing::error!("failed to evict file from disk: {e:#}"); - Err(EvictionCancelled::RemoveFailed) + return Err(EvictionCancelled::RemoveFailed); } - }; + } - // we are still holding the permit, so no new spawn_download_and_wait can happen - drop(self.status.send(Status::Evicted)); + self.access_stats.record_residence_event(); + + self.status.as_ref().unwrap().send_replace(Status::Evicted); *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now()); - res + Ok(()) } fn metadata(&self) -> LayerFileMetadata { LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard) } + + /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME. + /// + /// Synchronizing with spawned tasks is very complicated otherwise. + fn spawn(fut: F) + where + F: std::future::Future + Send + 'static, + { + #[cfg(test)] + tokio::task::spawn(fut); + #[cfg(not(test))] + crate::task_mgr::BACKGROUND_RUNTIME.spawn(fut); + } + + /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME. + fn spawn_blocking(f: F) + where + F: FnOnce() + Send + 'static, + { + #[cfg(test)] + tokio::task::spawn_blocking(f); + #[cfg(not(test))] + crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(f); + } } fn capture_mtime_and_remove(path: &Utf8Path) -> Result { @@ -1170,15 +1572,16 @@ pub(crate) enum EvictionError { /// Evictions must always lose to downloads in races, and this time it happened. #[error("layer was downloaded instead")] Downloaded, + + #[error("eviction did not happen within timeout")] + Timeout, } /// Error internal to the [`LayerInner::get_or_maybe_download`] #[derive(Debug, thiserror::Error)] -enum DownloadError { +pub(crate) enum DownloadError { #[error("timeline has already shutdown")] TimelineShutdown, - #[error("no remote storage configured")] - NoRemoteStorage, #[error("context denies downloading")] ContextAndConfigReallyDeniesDownloads, #[error("downloading is really required but not allowed by this method")] @@ -1193,8 +1596,16 @@ enum DownloadError { DownloadCancelled, #[error("pre-condition: stat before download failed")] PreStatFailed(#[source] std::io::Error), - #[error("post-condition: stat after download failed")] - PostStatFailed(#[source] std::io::Error), + + #[cfg(test)] + #[error("failpoint: {0:?}")] + Failpoint(failpoints::FailpointKind), +} + +impl DownloadError { + pub(crate) fn is_cancelled(&self) -> bool { + matches!(self, DownloadError::DownloadCancelled) + } } #[derive(Debug, PartialEq)] @@ -1240,14 +1651,16 @@ impl Drop for DownloadedLayer { if let Some(owner) = self.owner.upgrade() { owner.on_downloaded_layer_drop(self.version); } else { - // no need to do anything, we are shutting down + // Layer::drop will handle cancelling the eviction; because of drop order and + // `DownloadedLayer` never leaking, we cannot know here if eviction was requested. } } } impl DownloadedLayer { - /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`], or fails to - /// initialize it permanently. + /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`]. + /// Failure to load the layer is sticky, i.e., future `get()` calls will return + /// the initial load failure immediately. /// /// `owner` parameter is a strong reference at the same `LayerInner` as the /// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called, @@ -1265,16 +1678,27 @@ impl DownloadedLayer { ); let res = if owner.desc.is_delta { + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(crate::context::PageContentKind::DeltaLayerSummary) + .build(); let summary = Some(delta_layer::Summary::expected( owner.desc.tenant_shard_id.tenant_id, owner.desc.timeline_id, owner.desc.key_range.clone(), owner.desc.lsn_range.clone(), )); - delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx) - .await - .map(|res| res.map(LayerKind::Delta)) + delta_layer::DeltaLayerInner::load( + &owner.path, + summary, + Some(owner.conf.max_vectored_read_bytes), + &ctx, + ) + .await + .map(LayerKind::Delta) } else { + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(crate::context::PageContentKind::ImageLayerSummary) + .build(); let lsn = owner.desc.image_layer_lsn(); let summary = Some(image_layer::Summary::expected( owner.desc.tenant_shard_id.tenant_id, @@ -1282,53 +1706,60 @@ impl DownloadedLayer { owner.desc.key_range.clone(), lsn, )); - image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx) - .await - .map(|res| res.map(LayerKind::Image)) + image_layer::ImageLayerInner::load( + &owner.path, + lsn, + summary, + Some(owner.conf.max_vectored_read_bytes), + &ctx, + ) + .await + .map(LayerKind::Image) }; match res { - Ok(Ok(layer)) => Ok(Ok(layer)), - Ok(Err(transient)) => Err(transient), - Err(permanent) => { + Ok(layer) => Ok(layer), + Err(err) => { LAYER_IMPL_METRICS.inc_permanent_loading_failures(); - // TODO(#5815): we are not logging all errors, so temporarily log them **once** - // here as well - let permanent = permanent.context("load layer"); - tracing::error!("layer loading failed permanently: {permanent:#}"); - Ok(Err(permanent)) + // We log this message once over the lifetime of `Self` + // => Ok and good to log backtrace and path here. + tracing::error!( + "layer load failed, assuming permanent failure: {}: {err:?}", + owner.path + ); + Err(err) } } }; self.kind - .get_or_try_init(init) - // return transient errors using `?` - .await? + .get_or_init(init) + .await .as_ref() - .map_err(|e| { - // errors are not clonabled, cannot but stringify - // test_broken_timeline matches this string - anyhow::anyhow!("layer loading failed: {e:#}") - }) + // We already logged the full backtrace above, once. Don't repeat that here. + .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}")) } - async fn get_value_reconstruct_data( + async fn get_values_reconstruct_data( &self, - key: Key, + keyspace: KeySpace, lsn_range: Range, - reconstruct_data: &mut ValueReconstructState, + reconstruct_data: &mut ValuesReconstructState, owner: &Arc, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result<(), GetVectoredError> { use LayerKind::*; - match self.get(owner, ctx).await? { + match self + .get(owner, ctx) + .await + .map_err(GetVectoredError::Other)? + { Delta(d) => { - d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx) + d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx) .await } Image(i) => { - i.get_value_reconstruct_data(key, reconstruct_data, ctx) + i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx) .await } } @@ -1380,7 +1811,7 @@ impl ResidentLayer { } /// Loads all keys stored in the layer. Returns key, lsn and value size. - #[tracing::instrument(skip_all, fields(layer=%self))] + #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))] pub(crate) async fn load_keys<'a>( &'a self, ctx: &RequestContext, @@ -1388,21 +1819,67 @@ impl ResidentLayer { use LayerKind::*; let owner = &self.owner.0; - match self.downloaded.get(owner, ctx).await? { Delta(ref d) => { - owner - .access_stats - .record_access(LayerAccessKind::KeyIter, ctx); - // this is valid because the DownloadedLayer::kind is a OnceCell, not a // Mutex, so we cannot go and deinitialize the value with OnceCell::take // while it's being held. + self.owner.record_access(ctx); + delta_layer::DeltaLayerInner::load_keys(d, ctx) .await - .context("Layer index is corrupted") + .with_context(|| format!("Layer index is corrupted for {self}")) } - Image(_) => anyhow::bail!("cannot load_keys on a image layer"), + Image(_) => anyhow::bail!(format!("cannot load_keys on a image layer {self}")), + } + } + + /// Read all they keys in this layer which match the ShardIdentity, and write them all to + /// the provided writer. Return the number of keys written. + #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))] + pub(crate) async fn filter( + &self, + shard_identity: &ShardIdentity, + writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> Result { + use LayerKind::*; + + match self + .downloaded + .get(&self.owner.0, ctx) + .await + .map_err(CompactionError::Other)? + { + Delta(_) => { + return Err(CompactionError::Other(anyhow::anyhow!(format!( + "cannot filter() on a delta layer {self}" + )))); + } + Image(i) => i + .filter(shard_identity, writer, ctx) + .await + .map_err(CompactionError::Other), + } + } + + /// Returns the amount of keys and values written to the writer. + pub(crate) async fn copy_delta_prefix( + &self, + writer: &mut super::delta_layer::DeltaLayerWriter, + until: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result { + use LayerKind::*; + + let owner = &self.owner.0; + + match self.downloaded.get(owner, ctx).await? { + Delta(ref d) => d + .copy_prefix(writer, until, ctx) + .await + .with_context(|| format!("copy_delta_prefix until {until} of {self}")), + Image(_) => anyhow::bail!(format!("cannot copy_lsn_prefix of image layer {self}")), } } @@ -1410,13 +1887,33 @@ impl ResidentLayer { &self.owner.0.path } - pub(crate) fn access_stats(&self) -> &LayerAccessStats { - self.owner.access_stats() - } - pub(crate) fn metadata(&self) -> LayerFileMetadata { self.owner.metadata() } + + /// Cast the layer to a delta, return an error if it is an image layer. + pub(crate) async fn get_as_delta( + &self, + ctx: &RequestContext, + ) -> anyhow::Result<&delta_layer::DeltaLayerInner> { + use LayerKind::*; + match self.downloaded.get(&self.owner.0, ctx).await? { + Delta(ref d) => Ok(d), + Image(_) => Err(anyhow::anyhow!("image layer")), + } + } + + /// Cast the layer to an image, return an error if it is a delta layer. + pub(crate) async fn get_as_image( + &self, + ctx: &RequestContext, + ) -> anyhow::Result<&image_layer::ImageLayerInner> { + use LayerKind::*; + match self.downloaded.get(&self.owner.0, ctx).await? { + Image(ref d) => Ok(d), + Delta(_) => Err(anyhow::anyhow!("delta layer")), + } + } } impl AsLayerDesc for ResidentLayer { @@ -1452,6 +1949,7 @@ pub(crate) struct LayerImplMetrics { rare_counters: enum_map::EnumMap, inits_cancelled: metrics::core::GenericCounter, redownload_after: metrics::Histogram, + time_to_evict: metrics::Histogram, } impl Default for LayerImplMetrics { @@ -1547,6 +2045,13 @@ impl Default for LayerImplMetrics { .unwrap() }; + let time_to_evict = metrics::register_histogram!( + "pageserver_layer_eviction_held_permit_seconds", + "Time eviction held the permit.", + vec![0.001, 0.010, 0.100, 0.500, 1.000, 5.000] + ) + .unwrap(); + Self { started_evictions, completed_evictions, @@ -1559,6 +2064,7 @@ impl Default for LayerImplMetrics { rare_counters, inits_cancelled, redownload_after, + time_to_evict, } } } @@ -1590,9 +2096,10 @@ impl LayerImplMetrics { self.rare_counters[RareEvent::RemoveOnDropFailed].inc(); } - /// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`. - fn inc_retried_get_or_maybe_download(&self) { - self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc(); + /// Expected rare just as cancellations are rare, but we could have cancellations separate from + /// the single caller which can start the download, so use this counter to separte them. + fn inc_init_completed_without_requester(&self) { + self.rare_counters[RareEvent::InitCompletedWithoutRequester].inc(); } /// Expected rare because cancellations are unexpected, and failures are unexpected @@ -1619,10 +2126,6 @@ impl LayerImplMetrics { self.rare_counters[RareEvent::PermanentLoadingFailure].inc(); } - fn inc_broadcast_lagged(&self) { - self.rare_counters[RareEvent::EvictAndWaitLagged].inc(); - } - fn inc_init_cancelled(&self) { self.inits_cancelled.inc() } @@ -1630,9 +2133,22 @@ impl LayerImplMetrics { fn record_redownloaded_after(&self, duration: std::time::Duration) { self.redownload_after.observe(duration.as_secs_f64()) } + + /// This would be bad if it ever happened, or mean extreme disk pressure. We should probably + /// instead cancel eviction if we would have read waiters. We cannot however separate reads + /// from other evictions, so this could have noise as well. + fn inc_evicted_with_waiters(&self) { + self.rare_counters[RareEvent::EvictedWithWaiters].inc(); + } + + /// Recorded at least initially as the permit is now acquired in async context before + /// spawn_blocking action. + fn record_time_to_evict(&self, duration: std::time::Duration) { + self.time_to_evict.observe(duration.as_secs_f64()) + } } -#[derive(enum_map::Enum)] +#[derive(Debug, Clone, Copy, enum_map::Enum)] enum EvictionCancelled { LayerGone, TimelineGone, @@ -1644,6 +2160,7 @@ enum EvictionCancelled { LostToDownload, /// After eviction, there was a new layer access which cancelled the eviction. UpgradedBackOnAccess, + UnexpectedEvictedState, } impl EvictionCancelled { @@ -1657,6 +2174,7 @@ impl EvictionCancelled { EvictionCancelled::AlreadyReinitialized => "already_reinitialized", EvictionCancelled::LostToDownload => "lost_to_download", EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access", + EvictionCancelled::UnexpectedEvictedState => "unexpected_evicted_state", } } } @@ -1679,12 +2197,12 @@ impl DeleteFailed { #[derive(enum_map::Enum)] enum RareEvent { RemoveOnDropFailed, - RetriedGetOrMaybeDownload, + InitCompletedWithoutRequester, DownloadFailedWithoutRequester, UpgradedWantedEvicted, InitWithoutDownload, PermanentLoadingFailure, - EvictAndWaitLagged, + EvictedWithWaiters, } impl RareEvent { @@ -1693,12 +2211,12 @@ impl RareEvent { match self { RemoveOnDropFailed => "remove_on_drop_failed", - RetriedGetOrMaybeDownload => "retried_gomd", + InitCompletedWithoutRequester => "init_completed_without", DownloadFailedWithoutRequester => "download_failed_without", UpgradedWantedEvicted => "raced_wanted_evicted", InitWithoutDownload => "init_needed_no_download", PermanentLoadingFailure => "permanent_loading_failure", - EvictAndWaitLagged => "broadcast_lagged", + EvictedWithWaiters => "evicted_with_waiters", } } } diff --git a/pageserver/src/tenant/storage_layer/layer/failpoints.rs b/pageserver/src/tenant/storage_layer/layer/failpoints.rs new file mode 100644 index 0000000000..6cedc41d98 --- /dev/null +++ b/pageserver/src/tenant/storage_layer/layer/failpoints.rs @@ -0,0 +1,119 @@ +//! failpoints for unit tests, implying `#[cfg(test)]`. +//! +//! These are not accessible over http. + +use super::*; + +impl Layer { + /// Enable a failpoint from a unit test. + pub(super) fn enable_failpoint(&self, failpoint: Failpoint) { + self.0.failpoints.lock().unwrap().push(failpoint); + } +} + +impl LayerInner { + /// Query if this failpoint is enabled, as in, arrive at a failpoint. + /// + /// Calls to this method need to be `#[cfg(test)]` guarded. + pub(super) async fn failpoint(&self, kind: FailpointKind) -> Result<(), FailpointHit> { + let fut = { + let mut fps = self.failpoints.lock().unwrap(); + // find the *last* failpoint for cases in which we need to use multiple for the same + // thing (two blocked evictions) + let fp = fps.iter_mut().rfind(|x| x.kind() == kind); + + let Some(fp) = fp else { + return Ok(()); + }; + + fp.hit() + }; + + fut.await + } +} + +#[derive(Debug, PartialEq, Eq)] +pub(crate) enum FailpointKind { + /// Failpoint acts as an accurate cancelled by drop here; see the only site of use. + AfterDeterminingLayerNeedsNoDownload, + /// Failpoint for stalling eviction starting + WaitBeforeStartingEvicting, + /// Failpoint hit in the spawned task + WaitBeforeDownloading, +} + +pub(crate) enum Failpoint { + AfterDeterminingLayerNeedsNoDownload, + WaitBeforeStartingEvicting( + Option, + utils::completion::Barrier, + ), + WaitBeforeDownloading( + Option, + utils::completion::Barrier, + ), +} + +impl Failpoint { + fn kind(&self) -> FailpointKind { + match self { + Failpoint::AfterDeterminingLayerNeedsNoDownload => { + FailpointKind::AfterDeterminingLayerNeedsNoDownload + } + Failpoint::WaitBeforeStartingEvicting(..) => FailpointKind::WaitBeforeStartingEvicting, + Failpoint::WaitBeforeDownloading(..) => FailpointKind::WaitBeforeDownloading, + } + } + + fn hit(&mut self) -> impl std::future::Future> + 'static { + use futures::future::FutureExt; + + // use boxed futures to avoid Either hurdles + match self { + Failpoint::AfterDeterminingLayerNeedsNoDownload => { + let kind = self.kind(); + + async move { Err(FailpointHit(kind)) }.boxed() + } + Failpoint::WaitBeforeStartingEvicting(arrival, b) + | Failpoint::WaitBeforeDownloading(arrival, b) => { + // first one signals arrival + drop(arrival.take()); + + let b = b.clone(); + + async move { + tracing::trace!("waiting on a failpoint barrier"); + b.wait().await; + tracing::trace!("done waiting on a failpoint barrier"); + Ok(()) + } + .boxed() + } + } + } +} + +impl std::fmt::Display for FailpointKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Debug::fmt(self, f) + } +} + +#[derive(Debug)] +pub(crate) struct FailpointHit(FailpointKind); + +impl std::fmt::Display for FailpointHit { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Debug::fmt(self, f) + } +} + +impl std::error::Error for FailpointHit {} + +impl From for DownloadError { + fn from(value: FailpointHit) -> Self { + DownloadError::Failpoint(value.0) + } +} diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs new file mode 100644 index 0000000000..0b9bde4f57 --- /dev/null +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -0,0 +1,1039 @@ +use std::time::UNIX_EPOCH; + +use pageserver_api::key::CONTROLFILE_KEY; +use tokio::task::JoinSet; +use utils::{ + completion::{self, Completion}, + id::TimelineId, +}; + +use super::failpoints::{Failpoint, FailpointKind}; +use super::*; +use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint}; +use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness}; + +/// Used in tests to advance a future to wanted await point, and not futher. +const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600); + +/// Used in tests to indicate forever long timeout; has to be longer than the amount of ADVANCE +/// timeout uses to advance futures. +const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_secs() * 24 * 7); + +/// Demonstrate the API and resident -> evicted -> resident -> deleted transitions. +#[tokio::test] +async fn smoke_test() { + let handle = tokio::runtime::Handle::current(); + + let h = TenantHarness::create("smoke_test").await.unwrap(); + let span = h.span(); + let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); + let (tenant, _) = h.load().await; + + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download); + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.likely_resident_layers().cloned().collect::>() + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // all layers created at pageserver are like `layer`, initialized with strong + // Arc. + + let controlfile_keyspace = KeySpace { + ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()], + }; + + let img_before = { + let mut data = ValuesReconstructState::default(); + layer + .get_values_reconstruct_data( + controlfile_keyspace.clone(), + Lsn(0x10)..Lsn(0x11), + &mut data, + &ctx, + ) + .await + .unwrap(); + data.keys + .remove(&CONTROLFILE_KEY) + .expect("must be present") + .expect("should not error") + .img + .take() + .expect("tenant harness writes the control file") + }; + + // important part is evicting the layer, which can be done when there are no more ResidentLayer + // instances -- there currently are none, only two `Layer` values, one in the layermap and on + // in scope. + layer.evict_and_wait(FOREVER).await.unwrap(); + + // double-evict returns an error, which is valid if both eviction_task and disk usage based + // eviction would both evict the same layer at the same time. + + let e = layer.evict_and_wait(FOREVER).await.unwrap_err(); + assert!(matches!(e, EvictionError::NotFound)); + + // on accesses when the layer is evicted, it will automatically be downloaded. + let img_after = { + let mut data = ValuesReconstructState::default(); + layer + .get_values_reconstruct_data( + controlfile_keyspace.clone(), + Lsn(0x10)..Lsn(0x11), + &mut data, + &ctx, + ) + .instrument(download_span.clone()) + .await + .unwrap(); + data.keys + .remove(&CONTROLFILE_KEY) + .expect("must be present") + .expect("should not error") + .img + .take() + .expect("tenant harness writes the control file") + }; + + assert_eq!(img_before, img_after); + + // evict_and_wait can timeout, but it doesn't cancel the evicting itself + // + // ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to + // artificially slow it down. + let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(&handle).await; + + match layer + .evict_and_wait(std::time::Duration::ZERO) + .await + .unwrap_err() + { + EvictionError::Timeout => { + // expected, but note that the eviction is "still ongoing" + helper.release().await; + // exhaust spawn_blocking pool to ensure it is now complete + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle) + .await; + } + other => unreachable!("{other:?}"), + } + + // only way to query if a layer is resident is to acquire a ResidentLayer instance. + // Layer::keep_resident never downloads, but it might initialize if the layer file is found + // downloaded locally. + let none = layer.keep_resident().await; + assert!( + none.is_none(), + "Expected none, because eviction removed the local file, found: {none:?}" + ); + + // plain downloading is rarely needed + layer + .download_and_keep_resident() + .instrument(download_span) + .await + .unwrap(); + + // last important part is deletion on drop: gc and compaction use it for compacted L0 layers + // or fully garbage collected layers. deletion means deleting the local file, and scheduling a + // deletion of the already unlinked from index_part.json remote file. + // + // marking a layer to be deleted on drop is irreversible; there is no technical reason against + // reversiblity, but currently it is not needed so it is not provided. + layer.delete_on_drop(); + + let path = layer.local_path().to_owned(); + + // wait_drop produces an unconnected to Layer future which will resolve when the + // LayerInner::drop has completed. + let mut wait_drop = std::pin::pin!(layer.wait_drop()); + + // paused time doesn't really work well with timeouts and evict_and_wait, so delay pausing + // until here + tokio::time::pause(); + tokio::time::timeout(ADVANCE, &mut wait_drop) + .await + .expect_err("should had timed out because two strong references exist"); + + tokio::fs::metadata(&path) + .await + .expect("the local layer file still exists"); + + let rtc = &timeline.remote_client; + + { + let layers = &[layer]; + let mut g = timeline.layers.write().await; + g.open_mut().unwrap().finish_gc_timeline(layers); + // this just updates the remote_physical_size for demonstration purposes + rtc.schedule_gc_update(layers).unwrap(); + } + + // when strong references are dropped, the file is deleted and remote deletion is scheduled + wait_drop.await; + + let e = tokio::fs::metadata(&path) + .await + .expect_err("the local file is deleted"); + assert_eq!(e.kind(), std::io::ErrorKind::NotFound); + + rtc.wait_completion().await.unwrap(); + + assert_eq!(rtc.get_remote_physical_size(), 0); + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) +} + +/// This test demonstrates a previous hang when a eviction and deletion were requested at the same +/// time. Now both of them complete per Arc drop semantics. +#[tokio::test(start_paused = true)] +async fn evict_and_wait_on_wanted_deleted() { + // this is the runtime on which Layer spawns the blocking tasks on + let handle = tokio::runtime::Handle::current(); + + let h = TenantHarness::create("evict_and_wait_on_wanted_deleted") + .await + .unwrap(); + utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); + let (tenant, ctx) = h.load().await; + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.likely_resident_layers().cloned().collect::>() + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // setup done + + let resident = layer.keep_resident().await.unwrap(); + + { + let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // drive the future to await on the status channel + tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect_err("should had been a timeout since we are holding the layer resident"); + + layer.delete_on_drop(); + + drop(resident); + + // make sure the eviction task gets to run + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; + + let resident = layer.keep_resident().await; + assert!( + resident.is_none(), + "keep_resident should not have re-initialized: {resident:?}" + ); + + evict_and_wait + .await + .expect("evict_and_wait should had succeeded"); + + // works as intended + } + + // assert that once we remove the `layer` from the layer map and drop our reference, + // the deletion of the layer in remote_storage happens. + { + let mut layers = timeline.layers.write().await; + layers.open_mut().unwrap().finish_gc_timeline(&[layer]); + } + + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; + + assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get()); + assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get()); + assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get()); + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) +} + +/// This test ensures we are able to read the layer while the layer eviction has been +/// started but not completed. +#[test] +fn read_wins_pending_eviction() { + let rt = tokio::runtime::Builder::new_current_thread() + .max_blocking_threads(1) + .enable_all() + .start_paused(true) + .build() + .unwrap(); + + rt.block_on(async move { + // this is the runtime on which Layer spawns the blocking tasks on + let handle = tokio::runtime::Handle::current(); + let h = TenantHarness::create("read_wins_pending_eviction") + .await + .unwrap(); + let (tenant, ctx) = h.load().await; + let span = h.span(); + let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.likely_resident_layers().cloned().collect::>() + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // setup done + + let resident = layer.keep_resident().await.unwrap(); + + let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // drive the future to await on the status channel + tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect_err("should had been a timeout since we are holding the layer resident"); + assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + + let (completion, barrier) = utils::completion::channel(); + let (arrival, arrived_at_barrier) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); + + // now the eviction cannot proceed because the threads are consumed while completion exists + drop(resident); + arrived_at_barrier.wait().await; + assert!(!layer.is_likely_resident()); + + // because no actual eviction happened, we get to just reinitialize the DownloadedLayer + layer + .0 + .get_or_maybe_download(false, None) + .instrument(download_span) + .await + .expect("should had reinitialized without downloading"); + + assert!(layer.is_likely_resident()); + + // reinitialization notifies of new resident status, which should error out all evict_and_wait + let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect("no timeout, because get_or_maybe_download re-initialized") + .expect_err("eviction should not have succeeded because re-initialized"); + + // works as intended: evictions lose to "downloads" + assert!(matches!(e, EvictionError::Downloaded), "{e:?}"); + assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); + + // this is not wrong: the eviction is technically still "on the way" as it's still queued + // because of a failpoint + assert_eq!( + 0, + LAYER_IMPL_METRICS + .cancelled_evictions + .values() + .map(|ctr| ctr.get()) + .sum::() + ); + + drop(completion); + + tokio::time::sleep(ADVANCE).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1) + .await; + + assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); + + // now we finally can observe the original eviction failing + // it would had been possible to observe it earlier, but here it is guaranteed to have + // happened. + assert_eq!( + 1, + LAYER_IMPL_METRICS + .cancelled_evictions + .values() + .map(|ctr| ctr.get()) + .sum::() + ); + + assert_eq!( + 1, + LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::AlreadyReinitialized].get() + ); + + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) + }); +} + +/// Use failpoint to delay an eviction starting to get a VersionCheckFailed. +#[test] +fn multiple_pending_evictions_in_order() { + let name = "multiple_pending_evictions_in_order"; + let in_order = true; + multiple_pending_evictions_scenario(name, in_order); +} + +/// Use failpoint to reorder later eviction before first to get a UnexpectedEvictedState. +#[test] +fn multiple_pending_evictions_out_of_order() { + let name = "multiple_pending_evictions_out_of_order"; + let in_order = false; + multiple_pending_evictions_scenario(name, in_order); +} + +fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { + let rt = tokio::runtime::Builder::new_current_thread() + .max_blocking_threads(1) + .enable_all() + .start_paused(true) + .build() + .unwrap(); + + rt.block_on(async move { + // this is the runtime on which Layer spawns the blocking tasks on + let handle = tokio::runtime::Handle::current(); + let h = TenantHarness::create(name).await.unwrap(); + let (tenant, ctx) = h.load().await; + let span = h.span(); + let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.likely_resident_layers().cloned().collect::>() + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // setup done + + let resident = layer.keep_resident().await.unwrap(); + + let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // drive the future to await on the status channel + tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect_err("should had been a timeout since we are holding the layer resident"); + assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + + let (completion1, barrier) = utils::completion::channel(); + let mut completion1 = Some(completion1); + let (arrival, arrived_at_barrier) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); + + // now the eviction cannot proceed because we are simulating arbitrary long delay for the + // eviction task start. + drop(resident); + assert!(!layer.is_likely_resident()); + + arrived_at_barrier.wait().await; + + // because no actual eviction happened, we get to just reinitialize the DownloadedLayer + layer + .0 + .get_or_maybe_download(false, None) + .instrument(download_span) + .await + .expect("should had reinitialized without downloading"); + + assert!(layer.is_likely_resident()); + + // reinitialization notifies of new resident status, which should error out all evict_and_wait + let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect("no timeout, because get_or_maybe_download re-initialized") + .expect_err("eviction should not have succeeded because re-initialized"); + + // works as intended: evictions lose to "downloads" + assert!(matches!(e, EvictionError::Downloaded), "{e:?}"); + assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); + + // this is not wrong: the eviction is technically still "on the way" as it's still queued + // because of a failpoint + assert_eq!( + 0, + LAYER_IMPL_METRICS + .cancelled_evictions + .values() + .map(|ctr| ctr.get()) + .sum::() + ); + + assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); + + // configure another failpoint for the second eviction -- evictions are per initialization, + // so now that we've reinitialized the inner, we get to run two of them at the same time. + let (completion2, barrier) = utils::completion::channel(); + let (arrival, arrived_at_barrier) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); + + let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // advance to the wait on the queue + tokio::time::timeout(ADVANCE, &mut second_eviction) + .await + .expect_err("timeout because failpoint is blocking"); + + arrived_at_barrier.wait().await; + + assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get()); + + let mut release_earlier_eviction = |expected_reason| { + assert_eq!( + 0, + LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(), + ); + + drop(completion1.take().unwrap()); + + let handle = &handle; + + async move { + tokio::time::sleep(ADVANCE).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0( + handle, 1, + ) + .await; + + assert_eq!( + 1, + LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(), + ); + } + }; + + if in_order { + release_earlier_eviction(EvictionCancelled::VersionCheckFailed).await; + } + + // release the later eviction which is for the current version + drop(completion2); + tokio::time::sleep(ADVANCE).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1) + .await; + + if !in_order { + release_earlier_eviction(EvictionCancelled::UnexpectedEvictedState).await; + } + + tokio::time::timeout(ADVANCE, &mut second_eviction) + .await + .expect("eviction goes through now that spawn_blocking is unclogged") + .expect("eviction should succeed, because version matches"); + + assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get()); + + // ensure the cancelled are unchanged + assert_eq!( + 1, + LAYER_IMPL_METRICS + .cancelled_evictions + .values() + .map(|ctr| ctr.get()) + .sum::() + ); + + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) + }); +} + +/// The test ensures with a failpoint that a pending eviction is not cancelled by what is currently +/// a `Layer::keep_resident` call. +/// +/// This matters because cancelling the eviction would leave us in a state where the file is on +/// disk but the layer internal state says it has not been initialized. Futhermore, it allows us to +/// have non-repairing `Layer::is_likely_resident`. +#[tokio::test(start_paused = true)] +async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { + let handle = tokio::runtime::Handle::current(); + let h = TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction") + .await + .unwrap(); + let (tenant, ctx) = h.load().await; + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.likely_resident_layers().cloned().collect::>() + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // this failpoint will simulate the `get_or_maybe_download` becoming cancelled (by returning an + // Err) at the right time as in "during" the `LayerInner::needs_download`. + layer.enable_failpoint(Failpoint::AfterDeterminingLayerNeedsNoDownload); + + let (completion, barrier) = utils::completion::channel(); + let (arrival, arrived_at_barrier) = utils::completion::channel(); + + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); + + tokio::time::timeout(ADVANCE, layer.evict_and_wait(FOREVER)) + .await + .expect_err("should had advanced to waiting on channel"); + + arrived_at_barrier.wait().await; + + // simulate a cancelled read which is cancelled before it gets to re-initialize + let e = layer + .0 + .get_or_maybe_download(false, None) + .await + .unwrap_err(); + assert!( + matches!( + e, + DownloadError::Failpoint(FailpointKind::AfterDeterminingLayerNeedsNoDownload) + ), + "{e:?}" + ); + + assert!( + layer.0.needs_download().await.unwrap().is_none(), + "file is still on disk" + ); + + // release the eviction task + drop(completion); + tokio::time::sleep(ADVANCE).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; + + // failpoint is still enabled, but it is not hit + let e = layer + .0 + .get_or_maybe_download(false, None) + .await + .unwrap_err(); + assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}"); + + // failpoint is not counted as cancellation either + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) +} + +#[tokio::test(start_paused = true)] +async fn evict_and_wait_does_not_wait_for_download() { + // let handle = tokio::runtime::Handle::current(); + let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download") + .await + .unwrap(); + let (tenant, ctx) = h.load().await; + let span = h.span(); + let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.likely_resident_layers().cloned().collect::>() + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // kind of forced setup: start an eviction but do not allow it progress until we are + // downloading + let (eviction_can_continue, barrier) = utils::completion::channel(); + let (arrival, eviction_arrived) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); + + let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // use this once-awaited other_evict to synchronize with the eviction + let other_evict = layer.evict_and_wait(FOREVER); + + tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect_err("should had advanced"); + eviction_arrived.wait().await; + drop(eviction_can_continue); + other_evict.await.unwrap(); + + // now the layer is evicted, and the "evict_and_wait" is waiting on the receiver + assert!(!layer.is_likely_resident()); + + // following new evict_and_wait will fail until we've completed the download + let e = layer.evict_and_wait(FOREVER).await.unwrap_err(); + assert!(matches!(e, EvictionError::NotFound), "{e:?}"); + + let (download_can_continue, barrier) = utils::completion::channel(); + let (arrival, _download_arrived) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeDownloading(Some(arrival), barrier)); + + let mut download = std::pin::pin!(layer + .0 + .get_or_maybe_download(true, None) + .instrument(download_span)); + + assert!( + !layer.is_likely_resident(), + "during download layer is evicted" + ); + + tokio::time::timeout(ADVANCE, &mut download) + .await + .expect_err("should had timed out because of failpoint"); + + // now we finally get to continue, and because the latest state is downloading, we deduce that + // original eviction succeeded + evict_and_wait.await.unwrap(); + + // however a new evict_and_wait will fail + let e = layer.evict_and_wait(FOREVER).await.unwrap_err(); + assert!(matches!(e, EvictionError::NotFound), "{e:?}"); + + assert!(!layer.is_likely_resident()); + + drop(download_can_continue); + download.await.expect("download should had succeeded"); + assert!(layer.is_likely_resident()); + + // only now can we evict + layer.evict_and_wait(FOREVER).await.unwrap(); +} + +/// Asserts that there is no miscalculation when Layer is dropped while it is being kept resident, +/// which is the last value. +/// +/// Also checks that the same does not happen on a non-evicted layer (regression test). +#[tokio::test(start_paused = true)] +async fn eviction_cancellation_on_drop() { + use crate::repository::Value; + use bytes::Bytes; + + // this is the runtime on which Layer spawns the blocking tasks on + let handle = tokio::runtime::Handle::current(); + + let h = TenantHarness::create("eviction_cancellation_on_drop") + .await + .unwrap(); + utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); + let (tenant, ctx) = h.load().await; + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + { + // create_test_timeline wrote us one layer, write another + let mut writer = timeline.writer().await; + writer + .put( + crate::repository::Key::from_i128(5), + Lsn(0x20), + &Value::Image(Bytes::from_static(b"this does not matter either")), + &ctx, + ) + .await + .unwrap(); + + writer.finish_write(Lsn(0x20)); + } + + timeline.freeze_and_flush().await.unwrap(); + + // wait for the upload to complete so our Arc::strong_count assertion holds + timeline.remote_client.wait_completion().await.unwrap(); + + let (evicted_layer, not_evicted) = { + let mut layers = { + let mut guard = timeline.layers.write().await; + let layers = guard.likely_resident_layers().cloned().collect::>(); + // remove the layers from layermap + guard.open_mut().unwrap().finish_gc_timeline(&layers); + + layers + }; + + assert_eq!(layers.len(), 2); + + (layers.pop().unwrap(), layers.pop().unwrap()) + }; + + let victims = [(evicted_layer, true), (not_evicted, false)]; + + for (victim, evict) in victims { + let resident = victim.keep_resident().await.unwrap(); + drop(victim); + + assert_eq!(Arc::strong_count(&resident.owner.0), 1); + + if evict { + let evict_and_wait = resident.owner.evict_and_wait(FOREVER); + + // drive the future to await on the status channel, and then drop it + tokio::time::timeout(ADVANCE, evict_and_wait) + .await + .expect_err("should had been a timeout since we are holding the layer resident"); + } + + // 1 == we only evict one of the layers + assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + + drop(resident); + + // run any spawned + tokio::time::sleep(ADVANCE).await; + + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; + + assert_eq!( + 1, + LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::LayerGone].get() + ); + } +} + +/// A test case to remind you the cost of these structures. You can bump the size limit +/// below if it is really necessary to add more fields to the structures. +#[test] +#[cfg(target_arch = "x86_64")] +fn layer_size() { + assert_eq!(size_of::(), 8); + assert_eq!(size_of::(), 104); + assert_eq!(size_of::(), 296); + // it also has the utf8 path +} + +struct SpawnBlockingPoolHelper { + awaited_by_spawn_blocking_tasks: Completion, + blocking_tasks: JoinSet<()>, +} + +impl SpawnBlockingPoolHelper { + /// All `crate::task_mgr::BACKGROUND_RUNTIME` spawn_blocking threads will be consumed until + /// release is called. + /// + /// In the tests this can be used to ensure something cannot be started on the target runtimes + /// spawn_blocking pool. + /// + /// This should be no issue nowdays, because nextest runs each test in it's own process. + async fn consume_all_spawn_blocking_threads(handle: &tokio::runtime::Handle) -> Self { + let default_max_blocking_threads = 512; + + Self::consume_all_spawn_blocking_threads0(handle, default_max_blocking_threads).await + } + + async fn consume_all_spawn_blocking_threads0( + handle: &tokio::runtime::Handle, + threads: usize, + ) -> Self { + assert_ne!(threads, 0); + + let (completion, barrier) = completion::channel(); + let (started, starts_completed) = completion::channel(); + + let mut blocking_tasks = JoinSet::new(); + + for _ in 0..threads { + let barrier = barrier.clone(); + let started = started.clone(); + blocking_tasks.spawn_blocking_on( + move || { + drop(started); + tokio::runtime::Handle::current().block_on(barrier.wait()); + }, + handle, + ); + } + + drop(started); + + starts_completed.wait().await; + + drop(barrier); + + tracing::trace!("consumed all threads"); + + SpawnBlockingPoolHelper { + awaited_by_spawn_blocking_tasks: completion, + blocking_tasks, + } + } + + /// Release all previously blocked spawn_blocking threads + async fn release(self) { + let SpawnBlockingPoolHelper { + awaited_by_spawn_blocking_tasks, + mut blocking_tasks, + } = self; + + drop(awaited_by_spawn_blocking_tasks); + + while let Some(res) = blocking_tasks.join_next().await { + res.expect("none of the tasks should had panicked"); + } + + tracing::trace!("released all threads"); + } + + /// In the tests it is used as an easy way of making sure something scheduled on the target + /// runtimes `spawn_blocking` has completed, because it must've been scheduled and completed + /// before our tasks have a chance to schedule and complete. + async fn consume_and_release_all_of_spawn_blocking_threads(handle: &tokio::runtime::Handle) { + Self::consume_and_release_all_of_spawn_blocking_threads0(handle, 512).await + } + + async fn consume_and_release_all_of_spawn_blocking_threads0( + handle: &tokio::runtime::Handle, + threads: usize, + ) { + Self::consume_all_spawn_blocking_threads0(handle, threads) + .await + .release() + .await + } +} + +#[test] +fn spawn_blocking_pool_helper_actually_works() { + // create a custom runtime for which we know and control how many blocking threads it has + // + // because the amount is not configurable for our helper, expect the same amount as + // BACKGROUND_RUNTIME using the tokio defaults would have. + let rt = tokio::runtime::Builder::new_current_thread() + .max_blocking_threads(1) + .enable_all() + .build() + .unwrap(); + + let handle = rt.handle(); + + rt.block_on(async move { + // this will not return until all threads are spun up and actually executing the code + // waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d. + let consumed = + SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads0(handle, 1).await; + + println!("consumed"); + + let mut jh = std::pin::pin!(tokio::task::spawn_blocking(move || { + // this will not get to run before we release + })); + + println!("spawned"); + + tokio::time::timeout(std::time::Duration::from_secs(1), &mut jh) + .await + .expect_err("the task should not have gotten to run yet"); + + println!("tried to join"); + + consumed.release().await; + + println!("released"); + + tokio::time::timeout(std::time::Duration::from_secs(1), jh) + .await + .expect("no timeout") + .expect("no join error"); + + println!("joined"); + }); +} + +/// Drop the low bits from a time, to emulate the precision loss in LayerAccessStats +fn lowres_time(hires: SystemTime) -> SystemTime { + let ts = hires.duration_since(UNIX_EPOCH).unwrap().as_secs(); + UNIX_EPOCH + Duration::from_secs(ts) +} + +#[test] +fn access_stats() { + let access_stats = LayerAccessStats::default(); + // Default is visible + assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible); + + access_stats.set_visibility(LayerVisibilityHint::Covered); + assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered); + access_stats.set_visibility(LayerVisibilityHint::Visible); + assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible); + + let rtime = UNIX_EPOCH + Duration::from_secs(2000000000); + access_stats.record_residence_event_at(rtime); + assert_eq!(access_stats.latest_activity(), lowres_time(rtime)); + + let atime = UNIX_EPOCH + Duration::from_secs(2100000000); + access_stats.record_access_at(atime); + assert_eq!(access_stats.latest_activity(), lowres_time(atime)); + + // Setting visibility doesn't clobber access time + access_stats.set_visibility(LayerVisibilityHint::Covered); + assert_eq!(access_stats.latest_activity(), lowres_time(atime)); + access_stats.set_visibility(LayerVisibilityHint::Visible); + assert_eq!(access_stats.latest_activity(), lowres_time(atime)); +} + +#[test] +fn access_stats_2038() { + // The access stats structure uses a timestamp representation that will run out + // of bits in 2038. One year before that, this unit test will start failing. + + let one_year_from_now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap() + + Duration::from_secs(3600 * 24 * 365); + + assert!(one_year_from_now.as_secs() < (2 << 31)); +} diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index bf24407fc5..e90ff3c4b2 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -5,17 +5,19 @@ use utils::{id::TimelineId, lsn::Lsn}; use crate::repository::Key; -use super::{DeltaFileName, ImageFileName, LayerFileName}; +use super::{DeltaLayerName, ImageLayerName, LayerName}; use serde::{Deserialize, Serialize}; #[cfg(test)] use utils::id::TenantId; -/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the -/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides +/// A unique identifier of a persistent layer. +/// +/// This is different from `LayerDescriptor`, which is only used in the benchmarks. +/// This struct contains all necessary information to find the image / delta layer. It also provides /// a unified way to generate layer information like file name. -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)] pub struct PersistentLayerDesc { pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, @@ -25,7 +27,7 @@ pub struct PersistentLayerDesc { /// /// - For an open in-memory layer, the end bound is MAX_LSN /// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the - /// range start + /// range start /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1 pub lsn_range: Range, /// Whether this is a delta layer, and also, is this incremental. @@ -41,6 +43,20 @@ pub struct PersistentLayerKey { pub is_delta: bool, } +impl std::fmt::Display for PersistentLayerKey { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}..{} {}..{} is_delta={}", + self.key_range.start, + self.key_range.end, + self.lsn_range.start, + self.lsn_range.end, + self.is_delta + ) + } +} + impl PersistentLayerDesc { pub fn key(&self) -> PersistentLayerKey { PersistentLayerKey { @@ -51,17 +67,17 @@ impl PersistentLayerDesc { } pub fn short_id(&self) -> impl Display { - self.filename() + self.layer_name() } #[cfg(test)] - pub fn new_test(key_range: Range) -> Self { + pub fn new_test(key_range: Range, lsn_range: Range, is_delta: bool) -> Self { Self { tenant_shard_id: TenantShardId::unsharded(TenantId::generate()), timeline_id: TimelineId::generate(), key_range, - lsn_range: Lsn(0)..Lsn(1), - is_delta: false, + lsn_range, + is_delta, file_size: 0, } } @@ -103,14 +119,14 @@ impl PersistentLayerDesc { pub fn from_filename( tenant_shard_id: TenantShardId, timeline_id: TimelineId, - filename: LayerFileName, + filename: LayerName, file_size: u64, ) -> Self { match filename { - LayerFileName::Image(i) => { + LayerName::Image(i) => { Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size) } - LayerFileName::Delta(d) => Self::new_delta( + LayerName::Delta(d) => Self::new_delta( tenant_shard_id, timeline_id, d.key_range, @@ -132,34 +148,34 @@ impl PersistentLayerDesc { lsn..(lsn + 1) } - /// Get a delta file name for this layer. + /// Get a delta layer name for this layer. /// /// Panic: if this is not a delta layer. - pub fn delta_file_name(&self) -> DeltaFileName { + pub fn delta_layer_name(&self) -> DeltaLayerName { assert!(self.is_delta); - DeltaFileName { + DeltaLayerName { key_range: self.key_range.clone(), lsn_range: self.lsn_range.clone(), } } - /// Get a delta file name for this layer. + /// Get a image layer name for this layer. /// /// Panic: if this is not an image layer, or the lsn range is invalid - pub fn image_file_name(&self) -> ImageFileName { + pub fn image_layer_name(&self) -> ImageLayerName { assert!(!self.is_delta); assert!(self.lsn_range.start + 1 == self.lsn_range.end); - ImageFileName { + ImageLayerName { key_range: self.key_range.clone(), lsn: self.lsn_range.start, } } - pub fn filename(&self) -> LayerFileName { + pub fn layer_name(&self) -> LayerName { if self.is_delta { - self.delta_file_name().into() + self.delta_layer_name().into() } else { - self.image_file_name().into() + self.image_layer_name().into() } } diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/layer_name.rs similarity index 56% rename from pageserver/src/tenant/storage_layer/filename.rs rename to pageserver/src/tenant/storage_layer/layer_name.rs index a98be0842b..ffe7ca5f3e 100644 --- a/pageserver/src/tenant/storage_layer/filename.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -2,40 +2,42 @@ //! Helper functions for dealing with filenames of the image and delta layer files. //! use crate::repository::Key; +use std::borrow::Cow; use std::cmp::Ordering; use std::fmt; use std::ops::Range; use std::str::FromStr; +use regex::Regex; use utils::lsn::Lsn; use super::PersistentLayerDesc; // Note: Timeline::load_layer_map() relies on this sort order #[derive(PartialEq, Eq, Clone, Hash)] -pub struct DeltaFileName { +pub struct DeltaLayerName { pub key_range: Range, pub lsn_range: Range, } -impl std::fmt::Debug for DeltaFileName { +impl std::fmt::Debug for DeltaLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use super::RangeDisplayDebug; - f.debug_struct("DeltaFileName") + f.debug_struct("DeltaLayerName") .field("key_range", &RangeDisplayDebug(&self.key_range)) .field("lsn_range", &self.lsn_range) .finish() } } -impl PartialOrd for DeltaFileName { +impl PartialOrd for DeltaLayerName { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for DeltaFileName { +impl Ord for DeltaLayerName { fn cmp(&self, other: &Self) -> Ordering { let mut cmp = self.key_range.start.cmp(&other.key_range.start); if cmp != Ordering::Equal { @@ -55,16 +57,14 @@ impl Ord for DeltaFileName { } } -/// Represents the filename of a DeltaLayer +/// Represents the region of the LSN-Key space covered by a DeltaLayer /// /// ```text /// -__- /// ``` -impl DeltaFileName { - /// - /// Parse a string as a delta file name. Returns None if the filename does not - /// match the expected pattern. - /// +impl DeltaLayerName { + /// Parse the part of a delta layer's file name that represents the LayerName. Returns None + /// if the filename does not match the expected pattern. pub fn parse_str(fname: &str) -> Option { let mut parts = fname.split("__"); let mut key_parts = parts.next()?.split('-'); @@ -74,10 +74,19 @@ impl DeltaFileName { let key_end_str = key_parts.next()?; let lsn_start_str = lsn_parts.next()?; let lsn_end_str = lsn_parts.next()?; + if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() { return None; } + if key_start_str.len() != 36 + || key_end_str.len() != 36 + || lsn_start_str.len() != 16 + || lsn_end_str.len() != 16 + { + return None; + } + let key_start = Key::from_hex(key_start_str).ok()?; let key_end = Key::from_hex(key_end_str).ok()?; @@ -94,14 +103,14 @@ impl DeltaFileName { // or panic? } - Some(DeltaFileName { + Some(DeltaLayerName { key_range: key_start..key_end, lsn_range: start_lsn..end_lsn, }) } } -impl fmt::Display for DeltaFileName { +impl fmt::Display for DeltaLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, @@ -115,29 +124,29 @@ impl fmt::Display for DeltaFileName { } #[derive(PartialEq, Eq, Clone, Hash)] -pub struct ImageFileName { +pub struct ImageLayerName { pub key_range: Range, pub lsn: Lsn, } -impl std::fmt::Debug for ImageFileName { +impl std::fmt::Debug for ImageLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use super::RangeDisplayDebug; - f.debug_struct("ImageFileName") + f.debug_struct("ImageLayerName") .field("key_range", &RangeDisplayDebug(&self.key_range)) .field("lsn", &self.lsn) .finish() } } -impl PartialOrd for ImageFileName { +impl PartialOrd for ImageLayerName { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for ImageFileName { +impl Ord for ImageLayerName { fn cmp(&self, other: &Self) -> Ordering { let mut cmp = self.key_range.start.cmp(&other.key_range.start); if cmp != Ordering::Equal { @@ -153,7 +162,7 @@ impl Ord for ImageFileName { } } -impl ImageFileName { +impl ImageLayerName { pub fn lsn_as_range(&self) -> Range { // Saves from having to copypaste this all over PersistentLayerDesc::image_layer_lsn_range(self.lsn) @@ -161,16 +170,14 @@ impl ImageFileName { } /// -/// Represents the filename of an ImageLayer +/// Represents the part of the Key-LSN space covered by an ImageLayer /// /// ```text /// -__ /// ``` -impl ImageFileName { - /// - /// Parse a string as an image file name. Returns None if the filename does not - /// match the expected pattern. - /// +impl ImageLayerName { + /// Parse a string as then LayerName part of an image layer file name. Returns None if the + /// filename does not match the expected pattern. pub fn parse_str(fname: &str) -> Option { let mut parts = fname.split("__"); let mut key_parts = parts.next()?.split('-'); @@ -182,19 +189,23 @@ impl ImageFileName { return None; } + if key_start_str.len() != 36 || key_end_str.len() != 36 || lsn_str.len() != 16 { + return None; + } + let key_start = Key::from_hex(key_start_str).ok()?; let key_end = Key::from_hex(key_end_str).ok()?; let lsn = Lsn::from_hex(lsn_str).ok()?; - Some(ImageFileName { + Some(ImageLayerName { key_range: key_start..key_end, lsn, }) } } -impl fmt::Display for ImageFileName { +impl fmt::Display for ImageLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, @@ -205,21 +216,25 @@ impl fmt::Display for ImageFileName { ) } } + +/// LayerName is the logical identity of a layer within a LayerMap at a moment in time. +/// +/// The LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations +/// over time (e.g. across shard splits or compression). The physical filenames of layers in local +/// storage and object names in remote storage consist of the LayerName plus some extra qualifiers +/// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path]) +/// and [`crate::tenant::storage_layer::layer::local_layer_path`]) #[derive(Debug, PartialEq, Eq, Hash, Clone)] -pub enum LayerFileName { - Image(ImageFileName), - Delta(DeltaFileName), +pub enum LayerName { + Image(ImageLayerName), + Delta(DeltaLayerName), } -impl LayerFileName { - pub fn file_name(&self) -> String { - self.to_string() - } - +impl LayerName { /// Determines if this layer file is considered to be in future meaning we will discard these /// layers during timeline initialization from the given disk_consistent_lsn. pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool { - use LayerFileName::*; + use LayerName::*; match self { Image(file_name) if file_name.lsn > disk_consistent_lsn => true, Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true, @@ -228,15 +243,27 @@ impl LayerFileName { } pub(crate) fn kind(&self) -> &'static str { - use LayerFileName::*; + use LayerName::*; match self { Delta(_) => "delta", Image(_) => "image", } } + + /// Gets the key range encoded in the layer name. + pub fn key_range(&self) -> &Range { + match &self { + LayerName::Image(layer) => &layer.key_range, + LayerName::Delta(layer) => &layer.key_range, + } + } + + pub fn is_delta(&self) -> bool { + matches!(self, LayerName::Delta(_)) + } } -impl fmt::Display for LayerFileName { +impl fmt::Display for LayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::Image(fname) => write!(f, "{fname}"), @@ -245,23 +272,36 @@ impl fmt::Display for LayerFileName { } } -impl From for LayerFileName { - fn from(fname: ImageFileName) -> Self { +impl From for LayerName { + fn from(fname: ImageLayerName) -> Self { Self::Image(fname) } } -impl From for LayerFileName { - fn from(fname: DeltaFileName) -> Self { +impl From for LayerName { + fn from(fname: DeltaLayerName) -> Self { Self::Delta(fname) } } -impl FromStr for LayerFileName { +impl FromStr for LayerName { type Err = String; + /// Conversion from either a physical layer filename, or the string-ization of + /// Self. When loading a physical layer filename, we drop any extra information + /// not needed to build Self. fn from_str(value: &str) -> Result { - let delta = DeltaFileName::parse_str(value); - let image = ImageFileName::parse_str(value); + let gen_suffix_regex = Regex::new("^(?.+)(?-v1-[0-9a-f]{8})$").unwrap(); + let file_name: Cow = match gen_suffix_regex.captures(value) { + Some(captures) => captures + .name("base") + .expect("Non-optional group") + .as_str() + .into(), + None => value.into(), + }; + + let delta = DeltaLayerName::parse_str(&file_name); + let image = ImageLayerName::parse_str(&file_name); let ok = match (delta, image) { (None, None) => { return Err(format!( @@ -276,7 +316,7 @@ impl FromStr for LayerFileName { } } -impl serde::Serialize for LayerFileName { +impl serde::Serialize for LayerName { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, @@ -288,19 +328,19 @@ impl serde::Serialize for LayerFileName { } } -impl<'de> serde::Deserialize<'de> for LayerFileName { +impl<'de> serde::Deserialize<'de> for LayerName { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { - deserializer.deserialize_string(LayerFileNameVisitor) + deserializer.deserialize_string(LayerNameVisitor) } } -struct LayerFileNameVisitor; +struct LayerNameVisitor; -impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor { - type Value = LayerFileName; +impl<'de> serde::de::Visitor<'de> for LayerNameVisitor { + type Value = LayerName; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { write!( @@ -315,3 +355,38 @@ impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor { v.parse().map_err(|e| E::custom(e)) } } + +#[cfg(test)] +mod test { + use super::*; + #[test] + fn image_layer_parse() { + let expected = LayerName::Image(ImageLayerName { + key_range: Key::from_i128(0) + ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(), + lsn: Lsn::from_hex("00000000014FED58").unwrap(), + }); + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap(); + assert_eq!(parsed, expected,); + + // Omitting generation suffix is valid + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap(); + assert_eq!(parsed, expected,); + } + + #[test] + fn delta_layer_parse() { + let expected = LayerName::Delta(DeltaLayerName { + key_range: Key::from_i128(0) + ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(), + lsn_range: Lsn::from_hex("00000000014FED58").unwrap() + ..Lsn::from_hex("000000000154C481").unwrap(), + }); + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap(); + assert_eq!(parsed, expected); + + // Omitting generation suffix is valid + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap(); + assert_eq!(parsed, expected); + } +} diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs new file mode 100644 index 0000000000..0831fd9530 --- /dev/null +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -0,0 +1,590 @@ +use std::{ + cmp::Ordering, + collections::{binary_heap, BinaryHeap}, +}; + +use anyhow::bail; +use pageserver_api::key::Key; +use utils::lsn::Lsn; + +use crate::{context::RequestContext, repository::Value}; + +use super::{ + delta_layer::{DeltaLayerInner, DeltaLayerIterator}, + image_layer::{ImageLayerInner, ImageLayerIterator}, +}; + +#[derive(Clone, Copy)] +enum LayerRef<'a> { + Image(&'a ImageLayerInner), + Delta(&'a DeltaLayerInner), +} + +impl<'a> LayerRef<'a> { + fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> { + match self { + Self::Image(x) => LayerIterRef::Image(x.iter(ctx)), + Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)), + } + } + + fn layer_dbg_info(&self) -> String { + match self { + Self::Image(x) => x.layer_dbg_info(), + Self::Delta(x) => x.layer_dbg_info(), + } + } +} + +enum LayerIterRef<'a> { + Image(ImageLayerIterator<'a>), + Delta(DeltaLayerIterator<'a>), +} + +impl LayerIterRef<'_> { + async fn next(&mut self) -> anyhow::Result> { + match self { + Self::Delta(x) => x.next().await, + Self::Image(x) => x.next().await, + } + } + + fn layer_dbg_info(&self) -> String { + match self { + Self::Image(x) => x.layer_dbg_info(), + Self::Delta(x) => x.layer_dbg_info(), + } + } +} + +/// This type plays several roles at once +/// 1. Unified iterator for image and delta layers. +/// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge). +/// 3. Lazy creation of the real delta/image iterator. +enum IteratorWrapper<'a> { + NotLoaded { + ctx: &'a RequestContext, + first_key_lower_bound: (Key, Lsn), + layer: LayerRef<'a>, + }, + Loaded { + iter: PeekableLayerIterRef<'a>, + }, +} + +struct PeekableLayerIterRef<'a> { + iter: LayerIterRef<'a>, + peeked: Option<(Key, Lsn, Value)>, // None == end +} + +impl<'a> PeekableLayerIterRef<'a> { + async fn create(mut iter: LayerIterRef<'a>) -> anyhow::Result { + let peeked = iter.next().await?; + Ok(Self { iter, peeked }) + } + + fn peek(&self) -> &Option<(Key, Lsn, Value)> { + &self.peeked + } + + async fn next(&mut self) -> anyhow::Result> { + let result = self.peeked.take(); + self.peeked = self.iter.next().await?; + if let (Some((k1, l1, _)), Some((k2, l2, _))) = (&self.peeked, &result) { + if (k1, l1) < (k2, l2) { + bail!("iterator is not ordered: {}", self.iter.layer_dbg_info()); + } + } + Ok(result) + } +} + +impl<'a> std::cmp::PartialEq for IteratorWrapper<'a> { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl<'a> std::cmp::Eq for IteratorWrapper<'a> {} + +impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl<'a> std::cmp::Ord for IteratorWrapper<'a> { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + use std::cmp::Ordering; + let a = self.peek_next_key_lsn_value(); + let b = other.peek_next_key_lsn_value(); + match (a, b) { + (Some((k1, l1, v1)), Some((k2, l2, v2))) => { + fn map_value_to_num(val: &Option<&Value>) -> usize { + match val { + None => 0, + Some(Value::Image(_)) => 1, + Some(Value::WalRecord(_)) => 2, + } + } + let order_1 = map_value_to_num(&v1); + let order_2 = map_value_to_num(&v2); + // When key_lsn are the same, the unloaded iter will always appear before the loaded one. + // And note that we do a reverse at the end of the comparison, so it works with the max heap. + (k1, l1, order_1).cmp(&(k2, l2, order_2)) + } + (Some(_), None) => Ordering::Less, + (None, Some(_)) => Ordering::Greater, + (None, None) => Ordering::Equal, + } + .reverse() + } +} + +impl<'a> IteratorWrapper<'a> { + pub fn create_from_image_layer( + image_layer: &'a ImageLayerInner, + ctx: &'a RequestContext, + ) -> Self { + Self::NotLoaded { + layer: LayerRef::Image(image_layer), + first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()), + ctx, + } + } + + pub fn create_from_delta_layer( + delta_layer: &'a DeltaLayerInner, + ctx: &'a RequestContext, + ) -> Self { + Self::NotLoaded { + layer: LayerRef::Delta(delta_layer), + first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start), + ctx, + } + } + + fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> { + match self { + Self::Loaded { iter } => iter + .peek() + .as_ref() + .map(|(key, lsn, val)| (key, *lsn, Some(val))), + Self::NotLoaded { + first_key_lower_bound: (key, lsn), + .. + } => Some((key, *lsn, None)), + } + } + + // CORRECTNESS: this function must always take `&mut self`, never `&self`. + // + // The reason is that `impl Ord for Self` evaluates differently after this function + // returns. We're called through a `PeekMut::deref_mut`, which causes heap repair when + // the PeekMut gets returned. So, it's critical that we actually run through `PeekMut::deref_mut` + // and not just `PeekMut::deref` + // If we don't take `&mut self` + async fn load(&mut self) -> anyhow::Result<()> { + assert!(!self.is_loaded()); + let Self::NotLoaded { + ctx, + first_key_lower_bound, + layer, + } = self + else { + unreachable!() + }; + let iter = layer.iter(ctx); + let iter = PeekableLayerIterRef::create(iter).await?; + if let Some((k1, l1, _)) = iter.peek() { + let (k2, l2) = first_key_lower_bound; + if (k1, l1) < (k2, l2) { + bail!( + "layer key range did not include the first key in the layer: {}", + layer.layer_dbg_info() + ); + } + } + *self = Self::Loaded { iter }; + Ok(()) + } + + fn is_loaded(&self) -> bool { + matches!(self, Self::Loaded { .. }) + } + + /// Correctness: must load the iterator before using. + /// + /// Given this iterator wrapper is private to the merge iterator, users won't be able to mis-use it. + /// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and + /// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. + async fn next(&mut self) -> anyhow::Result> { + let Self::Loaded { iter } = self else { + panic!("must load the iterator before using") + }; + iter.next().await + } +} + +/// A merge iterator over delta/image layer iterators. +/// +/// When duplicated records are found, the iterator will not perform any +/// deduplication, and the caller should handle these situation. By saying +/// duplicated records, there are many possibilities: +/// +/// * Two same delta at the same LSN. +/// * Two same image at the same LSN. +/// * Delta/image at the same LSN where the image has already applied the delta. +/// +/// The iterator will always put the image before the delta. +pub struct MergeIterator<'a> { + heap: BinaryHeap>, +} + +impl<'a> MergeIterator<'a> { + pub fn create( + deltas: &[&'a DeltaLayerInner], + images: &[&'a ImageLayerInner], + ctx: &'a RequestContext, + ) -> Self { + let mut heap = Vec::with_capacity(images.len() + deltas.len()); + for image in images { + heap.push(IteratorWrapper::create_from_image_layer(image, ctx)); + } + for delta in deltas { + heap.push(IteratorWrapper::create_from_delta_layer(delta, ctx)); + } + Self { + heap: BinaryHeap::from(heap), + } + } + + pub async fn next(&mut self) -> anyhow::Result> { + while let Some(mut iter) = self.heap.peek_mut() { + if !iter.is_loaded() { + // Once we load the iterator, we can know the real first key-value pair in the iterator. + // We put it back into the heap so that a potentially unloaded layer may have a key between + // [potential_first_key, loaded_first_key). + iter.load().await?; + continue; + } + let Some(item) = iter.next().await? else { + // If the iterator returns None, we pop this iterator. Actually, in the current implementation, + // we order None > Some, and all the rest of the iterators should return None. + binary_heap::PeekMut::pop(iter); + continue; + }; + return Ok(Some(item)); + } + Ok(None) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use itertools::Itertools; + use pageserver_api::key::Key; + use utils::lsn::Lsn; + + use crate::{ + tenant::{ + harness::{TenantHarness, TIMELINE_ID}, + storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value}, + }, + walrecord::NeonWalRecord, + DEFAULT_PG_VERSION, + }; + + async fn assert_merge_iter_equal( + merge_iter: &mut MergeIterator<'_>, + expect: &[(Key, Lsn, Value)], + ) { + let mut expect_iter = expect.iter(); + loop { + let o1 = merge_iter.next().await.unwrap(); + let o2 = expect_iter.next(); + assert_eq!(o1.is_some(), o2.is_some()); + if o1.is_none() && o2.is_none() { + break; + } + let (k1, l1, v1) = o1.unwrap(); + let (k2, l2, v2) = o2.unwrap(); + assert_eq!(&k1, k2); + assert_eq!(l1, *l2); + assert_eq!(&v1, v2); + } + } + + #[tokio::test] + async fn merge_in_between() { + use crate::repository::Value; + use bytes::Bytes; + + let harness = TenantHarness::create("merge_iterator_merge_in_between") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + let test_deltas1 = vec![ + ( + get_key(0), + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"test")), + ), + ( + get_key(5), + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"test")), + ), + ]; + let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx) + .await + .unwrap(); + let test_deltas2 = vec![ + ( + get_key(3), + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"test")), + ), + ( + get_key(4), + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"test")), + ), + ]; + let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx) + .await + .unwrap(); + let mut merge_iter = MergeIterator::create( + &[ + resident_layer_2.get_as_delta(&ctx).await.unwrap(), + resident_layer_1.get_as_delta(&ctx).await.unwrap(), + ], + &[], + &ctx, + ); + let mut expect = Vec::new(); + expect.extend(test_deltas1); + expect.extend(test_deltas2); + expect.sort_by(sort_delta); + assert_merge_iter_equal(&mut merge_iter, &expect).await; + } + + #[tokio::test] + async fn delta_merge() { + use crate::repository::Value; + use bytes::Bytes; + + let harness = TenantHarness::create("merge_iterator_delta_merge") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + const N: usize = 1000; + let test_deltas1 = (0..N) + .map(|idx| { + ( + get_key(idx as u32 / 10), + Lsn(0x20 * ((idx as u64) % 10 + 1)), + Value::Image(Bytes::from(format!("img{idx:05}"))), + ) + }) + .collect_vec(); + let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx) + .await + .unwrap(); + let test_deltas2 = (0..N) + .map(|idx| { + ( + get_key(idx as u32 / 10), + Lsn(0x20 * ((idx as u64) % 10 + 1) + 0x10), + Value::Image(Bytes::from(format!("img{idx:05}"))), + ) + }) + .collect_vec(); + let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx) + .await + .unwrap(); + let test_deltas3 = (0..N) + .map(|idx| { + ( + get_key(idx as u32 / 10 + N as u32), + Lsn(0x10 * ((idx as u64) % 10 + 1)), + Value::Image(Bytes::from(format!("img{idx:05}"))), + ) + }) + .collect_vec(); + let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx) + .await + .unwrap(); + let mut merge_iter = MergeIterator::create( + &[ + resident_layer_1.get_as_delta(&ctx).await.unwrap(), + resident_layer_2.get_as_delta(&ctx).await.unwrap(), + resident_layer_3.get_as_delta(&ctx).await.unwrap(), + ], + &[], + &ctx, + ); + let mut expect = Vec::new(); + expect.extend(test_deltas1); + expect.extend(test_deltas2); + expect.extend(test_deltas3); + expect.sort_by(sort_delta); + assert_merge_iter_equal(&mut merge_iter, &expect).await; + + // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge + } + + #[tokio::test] + async fn delta_image_mixed_merge() { + use crate::repository::Value; + use bytes::Bytes; + + let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + // In this test case, we want to test if the iterator still works correctly with multiple copies + // of a delta+image at the same LSN, for example, the following sequence a@10=+a, a@10=+a, a@10=ab, a@10=ab. + // Duplicated deltas/images are possible for old tenants before the full L0 compaction file name fix. + // An incomplete compaction could produce multiple exactly-the-same delta layers. Force image generation + // could produce overlapping images. Apart from duplicated deltas/images, in the current storage implementation + // one key-lsn could have a delta in the delta layer and one image in the image layer. The iterator should + // correctly process these situations and return everything as-is, and the upper layer of the system + // will handle duplicated LSNs. + let test_deltas1 = vec![ + ( + get_key(0), + Lsn(0x10), + Value::WalRecord(NeonWalRecord::wal_init()), + ), + ( + get_key(0), + Lsn(0x18), + Value::WalRecord(NeonWalRecord::wal_append("a")), + ), + ( + get_key(5), + Lsn(0x10), + Value::WalRecord(NeonWalRecord::wal_init()), + ), + ( + get_key(5), + Lsn(0x18), + Value::WalRecord(NeonWalRecord::wal_append("b")), + ), + ]; + let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx) + .await + .unwrap(); + let mut test_deltas2 = test_deltas1.clone(); + test_deltas2.push(( + get_key(10), + Lsn(0x20), + Value::Image(Bytes::copy_from_slice(b"test")), + )); + let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx) + .await + .unwrap(); + let test_deltas3 = vec![ + ( + get_key(0), + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"")), + ), + ( + get_key(5), + Lsn(0x18), + Value::Image(Bytes::copy_from_slice(b"b")), + ), + ( + get_key(15), + Lsn(0x20), + Value::Image(Bytes::copy_from_slice(b"test")), + ), + ]; + let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx) + .await + .unwrap(); + let mut test_deltas4 = test_deltas3.clone(); + test_deltas4.push(( + get_key(20), + Lsn(0x20), + Value::Image(Bytes::copy_from_slice(b"test")), + )); + let resident_layer_4 = produce_delta_layer(&tenant, &tline, test_deltas4.clone(), &ctx) + .await + .unwrap(); + let mut expect = Vec::new(); + expect.extend(test_deltas1); + expect.extend(test_deltas2); + expect.extend(test_deltas3); + expect.extend(test_deltas4); + expect.sort_by(sort_delta_value); + + // Test with different layer order for MergeIterator::create to ensure the order + // is stable. + + let mut merge_iter = MergeIterator::create( + &[ + resident_layer_4.get_as_delta(&ctx).await.unwrap(), + resident_layer_1.get_as_delta(&ctx).await.unwrap(), + resident_layer_3.get_as_delta(&ctx).await.unwrap(), + resident_layer_2.get_as_delta(&ctx).await.unwrap(), + ], + &[], + &ctx, + ); + assert_merge_iter_equal(&mut merge_iter, &expect).await; + + let mut merge_iter = MergeIterator::create( + &[ + resident_layer_1.get_as_delta(&ctx).await.unwrap(), + resident_layer_4.get_as_delta(&ctx).await.unwrap(), + resident_layer_3.get_as_delta(&ctx).await.unwrap(), + resident_layer_2.get_as_delta(&ctx).await.unwrap(), + ], + &[], + &ctx, + ); + assert_merge_iter_equal(&mut merge_iter, &expect).await; + + is_send(merge_iter); + } + + fn is_send(_: impl Send) {} +} diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs new file mode 100644 index 0000000000..7c1ac863bf --- /dev/null +++ b/pageserver/src/tenant/storage_layer/split_writer.rs @@ -0,0 +1,698 @@ +use std::{future::Future, ops::Range, sync::Arc}; + +use bytes::Bytes; +use pageserver_api::key::{Key, KEY_SIZE}; +use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId}; + +use crate::tenant::storage_layer::Layer; +use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline}; + +use super::layer::S3_UPLOAD_LIMIT; +use super::{ + DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer, +}; + +pub(crate) enum SplitWriterResult { + Produced(ResidentLayer), + Discarded(PersistentLayerKey), +} + +#[cfg(test)] +impl SplitWriterResult { + fn into_resident_layer(self) -> ResidentLayer { + match self { + SplitWriterResult::Produced(layer) => layer, + SplitWriterResult::Discarded(_) => panic!("unexpected discarded layer"), + } + } + + fn into_discarded_layer(self) -> PersistentLayerKey { + match self { + SplitWriterResult::Produced(_) => panic!("unexpected produced layer"), + SplitWriterResult::Discarded(layer) => layer, + } + } +} + +/// An image writer that takes images and produces multiple image layers. +/// +/// The interface does not guarantee atomicity (i.e., if the image layer generation +/// fails, there might be leftover files to be cleaned up) +#[must_use] +pub struct SplitImageLayerWriter { + inner: ImageLayerWriter, + target_layer_size: u64, + generated_layers: Vec, + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_shard_id: TenantShardId, + lsn: Lsn, + start_key: Key, +} + +impl SplitImageLayerWriter { + pub async fn new( + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_shard_id: TenantShardId, + start_key: Key, + lsn: Lsn, + target_layer_size: u64, + ctx: &RequestContext, + ) -> anyhow::Result { + Ok(Self { + target_layer_size, + inner: ImageLayerWriter::new( + conf, + timeline_id, + tenant_shard_id, + &(start_key..Key::MAX), + lsn, + ctx, + ) + .await?, + generated_layers: Vec::new(), + conf, + timeline_id, + tenant_shard_id, + lsn, + start_key, + }) + } + + pub async fn put_image_with_discard_fn( + &mut self, + key: Key, + img: Bytes, + tline: &Arc, + ctx: &RequestContext, + discard: D, + ) -> anyhow::Result<()> + where + D: FnOnce(&PersistentLayerKey) -> F, + F: Future, + { + // The current estimation is an upper bound of the space that the key/image could take + // because we did not consider compression in this estimation. The resulting image layer + // could be smaller than the target size. + let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64; + if self.inner.num_keys() >= 1 + && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size + { + let next_image_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &(key..Key::MAX), + self.lsn, + ctx, + ) + .await?; + let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer); + let layer_key = PersistentLayerKey { + key_range: self.start_key..key, + lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn), + is_delta: false, + }; + self.start_key = key; + + if discard(&layer_key).await { + drop(prev_image_writer); + self.generated_layers + .push(SplitWriterResult::Discarded(layer_key)); + } else { + self.generated_layers.push(SplitWriterResult::Produced( + prev_image_writer + .finish_with_end_key(tline, key, ctx) + .await?, + )); + } + } + self.inner.put_image(key, img, ctx).await + } + + #[cfg(test)] + pub async fn put_image( + &mut self, + key: Key, + img: Bytes, + tline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.put_image_with_discard_fn(key, img, tline, ctx, |_| async { false }) + .await + } + + pub(crate) async fn finish_with_discard_fn( + self, + tline: &Arc, + ctx: &RequestContext, + end_key: Key, + discard: D, + ) -> anyhow::Result> + where + D: FnOnce(&PersistentLayerKey) -> F, + F: Future, + { + let Self { + mut generated_layers, + inner, + .. + } = self; + if inner.num_keys() == 0 { + return Ok(generated_layers); + } + let layer_key = PersistentLayerKey { + key_range: self.start_key..end_key, + lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn), + is_delta: false, + }; + if discard(&layer_key).await { + generated_layers.push(SplitWriterResult::Discarded(layer_key)); + } else { + generated_layers.push(SplitWriterResult::Produced( + inner.finish_with_end_key(tline, end_key, ctx).await?, + )); + } + Ok(generated_layers) + } + + #[cfg(test)] + pub(crate) async fn finish( + self, + tline: &Arc, + ctx: &RequestContext, + end_key: Key, + ) -> anyhow::Result> { + self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false }) + .await + } + + /// When split writer fails, the caller should call this function and handle partially generated layers. + pub(crate) fn take(self) -> anyhow::Result<(Vec, ImageLayerWriter)> { + Ok((self.generated_layers, self.inner)) + } +} + +/// A delta writer that takes key-lsn-values and produces multiple delta layers. +/// +/// The interface does not guarantee atomicity (i.e., if the delta layer generation fails, +/// there might be leftover files to be cleaned up). +/// +/// Note that if updates of a single key exceed the target size limit, all of the updates will be batched +/// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm +/// will split them into multiple files based on size. +#[must_use] +pub struct SplitDeltaLayerWriter { + inner: DeltaLayerWriter, + target_layer_size: u64, + generated_layers: Vec, + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_shard_id: TenantShardId, + lsn_range: Range, + last_key_written: Key, + start_key: Key, +} + +impl SplitDeltaLayerWriter { + pub async fn new( + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_shard_id: TenantShardId, + start_key: Key, + lsn_range: Range, + target_layer_size: u64, + ctx: &RequestContext, + ) -> anyhow::Result { + Ok(Self { + target_layer_size, + inner: DeltaLayerWriter::new( + conf, + timeline_id, + tenant_shard_id, + start_key, + lsn_range.clone(), + ctx, + ) + .await?, + generated_layers: Vec::new(), + conf, + timeline_id, + tenant_shard_id, + lsn_range, + last_key_written: Key::MIN, + start_key, + }) + } + + /// Put value into the layer writer. In the case the writer decides to produce a layer, and the discard fn returns true, no layer will be written in the end. + pub async fn put_value_with_discard_fn( + &mut self, + key: Key, + lsn: Lsn, + val: Value, + tline: &Arc, + ctx: &RequestContext, + discard: D, + ) -> anyhow::Result<()> + where + D: FnOnce(&PersistentLayerKey) -> F, + F: Future, + { + // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate + // number, and therefore the final layer size could be a little bit larger or smaller than the target. + // + // Also, keep all updates of a single key in a single file. TODO: split them using the legacy compaction + // strategy. https://github.com/neondatabase/neon/issues/8837 + let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */; + if self.inner.num_keys() >= 1 + && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size + { + if key != self.last_key_written { + let next_delta_writer = DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + key, + self.lsn_range.clone(), + ctx, + ) + .await?; + let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer); + let layer_key = PersistentLayerKey { + key_range: self.start_key..key, + lsn_range: self.lsn_range.clone(), + is_delta: true, + }; + self.start_key = key; + if discard(&layer_key).await { + drop(prev_delta_writer); + self.generated_layers + .push(SplitWriterResult::Discarded(layer_key)); + } else { + let (desc, path) = prev_delta_writer.finish(key, ctx).await?; + let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?; + self.generated_layers + .push(SplitWriterResult::Produced(delta_layer)); + } + } else if self.inner.estimated_size() >= S3_UPLOAD_LIMIT { + // We have to produce a very large file b/c a key is updated too often. + anyhow::bail!( + "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced", + key, + self.inner.estimated_size() + ); + } + } + self.last_key_written = key; + self.inner.put_value(key, lsn, val, ctx).await + } + + pub async fn put_value( + &mut self, + key: Key, + lsn: Lsn, + val: Value, + tline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.put_value_with_discard_fn(key, lsn, val, tline, ctx, |_| async { false }) + .await + } + + pub(crate) async fn finish_with_discard_fn( + self, + tline: &Arc, + ctx: &RequestContext, + end_key: Key, + discard: D, + ) -> anyhow::Result> + where + D: FnOnce(&PersistentLayerKey) -> F, + F: Future, + { + let Self { + mut generated_layers, + inner, + .. + } = self; + if inner.num_keys() == 0 { + return Ok(generated_layers); + } + let layer_key = PersistentLayerKey { + key_range: self.start_key..end_key, + lsn_range: self.lsn_range.clone(), + is_delta: true, + }; + if discard(&layer_key).await { + generated_layers.push(SplitWriterResult::Discarded(layer_key)); + } else { + let (desc, path) = inner.finish(end_key, ctx).await?; + let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?; + generated_layers.push(SplitWriterResult::Produced(delta_layer)); + } + Ok(generated_layers) + } + + #[cfg(test)] + pub(crate) async fn finish( + self, + tline: &Arc, + ctx: &RequestContext, + end_key: Key, + ) -> anyhow::Result> { + self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false }) + .await + } + + /// When split writer fails, the caller should call this function and handle partially generated layers. + pub(crate) fn take(self) -> anyhow::Result<(Vec, DeltaLayerWriter)> { + Ok((self.generated_layers, self.inner)) + } +} + +#[cfg(test)] +mod tests { + use itertools::Itertools; + use rand::{RngCore, SeedableRng}; + + use crate::{ + tenant::{ + harness::{TenantHarness, TIMELINE_ID}, + storage_layer::AsLayerDesc, + }, + DEFAULT_PG_VERSION, + }; + + use super::*; + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + fn get_img(id: u32) -> Bytes { + format!("{id:064}").into() + } + + fn get_large_img() -> Bytes { + let mut rng = rand::rngs::SmallRng::seed_from_u64(42); + let mut data = vec![0; 8192]; + rng.fill_bytes(&mut data); + data.into() + } + + #[tokio::test] + async fn write_one_image() { + let harness = TenantHarness::create("split_writer_write_one_image") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + let mut image_writer = SplitImageLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18), + 4 * 1024 * 1024, + &ctx, + ) + .await + .unwrap(); + + let mut delta_writer = SplitDeltaLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18)..Lsn(0x20), + 4 * 1024 * 1024, + &ctx, + ) + .await + .unwrap(); + + image_writer + .put_image(get_key(0), get_img(0), &tline, &ctx) + .await + .unwrap(); + let layers = image_writer + .finish(&tline, &ctx, get_key(10)) + .await + .unwrap(); + assert_eq!(layers.len(), 1); + + delta_writer + .put_value( + get_key(0), + Lsn(0x18), + Value::Image(get_img(0)), + &tline, + &ctx, + ) + .await + .unwrap(); + let layers = delta_writer + .finish(&tline, &ctx, get_key(10)) + .await + .unwrap(); + assert_eq!(layers.len(), 1); + } + + #[tokio::test] + async fn write_split() { + write_split_helper("split_writer_write_split", false).await; + } + + #[tokio::test] + async fn write_split_discard() { + write_split_helper("split_writer_write_split_discard", false).await; + } + + async fn write_split_helper(harness_name: &'static str, discard: bool) { + let harness = TenantHarness::create(harness_name).await.unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + let mut image_writer = SplitImageLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18), + 4 * 1024 * 1024, + &ctx, + ) + .await + .unwrap(); + let mut delta_writer = SplitDeltaLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18)..Lsn(0x20), + 4 * 1024 * 1024, + &ctx, + ) + .await + .unwrap(); + const N: usize = 2000; + for i in 0..N { + let i = i as u32; + image_writer + .put_image_with_discard_fn(get_key(i), get_large_img(), &tline, &ctx, |_| async { + discard + }) + .await + .unwrap(); + delta_writer + .put_value_with_discard_fn( + get_key(i), + Lsn(0x20), + Value::Image(get_large_img()), + &tline, + &ctx, + |_| async { discard }, + ) + .await + .unwrap(); + } + let image_layers = image_writer + .finish(&tline, &ctx, get_key(N as u32)) + .await + .unwrap(); + let delta_layers = delta_writer + .finish(&tline, &ctx, get_key(N as u32)) + .await + .unwrap(); + if discard { + for layer in image_layers { + layer.into_discarded_layer(); + } + for layer in delta_layers { + layer.into_discarded_layer(); + } + } else { + let image_layers = image_layers + .into_iter() + .map(|x| x.into_resident_layer()) + .collect_vec(); + let delta_layers = delta_layers + .into_iter() + .map(|x| x.into_resident_layer()) + .collect_vec(); + assert_eq!(image_layers.len(), N / 512 + 1); + assert_eq!(delta_layers.len(), N / 512 + 1); + for idx in 0..image_layers.len() { + assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN); + assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX); + assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN); + assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX); + if idx > 0 { + assert_eq!( + image_layers[idx - 1].layer_desc().key_range.end, + image_layers[idx].layer_desc().key_range.start + ); + assert_eq!( + delta_layers[idx - 1].layer_desc().key_range.end, + delta_layers[idx].layer_desc().key_range.start + ); + } + } + } + } + + #[tokio::test] + async fn write_large_img() { + let harness = TenantHarness::create("split_writer_write_large_img") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + let mut image_writer = SplitImageLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18), + 4 * 1024, + &ctx, + ) + .await + .unwrap(); + + let mut delta_writer = SplitDeltaLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x18)..Lsn(0x20), + 4 * 1024, + &ctx, + ) + .await + .unwrap(); + + image_writer + .put_image(get_key(0), get_img(0), &tline, &ctx) + .await + .unwrap(); + image_writer + .put_image(get_key(1), get_large_img(), &tline, &ctx) + .await + .unwrap(); + let layers = image_writer + .finish(&tline, &ctx, get_key(10)) + .await + .unwrap(); + assert_eq!(layers.len(), 2); + + delta_writer + .put_value( + get_key(0), + Lsn(0x18), + Value::Image(get_img(0)), + &tline, + &ctx, + ) + .await + .unwrap(); + delta_writer + .put_value( + get_key(1), + Lsn(0x1A), + Value::Image(get_large_img()), + &tline, + &ctx, + ) + .await + .unwrap(); + let layers = delta_writer + .finish(&tline, &ctx, get_key(10)) + .await + .unwrap(); + assert_eq!(layers.len(), 2); + } + + #[tokio::test] + async fn write_split_single_key() { + let harness = TenantHarness::create("split_writer_write_split_single_key") + .await + .unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + const N: usize = 2000; + let mut delta_writer = SplitDeltaLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + get_key(0), + Lsn(0x10)..Lsn(N as u64 * 16 + 0x10), + 4 * 1024 * 1024, + &ctx, + ) + .await + .unwrap(); + + for i in 0..N { + let i = i as u32; + delta_writer + .put_value( + get_key(0), + Lsn(i as u64 * 16 + 0x10), + Value::Image(get_large_img()), + &tline, + &ctx, + ) + .await + .unwrap(); + } + let delta_layers = delta_writer + .finish(&tline, &ctx, get_key(N as u32)) + .await + .unwrap(); + assert_eq!(delta_layers.len(), 1); + } +} diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 5f39c46a84..478e9bb4f0 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -2,6 +2,7 @@ //! such as compaction and GC use std::ops::ControlFlow; +use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant}; @@ -9,15 +10,17 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant::throttle::Stats; use crate::tenant::timeline::CompactionError; use crate::tenant::{Tenant, TenantState}; +use rand::Rng; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::{backoff, completion}; +use utils::{backoff, completion, pausable_failpoint}; static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy = once_cell::sync::Lazy::new(|| { - let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS; + let total_threads = task_mgr::TOKIO_WORKER_THREADS.get(); let permits = usize::max( 1, // while a lot of the work is done on spawn_blocking, we still do @@ -37,12 +40,13 @@ static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy &'static str { - let s: &'static str = self.into(); - s + self.into() } } @@ -61,16 +64,15 @@ impl BackgroundLoopKind { pub(crate) async fn concurrent_background_tasks_rate_limit_permit( loop_kind: BackgroundLoopKind, _ctx: &RequestContext, -) -> impl Drop { - let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE - .with_label_values(&[loop_kind.as_static_str()]) - .guard(); +) -> tokio::sync::SemaphorePermit<'static> { + let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind); pausable_failpoint!( "initial-size-calculation-permit-pause", loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation ); + // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id(); match CONCURRENT_BACKGROUND_TASKS.acquire().await { Ok(permit) => permit, Err(_closed) => unreachable!("we never close the semaphore"), @@ -86,10 +88,9 @@ pub fn start_background_loops( task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Compaction, - Some(tenant_shard_id), + tenant_shard_id, None, &format!("compactor for tenant {tenant_shard_id}"), - false, { let tenant = Arc::clone(tenant); let background_jobs_can_start = background_jobs_can_start.cloned(); @@ -100,6 +101,7 @@ pub fn start_background_loops( _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} }; compaction_loop(tenant, cancel) + // If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; Ok(()) @@ -109,10 +111,9 @@ pub fn start_background_loops( task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::GarbageCollector, - Some(tenant_shard_id), + tenant_shard_id, None, &format!("garbage collector for tenant {tenant_shard_id}"), - false, { let tenant = Arc::clone(tenant); let background_jobs_can_start = background_jobs_can_start.cloned(); @@ -129,6 +130,29 @@ pub fn start_background_loops( } }, ); + + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::IngestHousekeeping, + tenant_shard_id, + None, + &format!("ingest housekeeping for tenant {tenant_shard_id}"), + { + let tenant = Arc::clone(tenant); + let background_jobs_can_start = background_jobs_can_start.cloned(); + async move { + let cancel = task_mgr::shutdown_token(); + tokio::select! { + _ = cancel.cancelled() => { return Ok(()) }, + _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} + }; + ingest_housekeeping_loop(tenant, cancel) + .instrument(info_span!("ingest_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) + .await; + Ok(()) + } + }, + ); } /// @@ -139,6 +163,8 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { // How many errors we have seen consequtively let mut error_run_count = 0; + let mut last_throttle_flag_reset_at = Instant::now(); + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); @@ -165,41 +191,79 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } } - let started_at = Instant::now(); - let sleep_duration = if period == Duration::ZERO { + + let sleep_duration; + if period == Duration::ZERO { #[cfg(not(feature = "testing"))] info!("automatic compaction is disabled"); // check again in 10 seconds, in case it's been enabled again. - Duration::from_secs(10) + sleep_duration = Duration::from_secs(10) } else { + let iteration = Iteration { + started_at: Instant::now(), + period, + kind: BackgroundLoopKind::Compaction, + }; + // Run compaction - if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await { - let wait_duration = backoff::exponential_backoff_duration_seconds( - error_run_count + 1, - 1.0, - MAX_BACKOFF_SECS, - ); - error_run_count += 1; - let wait_duration = Duration::from_secs_f64(wait_duration); - log_compaction_error( - &e, - error_run_count, - &wait_duration, - cancel.is_cancelled(), - ); - wait_duration - } else { - error_run_count = 0; - period + let IterationResult { output, elapsed } = iteration.run(tenant.compaction_iteration(&cancel, &ctx)).await; + match output { + Ok(has_pending_task) => { + error_run_count = 0; + // schedule the next compaction immediately in case there is a pending compaction task + sleep_duration = if has_pending_task { Duration::ZERO } else { period }; + } + Err(e) => { + let wait_duration = backoff::exponential_backoff_duration_seconds( + error_run_count + 1, + 1.0, + MAX_BACKOFF_SECS, + ); + error_run_count += 1; + let wait_duration = Duration::from_secs_f64(wait_duration); + log_compaction_error( + &e, + error_run_count, + &wait_duration, + cancel.is_cancelled(), + ); + sleep_duration = wait_duration; + } } + + // the duration is recorded by performance tests by enabling debug in this function + tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete"); }; - warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction); // Perhaps we did no work and the walredo process has been idle for some time: // give it a chance to shut down to avoid leaving walredo process running indefinitely. - tenant.walredo_mgr.maybe_quiesce(period * 10); + if let Some(walredo_mgr) = &tenant.walredo_mgr { + walredo_mgr.maybe_quiesce(period * 10); + } + + // TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off, + // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens. + info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| { + let now = Instant::now(); + let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now); + let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats(); + if count_throttled == 0 { + return; + } + let allowed_rps = tenant.timeline_get_throttle.steady_rps(); + let delta = now - prev; + info!( + n_seconds=%format_args!("{:.3}", + delta.as_secs_f64()), + count_accounted, + count_throttled, + sum_throttled_usecs, + allowed_rps=%format_args!("{allowed_rps:.0}"), + "shard was throttled in the last n_seconds" + ); + }); // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) @@ -280,6 +344,7 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { // cutoff specified as time. let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); + let mut first = true; loop { tokio::select! { @@ -296,45 +361,67 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { if first { first = false; - if random_init_delay(period, &cancel).await.is_err() { + + let delays = async { + delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?; + random_init_delay(period, &cancel).await?; + Ok::<_, Cancelled>(()) + }; + + if delays.await.is_err() { break; } } - let started_at = Instant::now(); - let gc_horizon = tenant.get_gc_horizon(); - let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 { + let sleep_duration; + if period == Duration::ZERO || gc_horizon == 0 { #[cfg(not(feature = "testing"))] info!("automatic GC is disabled"); // check again in 10 seconds, in case it's been enabled again. - Duration::from_secs(10) + sleep_duration = Duration::from_secs(10); } else { + let iteration = Iteration { + started_at: Instant::now(), + period, + kind: BackgroundLoopKind::Gc, + }; // Run gc - let res = tenant - .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx) + let IterationResult { output, elapsed: _ } = + iteration.run(tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)) .await; - if let Err(e) = res { - let wait_duration = backoff::exponential_backoff_duration_seconds( - error_run_count + 1, - 1.0, - MAX_BACKOFF_SECS, - ); - error_run_count += 1; - let wait_duration = Duration::from_secs_f64(wait_duration); - error!( - "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}", - ); - wait_duration - } else { - error_run_count = 0; - period + match output { + Ok(_) => { + error_run_count = 0; + sleep_duration = period; + } + Err(crate::tenant::GcError::TenantCancelled) => { + return; + } + Err(e) => { + let wait_duration = backoff::exponential_backoff_duration_seconds( + error_run_count + 1, + 1.0, + MAX_BACKOFF_SECS, + ); + error_run_count += 1; + let wait_duration = Duration::from_secs_f64(wait_duration); + + if matches!(e, crate::tenant::GcError::TimelineCancelled) { + // Timeline was cancelled during gc. We might either be in an event + // that affects the entire tenant (tenant deletion, pageserver shutdown), + // or in one that affects the timeline only (timeline deletion). + // Therefore, don't exit the loop. + info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}"); + } else { + error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}"); + } + + sleep_duration = wait_duration; + } } }; - warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc); - - // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) .await .is_ok() @@ -347,6 +434,61 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); } +async fn ingest_housekeeping_loop(tenant: Arc, cancel: CancellationToken) { + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + async { + loop { + tokio::select! { + _ = cancel.cancelled() => { + return; + }, + tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { + ControlFlow::Break(()) => return, + ControlFlow::Continue(()) => (), + }, + } + + // We run ingest housekeeping with the same frequency as compaction: it is not worth + // having a distinct setting. But we don't run it in the same task, because compaction + // blocks on acquiring the background job semaphore. + let period = tenant.get_compaction_period(); + + // If compaction period is set to zero (to disable it), then we will use a reasonable default + let period = if period == Duration::ZERO { + humantime::Duration::from_str( + pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD, + ) + .unwrap() + .into() + } else { + period + }; + + // Jitter the period by +/- 5% + let period = + rand::thread_rng().gen_range((period * (95)) / 100..(period * (105)) / 100); + + // Always sleep first: we do not need to do ingest housekeeping early in the lifetime of + // a tenant, since it won't have started writing any ephemeral files yet. + if tokio::time::timeout(period, cancel.cancelled()) + .await + .is_ok() + { + break; + } + + let iteration = Iteration { + started_at: Instant::now(), + period, + kind: BackgroundLoopKind::IngestHouseKeeping, + }; + iteration.run(tenant.ingest_housekeeping()).await; + } + } + .await; + TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); +} + async fn wait_for_active_tenant(tenant: &Arc) -> ControlFlow<()> { // if the tenant has a proper status already, no need to wait for anything if tenant.current_state() == TenantState::Active { @@ -388,8 +530,6 @@ pub(crate) async fn random_init_delay( period: Duration, cancel: &CancellationToken, ) -> Result<(), Cancelled> { - use rand::Rng; - if period == Duration::ZERO { return Ok(()); } @@ -405,6 +545,69 @@ pub(crate) async fn random_init_delay( } } +/// Delays GC by defaul lease length at restart. +/// +/// We do this as the leases mapping are not persisted to disk. By delaying GC by default +/// length, we gurantees that all the leases we granted before the restart will expire +/// when we run GC for the first time after the restart. +pub(crate) async fn delay_by_lease_length( + length: Duration, + cancel: &CancellationToken, +) -> Result<(), Cancelled> { + match tokio::time::timeout(length, cancel.cancelled()).await { + Ok(_) => Err(Cancelled), + Err(_) => Ok(()), + } +} + +struct Iteration { + started_at: Instant, + period: Duration, + kind: BackgroundLoopKind, +} + +struct IterationResult { + output: O, + elapsed: Duration, +} + +impl Iteration { + #[instrument(skip_all)] + pub(crate) async fn run(self, fut: Fut) -> IterationResult + where + Fut: std::future::Future, + { + let Self { + started_at, + period, + kind, + } = self; + + let mut fut = std::pin::pin!(fut); + + // Wrap `fut` into a future that logs a message every `period` so that we get a + // very obvious breadcrumb in the logs _while_ a slow iteration is happening. + let liveness_logger = async move { + loop { + match tokio::time::timeout(period, &mut fut).await { + Ok(x) => return x, + Err(_) => { + // info level as per the same rationale why warn_when_period_overrun is info + // => https://github.com/neondatabase/neon/pull/5724 + info!("still running"); + } + } + } + }; + + let output = liveness_logger.await; + + let elapsed = started_at.elapsed(); + warn_when_period_overrun(elapsed, period, kind); + + IterationResult { output, elapsed } + } +} /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric. pub(crate) fn warn_when_period_overrun( elapsed: Duration, diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs new file mode 100644 index 0000000000..f222e708e1 --- /dev/null +++ b/pageserver/src/tenant/throttle.rs @@ -0,0 +1,171 @@ +use std::{ + str::FromStr, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, + time::{Duration, Instant}, +}; + +use arc_swap::ArcSwap; +use enumset::EnumSet; +use tracing::{error, warn}; +use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter}; + +use crate::{context::RequestContext, task_mgr::TaskKind}; + +/// Throttle for `async` functions. +/// +/// Runtime reconfigurable. +/// +/// To share a throttle among multiple entities, wrap it in an [`Arc`]. +/// +/// The intial use case for this is tenant-wide throttling of getpage@lsn requests. +pub struct Throttle { + inner: ArcSwap, + metric: M, + /// will be turned into [`Stats::count_accounted`] + count_accounted: AtomicU64, + /// will be turned into [`Stats::count_throttled`] + count_throttled: AtomicU64, + /// will be turned into [`Stats::sum_throttled_usecs`] + sum_throttled_usecs: AtomicU64, +} + +pub struct Inner { + task_kinds: EnumSet, + rate_limiter: Arc, +} + +pub type Config = pageserver_api::models::ThrottleConfig; + +pub struct Observation { + pub wait_time: Duration, +} +pub trait Metric { + fn observe_throttling(&self, observation: &Observation); +} + +/// See [`Throttle::reset_stats`]. +pub struct Stats { + // Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`]. + pub count_accounted: u64, + // Subset of the `accounted` requests that were actually throttled. + // Note that the numbers are stored as two independent atomics, so, there might be a slight drift. + pub count_throttled: u64, + // Sum of microseconds that throttled requests spent waiting for throttling. + pub sum_throttled_usecs: u64, +} + +impl Throttle +where + M: Metric, +{ + pub fn new(config: Config, metric: M) -> Self { + Self { + inner: ArcSwap::new(Arc::new(Self::new_inner(config))), + metric, + count_accounted: AtomicU64::new(0), + count_throttled: AtomicU64::new(0), + sum_throttled_usecs: AtomicU64::new(0), + } + } + fn new_inner(config: Config) -> Inner { + let Config { + task_kinds, + initial, + refill_interval, + refill_amount, + max, + } = config; + let task_kinds: EnumSet = task_kinds + .iter() + .filter_map(|s| match TaskKind::from_str(s) { + Ok(v) => Some(v), + Err(e) => { + // TODO: avoid this failure mode + error!( + "cannot parse task kind, ignoring for rate limiting {}", + utils::error::report_compact_sources(&e) + ); + None + } + }) + .collect(); + + // steady rate, we expect `refill_amount` requests per `refill_interval`. + // dividing gives us the rps. + let rps = f64::from(refill_amount.get()) / refill_interval.as_secs_f64(); + let config = LeakyBucketConfig::new(rps, f64::from(max)); + + // initial tracks how many tokens are available to put in the bucket + // we want how many tokens are currently in the bucket + let initial_tokens = max - initial; + + let rate_limiter = RateLimiter::with_initial_tokens(config, f64::from(initial_tokens)); + + Inner { + task_kinds, + rate_limiter: Arc::new(rate_limiter), + } + } + pub fn reconfigure(&self, config: Config) { + self.inner.store(Arc::new(Self::new_inner(config))); + } + + /// The [`Throttle`] keeps an internal flag that is true if there was ever any actual throttling. + /// This method allows retrieving & resetting that flag. + /// Useful for periodic reporting. + pub fn reset_stats(&self) -> Stats { + let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed); + let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed); + let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed); + Stats { + count_accounted, + count_throttled, + sum_throttled_usecs, + } + } + + /// See [`Config::steady_rps`]. + pub fn steady_rps(&self) -> f64 { + self.inner.load().rate_limiter.steady_rps() + } + + pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option { + let inner = self.inner.load_full(); // clones the `Inner` Arc + if !inner.task_kinds.contains(ctx.task_kind()) { + return None; + }; + let start = std::time::Instant::now(); + + let did_throttle = inner.rate_limiter.acquire(key_count).await; + + self.count_accounted.fetch_add(1, Ordering::Relaxed); + if did_throttle { + self.count_throttled.fetch_add(1, Ordering::Relaxed); + let now = Instant::now(); + let wait_time = now - start; + self.sum_throttled_usecs + .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed); + let observation = Observation { wait_time }; + self.metric.observe_throttling(&observation); + match ctx.micros_spent_throttled.add(wait_time) { + Ok(res) => res, + Err(error) => { + use once_cell::sync::Lazy; + use utils::rate_limit::RateLimit; + static WARN_RATE_LIMIT: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut guard = WARN_RATE_LIMIT.lock().unwrap(); + guard.call(move || { + warn!(error, "error adding time spent throttled; this message is logged at a global rate limit"); + }); + } + } + Some(wait_time) + } else { + None + } + } +} diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c21fe94d01..262dccac7d 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1,5 +1,9 @@ +pub(crate) mod analysis; +pub(crate) mod compaction; pub mod delete; +pub(crate) mod detach_ancestor; mod eviction_task; +pub(crate) mod handle; mod init; pub mod layer_manager; pub(crate) mod logical_size; @@ -8,19 +12,27 @@ pub mod uninit; mod walreceiver; use anyhow::{anyhow, bail, ensure, Context, Result}; +use arc_swap::ArcSwap; use bytes::Bytes; -use camino::{Utf8Path, Utf8PathBuf}; +use camino::Utf8Path; +use chrono::{DateTime, Utc}; use enumset::EnumSet; use fail::fail_point; -use itertools::Itertools; +use handle::ShardTimelineId; +use once_cell::sync::Lazy; use pageserver_api::{ - keyspace::{key_range_size, KeySpaceAccum}, + key::{ + CompactKey, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, + NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE, + }, + keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, models::{ + AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, - LayerMapInfo, TimelineState, + InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState, }, reltag::BlockNumber, - shard::{ShardIdentity, TenantShardId}, + shard::{ShardIdentity, ShardNumber, TenantShardId}, }; use rand::Rng; use serde_with::serde_as; @@ -31,51 +43,71 @@ use tokio::{ }; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::sync::gate::Gate; +use utils::{ + fs_ext, pausable_failpoint, + sync::gate::{Gate, GateGuard}, +}; -use std::collections::{BTreeMap, BinaryHeap, HashMap, HashSet}; -use std::ops::{Deref, Range}; use std::pin::pin; use std::sync::atomic::Ordering as AtomicOrdering; use std::sync::{Arc, Mutex, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; use std::{ - cmp::{max, min, Ordering}, - ops::ControlFlow, + array, + collections::{BTreeMap, HashMap, HashSet}, + sync::atomic::AtomicU64, +}; +use std::{cmp::min, ops::ControlFlow}; +use std::{ + collections::btree_map::Entry, + ops::{Deref, Range}, }; -use crate::tenant::timeline::logical_size::CurrentLogicalSize; -use crate::tenant::{ - layer_map::{LayerMap, SearchResult}, - metadata::{save_metadata, TimelineMetadata}, - par_fsync, +use crate::{ + aux_file::AuxFileSizeEstimator, + tenant::{ + layer_map::{LayerMap, SearchResult}, + metadata::TimelineMetadata, + storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc}, + }, + walredo, }; use crate::{ - context::{AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder}, + context::{DownloadBehavior, RequestContext}, disk_usage_eviction_task::DiskUsageEvictionInfo, + pgdatadir_mapping::CollectKeySpaceError, }; -use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError}; use crate::{ disk_usage_eviction_task::finite_f32, tenant::storage_layer::{ AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer, - LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult, - ValueReconstructState, + LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructState, + ValuesReconstructState, }, }; use crate::{ disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, }; +use crate::{ + l0_flush::{self, L0FlushGlobalState}, + metrics::GetKind, +}; +use crate::{ + metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, +}; use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; +use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey}; +use crate::{ + pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, + virtual_file::{MaybeFatalIo, VirtualFile}, +}; +use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; use crate::config::PageServerConf; -use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum}; -use crate::metrics::{ - TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT, -}; +use crate::keyspace::{KeyPartitioning, KeySpace}; +use crate::metrics::TimelineMetrics; use crate::pgdatadir_mapping::CalculateLogicalSizeError; use crate::tenant::config::TenantConfOpt; -use pageserver_api::key::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key}; use pageserver_api::reltag::RelTag; use pageserver_api::shard::ShardIndex; @@ -90,7 +122,6 @@ use utils::{ simple_rcu::{Rcu, RcuReadGuard}, }; -use crate::page_cache; use crate::repository::GcResult; use crate::repository::{Key, Value}; use crate::task_mgr; @@ -104,14 +135,23 @@ use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; -use super::config::TenantConf; -use super::remote_timeline_client::index::{IndexLayerMetadata, IndexPart}; -use super::remote_timeline_client::RemoteTimelineClient; -use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline}; +use super::{ + config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint, + upload_queue::NotInitialized, +}; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; +use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; +use super::{ + remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError, + storage_layer::ReadableLayer, +}; +use super::{ + secondary::heatmap::{HeatMapLayer, HeatMapTimeline}, + GcError, +}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub(super) enum FlushLoopState { +pub(crate) enum FlushLoopState { NotStarted, Running { #[cfg(test)] @@ -122,28 +162,28 @@ pub(super) enum FlushLoopState { Exited, } -/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Hole { - key_range: Range, - coverage_size: usize, +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum ImageLayerCreationMode { + /// Try to create image layers based on `time_for_new_image_layer`. Used in compaction code path. + Try, + /// Force creating the image layers if possible. For now, no image layers will be created + /// for metadata keys. Used in compaction code path with force flag enabled. + Force, + /// Initial ingestion of the data, and no data should be dropped in this function. This + /// means that no metadata keys should be included in the partitions. Used in flush frozen layer + /// code path. + Initial, } -impl Ord for Hole { - fn cmp(&self, other: &Self) -> Ordering { - other.coverage_size.cmp(&self.coverage_size) // inverse order - } -} - -impl PartialOrd for Hole { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) +impl std::fmt::Display for ImageLayerCreationMode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self) } } /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things. /// Can be removed after all refactors are done. -fn drop_rlock(rlock: tokio::sync::OwnedRwLockReadGuard) { +fn drop_rlock(rlock: tokio::sync::RwLockReadGuard) { drop(rlock) } @@ -155,13 +195,31 @@ fn drop_wlock(rlock: tokio::sync::RwLockWriteGuard<'_, T>) { /// The outward-facing resources required to build a Timeline pub struct TimelineResources { - pub remote_client: Option, - pub deletion_queue_client: DeletionQueueClient, + pub remote_client: RemoteTimelineClient, + pub timeline_get_throttle: Arc< + crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>, + >, + pub l0_flush_global_state: l0_flush::L0FlushGlobalState, +} + +pub(crate) struct AuxFilesState { + pub(crate) dir: Option, + pub(crate) n_deltas: usize, +} + +/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL +/// ingestion considerably, because WAL ingestion needs to check on most records if the record +/// implicitly extends the relation. At startup, `complete_as_of` is initialized to the current end +/// of the timeline (disk_consistent_lsn). It's used on reads of relation sizes to check if the +/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`]. +pub(crate) struct RelSizeCache { + pub(crate) complete_as_of: Lsn, + pub(crate) map: HashMap, } pub struct Timeline { - conf: &'static PageServerConf, - tenant_conf: Arc>, + pub(crate) conf: &'static PageServerConf, + tenant_conf: Arc>, myself: Weak, @@ -198,29 +256,20 @@ pub struct Timeline { /// /// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`, /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`. - pub(crate) layers: Arc>, - - /// Set of key ranges which should be covered by image layers to - /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored. - /// It is used by compaction task when it checks if new image layer should be created. - /// Newly created image layer doesn't help to remove the delta layer, until the - /// newly created image layer falls off the PITR horizon. So on next GC cycle, - /// gc_timeline may still want the new image layer to be created. To avoid redundant - /// image layers creation we should check if image layer exists but beyond PITR horizon. - /// This is why we need remember GC cutoff LSN. - /// - wanted_image_layers: Mutex>, + pub(crate) layers: tokio::sync::RwLock, last_freeze_at: AtomicLsn, // Atomic would be more appropriate here. last_freeze_ts: RwLock, - // WAL redo manager - walredo_mgr: Arc, + pub(crate) standby_horizon: AtomicLsn, + + // WAL redo manager. `None` only for broken tenants. + walredo_mgr: Option>, /// Remote storage client. /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details. - pub remote_client: Option>, + pub remote_client: Arc, // What page versions do we hold in the repository? If we get a // request > last_record_lsn, we need to wait until we receive all @@ -257,21 +306,27 @@ pub struct Timeline { // in `crate::page_service` writes these metrics. pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline, + directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM], + /// Ensures layers aren't frozen by checkpointer between /// [`Timeline::get_layer_for_write`] and layer reads. /// Locked automatically by [`TimelineWriter`] and checkpointer. /// Must always be acquired before the layer map/individual layer lock /// to avoid deadlock. - write_lock: tokio::sync::Mutex<()>, + /// + /// The state is cleared upon freezing. + write_lock: tokio::sync::Mutex>, /// Used to avoid multiple `flush_loop` tasks running pub(super) flush_loop_state: Mutex, /// layer_flush_start_tx can be used to wake up the layer-flushing task. - /// The value is a counter, incremented every time a new flush cycle is requested. - /// The flush cycle counter is sent back on the layer_flush_done channel when - /// the flush finishes. You can use that to wait for the flush to finish. - layer_flush_start_tx: tokio::sync::watch::Sender, + /// - The u64 value is a counter, incremented every time a new flush cycle is requested. + /// The flush cycle counter is sent back on the layer_flush_done channel when + /// the flush finishes. You can use that to wait for the flush to finish. + /// - The LSN is updated to max() of its current value and the latest disk_consistent_lsn + /// read by whoever sends an update + layer_flush_start_tx: tokio::sync::watch::Sender<(u64, Lsn)>, /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>, @@ -280,7 +335,7 @@ pub struct Timeline { // List of child timelines and their branch points. This is needed to avoid // garbage collecting data that is still needed by the child timelines. - pub gc_info: std::sync::RwLock, + pub(crate) gc_info: std::sync::RwLock, // It may change across major versions so for simplicity // keep it after running initdb for a timeline. @@ -290,12 +345,15 @@ pub struct Timeline { // though let's keep them both for better error visibility. pub initdb_lsn: Lsn, - /// When did we last calculate the partitioning? - partitioning: Mutex<(KeyPartitioning, Lsn)>, + /// When did we last calculate the partitioning? Make it pub to test cases. + pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, /// Configuration: how often should the partitioning be recalculated. repartition_threshold: u64, + last_image_layer_creation_check_at: AtomicLsn, + last_image_layer_creation_check_instant: std::sync::Mutex>, + /// Current logical size of the "datadir", at the last LSN. current_logical_size: LogicalSize, @@ -306,7 +364,7 @@ pub struct Timeline { pub walreceiver: Mutex>, /// Relation size cache - pub rel_size_cache: RwLock>, + pub(crate) rel_size_cache: RwLock, download_all_remote_layers_task_info: RwLock>, @@ -333,7 +391,7 @@ pub struct Timeline { /// /// Must only be taken in two places: /// - [`Timeline::compact`] (this file) - /// - [`delete::delete_local_layer_files`] + /// - [`delete::delete_local_timeline_directory`] /// /// Timeline deletion will acquire both compaction and gc locks in whatever order. compaction_lock: tokio::sync::Mutex<()>, @@ -342,10 +400,36 @@ pub struct Timeline { /// /// Must only be taken in two places: /// - [`Timeline::gc`] (this file) - /// - [`delete::delete_local_layer_files`] + /// - [`delete::delete_local_timeline_directory`] /// /// Timeline deletion will acquire both compaction and gc locks in whatever order. gc_lock: tokio::sync::Mutex<()>, + + /// Cloned from [`super::Tenant::timeline_get_throttle`] on construction. + timeline_get_throttle: Arc< + crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>, + >, + + /// Keep aux directory cache to avoid it's reconstruction on each update + pub(crate) aux_files: tokio::sync::Mutex, + + /// Size estimator for aux file v2 + pub(crate) aux_file_size_estimator: AuxFileSizeEstimator, + + /// Indicate whether aux file v2 storage is enabled. + pub(crate) last_aux_file_policy: AtomicAuxFilePolicy, + + /// Some test cases directly place keys into the timeline without actually modifying the directory + /// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that + /// these keys won't get garbage-collected during compaction/GC. This field only modifies the dense + /// keyspace return value of `collect_keyspace`. For sparse keyspaces, use AUX keys for testing, and + /// in the future, add `extra_test_sparse_keyspace` if necessary. + #[cfg(test)] + pub(crate) extra_test_dense_keyspace: ArcSwap, + + pub(crate) l0_flush_global_state: L0FlushGlobalState, + + pub(crate) handles: handle::PerTimelineState, } pub struct WalReceiverInfo { @@ -354,54 +438,164 @@ pub struct WalReceiverInfo { pub last_received_msg_ts: u128, } -/// /// Information about how much history needs to be retained, needed by /// Garbage Collection. -/// -pub struct GcInfo { +#[derive(Default)] +pub(crate) struct GcInfo { /// Specific LSNs that are needed. /// /// Currently, this includes all points where child branches have /// been forked off from. In the future, could also include /// explicit user-defined snapshot points. - pub retain_lsns: Vec, + pub(crate) retain_lsns: Vec<(Lsn, TimelineId)>, - /// In addition to 'retain_lsns', keep everything newer than this - /// point. - /// - /// This is calculated by subtracting 'gc_horizon' setting from - /// last-record LSN - /// - /// FIXME: is this inclusive or exclusive? - pub horizon_cutoff: Lsn, + /// The cutoff coordinates, which are combined by selecting the minimum. + pub(crate) cutoffs: GcCutoffs, - /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this - /// point. - /// - /// This is calculated by finding a number such that a record is needed for PITR - /// if only if its LSN is larger than 'pitr_cutoff'. - pub pitr_cutoff: Lsn, + /// Leases granted to particular LSNs. + pub(crate) leases: BTreeMap, + + /// Whether our branch point is within our ancestor's PITR interval (for cost estimation) + pub(crate) within_ancestor_pitr: bool, +} + +impl GcInfo { + pub(crate) fn min_cutoff(&self) -> Lsn { + self.cutoffs.select_min() + } + + pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) { + self.retain_lsns.push((child_lsn, child_id)); + self.retain_lsns.sort_by_key(|i| i.0); + } + + pub(super) fn remove_child(&mut self, child_id: TimelineId) { + self.retain_lsns.retain(|i| i.1 != child_id); + } +} + +/// The `GcInfo` component describing which Lsns need to be retained. Functionally, this +/// is a single number (the oldest LSN which we must retain), but it internally distinguishes +/// between time-based and space-based retention for observability and consumption metrics purposes. +#[derive(Debug, Clone)] +pub(crate) struct GcCutoffs { + /// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much + /// history we must keep to retain a specified number of bytes of WAL. + pub(crate) space: Lsn, + + /// Calculated from [`TenantConf::pitr_interval`], this LSN indicates how much + /// history we must keep to enable reading back at least the PITR interval duration. + pub(crate) time: Lsn, +} + +impl Default for GcCutoffs { + fn default() -> Self { + Self { + space: Lsn::INVALID, + time: Lsn::INVALID, + } + } +} + +impl GcCutoffs { + fn select_min(&self) -> Lsn { + std::cmp::min(self.space, self.time) + } +} + +pub(crate) struct TimelineVisitOutcome { + completed_keyspace: KeySpace, + image_covered_keyspace: KeySpace, } /// An error happened in a get() operation. #[derive(thiserror::Error, Debug)] pub(crate) enum PageReconstructError { #[error(transparent)] - Other(#[from] anyhow::Error), + Other(anyhow::Error), #[error("Ancestor LSN wait error: {0}")] - AncestorLsnTimeout(#[from] WaitLsnError), + AncestorLsnTimeout(WaitLsnError), #[error("timeline shutting down")] Cancelled, - /// The ancestor of this is being stopped - #[error("ancestor timeline {0} is being stopped")] - AncestorStopping(TimelineId), - /// An error happened replaying WAL records #[error(transparent)] WalRedo(anyhow::Error), + + #[error("{0}")] + MissingKey(MissingKeyError), +} + +impl From for PageReconstructError { + fn from(value: anyhow::Error) -> Self { + // with walingest.rs many PageReconstructError are wrapped in as anyhow::Error + match value.downcast::() { + Ok(pre) => pre, + Err(other) => PageReconstructError::Other(other), + } + } +} + +impl From for PageReconstructError { + fn from(value: utils::bin_ser::DeserializeError) -> Self { + PageReconstructError::Other(anyhow::Error::new(value).context("deserialization failure")) + } +} + +impl From for PageReconstructError { + fn from(_: layer_manager::Shutdown) -> Self { + PageReconstructError::Cancelled + } +} + +impl GetVectoredError { + #[cfg(test)] + pub(crate) fn is_missing_key_error(&self) -> bool { + matches!(self, Self::MissingKey(_)) + } +} + +impl From for GetVectoredError { + fn from(_: layer_manager::Shutdown) -> Self { + GetVectoredError::Cancelled + } +} + +#[derive(thiserror::Error)] +pub struct MissingKeyError { + key: Key, + shard: ShardNumber, + cont_lsn: Lsn, + request_lsn: Lsn, + ancestor_lsn: Option, + backtrace: Option, +} + +impl std::fmt::Debug for MissingKeyError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self) + } +} + +impl std::fmt::Display for MissingKeyError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "could not find data for key {} (shard {:?}) at LSN {}, request LSN {}", + self.key, self.shard, self.cont_lsn, self.request_lsn + )?; + if let Some(ref ancestor_lsn) = self.ancestor_lsn { + write!(f, ", ancestor {}", ancestor_lsn)?; + } + + if let Some(ref backtrace) = self.backtrace { + write!(f, "\n{}", backtrace)?; + } + + Ok(()) + } } impl PageReconstructError { @@ -409,40 +603,74 @@ impl PageReconstructError { pub(crate) fn is_stopping(&self) -> bool { use PageReconstructError::*; match self { - Other(_) => false, - AncestorLsnTimeout(_) => false, - Cancelled | AncestorStopping(_) => true, - WalRedo(_) => false, + Cancelled => true, + Other(_) | AncestorLsnTimeout(_) | WalRedo(_) | MissingKey(_) => false, } } } #[derive(thiserror::Error, Debug)] -enum CreateImageLayersError { +pub(crate) enum CreateImageLayersError { #[error("timeline shutting down")] Cancelled, - #[error(transparent)] - GetVectoredError(GetVectoredError), + #[error("read failed")] + GetVectoredError(#[source] GetVectoredError), - #[error(transparent)] - PageReconstructError(PageReconstructError), + #[error("reconstruction failed")] + PageReconstructError(#[source] PageReconstructError), #[error(transparent)] Other(#[from] anyhow::Error), } -#[derive(thiserror::Error, Debug)] -enum FlushLayerError { +impl From for CreateImageLayersError { + fn from(_: layer_manager::Shutdown) -> Self { + CreateImageLayersError::Cancelled + } +} + +#[derive(thiserror::Error, Debug, Clone)] +pub(crate) enum FlushLayerError { /// Timeline cancellation token was cancelled #[error("timeline shutting down")] Cancelled, - #[error(transparent)] - CreateImageLayersError(CreateImageLayersError), + /// We tried to flush a layer while the Timeline is in an unexpected state + #[error("cannot flush frozen layers when flush_loop is not running, state is {0:?}")] + NotRunning(FlushLoopState), - #[error(transparent)] - Other(#[from] anyhow::Error), + // Arc<> the following non-clonable error types: we must be Clone-able because the flush error is propagated from the flush + // loop via a watch channel, where we can only borrow it. + #[error("create image layers (shared)")] + CreateImageLayersError(Arc), + + #[error("other (shared)")] + Other(#[from] Arc), +} + +impl FlushLayerError { + // When crossing from generic anyhow errors to this error type, we explicitly check + // for timeline cancellation to avoid logging inoffensive shutdown errors as warn/err. + fn from_anyhow(timeline: &Timeline, err: anyhow::Error) -> Self { + let cancelled = timeline.cancel.is_cancelled() + // The upload queue might have been shut down before the official cancellation of the timeline. + || err + .downcast_ref::() + .map(NotInitialized::is_stopping) + .unwrap_or_default(); + if cancelled { + Self::Cancelled + } else { + Self::Other(Arc::new(err)) + } + } +} + +impl From for FlushLayerError { + fn from(_: layer_manager::Shutdown) -> Self { + FlushLayerError::Cancelled + } } #[derive(thiserror::Error, Debug)] @@ -450,11 +678,47 @@ pub(crate) enum GetVectoredError { #[error("timeline shutting down")] Cancelled, - #[error("Requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)] + #[error("requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)] Oversized(u64), - #[error("Requested at invalid LSN: {0}")] + #[error("requested at invalid LSN: {0}")] InvalidLsn(Lsn), + + #[error("requested key not found: {0}")] + MissingKey(MissingKeyError), + + #[error("ancestry walk")] + GetReadyAncestorError(#[source] GetReadyAncestorError), + + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl From for GetVectoredError { + fn from(value: GetReadyAncestorError) -> Self { + use GetReadyAncestorError::*; + match value { + Cancelled => GetVectoredError::Cancelled, + AncestorLsnTimeout(_) | BadState { .. } => { + GetVectoredError::GetReadyAncestorError(value) + } + } + } +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum GetReadyAncestorError { + #[error("ancestor LSN wait error")] + AncestorLsnTimeout(#[from] WaitLsnError), + + #[error("bad state on timeline {timeline_id}: {state:?}")] + BadState { + timeline_id: TimelineId, + state: TimelineState, + }, + + #[error("cancelled")] + Cancelled, } #[derive(Clone, Copy)] @@ -473,6 +737,9 @@ pub enum GetLogicalSizePriority { #[derive(enumset::EnumSetType)] pub(crate) enum CompactFlags { ForceRepartition, + ForceImageLayerCreation, + EnhancedGcBottomMostCompaction, + DryRun, } impl std::fmt::Debug for Timeline { @@ -488,8 +755,8 @@ pub(crate) enum WaitLsnError { Shutdown, // Called on an timeline not in active state or shutting down - #[error("Bad state (not active)")] - BadState, + #[error("Bad timeline state: {0:?}")] + BadState(TimelineState), // Timeout expired while waiting for LSN to catch up with goal. #[error("{0}")] @@ -503,6 +770,9 @@ impl From for CompactionError { fn from(e: CreateImageLayersError) -> Self { match e { CreateImageLayersError::Cancelled => CompactionError::ShuttingDown, + CreateImageLayersError::Other(e) => { + CompactionError::Other(e.context("create image layers")) + } _ => CompactionError::Other(e.into()), } } @@ -512,7 +782,7 @@ impl From for FlushLayerError { fn from(e: CreateImageLayersError) -> Self { match e { CreateImageLayersError::Cancelled => FlushLayerError::Cancelled, - any => FlushLayerError::CreateImageLayersError(any), + any => FlushLayerError::CreateImageLayersError(Arc::new(any)), } } } @@ -535,22 +805,87 @@ impl From for CreateImageLayersError { } } +impl From for PageReconstructError { + fn from(e: GetVectoredError) -> Self { + match e { + GetVectoredError::Cancelled => PageReconstructError::Cancelled, + GetVectoredError::InvalidLsn(_) => PageReconstructError::Other(anyhow!("Invalid LSN")), + err @ GetVectoredError::Oversized(_) => PageReconstructError::Other(err.into()), + GetVectoredError::MissingKey(err) => PageReconstructError::MissingKey(err), + GetVectoredError::GetReadyAncestorError(err) => PageReconstructError::from(err), + GetVectoredError::Other(err) => PageReconstructError::Other(err), + } + } +} + +impl From for PageReconstructError { + fn from(e: GetReadyAncestorError) -> Self { + use GetReadyAncestorError::*; + match e { + AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err), + bad_state @ BadState { .. } => PageReconstructError::Other(anyhow::anyhow!(bad_state)), + Cancelled => PageReconstructError::Cancelled, + } + } +} + +pub(crate) enum WaitLsnWaiter<'a> { + Timeline(&'a Timeline), + Tenant, + PageService, +} + +/// Argument to [`Timeline::shutdown`]. +#[derive(Debug, Clone, Copy)] +pub(crate) enum ShutdownMode { + /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then + /// also to remote storage. This method can easily take multiple seconds for a busy timeline. + /// + /// While we are flushing, we continue to accept read I/O for LSNs ingested before + /// the call to [`Timeline::shutdown`]. + FreezeAndFlush, + /// Shut down immediately, without waiting for any open layers to flush. + Hard, +} + +struct ImageLayerCreationOutcome { + image: Option, + next_start_key: Key, +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created - pub fn get_ancestor_lsn(&self) -> Lsn { + pub(crate) fn get_ancestor_lsn(&self) -> Lsn { self.ancestor_lsn } /// Get the ancestor's timeline id - pub fn get_ancestor_timeline_id(&self) -> Option { + pub(crate) fn get_ancestor_timeline_id(&self) -> Option { self.ancestor_timeline .as_ref() .map(|ancestor| ancestor.timeline_id) } + /// Get the ancestor timeline + pub(crate) fn ancestor_timeline(&self) -> Option<&Arc> { + self.ancestor_timeline.as_ref() + } + + /// Get the bytes written since the PITR cutoff on this branch, and + /// whether this branch's ancestor_lsn is within its parent's PITR. + pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) { + let gc_info = self.gc_info.read().unwrap(); + let history = self + .get_last_record_lsn() + .checked_sub(gc_info.cutoffs.time) + .unwrap_or(Lsn(0)) + .0; + (history, gc_info.within_ancestor_pitr) + } + /// Lock and get timeline's GC cutoff - pub fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard { + pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard { self.latest_gc_cutoff_lsn.read() } @@ -559,6 +894,8 @@ impl Timeline { /// If a remote layer file is needed, it is downloaded as part of this /// call. /// + /// This method enforces [`Self::timeline_get_throttle`] internally. + /// /// NOTE: It is considered an error to 'get' a key that doesn't exist. The /// abstraction above this needs to store suitable metadata to track what /// data exists with what keys, in separate metadata entries. If a @@ -569,6 +906,7 @@ impl Timeline { /// # Cancel-Safety /// /// This method is cancellation-safe. + #[inline(always)] pub(crate) async fn get( &self, key: Key, @@ -584,77 +922,48 @@ impl Timeline { // page_service. debug_assert!(!self.shard_identity.is_key_disposable(&key)); - // XXX: structured stats collection for layer eviction here. - trace!( - "get page request for {}@{} from task kind {:?}", - key, - lsn, - ctx.task_kind() - ); + self.timeline_get_throttle.throttle(ctx, 1).await; - // Check the page cache. We will get back the most recent page with lsn <= `lsn`. - // The cached image can be returned directly if there is no WAL between the cached image - // and requested LSN. The cached image can also be used to reduce the amount of WAL needed - // for redo. - let cached_page_img = match self.lookup_cached_page(&key, lsn, ctx).await { - Some((cached_lsn, cached_img)) => { - match cached_lsn.cmp(&lsn) { - Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check - Ordering::Equal => { - MATERIALIZED_PAGE_CACHE_HIT_DIRECT.inc(); - return Ok(cached_img); // exact LSN match, return the image - } - Ordering::Greater => { - unreachable!("the returned lsn should never be after the requested lsn") - } + let keyspace = KeySpace { + ranges: vec![key..key.next()], + }; + + // Initialise the reconstruct state for the key with the cache + // entry returned above. + let mut reconstruct_state = ValuesReconstructState::new(); + + let vectored_res = self + .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) + .await; + + let key_value = vectored_res?.pop_first(); + match key_value { + Some((got_key, value)) => { + if got_key != key { + error!( + "Expected {}, but singular vectored get returned {}", + key, got_key + ); + Err(PageReconstructError::Other(anyhow!( + "Singular vectored get returned wrong key" + ))) + } else { + value } - Some((cached_lsn, cached_img)) } - None => None, - }; - - let mut reconstruct_state = ValueReconstructState { - records: Vec::new(), - img: cached_page_img, - }; - - let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer(); - let path = self - .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx) - .await?; - timer.stop_and_record(); - - let start = Instant::now(); - let res = self.reconstruct_value(key, lsn, reconstruct_state).await; - let elapsed = start.elapsed(); - crate::metrics::RECONSTRUCT_TIME - .for_result(&res) - .observe(elapsed.as_secs_f64()); - - if cfg!(feature = "testing") && res.is_err() { - // it can only be walredo issue - use std::fmt::Write; - - let mut msg = String::new(); - - path.into_iter().for_each(|(res, cont_lsn, layer)| { - writeln!( - msg, - "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}", - layer(), - ) - .expect("string grows") - }); - - // this is to rule out or provide evidence that we could in some cases read a duplicate - // walrecord - tracing::info!("walredo failed, path:\n{msg}"); + None => Err(PageReconstructError::MissingKey(MissingKeyError { + key, + shard: self.shard_identity.get_shard_number(&key), + cont_lsn: Lsn(0), + request_lsn: lsn, + ancestor_lsn: None, + backtrace: None, + })), } - - res } pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32; + pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0; /// Look up multiple page versions at a given LSN /// @@ -662,7 +971,7 @@ impl Timeline { /// which actually vectorizes the read path. pub(crate) async fn get_vectored( &self, - key_ranges: &[Range], + keyspace: KeySpace, lsn: Lsn, ctx: &RequestContext, ) -> Result>, GetVectoredError> { @@ -670,94 +979,257 @@ impl Timeline { return Err(GetVectoredError::InvalidLsn(lsn)); } - let key_count = key_ranges - .iter() - .map(|range| key_range_size(range) as u64) - .sum(); + let key_count = keyspace.total_raw_size().try_into().unwrap(); if key_count > Timeline::MAX_GET_VECTORED_KEYS { return Err(GetVectoredError::Oversized(key_count)); } - let mut values = BTreeMap::new(); - for range in key_ranges { + for range in &keyspace.ranges { let mut key = range.start; while key != range.end { assert!(!self.shard_identity.is_key_disposable(&key)); - - let block = self.get(key, lsn, ctx).await; - - if matches!( - block, - Err(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_)) - ) { - return Err(GetVectoredError::Cancelled); - } - - values.insert(key, block); key = key.next(); } } - Ok(values) + trace!( + "get vectored request for {:?}@{} from task kind {:?}", + keyspace, + lsn, + ctx.task_kind(), + ); + + let start = crate::metrics::GET_VECTORED_LATENCY + .for_task_kind(ctx.task_kind()) + .map(|metric| (metric, Instant::now())); + + // start counting after throttle so that throttle time + // is always less than observation time + let throttled = self + .timeline_get_throttle + .throttle(ctx, key_count as usize) + .await; + + let res = self + .get_vectored_impl( + keyspace.clone(), + lsn, + &mut ValuesReconstructState::new(), + ctx, + ) + .await; + + if let Some((metric, start)) = start { + let elapsed = start.elapsed(); + let ex_throttled = if let Some(throttled) = throttled { + elapsed.checked_sub(throttled) + } else { + Some(elapsed) + }; + + if let Some(ex_throttled) = ex_throttled { + metric.observe(ex_throttled.as_secs_f64()); + } else { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!("error deducting time spent throttled; this message is logged at a global rate limit"); + }); + } + } + + res + } + + /// Scan the keyspace and return all existing key-values in the keyspace. This currently uses vectored + /// get underlying. Normal vectored get would throw an error when a key in the keyspace is not found + /// during the search, but for the scan interface, it returns all existing key-value pairs, and does + /// not expect each single key in the key space will be found. The semantics is closer to the RocksDB + /// scan iterator interface. We could optimize this interface later to avoid some checks in the vectored + /// get path to maintain and split the probing and to-be-probe keyspace. We also need to ensure that + /// the scan operation will not cause OOM in the future. + pub(crate) async fn scan( + &self, + keyspace: KeySpace, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result>, GetVectoredError> { + if !lsn.is_valid() { + return Err(GetVectoredError::InvalidLsn(lsn)); + } + + trace!( + "key-value scan request for {:?}@{} from task kind {:?}", + keyspace, + lsn, + ctx.task_kind() + ); + + // We should generalize this into Keyspace::contains in the future. + for range in &keyspace.ranges { + if range.start.field1 < METADATA_KEY_BEGIN_PREFIX + || range.end.field1 > METADATA_KEY_END_PREFIX + { + return Err(GetVectoredError::Other(anyhow::anyhow!( + "only metadata keyspace can be scanned" + ))); + } + } + + let start = crate::metrics::SCAN_LATENCY + .for_task_kind(ctx.task_kind()) + .map(ScanLatencyOngoingRecording::start_recording); + + // start counting after throttle so that throttle time + // is always less than observation time + let throttled = self + .timeline_get_throttle + // assume scan = 1 quota for now until we find a better way to process this + .throttle(ctx, 1) + .await; + + let vectored_res = self + .get_vectored_impl( + keyspace.clone(), + lsn, + &mut ValuesReconstructState::default(), + ctx, + ) + .await; + + if let Some(recording) = start { + recording.observe(throttled); + } + + vectored_res + } + + pub(super) async fn get_vectored_impl( + &self, + keyspace: KeySpace, + lsn: Lsn, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> Result>, GetVectoredError> { + let get_kind = if keyspace.total_raw_size() == 1 { + GetKind::Singular + } else { + GetKind::Vectored + }; + + let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME + .for_get_kind(get_kind) + .start_timer(); + self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx) + .await?; + get_data_timer.stop_and_record(); + + let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME + .for_get_kind(get_kind) + .start_timer(); + let mut results: BTreeMap> = BTreeMap::new(); + let layers_visited = reconstruct_state.get_layers_visited(); + + for (key, res) in std::mem::take(&mut reconstruct_state.keys) { + match res { + Err(err) => { + results.insert(key, Err(err)); + } + Ok(state) => { + let state = ValueReconstructState::from(state); + + let reconstruct_res = self.reconstruct_value(key, lsn, state).await; + results.insert(key, reconstruct_res); + } + } + } + reconstruct_timer.stop_and_record(); + + // For aux file keys (v1 or v2) the vectored read path does not return an error + // when they're missing. Instead they are omitted from the resulting btree + // (this is a requirement, not a bug). Skip updating the metric in these cases + // to avoid infinite results. + if !results.is_empty() { + let avg = layers_visited as f64 / results.len() as f64; + if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + tracing::info!( + shard_id = %self.tenant_shard_id.shard_slug(), + lsn = %lsn, + "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned", + keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size()); + }); + } + + // Note that this is an approximation. Tracking the exact number of layers visited + // per key requires virtually unbounded memory usage and is inefficient + // (i.e. segment tree tracking each range queried from a layer) + crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg); + } + + Ok(results) } /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. - pub fn get_last_record_lsn(&self) -> Lsn { + pub(crate) fn get_last_record_lsn(&self) -> Lsn { self.last_record_lsn.load().last } - pub fn get_prev_record_lsn(&self) -> Lsn { + pub(crate) fn get_prev_record_lsn(&self) -> Lsn { self.last_record_lsn.load().prev } /// Atomically get both last and prev. - pub fn get_last_record_rlsn(&self) -> RecordLsn { + pub(crate) fn get_last_record_rlsn(&self) -> RecordLsn { self.last_record_lsn.load() } - pub fn get_disk_consistent_lsn(&self) -> Lsn { + /// Subscribe to callers of wait_lsn(). The value of the channel is None if there are no + /// wait_lsn() calls in progress, and Some(Lsn) if there is an active waiter for wait_lsn(). + pub(crate) fn subscribe_for_wait_lsn_updates(&self) -> watch::Receiver> { + self.last_record_lsn.status_receiver() + } + + pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn { self.disk_consistent_lsn.load() } /// remote_consistent_lsn from the perspective of the tenant's current generation, /// not validated with control plane yet. /// See [`Self::get_remote_consistent_lsn_visible`]. - pub fn get_remote_consistent_lsn_projected(&self) -> Option { - if let Some(remote_client) = &self.remote_client { - remote_client.remote_consistent_lsn_projected() - } else { - None - } + pub(crate) fn get_remote_consistent_lsn_projected(&self) -> Option { + self.remote_client.remote_consistent_lsn_projected() } /// remote_consistent_lsn which the tenant is guaranteed not to go backward from, /// i.e. a value of remote_consistent_lsn_projected which has undergone /// generation validation in the deletion queue. - pub fn get_remote_consistent_lsn_visible(&self) -> Option { - if let Some(remote_client) = &self.remote_client { - remote_client.remote_consistent_lsn_visible() - } else { - None - } + pub(crate) fn get_remote_consistent_lsn_visible(&self) -> Option { + self.remote_client.remote_consistent_lsn_visible() } /// The sum of the file size of all historic layers in the layer map. /// This method makes no distinction between local and remote layers. /// Hence, the result **does not represent local filesystem usage**. - pub async fn layer_size_sum(&self) -> u64 { + pub(crate) async fn layer_size_sum(&self) -> u64 { let guard = self.layers.read().await; - let layer_map = guard.layer_map(); - let mut size = 0; - for l in layer_map.iter_historic_layers() { - size += l.file_size(); - } - size + guard.layer_size_sum() } - pub fn resident_physical_size(&self) -> u64 { + pub(crate) fn resident_physical_size(&self) -> u64 { self.metrics.resident_physical_size_get() } + pub(crate) fn get_directory_metrics(&self) -> [u64; DirectoryKind::KINDS_NUM] { + array::from_fn(|idx| self.directory_metrics[idx].load(AtomicOrdering::Relaxed)) + } + /// /// Wait until WAL has been received and processed up to this LSN. /// @@ -767,28 +1239,38 @@ impl Timeline { pub(crate) async fn wait_lsn( &self, lsn: Lsn, - _ctx: &RequestContext, /* Prepare for use by cancellation */ + who_is_waiting: WaitLsnWaiter<'_>, + ctx: &RequestContext, /* Prepare for use by cancellation */ ) -> Result<(), WaitLsnError> { - if self.cancel.is_cancelled() { + let state = self.current_state(); + if self.cancel.is_cancelled() || matches!(state, TimelineState::Stopping) { return Err(WaitLsnError::Shutdown); - } else if !self.is_active() { - return Err(WaitLsnError::BadState); + } else if !matches!(state, TimelineState::Active) { + return Err(WaitLsnError::BadState(state)); } - // This should never be called from the WAL receiver, because that could lead - // to a deadlock. - debug_assert!( - task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager), - "wait_lsn cannot be called in WAL receiver" - ); - debug_assert!( - task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler), - "wait_lsn cannot be called in WAL receiver" - ); - debug_assert!( - task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller), - "wait_lsn cannot be called in WAL receiver" - ); + if cfg!(debug_assertions) { + match ctx.task_kind() { + TaskKind::WalReceiverManager + | TaskKind::WalReceiverConnectionHandler + | TaskKind::WalReceiverConnectionPoller => { + let is_myself = match who_is_waiting { + WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself), + WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()), + }; + if is_myself { + if let Err(current) = self.last_record_lsn.would_wait_for(lsn) { + // walingest is the only one that can advance last_record_lsn; it should make sure to never reach here + panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock"); + } + } else { + // if another timeline's is waiting for us, there's no deadlock risk because + // our walreceiver task can make progress independent of theirs + } + } + _ => {} + } + } let _timer = crate::metrics::WAIT_LSN_TIME.start_timer(); @@ -830,34 +1312,221 @@ impl Timeline { } /// Check that it is valid to request operations with that lsn. - pub fn check_lsn_is_in_scope( + pub(crate) fn check_lsn_is_in_scope( &self, lsn: Lsn, latest_gc_cutoff_lsn: &RcuReadGuard, ) -> anyhow::Result<()> { ensure!( lsn >= **latest_gc_cutoff_lsn, - "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)", + "LSN {} is earlier than latest GC cutoff {} (we might've already garbage collected needed data)", lsn, **latest_gc_cutoff_lsn, ); Ok(()) } - /// Flush to disk all data that was written with the put_* functions - #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))] - pub async fn freeze_and_flush(&self) -> anyhow::Result<()> { - self.freeze_inmem_layer(false).await; - self.flush_frozen_layers_and_wait().await + /// Obtains a temporary lease blocking garbage collection for the given LSN. + /// + /// This function will error if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is also + /// no existing lease to renew. If there is an existing lease in the map, the lease will be renewed only if + /// the request extends the lease. The returned lease is therefore the maximum between the existing lease and + /// the requesting lease. + pub(crate) fn make_lsn_lease( + &self, + lsn: Lsn, + length: Duration, + _ctx: &RequestContext, + ) -> anyhow::Result { + let lease = { + let mut gc_info = self.gc_info.write().unwrap(); + + let valid_until = SystemTime::now() + length; + + let entry = gc_info.leases.entry(lsn); + + let lease = { + if let Entry::Occupied(mut occupied) = entry { + let existing_lease = occupied.get_mut(); + if valid_until > existing_lease.valid_until { + existing_lease.valid_until = valid_until; + let dt: DateTime = valid_until.into(); + info!("lease extended to {}", dt); + } else { + let dt: DateTime = existing_lease.valid_until.into(); + info!("existing lease covers greater length, valid until {}", dt); + } + + existing_lease.clone() + } else { + // Reject already GC-ed LSN (lsn < latest_gc_cutoff) + let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn(); + if lsn < *latest_gc_cutoff_lsn { + bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn); + } + + let dt: DateTime = valid_until.into(); + info!("lease created, valid until {}", dt); + entry.or_insert(LsnLease { valid_until }).clone() + } + }; + + lease + }; + + Ok(lease) } - /// Outermost timeline compaction operation; downloads needed layers. + /// Flush to disk all data that was written with the put_* functions + #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))] + pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> { + self.freeze_and_flush0().await + } + + // This exists to provide a non-span creating version of `freeze_and_flush` we can call without + // polluting the span hierarchy. + pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> { + let token = { + // Freeze the current open in-memory layer. It will be written to disk on next + // iteration. + let mut g = self.write_lock.lock().await; + + let to_lsn = self.get_last_record_lsn(); + self.freeze_inmem_layer_at(to_lsn, &mut g).await? + }; + self.wait_flush_completion(token).await + } + + // Check if an open ephemeral layer should be closed: this provides + // background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping + // an ephemeral layer open forever when idle. It also freezes layers if the global limit on + // ephemeral layer bytes has been breached. + pub(super) async fn maybe_freeze_ephemeral_layer(&self) { + let Ok(mut write_guard) = self.write_lock.try_lock() else { + // If the write lock is held, there is an active wal receiver: rolling open layers + // is their responsibility while they hold this lock. + return; + }; + + // FIXME: why not early exit? because before #7927 the state would had been cleared every + // time, and this was missed. + // if write_guard.is_none() { return; } + + let Ok(layers_guard) = self.layers.try_read() else { + // Don't block if the layer lock is busy + return; + }; + + let Ok(lm) = layers_guard.layer_map() else { + return; + }; + + let Some(open_layer) = &lm.open_layer else { + // If there is no open layer, we have no layer freezing to do. However, we might need to generate + // some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions + // that didn't result in writes to this shard. + + // Must not hold the layers lock while waiting for a flush. + drop(layers_guard); + + let last_record_lsn = self.get_last_record_lsn(); + let disk_consistent_lsn = self.get_disk_consistent_lsn(); + if last_record_lsn > disk_consistent_lsn { + // We have no open layer, but disk_consistent_lsn is behind the last record: this indicates + // we are a sharded tenant and have skipped some WAL + let last_freeze_ts = *self.last_freeze_ts.read().unwrap(); + if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() { + // Only do this if have been layer-less longer than get_checkpoint_timeout, so that a shard + // without any data ingested (yet) doesn't write a remote index as soon as it + // sees its LSN advance: we only do this if we've been layer-less + // for some time. + tracing::debug!( + "Advancing disk_consistent_lsn past WAL ingest gap {} -> {}", + disk_consistent_lsn, + last_record_lsn + ); + + // The flush loop will update remote consistent LSN as well as disk consistent LSN. + // We know there is no open layer, so we can request freezing without actually + // freezing anything. This is true even if we have dropped the layers_guard, we + // still hold the write_guard. + let _ = async { + let token = self + .freeze_inmem_layer_at(last_record_lsn, &mut write_guard) + .await?; + self.wait_flush_completion(token).await + } + .await; + } + } + + return; + }; + + let Some(current_size) = open_layer.try_len() else { + // Unexpected: since we hold the write guard, nobody else should be writing to this layer, so + // read lock to get size should always succeed. + tracing::warn!("Lock conflict while reading size of open layer"); + return; + }; + + let current_lsn = self.get_last_record_lsn(); + + let checkpoint_distance_override = open_layer.tick().await; + + if let Some(size_override) = checkpoint_distance_override { + if current_size > size_override { + // This is not harmful, but it only happens in relatively rare cases where + // time-based checkpoints are not happening fast enough to keep the amount of + // ephemeral data within configured limits. It's a sign of stress on the system. + tracing::info!("Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure"); + } + } + + let checkpoint_distance = + checkpoint_distance_override.unwrap_or(self.get_checkpoint_distance()); + + if self.should_roll( + current_size, + current_size, + checkpoint_distance, + self.get_last_record_lsn(), + self.last_freeze_at.load(), + open_layer.get_opened_at(), + ) { + match open_layer.info() { + InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => { + // We may reach this point if the layer was already frozen by not yet flushed: flushing + // happens asynchronously in the background. + tracing::debug!( + "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})" + ); + } + InMemoryLayerInfo::Open { .. } => { + // Upgrade to a write lock and freeze the layer + drop(layers_guard); + let res = self + .freeze_inmem_layer_at(current_lsn, &mut write_guard) + .await; + + if let Err(e) = res { + tracing::info!( + "failed to flush frozen layer after background freeze: {e:#}" + ); + } + } + } + } + } + + /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending + /// compaction tasks. pub(crate) async fn compact( self: &Arc, cancel: &CancellationToken, flags: EnumSet, ctx: &RequestContext, - ) -> Result<(), CompactionError> { + ) -> Result { // most likely the cancellation token is from background task, but in tests it could be the // request task as well. @@ -877,8 +1546,8 @@ impl Timeline { // compaction task goes over it's period (20s) which is quite often in production. let (_guard, _permit) = tokio::select! { tuple = prepare => { tuple }, - _ = self.cancel.cancelled() => return Ok(()), - _ = cancel.cancelled() => return Ok(()), + _ = self.cancel.cancelled() => return Ok(false), + _ = cancel.cancelled() => return Ok(false), }; let last_record_lsn = self.get_last_record_lsn(); @@ -886,198 +1555,123 @@ impl Timeline { // Last record Lsn could be zero in case the timeline was just created if !last_record_lsn.is_valid() { warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}"); - return Ok(()); + return Ok(false); } - // High level strategy for compaction / image creation: - // - // 1. First, calculate the desired "partitioning" of the - // currently in-use key space. The goal is to partition the - // key space into roughly fixed-size chunks, but also take into - // account any existing image layers, and try to align the - // chunk boundaries with the existing image layers to avoid - // too much churn. Also try to align chunk boundaries with - // relation boundaries. In principle, we don't know about - // relation boundaries here, we just deal with key-value - // pairs, and the code in pgdatadir_mapping.rs knows how to - // map relations into key-value pairs. But in practice we know - // that 'field6' is the block number, and the fields 1-5 - // identify a relation. This is just an optimization, - // though. - // - // 2. Once we know the partitioning, for each partition, - // decide if it's time to create a new image layer. The - // criteria is: there has been too much "churn" since the last - // image layer? The "churn" is fuzzy concept, it's a - // combination of too many delta files, or too much WAL in - // total in the delta file. Or perhaps: if creating an image - // file would allow to delete some older files. - // - // 3. After that, we compact all level0 delta files if there - // are too many of them. While compacting, we also garbage - // collect any page versions that are no longer needed because - // of the new image layers we created in step 2. - // - // TODO: This high level strategy hasn't been implemented yet. - // Below are functions compact_level0() and create_image_layers() - // but they are a bit ad hoc and don't quite work like it's explained - // above. Rewrite it. - - // Is the timeline being deleted? - if self.is_stopping() { - trace!("Dropping out of compaction on timeline shutdown"); - return Err(CompactionError::ShuttingDown); + match self.get_compaction_algorithm_settings().kind { + CompactionAlgorithm::Tiered => { + self.compact_tiered(cancel, ctx).await?; + Ok(false) + } + CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await, } - - let target_file_size = self.get_checkpoint_distance(); - - // Define partitioning schema if needed - - // FIXME: the match should only cover repartitioning, not the next steps - match self - .repartition( - self.get_last_record_lsn(), - self.get_compaction_target_size(), - flags, - ctx, - ) - .await - { - Ok((partitioning, lsn)) => { - // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them - let image_ctx = RequestContextBuilder::extend(ctx) - .access_stats_behavior(AccessStatsBehavior::Skip) - .build(); - - // 2. Compact - let timer = self.metrics.compact_time_histo.start_timer(); - self.compact_level0(target_file_size, ctx).await?; - timer.stop_and_record(); - - // 3. Create new image layers for partitions that have been modified - // "enough". - let layers = self - .create_image_layers(&partitioning, lsn, false, &image_ctx) - .await - .map_err(anyhow::Error::from)?; - if let Some(remote_client) = &self.remote_client { - for layer in layers { - remote_client.schedule_layer_file_upload(layer)?; - } - } - - if let Some(remote_client) = &self.remote_client { - // should any new image layer been created, not uploading index_part will - // result in a mismatch between remote_physical_size and layermap calculated - // size, which will fail some tests, but should not be an issue otherwise. - remote_client.schedule_index_upload_for_file_changes()?; - } - } - Err(err) => { - // no partitioning? This is normal, if the timeline was just created - // as an empty timeline. Also in unit tests, when we use the timeline - // as a simple key-value store, ignoring the datadir layout. Log the - // error but continue. - // - // Suppress error when it's due to cancellation - if !self.cancel.is_cancelled() { - error!("could not compact, repartitioning keyspace failed: {err:?}"); - } - } - }; - - Ok(()) } /// Mutate the timeline with a [`TimelineWriter`]. - pub async fn writer(&self) -> TimelineWriter<'_> { + pub(crate) async fn writer(&self) -> TimelineWriter<'_> { TimelineWriter { tl: self, - _write_guard: self.write_lock.lock().await, + write_guard: self.write_lock.lock().await, } } - /// Check if more than 'checkpoint_distance' of WAL has been accumulated in - /// the in-memory layer, and initiate flushing it if so. - /// - /// Also flush after a period of time without new data -- it helps - /// safekeepers to regard pageserver as caught up and suspend activity. - pub async fn check_checkpoint_distance(self: &Arc) -> anyhow::Result<()> { - let last_lsn = self.get_last_record_lsn(); - let open_layer_size = { - let guard = self.layers.read().await; - let layers = guard.layer_map(); - let Some(open_layer) = layers.open_layer.as_ref() else { - return Ok(()); - }; - open_layer.size().await? - }; - let last_freeze_at = self.last_freeze_at.load(); - let last_freeze_ts = *(self.last_freeze_ts.read().unwrap()); - let distance = last_lsn.widening_sub(last_freeze_at); - // Checkpointing the open layer can be triggered by layer size or LSN range. - // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and - // we want to stay below that with a big margin. The LSN distance determines how - // much WAL the safekeepers need to store. - if distance >= self.get_checkpoint_distance().into() - || open_layer_size > self.get_checkpoint_distance() - || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()) - { - info!( - "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}", - distance, - open_layer_size, - last_freeze_ts.elapsed() - ); - - self.freeze_inmem_layer(true).await; - self.last_freeze_at.store(last_lsn); - *(self.last_freeze_ts.write().unwrap()) = Instant::now(); - - // Wake up the layer flusher - self.flush_frozen_layers(); - } - Ok(()) - } - - pub fn activate( + pub(crate) fn activate( self: &Arc, + parent: Arc, broker_client: BrokerClientChannel, background_jobs_can_start: Option<&completion::Barrier>, ctx: &RequestContext, ) { - self.spawn_initial_logical_size_computation_task(ctx); + if self.tenant_shard_id.is_shard_zero() { + // Logical size is only maintained accurately on shard zero. + self.spawn_initial_logical_size_computation_task(ctx); + } self.launch_wal_receiver(ctx, broker_client); self.set_state(TimelineState::Active); - self.launch_eviction_task(background_jobs_can_start); + self.launch_eviction_task(parent, background_jobs_can_start); } - /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then - /// also to remote storage. This method can easily take multiple seconds for a busy timeline. + /// After this function returns, there are no timeline-scoped tasks are left running. /// - /// While we are flushing, we continue to accept read I/O. - #[instrument(skip_all, fields(timeline_id=%self.timeline_id))] - pub(crate) async fn flush_and_shutdown(&self) { + /// The preferred pattern for is: + /// - in any spawned tasks, keep Timeline::guard open + Timeline::cancel / child token + /// - if early shutdown (not just cancellation) of a sub-tree of tasks is required, + /// go the extra mile and keep track of JoinHandles + /// - Keep track of JoinHandles using a passed-down `Arc>>` or similar, + /// instead of spawning directly on a runtime. It is a more composable / testable pattern. + /// + /// For legacy reasons, we still have multiple tasks spawned using + /// `task_mgr::spawn(X, Some(tenant_id), Some(timeline_id))`. + /// We refer to these as "timeline-scoped task_mgr tasks". + /// Some of these tasks are already sensitive to Timeline::cancel while others are + /// not sensitive to Timeline::cancel and instead respect [`task_mgr::shutdown_token`] + /// or [`task_mgr::shutdown_watcher`]. + /// We want to gradually convert the code base away from these. + /// + /// Here is an inventory of timeline-scoped task_mgr tasks that are still sensitive to + /// `task_mgr::shutdown_{token,watcher}` (there are also tenant-scoped and global-scoped + /// ones that aren't mentioned here): + /// - [`TaskKind::TimelineDeletionWorker`] + /// - NB: also used for tenant deletion + /// - [`TaskKind::RemoteUploadTask`]` + /// - [`TaskKind::InitialLogicalSizeCalculation`] + /// - [`TaskKind::DownloadAllRemoteLayers`] (can we get rid of it?) + // Inventory of timeline-scoped task_mgr tasks that use spawn but aren't sensitive: + /// - [`TaskKind::Eviction`] + /// - [`TaskKind::LayerFlushTask`] + /// - [`TaskKind::OndemandLogicalSizeCalculation`] + /// - [`TaskKind::GarbageCollector`] (immediate_gc is timeline-scoped) + pub(crate) async fn shutdown(&self, mode: ShutdownMode) { debug_assert_current_span_has_tenant_and_timeline_id(); - // Stop ingesting data, so that we are not still writing to an InMemoryLayer while - // trying to flush - tracing::debug!("Waiting for WalReceiverManager..."); - task_mgr::shutdown_tasks( - Some(TaskKind::WalReceiverManager), - Some(self.tenant_shard_id), - Some(self.timeline_id), - ) - .await; + let try_freeze_and_flush = match mode { + ShutdownMode::FreezeAndFlush => true, + ShutdownMode::Hard => false, + }; - // Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance + // Regardless of whether we're going to try_freeze_and_flush + // or not, stop ingesting any more data. Walreceiver only provides + // cancellation but no "wait until gone", because it uses the Timeline::gate. + // So, only after the self.gate.close() below will we know for sure that + // no walreceiver tasks are left. + // For `try_freeze_and_flush=true`, this means that we might still be ingesting + // data during the call to `self.freeze_and_flush()` below. + // That's not ideal, but, we don't have the concept of a ChildGuard, + // which is what we'd need to properly model early shutdown of the walreceiver + // task sub-tree before the other Timeline task sub-trees. + let walreceiver = self.walreceiver.lock().unwrap().take(); + tracing::debug!( + is_some = walreceiver.is_some(), + "Waiting for WalReceiverManager..." + ); + if let Some(walreceiver) = walreceiver { + walreceiver.cancel(); + } + // ... and inform any waiters for newer LSNs that there won't be any. self.last_record_lsn.shutdown(); - // now all writers to InMemory layer are gone, do the final flush if requested - match self.freeze_and_flush().await { - Ok(_) => { - // drain the upload queue - if let Some(client) = self.remote_client.as_ref() { + if try_freeze_and_flush { + if let Some((open, frozen)) = self + .layers + .read() + .await + .layer_map() + .map(|lm| (lm.open_layer.is_some(), lm.frozen_layers.len())) + .ok() + .filter(|(open, frozen)| *open || *frozen > 0) + { + tracing::info!(?open, frozen, "flushing and freezing on shutdown"); + } else { + // this is double-shutdown, ignore it + } + + // we shut down walreceiver above, so, we won't add anything more + // to the InMemoryLayer; freeze it and wait for all frozen layers + // to reach the disk & upload queue, then shut the upload queue and + // wait for it to drain. + match self.freeze_and_flush().await { + Ok(_) => { + // drain the upload queue // if we did not wait for completion here, it might be our shutdown process // didn't wait for remote uploads to complete at all, as new tasks can forever // be spawned. @@ -1085,62 +1679,63 @@ impl Timeline { // what is problematic is the shutting down of RemoteTimelineClient, because // obviously it does not make sense to stop while we wait for it, but what // about corner cases like s3 suddenly hanging up? - if let Err(e) = client.shutdown().await { - // Non-fatal. Shutdown is infallible. Failures to flush just mean that - // we have some extra WAL replay to do next time the timeline starts. - warn!("failed to flush to remote storage: {e:#}"); - } + self.remote_client.shutdown().await; + } + Err(FlushLayerError::Cancelled) => { + // this is likely the second shutdown, ignore silently. + // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080 + debug_assert!(self.cancel.is_cancelled()); + } + Err(e) => { + // Non-fatal. Shutdown is infallible. Failures to flush just mean that + // we have some extra WAL replay to do next time the timeline starts. + warn!("failed to freeze and flush: {e:#}"); } - } - Err(e) => { - // Non-fatal. Shutdown is infallible. Failures to flush just mean that - // we have some extra WAL replay to do next time the timeline starts. - warn!("failed to freeze and flush: {e:#}"); } } - self.shutdown().await; - } - - /// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of - /// the graceful [`Timeline::flush_and_shutdown`] function. - pub(crate) async fn shutdown(&self) { // Signal any subscribers to our cancellation token to drop out tracing::debug!("Cancelling CancellationToken"); self.cancel.cancel(); - // Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel - // while doing so. - self.last_record_lsn.shutdown(); + // Ensure Prevent new page service requests from starting. + self.handles.shutdown(); - // Shut down the layer flush task before the remote client, as one depends on the other + // Transition the remote_client into a state where it's only useful for timeline deletion. + // (The deletion use case is why we can't just hook up remote_client to Self::cancel).) + self.remote_client.stop(); + + // As documented in remote_client.stop()'s doc comment, it's our responsibility + // to shut down the upload queue tasks. + // TODO: fix that, task management should be encapsulated inside remote_client. task_mgr::shutdown_tasks( - Some(TaskKind::LayerFlushTask), + Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(self.timeline_id), ) .await; - // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in - // case our caller wants to use that for a deletion - if let Some(remote_client) = self.remote_client.as_ref() { - match remote_client.stop() { - Ok(()) => {} - Err(StopError::QueueUninitialized) => { - // Shutting down during initialization is legal - } - } - } - + // TODO: work toward making this a no-op. See this function's doc comment for more context. tracing::debug!("Waiting for tasks..."); - task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await; - // Finally wait until any gate-holders are complete + { + // Allow any remaining in-memory layers to do cleanup -- until that, they hold the gate + // open. + let mut write_guard = self.write_lock.lock().await; + self.layers.write().await.shutdown(&mut write_guard); + } + + // Finally wait until any gate-holders are complete. + // + // TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks + // and use a TBD variant of shutdown_tasks that asserts that there were no tasks left. self.gate.close().await; + + self.metrics.shutdown(); } - pub fn set_state(&self, new_state: TimelineState) { + pub(crate) fn set_state(&self, new_state: TimelineState) { match (self.current_state(), new_state) { (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => { info!("Ignoring new state, equal to the existing one: {equal_state_2:?}"); @@ -1160,7 +1755,7 @@ impl Timeline { } } - pub fn set_broken(&self, reason: String) { + pub(crate) fn set_broken(&self, reason: String) { let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture()); let broken_state = TimelineState::Broken { reason, @@ -1174,27 +1769,32 @@ impl Timeline { self.cancel.cancel(); } - pub fn current_state(&self) -> TimelineState { + pub(crate) fn current_state(&self) -> TimelineState { self.state.borrow().clone() } - pub fn is_broken(&self) -> bool { + pub(crate) fn is_broken(&self) -> bool { matches!(&*self.state.borrow(), TimelineState::Broken { .. }) } - pub fn is_active(&self) -> bool { + pub(crate) fn is_active(&self) -> bool { self.current_state() == TimelineState::Active } - pub fn is_stopping(&self) -> bool { + #[allow(unused)] + pub(crate) fn is_archived(&self) -> Option { + self.remote_client.is_archived() + } + + pub(crate) fn is_stopping(&self) -> bool { self.current_state() == TimelineState::Stopping } - pub fn subscribe_for_state_updates(&self) -> watch::Receiver { + pub(crate) fn subscribe_for_state_updates(&self) -> watch::Receiver { self.state.subscribe() } - pub async fn wait_to_become_active( + pub(crate) async fn wait_to_become_active( &self, _ctx: &RequestContext, // Prepare for use by cancellation ) -> Result<(), TimelineState> { @@ -1219,9 +1819,12 @@ impl Timeline { } } - pub async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo { + pub(crate) async fn layer_map_info( + &self, + reset: LayerAccessStatsReset, + ) -> Result { let guard = self.layers.read().await; - let layer_map = guard.layer_map(); + let layer_map = guard.layer_map()?; let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1); if let Some(open_layer) = &layer_map.open_layer { in_memory_layers.push(open_layer.info()); @@ -1230,28 +1833,26 @@ impl Timeline { in_memory_layers.push(frozen_layer.info()); } - let mut historic_layers = Vec::new(); - for historic_layer in layer_map.iter_historic_layers() { - let historic_layer = guard.get_from_desc(&historic_layer); - historic_layers.push(historic_layer.info(reset)); - } + let historic_layers = layer_map + .iter_historic_layers() + .map(|desc| guard.get_from_desc(&desc).info(reset)) + .collect(); - LayerMapInfo { + Ok(LayerMapInfo { in_memory_layers, historic_layers, - } + }) } #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] - pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result> { - let Some(layer) = self.find_layer(layer_file_name).await else { + pub(crate) async fn download_layer( + &self, + layer_file_name: &LayerName, + ) -> anyhow::Result> { + let Some(layer) = self.find_layer(layer_file_name).await? else { return Ok(None); }; - if self.remote_client.is_none() { - return Ok(Some(false)); - } - layer.download().await?; Ok(Some(true)) @@ -1260,20 +1861,76 @@ impl Timeline { /// Evict just one layer. /// /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`. - pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result> { + pub(crate) async fn evict_layer( + &self, + layer_file_name: &LayerName, + ) -> anyhow::Result> { let _gate = self .gate .enter() .map_err(|_| anyhow::anyhow!("Shutting down"))?; - let Some(local_layer) = self.find_layer(layer_file_name).await else { + let Some(local_layer) = self.find_layer(layer_file_name).await? else { return Ok(None); }; - match local_layer.evict_and_wait().await { + // curl has this by default + let timeout = std::time::Duration::from_secs(120); + + match local_layer.evict_and_wait(timeout).await { Ok(()) => Ok(Some(true)), Err(EvictionError::NotFound) => Ok(Some(false)), Err(EvictionError::Downloaded) => Ok(Some(false)), + Err(EvictionError::Timeout) => Ok(Some(false)), + } + } + + fn should_roll( + &self, + layer_size: u64, + projected_layer_size: u64, + checkpoint_distance: u64, + projected_lsn: Lsn, + last_freeze_at: Lsn, + opened_at: Instant, + ) -> bool { + let distance = projected_lsn.widening_sub(last_freeze_at); + + // Rolling the open layer can be triggered by: + // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that + // the safekeepers need to store. For sharded tenants, we multiply by shard count to + // account for how writes are distributed across shards: we expect each node to consume + // 1/count of the LSN on average. + // 2. The size of the currently open layer. + // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught + // up and suspend activity. + if distance >= checkpoint_distance as i128 * self.shard_identity.count.count() as i128 { + info!( + "Will roll layer at {} with layer size {} due to LSN distance ({})", + projected_lsn, layer_size, distance + ); + + true + } else if projected_layer_size >= checkpoint_distance { + // NB: this check is relied upon by: + let _ = IndexEntry::validate_checkpoint_distance; + info!( + "Will roll layer at {} with layer size {} due to layer size ({})", + projected_lsn, layer_size, projected_layer_size + ); + + true + } else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() { + info!( + "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})", + projected_lsn, + layer_size, + opened_at.elapsed() + ); + + true + } else { + false } } } @@ -1283,44 +1940,94 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10; // Private functions impl Timeline { - fn get_checkpoint_distance(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + pub(crate) fn get_lsn_lease_length(&self) -> Duration { + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf + .lsn_lease_length + .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length) + } + + // TODO(yuchen): remove unused flag after implementing https://github.com/neondatabase/neon/issues/8072 + #[allow(unused)] + pub(crate) fn get_lsn_lease_length_for_ts(&self) -> Duration { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .lsn_lease_length_for_ts + .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts) + } + + pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .switch_aux_file_policy + .unwrap_or(self.conf.default_tenant_conf.switch_aux_file_policy) + } + + pub(crate) fn get_lazy_slru_download(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .lazy_slru_download + .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download) + } + + fn get_checkpoint_distance(&self) -> u64 { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf .checkpoint_distance .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) } fn get_checkpoint_timeout(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .checkpoint_timeout .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } fn get_compaction_target_size(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .compaction_target_size .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) } fn get_compaction_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .compaction_threshold .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } fn get_image_creation_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .image_creation_threshold .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } - fn get_eviction_policy(&self) -> EvictionPolicy { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings { + let tenant_conf = &self.tenant_conf.load(); tenant_conf + .tenant_conf + .compaction_algorithm + .as_ref() + .unwrap_or(&self.conf.default_tenant_conf.compaction_algorithm) + .clone() + } + + fn get_eviction_policy(&self) -> EvictionPolicy { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf .eviction_policy .unwrap_or(self.conf.default_tenant_conf.eviction_policy) } @@ -1334,21 +2041,26 @@ impl Timeline { .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold) } - fn get_gc_feedback(&self) -> bool { - let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf; + fn get_image_layer_creation_check_threshold(&self) -> u8 { + let tenant_conf = self.tenant_conf.load(); tenant_conf - .gc_feedback - .unwrap_or(self.conf.default_tenant_conf.gc_feedback) + .tenant_conf + .image_layer_creation_check_threshold + .unwrap_or( + self.conf + .default_tenant_conf + .image_layer_creation_check_threshold, + ) } - pub(super) fn tenant_conf_updated(&self) { + pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) { // NB: Most tenant conf options are read by background loops, so, // changes will automatically be picked up. // The threshold is embedded in the metric. So, we need to update it. { let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold( - &self.tenant_conf.read().unwrap().tenant_conf, + new_conf, &self.conf.default_tenant_conf, ); @@ -1375,35 +2087,50 @@ impl Timeline { #[allow(clippy::too_many_arguments)] pub(super) fn new( conf: &'static PageServerConf, - tenant_conf: Arc>, + tenant_conf: Arc>, metadata: &TimelineMetadata, ancestor: Option>, timeline_id: TimelineId, tenant_shard_id: TenantShardId, generation: Generation, shard_identity: ShardIdentity, - walredo_mgr: Arc, + walredo_mgr: Option>, resources: TimelineResources, pg_version: u32, state: TimelineState, + aux_file_policy: Option, cancel: CancellationToken, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); let (state, _) = watch::channel(state); - let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0); + let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn)); let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(()))); - let tenant_conf_guard = tenant_conf.read().unwrap(); - - let evictions_low_residence_duration_metric_threshold = + let evictions_low_residence_duration_metric_threshold = { + let loaded_tenant_conf = tenant_conf.load(); Self::get_evictions_low_residence_duration_metric_threshold( - &tenant_conf_guard.tenant_conf, + &loaded_tenant_conf.tenant_conf, &conf.default_tenant_conf, - ); - drop(tenant_conf_guard); + ) + }; + + if let Some(ancestor) = &ancestor { + let mut ancestor_gc_info = ancestor.gc_info.write().unwrap(); + ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn()); + } Arc::new_cyclic(|myself| { + let metrics = TimelineMetrics::new( + &tenant_shard_id, + &timeline_id, + crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( + "mtime", + evictions_low_residence_duration_metric_threshold, + ), + ); + let aux_file_metrics = metrics.aux_file_size_gauge.clone(); + let mut result = Timeline { conf, tenant_conf, @@ -1413,13 +2140,12 @@ impl Timeline { generation, shard_identity, pg_version, - layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())), - wanted_image_layers: Mutex::new(None), + layers: Default::default(), walredo_mgr, walreceiver: Mutex::new(None), - remote_client: resources.remote_client.map(Arc::new), + remote_client: Arc::new(resources.remote_client), // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. last_record_lsn: SeqWait::new(RecordLsn { @@ -1436,32 +2162,23 @@ impl Timeline { ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), - metrics: TimelineMetrics::new( - &tenant_shard_id, - &timeline_id, - crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( - "mtime", - evictions_low_residence_duration_metric_threshold, - ), - ), + metrics, query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new( &tenant_shard_id, &timeline_id, ), + directory_metrics: array::from_fn(|_| AtomicU64::new(0)), + flush_loop_state: Mutex::new(FlushLoopState::NotStarted), layer_flush_start_tx, layer_flush_done_tx, - write_lock: tokio::sync::Mutex::new(()), + write_lock: tokio::sync::Mutex::new(None), - gc_info: std::sync::RwLock::new(GcInfo { - retain_lsns: Vec::new(), - horizon_cutoff: Lsn(0), - pitr_cutoff: Lsn(0), - }), + gc_info: std::sync::RwLock::new(GcInfo::default()), latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), @@ -1475,11 +2192,19 @@ impl Timeline { // initial logical size is 0. LogicalSize::empty_initial() }, - partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), + partitioning: tokio::sync::Mutex::new(( + (KeyPartitioning::new(), KeyPartitioning::new().into_sparse()), + Lsn(0), + )), repartition_threshold: 0, + last_image_layer_creation_check_at: AtomicLsn::new(0), + last_image_layer_creation_check_instant: Mutex::new(None), last_received_wal: Mutex::new(None), - rel_size_cache: RwLock::new(HashMap::new()), + rel_size_cache: RwLock::new(RelSizeCache { + complete_as_of: disk_consistent_lsn, + map: HashMap::new(), + }), download_all_remote_layers_task_info: RwLock::new(None), @@ -1491,13 +2216,39 @@ impl Timeline { delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())), cancel, - gate: Gate::new(format!("Timeline<{tenant_shard_id}/{timeline_id}>")), + gate: Gate::default(), compaction_lock: tokio::sync::Mutex::default(), gc_lock: tokio::sync::Mutex::default(), + + standby_horizon: AtomicLsn::new(0), + + timeline_get_throttle: resources.timeline_get_throttle, + + aux_files: tokio::sync::Mutex::new(AuxFilesState { + dir: None, + n_deltas: 0, + }), + + aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics), + + last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy), + + #[cfg(test)] + extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())), + + l0_flush_global_state: resources.l0_flush_global_state, + + handles: Default::default(), }; + + if aux_file_policy == Some(AuxFilePolicy::V1) { + warn!("this timeline is using deprecated aux file policy V1 (when loading the timeline)"); + } + result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; + result .metrics .last_record_gauge @@ -1543,16 +2294,15 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::LayerFlushTask, - Some(self.tenant_shard_id), + self.tenant_shard_id, Some(self.timeline_id), "layer flush task", - false, async move { let _guard = guard; let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error); self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await; let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap(); - assert!(matches!(*flush_loop_state, FlushLoopState::Running{ ..})); + assert!(matches!(*flush_loop_state, FlushLoopState::Running{..})); *flush_loop_state = FlushLoopState::Exited; Ok(()) } @@ -1574,20 +2324,19 @@ impl Timeline { self.timeline_id, self.tenant_shard_id ); - let tenant_conf_guard = self.tenant_conf.read().unwrap(); - let wal_connect_timeout = tenant_conf_guard + let tenant_conf = self.tenant_conf.load(); + let wal_connect_timeout = tenant_conf .tenant_conf .walreceiver_connect_timeout .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout); - let lagging_wal_timeout = tenant_conf_guard + let lagging_wal_timeout = tenant_conf .tenant_conf .lagging_wal_timeout .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout); - let max_lsn_wal_lag = tenant_conf_guard + let max_lsn_wal_lag = tenant_conf .tenant_conf .max_lsn_wal_lag .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); - drop(tenant_conf_guard); let mut guard = self.walreceiver.lock().unwrap(); assert!( @@ -1614,7 +2363,10 @@ impl Timeline { let mut layers = self.layers.try_write().expect( "in the context where we call this function, no other task has access to the object", ); - layers.initialize_empty(Lsn(start_lsn.0)); + layers + .open_mut() + .expect("in this context the LayerManager must still be open") + .initialize_empty(Lsn(start_lsn.0)); } /// Scan the timeline directory, cleanup, populate the layer map, and schedule uploads for local-only @@ -1625,13 +2377,13 @@ impl Timeline { index_part: Option, ) -> anyhow::Result<()> { use init::{Decision::*, Discovered, DismissedLayer}; - use LayerFileName::*; + use LayerName::*; let mut guard = self.layers.write().await; let timer = self.metrics.load_layer_map_histo.start_timer(); - // Scan timeline directory and create ImageFileName and DeltaFilename + // Scan timeline directory and create ImageLayerName and DeltaFilename // structs representing all files on disk let timeline_path = self .conf @@ -1640,8 +2392,6 @@ impl Timeline { let span = tracing::Span::current(); // Copy to move into the task we're about to spawn - let generation = self.generation; - let shard = self.get_shard_index(); let this = self.myself.upgrade().expect("&self method holds the arc"); let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({ @@ -1655,11 +2405,14 @@ impl Timeline { for discovered in discovered { let (name, kind) = match discovered { - Discovered::Layer(file_name, file_size) => { - discovered_layers.push((file_name, file_size)); + Discovered::Layer(layer_file_name, local_metadata) => { + discovered_layers.push((layer_file_name, local_metadata)); continue; } - Discovered::Metadata | Discovered::IgnoredBackup => { + Discovered::IgnoredBackup(path) => { + std::fs::remove_file(path) + .or_else(fs_ext::ignore_not_found) + .fatal_err("Removing .old file"); continue; } Discovered::Unknown(file_name) => { @@ -1685,13 +2438,8 @@ impl Timeline { ); } - let decided = init::reconcile( - discovered_layers, - index_part.as_ref(), - disk_consistent_lsn, - generation, - shard, - ); + let decided = + init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn); let mut loaded_layers = Vec::new(); let mut needs_cleanup = Vec::new(); @@ -1699,34 +2447,25 @@ impl Timeline { for (name, decision) in decided { let decision = match decision { - Ok(UseRemote { local, remote }) => { - // Remote is authoritative, but we may still choose to retain - // the local file if the contents appear to match - if local.file_size() == remote.file_size() { - // Use the local file, but take the remote metadata so that we pick up - // the correct generation. - UseLocal(remote) - } else { - path.push(name.file_name()); - init::cleanup_local_file_for_remote(&path, &local, &remote)?; - path.pop(); - UseRemote { local, remote } - } - } Ok(decision) => decision, Err(DismissedLayer::Future { local }) => { - if local.is_some() { - path.push(name.file_name()); - init::cleanup_future_layer(&path, &name, disk_consistent_lsn)?; - path.pop(); + if let Some(local) = local { + init::cleanup_future_layer( + &local.local_path, + &name, + disk_consistent_lsn, + )?; } needs_cleanup.push(name); continue; } Err(DismissedLayer::LocalOnly(local)) => { - path.push(name.file_name()); - init::cleanup_local_only_file(&path, &name, &local)?; - path.pop(); + init::cleanup_local_only_file(&name, &local)?; + // this file never existed remotely, we will have to do rework + continue; + } + Err(DismissedLayer::BadMetadata(local)) => { + init::cleanup_local_file_for_remote(&local)?; // this file never existed remotely, we will have to do rework continue; } @@ -1740,13 +2479,12 @@ impl Timeline { tracing::debug!(layer=%name, ?decision, "applied"); let layer = match decision { - UseLocal(m) => { - total_physical_size += m.file_size(); - Layer::for_resident(conf, &this, name, m).drop_eviction_guard() - } - Evicted(remote) | UseRemote { remote, .. } => { - Layer::for_evicted(conf, &this, name, remote) + Resident { local, remote } => { + total_physical_size += local.file_size; + Layer::for_resident(conf, &this, local.local_path, name, remote) + .drop_eviction_guard() } + Evicted(remote) => Layer::for_evicted(conf, &this, name, remote), }; loaded_layers.push(layer); @@ -1760,38 +2498,45 @@ impl Timeline { let num_layers = loaded_layers.len(); - guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1); + guard + .open_mut() + .expect("layermanager must be open during init") + .initialize_local_layers(loaded_layers, disk_consistent_lsn + 1); - if let Some(rtc) = self.remote_client.as_ref() { - rtc.schedule_layer_file_deletion(&needs_cleanup)?; - rtc.schedule_index_upload_for_file_changes()?; - // This barrier orders above DELETEs before any later operations. - // This is critical because code executing after the barrier might - // create again objects with the same key that we just scheduled for deletion. - // For example, if we just scheduled deletion of an image layer "from the future", - // later compaction might run again and re-create the same image layer. - // "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn. - // "same" here means same key range and LSN. - // - // Without a barrier between above DELETEs and the re-creation's PUTs, - // the upload queue may execute the PUT first, then the DELETE. - // In our example, we will end up with an IndexPart referencing a non-existent object. - // - // 1. a future image layer is created and uploaded - // 2. ps restart - // 3. the future layer from (1) is deleted during load layer map - // 4. image layer is re-created and uploaded - // 5. deletion queue would like to delete (1) but actually deletes (4) - // 6. delete by name works as expected, but it now deletes the wrong (later) version - // - // See https://github.com/neondatabase/neon/issues/5878 - // - // NB: generation numbers naturally protect against this because they disambiguate - // (1) and (4) - rtc.schedule_barrier()?; - // Tenant::create_timeline will wait for these uploads to happen before returning, or - // on retry. - } + self.remote_client + .schedule_layer_file_deletion(&needs_cleanup)?; + self.remote_client + .schedule_index_upload_for_file_changes()?; + // This barrier orders above DELETEs before any later operations. + // This is critical because code executing after the barrier might + // create again objects with the same key that we just scheduled for deletion. + // For example, if we just scheduled deletion of an image layer "from the future", + // later compaction might run again and re-create the same image layer. + // "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn. + // "same" here means same key range and LSN. + // + // Without a barrier between above DELETEs and the re-creation's PUTs, + // the upload queue may execute the PUT first, then the DELETE. + // In our example, we will end up with an IndexPart referencing a non-existent object. + // + // 1. a future image layer is created and uploaded + // 2. ps restart + // 3. the future layer from (1) is deleted during load layer map + // 4. image layer is re-created and uploaded + // 5. deletion queue would like to delete (1) but actually deletes (4) + // 6. delete by name works as expected, but it now deletes the wrong (later) version + // + // See https://github.com/neondatabase/neon/issues/5878 + // + // NB: generation numbers naturally protect against this because they disambiguate + // (1) and (4) + self.remote_client.schedule_barrier()?; + // Tenant::create_timeline will wait for these uploads to happen before returning, or + // on retry. + + // Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan) + drop(guard); // drop write lock, update_layer_visibility will take a read lock. + self.update_layer_visibility().await?; info!( "loaded layer map with {} layers at {}, total physical size: {}", @@ -1813,6 +2558,12 @@ impl Timeline { priority: GetLogicalSizePriority, ctx: &RequestContext, ) -> logical_size::CurrentLogicalSize { + if !self.tenant_shard_id.is_shard_zero() { + // Logical size is only accurately maintained on shard zero: when called elsewhere, for example + // when HTTP API is serving a GET for timeline zero, return zero + return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero()); + } + let current_size = self.current_logical_size.current_size(); debug!("Current size: {current_size:?}"); @@ -1844,6 +2595,7 @@ impl Timeline { // Don't make noise. } else { warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work"); + debug_assert!(false); } } }; @@ -1896,10 +2648,9 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::InitialLogicalSizeCalculation, - Some(self.tenant_shard_id), + self.tenant_shard_id, Some(self.timeline_id), "initial size calculation", - false, // NB: don't log errors here, task_mgr will do that. async move { let cancel = task_mgr::shutdown_token(); @@ -1913,7 +2664,7 @@ impl Timeline { .await; Ok(()) } - .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, timeline_id=%self.timeline_id)), + .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id)), ); } @@ -1929,11 +2680,6 @@ impl Timeline { self.current_logical_size.initialized.add_permits(1); } - enum BackgroundCalculationError { - Cancelled, - Other(anyhow::Error), - } - let try_once = |attempt: usize| { let background_ctx = &background_ctx; let self_ref = &self; @@ -1951,10 +2697,10 @@ impl Timeline { (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit) } _ = self_ref.cancel.cancelled() => { - return Err(BackgroundCalculationError::Cancelled); + return Err(CalculateLogicalSizeError::Cancelled); } _ = cancel.cancelled() => { - return Err(BackgroundCalculationError::Cancelled); + return Err(CalculateLogicalSizeError::Cancelled); }, () = skip_concurrency_limiter.cancelled() => { // Some action that is part of a end user interaction requested logical size @@ -1972,28 +2718,21 @@ impl Timeline { crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances) }; - match self_ref + let calculated_size = self_ref .logical_size_calculation_task( initial_part_end, LogicalSizeCalculationCause::Initial, background_ctx, ) - .await - { - Ok(calculated_size) => Ok((calculated_size, metrics_guard)), - Err(CalculateLogicalSizeError::Cancelled) => { - Err(BackgroundCalculationError::Cancelled) - } - Err(CalculateLogicalSizeError::Other(err)) => { - if let Some(PageReconstructError::AncestorStopping(_)) = - err.root_cause().downcast_ref() - { - Err(BackgroundCalculationError::Cancelled) - } else { - Err(BackgroundCalculationError::Other(err)) - } - } - } + .await?; + + self_ref + .trigger_aux_file_size_computation(initial_part_end, background_ctx) + .await?; + + // TODO: add aux file size to logical size + + Ok((calculated_size, metrics_guard)) } }; @@ -2004,8 +2743,11 @@ impl Timeline { match try_once(attempt).await { Ok(res) => return ControlFlow::Continue(res), - Err(BackgroundCalculationError::Cancelled) => return ControlFlow::Break(()), - Err(BackgroundCalculationError::Other(e)) => { + Err(CalculateLogicalSizeError::Cancelled) => return ControlFlow::Break(()), + Err( + e @ (CalculateLogicalSizeError::Decode(_) + | CalculateLogicalSizeError::PageRead(_)), + ) => { warn!(attempt, "initial size calculation failed: {e:?}"); // exponential back-off doesn't make sense at these long intervals; // use fixed retry interval with generous jitter instead @@ -2055,7 +2797,7 @@ impl Timeline { .expect("only this task sets it"); } - pub fn spawn_ondemand_logical_size_calculation( + pub(crate) fn spawn_ondemand_logical_size_calculation( self: &Arc, lsn: Lsn, cause: LogicalSizeCalculationCause, @@ -2074,10 +2816,9 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::OndemandLogicalSizeCalculation, - Some(self.tenant_shard_id), + self.tenant_shard_id, Some(self.timeline_id), "ondemand logical size calculation", - false, async move { let res = self_clone .logical_size_calculation_task(lsn, cause, &ctx) @@ -2100,16 +2841,22 @@ impl Timeline { cause: LogicalSizeCalculationCause, ctx: &RequestContext, ) -> Result { - span::debug_assert_current_span_has_tenant_and_timeline_id(); + crate::span::debug_assert_current_span_has_tenant_and_timeline_id(); + // We should never be calculating logical sizes on shard !=0, because these shards do not have + // accurate relation sizes, and they do not emit consumption metrics. + debug_assert!(self.tenant_shard_id.is_shard_zero()); - let _guard = self.gate.enter(); + let guard = self + .gate + .enter() + .map_err(|_| CalculateLogicalSizeError::Cancelled)?; let self_calculation = Arc::clone(self); let mut calculation = pin!(async { let ctx = ctx.attached_child(); self_calculation - .calculate_logical_size(lsn, cause, &ctx) + .calculate_logical_size(lsn, cause, &guard, &ctx) .await }); @@ -2119,10 +2866,6 @@ impl Timeline { debug!("cancelling logical size calculation for timeline shutdown"); calculation.await } - _ = task_mgr::shutdown_watcher() => { - debug!("cancelling logical size calculation for task shutdown"); - calculation.await - } } } @@ -2134,37 +2877,20 @@ impl Timeline { /// # Cancel-Safety /// /// This method is cancellation-safe. - pub async fn calculate_logical_size( + async fn calculate_logical_size( &self, up_to_lsn: Lsn, cause: LogicalSizeCalculationCause, + _guard: &GateGuard, ctx: &RequestContext, ) -> Result { info!( "Calculating logical size for timeline {} at {}", self.timeline_id, up_to_lsn ); - // These failpoints are used by python tests to ensure that we don't delete - // the timeline while the logical size computation is ongoing. - // The first failpoint is used to make this function pause. - // Then the python test initiates timeline delete operation in a thread. - // It waits for a few seconds, then arms the second failpoint and disables - // the first failpoint. The second failpoint prints an error if the timeline - // delete code has deleted the on-disk state while we're still running here. - // It shouldn't do that. If it does it anyway, the error will be caught - // by the test suite, highlighting the problem. - fail::fail_point!("timeline-calculate-logical-size-pause"); - fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| { - if !self - .conf - .metadata_path(&self.tenant_shard_id, &self.timeline_id) - .exists() - { - error!("timeline-calculate-logical-size-pre metadata file does not exist") - } - // need to return something - Ok(0) - }); + + pausable_failpoint!("timeline-calculate-logical-size-pause"); + // See if we've already done the work for initial size calculation. // This is a short-cut for timelines that are mostly unused. if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) { @@ -2209,16 +2935,40 @@ impl Timeline { } } - async fn find_layer(&self, layer_file_name: &str) -> Option { - let guard = self.layers.read().await; - for historic_layer in guard.layer_map().iter_historic_layers() { - let historic_layer_name = historic_layer.filename().file_name(); - if layer_file_name == historic_layer_name { - return Some(guard.get_from_desc(&historic_layer)); - } - } + pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) { + self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed); + let aux_metric = + self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed); - None + let sum_of_entries = self + .directory_metrics + .iter() + .map(|v| v.load(AtomicOrdering::Relaxed)) + .sum(); + // Set a high general threshold and a lower threshold for the auxiliary files, + // as we can have large numbers of relations in the db directory. + const SUM_THRESHOLD: u64 = 5000; + const AUX_THRESHOLD: u64 = 1000; + if sum_of_entries >= SUM_THRESHOLD || aux_metric >= AUX_THRESHOLD { + self.metrics + .directory_entries_count_gauge + .set(sum_of_entries); + } else if let Some(metric) = Lazy::get(&self.metrics.directory_entries_count_gauge) { + metric.set(sum_of_entries); + } + } + + async fn find_layer( + &self, + layer_name: &LayerName, + ) -> Result, layer_manager::Shutdown> { + let guard = self.layers.read().await; + let layer = guard + .layer_map()? + .iter_historic_layers() + .find(|l| &l.layer_name() == layer_name) + .map(|found| guard.get_from_desc(&found)); + Ok(layer) } /// The timeline heatmap is a hint to secondary locations from the primary location, @@ -2229,417 +2979,395 @@ impl Timeline { /// should treat this as a cue to simply skip doing any heatmap uploading /// for this timeline. pub(crate) async fn generate_heatmap(&self) -> Option { - let eviction_info = self.get_local_layers_for_disk_usage_eviction().await; + if !self.is_active() { + return None; + } - let remote_client = match &self.remote_client { - Some(c) => c, - None => return None, - }; + let guard = self.layers.read().await; - let layer_file_names = eviction_info - .resident_layers - .iter() - .map(|l| l.layer.get_name()) - .collect::>(); - - let decorated = match remote_client.get_layers_metadata(layer_file_names) { - Ok(d) => d, - Err(_) => { - // Getting metadata only fails on Timeline in bad state. - return None; + let resident = guard.likely_resident_layers().filter_map(|layer| { + match layer.visibility() { + LayerVisibilityHint::Visible => { + // Layer is visible to one or more read LSNs: elegible for inclusion in layer map + let last_activity_ts = layer.latest_activity(); + Some((layer.layer_desc(), layer.metadata(), last_activity_ts)) + } + LayerVisibilityHint::Covered => { + // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap. + None + } } - }; - - let heatmap_layers = std::iter::zip( - eviction_info.resident_layers.into_iter(), - decorated.into_iter(), - ) - .filter_map(|(layer, remote_info)| { - remote_info.map(|remote_info| { - HeatMapLayer::new( - layer.layer.get_name(), - IndexLayerMetadata::from(remote_info), - layer.last_activity_ts, - ) - }) }); - Some(HeatMapTimeline::new( - self.timeline_id, - heatmap_layers.collect(), - )) + let mut layers = resident.collect::>(); + + // Sort layers in order of which to download first. For a large set of layers to download, we + // want to prioritize those layers which are most likely to still be in the resident many minutes + // or hours later: + // - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might + // only exist for a few minutes before being compacted into L1s. + // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner + // the layer is likely to be covered by an image layer during compaction. + layers.sort_by_key(|(desc, _meta, _atime)| { + std::cmp::Reverse(( + !LayerMap::is_l0(&desc.key_range, desc.is_delta), + desc.lsn_range.end, + )) + }); + + let layers = layers + .into_iter() + .map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime)) + .collect(); + + Some(HeatMapTimeline::new(self.timeline_id, layers)) } -} -type TraversalId = String; - -trait TraversalLayerExt { - fn traversal_id(&self) -> TraversalId; -} - -impl TraversalLayerExt for Layer { - fn traversal_id(&self) -> TraversalId { - self.local_path().to_string() - } -} - -impl TraversalLayerExt for Arc { - fn traversal_id(&self) -> TraversalId { - format!("timeline {} in-memory {self}", self.get_timeline_id()) + /// Returns true if the given lsn is or was an ancestor branchpoint. + pub(crate) fn is_ancestor_lsn(&self, lsn: Lsn) -> bool { + // upon timeline detach, we set the ancestor_lsn to Lsn::INVALID and the store the original + // branchpoint in the value in IndexPart::lineage + self.ancestor_lsn == lsn + || (self.ancestor_lsn == Lsn::INVALID + && self.remote_client.is_previous_ancestor_lsn(lsn)) } } impl Timeline { + #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint + #[allow(clippy::doc_lazy_continuation)] + /// Get the data needed to reconstruct all keys in the provided keyspace /// - /// Get a handle to a Layer for reading. - /// - /// The returned Layer might be from an ancestor timeline, if the - /// segment hasn't been updated on this timeline yet. - /// - /// This function takes the current timeline's locked LayerMap as an argument, - /// so callers can avoid potential race conditions. - /// - /// # Cancel-Safety - /// - /// This method is cancellation-safe. - async fn get_reconstruct_data( + /// The algorithm is as follows: + /// 1. While some keys are still not done and there's a timeline to visit: + /// 2. Visit the timeline (see [`Timeline::get_vectored_reconstruct_data_timeline`]: + /// 2.1: Build the fringe for the current keyspace + /// 2.2 Visit the newest layer from the fringe to collect all values for the range it + /// intersects + /// 2.3. Pop the timeline from the fringe + /// 2.4. If the fringe is empty, go back to 1 + async fn get_vectored_reconstruct_data( &self, - key: Key, + mut keyspace: KeySpace, request_lsn: Lsn, - reconstruct_state: &mut ValueReconstructState, + reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, - ) -> Result, PageReconstructError> { - // Start from the current timeline. - let mut timeline_owned; + ) -> Result<(), GetVectoredError> { + let mut timeline_owned: Arc; let mut timeline = self; - let mut read_count = scopeguard::guard(0, |cnt| { - crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64) - }); - - // For debugging purposes, collect the path of layers that we traversed - // through. It's included in the error message if we fail to find the key. - let mut traversal_path = Vec::::new(); - - let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img { - *cached_lsn - } else { - Lsn(0) - }; - - // 'prev_lsn' tracks the last LSN that we were at in our search. It's used - // to check that each iteration make some progress, to break infinite - // looping if something goes wrong. - let mut prev_lsn = Lsn(u64::MAX); - - let mut result = ValueReconstructResult::Continue; let mut cont_lsn = Lsn(request_lsn.0 + 1); - 'outer: loop { + let missing_keyspace = loop { if self.cancel.is_cancelled() { - return Err(PageReconstructError::Cancelled); + return Err(GetVectoredError::Cancelled); } - // The function should have updated 'state' - //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn); - match result { - ValueReconstructResult::Complete => return Ok(traversal_path), - ValueReconstructResult::Continue => { - // If we reached an earlier cached page image, we're done. - if cont_lsn == cached_lsn + 1 { - MATERIALIZED_PAGE_CACHE_HIT.inc_by(1); - return Ok(traversal_path); - } - if prev_lsn <= cont_lsn { - // Didn't make any progress in last iteration. Error out to avoid - // getting stuck in the loop. - return Err(layer_traversal_error(format!( - "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", - key, - Lsn(cont_lsn.0 - 1), - request_lsn, - timeline.ancestor_lsn - ), traversal_path)); - } - prev_lsn = cont_lsn; - } - ValueReconstructResult::Missing => { - return Err(layer_traversal_error( - if cfg!(test) { - format!( - "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}", - key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(), - ) - } else { - format!( - "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}", - key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn - ) - }, - traversal_path, - )); - } + let TimelineVisitOutcome { + completed_keyspace: completed, + image_covered_keyspace, + } = Self::get_vectored_reconstruct_data_timeline( + timeline, + keyspace.clone(), + cont_lsn, + reconstruct_state, + &self.cancel, + ctx, + ) + .await?; + + keyspace.remove_overlapping_with(&completed); + + // Do not descend into the ancestor timeline for aux files. + // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid + // stalling compaction. + keyspace.remove_overlapping_with(&KeySpace { + ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE], + }); + + // Keyspace is fully retrieved + if keyspace.is_empty() { + break None; } - // Recurse into ancestor if needed - if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { - trace!( - "going into ancestor {}, cont_lsn is {}", - timeline.ancestor_lsn, - cont_lsn - ); - let ancestor = match timeline.get_ancestor_timeline() { - Ok(timeline) => timeline, - Err(e) => return Err(PageReconstructError::from(e)), - }; + let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else { + // Not fully retrieved but no ancestor timeline. + break Some(keyspace); + }; - // It's possible that the ancestor timeline isn't active yet, or - // is active but hasn't yet caught up to the branch point. Wait - // for it. - // - // This cannot happen while the pageserver is running normally, - // because you cannot create a branch from a point that isn't - // present in the pageserver yet. However, we don't wait for the - // branch point to be uploaded to cloud storage before creating - // a branch. I.e., the branch LSN need not be remote consistent - // for the branching operation to succeed. - // - // Hence, if we try to load a tenant in such a state where - // 1. the existence of the branch was persisted (in IndexPart and/or locally) - // 2. but the ancestor state is behind branch_lsn because it was not yet persisted - // then we will need to wait for the ancestor timeline to - // re-stream WAL up to branch_lsn before we access it. - // - // How can a tenant get in such a state? - // - ungraceful pageserver process exit - // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219 - // - // NB: this could be avoided by requiring - // branch_lsn >= remote_consistent_lsn - // during branch creation. - match ancestor.wait_to_become_active(ctx).await { - Ok(()) => {} - Err(TimelineState::Stopping) => { - return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id)); + // Now we see if there are keys covered by the image layer but does not exist in the + // image layer, which means that the key does not exist. + + // The block below will stop the vectored search if any of the keys encountered an image layer + // which did not contain a snapshot for said key. Since we have already removed all completed + // keys from `keyspace`, we expect there to be no overlap between it and the image covered key + // space. If that's not the case, we had at least one key encounter a gap in the image layer + // and stop the search as a result of that. + let removed = keyspace.remove_overlapping_with(&image_covered_keyspace); + if !removed.is_empty() { + break Some(removed); + } + // If we reached this point, `remove_overlapping_with` should not have made any change to the + // keyspace. + + // Take the min to avoid reconstructing a page with data newer than request Lsn. + cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1)); + timeline_owned = timeline + .get_ready_ancestor_timeline(ancestor_timeline, ctx) + .await?; + timeline = &*timeline_owned; + }; + + if let Some(missing_keyspace) = missing_keyspace { + return Err(GetVectoredError::MissingKey(MissingKeyError { + key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */ + shard: self + .shard_identity + .get_shard_number(&missing_keyspace.start().unwrap()), + cont_lsn, + request_lsn, + ancestor_lsn: Some(timeline.ancestor_lsn), + backtrace: None, + })); + } + + Ok(()) + } + + /// Collect the reconstruct data for a keyspace from the specified timeline. + /// + /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect + /// the current keyspace. The current keyspace of the search at any given timeline + /// is the original keyspace minus all the keys that have been completed minus + /// any keys for which we couldn't find an intersecting layer. It's not tracked explicitly, + /// but if you merge all the keyspaces in the fringe, you get the "current keyspace". + /// + /// This is basically a depth-first search visitor implementation where a vertex + /// is the (layer, lsn range, key space) tuple. The fringe acts as the stack. + /// + /// At each iteration pop the top of the fringe (the layer with the highest Lsn) + /// and get all the required reconstruct data from the layer in one go. + /// + /// Returns the completed keyspace and the keyspaces with image coverage. The caller + /// decides how to deal with these two keyspaces. + async fn get_vectored_reconstruct_data_timeline( + timeline: &Timeline, + keyspace: KeySpace, + mut cont_lsn: Lsn, + reconstruct_state: &mut ValuesReconstructState, + cancel: &CancellationToken, + ctx: &RequestContext, + ) -> Result { + let mut unmapped_keyspace = keyspace.clone(); + let mut fringe = LayerFringe::new(); + + let mut completed_keyspace = KeySpace::default(); + let mut image_covered_keyspace = KeySpaceRandomAccum::new(); + + loop { + if cancel.is_cancelled() { + return Err(GetVectoredError::Cancelled); + } + + let (keys_done_last_step, keys_with_image_coverage) = + reconstruct_state.consume_done_keys(); + unmapped_keyspace.remove_overlapping_with(&keys_done_last_step); + completed_keyspace.merge(&keys_done_last_step); + if let Some(keys_with_image_coverage) = keys_with_image_coverage { + unmapped_keyspace + .remove_overlapping_with(&KeySpace::single(keys_with_image_coverage.clone())); + image_covered_keyspace.add_range(keys_with_image_coverage); + } + + // Do not descent any further if the last layer we visited + // completed all keys in the keyspace it inspected. This is not + // required for correctness, but avoids visiting extra layers + // which turns out to be a perf bottleneck in some cases. + if !unmapped_keyspace.is_empty() { + let guard = timeline.layers.read().await; + let layers = guard.layer_map()?; + + let in_memory_layer = layers.find_in_memory_layer(|l| { + let start_lsn = l.get_lsn_range().start; + cont_lsn > start_lsn + }); + + match in_memory_layer { + Some(l) => { + let lsn_range = l.get_lsn_range().start..cont_lsn; + fringe.update( + ReadableLayer::InMemoryLayer(l), + unmapped_keyspace.clone(), + lsn_range, + ); } - Err(state) => { - return Err(PageReconstructError::Other(anyhow::anyhow!( - "Timeline {} will not become active. Current state: {:?}", - ancestor.timeline_id, - &state, - ))); - } - } - ancestor - .wait_lsn(timeline.ancestor_lsn, ctx) - .await - .map_err(|e| match e { - e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e), - WaitLsnError::Shutdown => PageReconstructError::Cancelled, - e @ WaitLsnError::BadState => { - PageReconstructError::Other(anyhow::anyhow!(e)) + None => { + for range in unmapped_keyspace.ranges.iter() { + let results = layers.range_search(range.clone(), cont_lsn); + + results + .found + .into_iter() + .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { + ( + ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)), + keyspace_accum.to_keyspace(), + lsn_floor..cont_lsn, + ) + }) + .for_each(|(layer, keyspace, lsn_range)| { + fringe.update(layer, keyspace, lsn_range) + }); } - })?; - - timeline_owned = ancestor; - timeline = &*timeline_owned; - prev_lsn = Lsn(u64::MAX); - continue 'outer; - } - - let guard = timeline.layers.read().await; - let layers = guard.layer_map(); - - // Check the open and frozen in-memory layers first, in order from newest - // to oldest. - if let Some(open_layer) = &layers.open_layer { - let start_lsn = open_layer.get_lsn_range().start; - if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); - // Get all the data needed to reconstruct the page version from this layer. - // But if we have an older cached page image, no need to go past that. - let lsn_floor = max(cached_lsn + 1, start_lsn); - result = match open_layer - .get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - ctx, - ) - .await - { - Ok(result) => result, - Err(e) => return Err(PageReconstructError::from(e)), - }; - cont_lsn = lsn_floor; - // metrics: open_layer does not count as fs access, so we are not updating `read_count` - traversal_path.push(( - result, - cont_lsn, - Box::new({ - let open_layer = Arc::clone(open_layer); - move || open_layer.traversal_id() - }), - )); - continue 'outer; - } - } - for frozen_layer in layers.frozen_layers.iter().rev() { - let start_lsn = frozen_layer.get_lsn_range().start; - if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); - let lsn_floor = max(cached_lsn + 1, start_lsn); - result = match frozen_layer - .get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - ctx, - ) - .await - { - Ok(result) => result, - Err(e) => return Err(PageReconstructError::from(e)), - }; - cont_lsn = lsn_floor; - // metrics: open_layer does not count as fs access, so we are not updating `read_count` - traversal_path.push(( - result, - cont_lsn, - Box::new({ - let frozen_layer = Arc::clone(frozen_layer); - move || frozen_layer.traversal_id() - }), - )); - continue 'outer; + } } + + // It's safe to drop the layer map lock after planning the next round of reads. + // The fringe keeps readable handles for the layers which are safe to read even + // if layers were compacted or flushed. + // + // The more interesting consideration is: "Why is the read algorithm still correct + // if the layer map changes while it is operating?". Doing a vectored read on a + // timeline boils down to pushing an imaginary lsn boundary downwards for each range + // covered by the read. The layer map tells us how to move the lsn downwards for a + // range at *a particular point in time*. It is fine for the answer to be different + // at two different time points. + drop(guard); } - if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) { - let layer = guard.get_from_desc(&layer); - // Get all the data needed to reconstruct the page version from this layer. - // But if we have an older cached page image, no need to go past that. - let lsn_floor = max(cached_lsn + 1, lsn_floor); - result = match layer - .get_value_reconstruct_data(key, lsn_floor..cont_lsn, reconstruct_state, ctx) - .await - { - Ok(result) => result, - Err(e) => return Err(PageReconstructError::from(e)), - }; - cont_lsn = lsn_floor; - *read_count += 1; - traversal_path.push(( - result, - cont_lsn, - Box::new({ - let layer = layer.to_owned(); - move || layer.traversal_id() - }), - )); - continue 'outer; - } else if timeline.ancestor_timeline.is_some() { - // Nothing on this timeline. Traverse to parent - result = ValueReconstructResult::Continue; - cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); - continue 'outer; + if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() { + let next_cont_lsn = lsn_range.start; + layer_to_read + .get_values_reconstruct_data( + keyspace_to_read.clone(), + lsn_range, + reconstruct_state, + ctx, + ) + .await?; + + unmapped_keyspace = keyspace_to_read; + cont_lsn = next_cont_lsn; + + reconstruct_state.on_layer_visited(&layer_to_read); } else { - // Nothing found - result = ValueReconstructResult::Missing; - continue 'outer; + break; } } + + Ok(TimelineVisitOutcome { + completed_keyspace, + image_covered_keyspace: image_covered_keyspace.consume_keyspace(), + }) } - /// # Cancel-safety - /// - /// This method is cancellation-safe. - async fn lookup_cached_page( + async fn get_ready_ancestor_timeline( &self, - key: &Key, - lsn: Lsn, + ancestor: &Arc, ctx: &RequestContext, - ) -> Option<(Lsn, Bytes)> { - let cache = page_cache::get(); + ) -> Result, GetReadyAncestorError> { + // It's possible that the ancestor timeline isn't active yet, or + // is active but hasn't yet caught up to the branch point. Wait + // for it. + // + // This cannot happen while the pageserver is running normally, + // because you cannot create a branch from a point that isn't + // present in the pageserver yet. However, we don't wait for the + // branch point to be uploaded to cloud storage before creating + // a branch. I.e., the branch LSN need not be remote consistent + // for the branching operation to succeed. + // + // Hence, if we try to load a tenant in such a state where + // 1. the existence of the branch was persisted (in IndexPart and/or locally) + // 2. but the ancestor state is behind branch_lsn because it was not yet persisted + // then we will need to wait for the ancestor timeline to + // re-stream WAL up to branch_lsn before we access it. + // + // How can a tenant get in such a state? + // - ungraceful pageserver process exit + // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219 + // + // NB: this could be avoided by requiring + // branch_lsn >= remote_consistent_lsn + // during branch creation. + match ancestor.wait_to_become_active(ctx).await { + Ok(()) => {} + Err(TimelineState::Stopping) => { + // If an ancestor is stopping, it means the tenant is stopping: handle this the same as if this timeline was stopping. + return Err(GetReadyAncestorError::Cancelled); + } + Err(state) => { + return Err(GetReadyAncestorError::BadState { + timeline_id: ancestor.timeline_id, + state, + }); + } + } + ancestor + .wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx) + .await + .map_err(|e| match e { + e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e), + WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled, + WaitLsnError::BadState(state) => GetReadyAncestorError::BadState { + timeline_id: ancestor.timeline_id, + state, + }, + })?; - // FIXME: It's pointless to check the cache for things that are not 8kB pages. - // We should look at the key to determine if it's a cacheable object - let (lsn, read_guard) = cache - .lookup_materialized_page(self.tenant_shard_id, self.timeline_id, key, lsn, ctx) - .await?; - let img = Bytes::from(read_guard.to_vec()); - Some((lsn, img)) - } - - fn get_ancestor_timeline(&self) -> anyhow::Result> { - let ancestor = self.ancestor_timeline.as_ref().with_context(|| { - format!( - "Ancestor is missing. Timeline id: {} Ancestor id {:?}", - self.timeline_id, - self.get_ancestor_timeline_id(), - ) - })?; - Ok(Arc::clone(ancestor)) + Ok(ancestor.clone()) } pub(crate) fn get_shard_identity(&self) -> &ShardIdentity { &self.shard_identity } + #[inline(always)] + pub(crate) fn shard_timeline_id(&self) -> ShardTimelineId { + ShardTimelineId { + shard_index: ShardIndex { + shard_number: self.shard_identity.number, + shard_count: self.shard_identity.count, + }, + timeline_id: self.timeline_id, + } + } + + /// Returns a non-frozen open in-memory layer for ingestion. /// - /// Get a handle to the latest layer for appending. - /// - async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result> { + /// Takes a witness of timeline writer state lock being held, because it makes no sense to call + /// this function without holding the mutex. + async fn get_layer_for_write( + &self, + lsn: Lsn, + _guard: &tokio::sync::MutexGuard<'_, Option>, + ctx: &RequestContext, + ) -> anyhow::Result> { let mut guard = self.layers.write().await; + let gate_guard = self.gate.enter().context("enter gate for inmem layer")?; + + let last_record_lsn = self.get_last_record_lsn(); + ensure!( + lsn > last_record_lsn, + "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", + lsn, + last_record_lsn, + ); + let layer = guard + .open_mut()? .get_layer_for_write( lsn, - self.get_last_record_lsn(), self.conf, self.timeline_id, self.tenant_shard_id, + gate_guard, + ctx, ) .await?; Ok(layer) } - async fn put_value( - &self, - key: Key, - lsn: Lsn, - val: &Value, - ctx: &RequestContext, - ) -> anyhow::Result<()> { - //info!("PUT: key {} at {}", key, lsn); - let layer = self.get_layer_for_write(lsn).await?; - layer.put_value(key, lsn, val, ctx).await?; - Ok(()) - } - - async fn put_values( - &self, - values: &HashMap>, - ctx: &RequestContext, - ) -> anyhow::Result<()> { - // Pick the first LSN in the batch to get the layer to write to. - for lsns in values.values() { - if let Some((lsn, _)) = lsns.first() { - let layer = self.get_layer_for_write(*lsn).await?; - layer.put_values(values, ctx).await?; - break; - } - } - Ok(()) - } - - async fn put_tombstones(&self, tombstones: &[(Range, Lsn)]) -> anyhow::Result<()> { - if let Some((_, lsn)) = tombstones.first() { - let layer = self.get_layer_for_write(*lsn).await?; - layer.put_tombstones(tombstones).await?; - } - Ok(()) - } - pub(crate) fn finish_write(&self, new_lsn: Lsn) { assert!(new_lsn.is_aligned()); @@ -2647,85 +3375,27 @@ impl Timeline { self.last_record_lsn.advance(new_lsn); } - async fn freeze_inmem_layer(&self, write_lock_held: bool) { - // Freeze the current open in-memory layer. It will be written to disk on next - // iteration. - let _write_guard = if write_lock_held { - None - } else { - Some(self.write_lock.lock().await) + /// Freeze any existing open in-memory layer and unconditionally notify the flush loop. + /// + /// Unconditional flush loop notification is given because in sharded cases we will want to + /// leave an Lsn gap. Unsharded tenants do not have Lsn gaps. + async fn freeze_inmem_layer_at( + &self, + at: Lsn, + write_lock: &mut tokio::sync::MutexGuard<'_, Option>, + ) -> Result { + let frozen = { + let mut guard = self.layers.write().await; + guard + .open_mut()? + .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock) + .await }; - let mut guard = self.layers.write().await; - guard - .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at) - .await; - } - /// Layer flusher task's main loop. - async fn flush_loop( - self: &Arc, - mut layer_flush_start_rx: tokio::sync::watch::Receiver, - ctx: &RequestContext, - ) { - info!("started flush loop"); - loop { - tokio::select! { - _ = self.cancel.cancelled() => { - info!("shutting down layer flush task"); - break; - }, - _ = task_mgr::shutdown_watcher() => { - info!("shutting down layer flush task"); - break; - }, - _ = layer_flush_start_rx.changed() => {} - } - - trace!("waking up"); - let timer = self.metrics.flush_time_histo.start_timer(); - let flush_counter = *layer_flush_start_rx.borrow(); - let result = loop { - if self.cancel.is_cancelled() { - info!("dropping out of flush loop for timeline shutdown"); - // Note: we do not bother transmitting into [`layer_flush_done_tx`], because - // anyone waiting on that will respect self.cancel as well: they will stop - // waiting at the same time we as drop out of this loop. - return; - } - - let layer_to_flush = { - let guard = self.layers.read().await; - guard.layer_map().frozen_layers.front().cloned() - // drop 'layers' lock to allow concurrent reads and writes - }; - let Some(layer_to_flush) = layer_to_flush else { - break Ok(()); - }; - match self.flush_frozen_layer(layer_to_flush, ctx).await { - Ok(()) => {} - Err(FlushLayerError::Cancelled) => { - info!("dropping out of flush loop for timeline shutdown"); - return; - } - err @ Err( - FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_), - ) => { - error!("could not flush frozen layer: {err:?}"); - break err; - } - } - }; - // Notify any listeners that we're done - let _ = self - .layer_flush_done_tx - .send_replace((flush_counter, result)); - - timer.stop_and_record(); + if frozen { + let now = Instant::now(); + *(self.last_freeze_ts.write().unwrap()) = now; } - } - - async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> { - let mut rx = self.layer_flush_done_tx.subscribe(); // Increment the flush cycle counter and wake up the flush task. // Remember the new value, so that when we listen for the flush @@ -2735,26 +3405,127 @@ impl Timeline { let flush_loop_state = { *self.flush_loop_state.lock().unwrap() }; if !matches!(flush_loop_state, FlushLoopState::Running { .. }) { - anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}") + return Err(FlushLayerError::NotRunning(flush_loop_state)); } - self.layer_flush_start_tx.send_modify(|counter| { + self.layer_flush_start_tx.send_modify(|(counter, lsn)| { my_flush_request = *counter + 1; *counter = my_flush_request; + *lsn = std::cmp::max(at, *lsn); }); + assert_ne!(my_flush_request, 0); + + Ok(my_flush_request) + } + + /// Layer flusher task's main loop. + async fn flush_loop( + self: &Arc, + mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>, + ctx: &RequestContext, + ) { + info!("started flush loop"); + loop { + tokio::select! { + _ = self.cancel.cancelled() => { + info!("shutting down layer flush task due to Timeline::cancel"); + break; + }, + _ = layer_flush_start_rx.changed() => {} + } + trace!("waking up"); + let (flush_counter, frozen_to_lsn) = *layer_flush_start_rx.borrow(); + + // The highest LSN to which we flushed in the loop over frozen layers + let mut flushed_to_lsn = Lsn(0); + + let result = loop { + if self.cancel.is_cancelled() { + info!("dropping out of flush loop for timeline shutdown"); + // Note: we do not bother transmitting into [`layer_flush_done_tx`], because + // anyone waiting on that will respect self.cancel as well: they will stop + // waiting at the same time we as drop out of this loop. + return; + } + + let timer = self.metrics.flush_time_histo.start_timer(); + + let layer_to_flush = { + let guard = self.layers.read().await; + let Ok(lm) = guard.layer_map() else { + info!("dropping out of flush loop for timeline shutdown"); + return; + }; + lm.frozen_layers.front().cloned() + // drop 'layers' lock to allow concurrent reads and writes + }; + let Some(layer_to_flush) = layer_to_flush else { + break Ok(()); + }; + match self.flush_frozen_layer(layer_to_flush, ctx).await { + Ok(this_layer_to_lsn) => { + flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn); + } + Err(FlushLayerError::Cancelled) => { + info!("dropping out of flush loop for timeline shutdown"); + return; + } + err @ Err( + FlushLayerError::NotRunning(_) + | FlushLayerError::Other(_) + | FlushLayerError::CreateImageLayersError(_), + ) => { + error!("could not flush frozen layer: {err:?}"); + break err.map(|_| ()); + } + } + timer.stop_and_record(); + }; + + // Unsharded tenants should never advance their LSN beyond the end of the + // highest layer they write: such gaps between layer data and the frozen LSN + // are only legal on sharded tenants. + debug_assert!( + self.shard_identity.count.count() > 1 + || flushed_to_lsn >= frozen_to_lsn + || !flushed_to_lsn.is_valid() + ); + + if flushed_to_lsn < frozen_to_lsn && self.shard_identity.count.count() > 1 { + // If our layer flushes didn't carry disk_consistent_lsn up to the `to_lsn` advertised + // to us via layer_flush_start_rx, then advance it here. + // + // This path is only taken for tenants with multiple shards: single sharded tenants should + // never encounter a gap in the wal. + let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); + tracing::debug!("Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}"); + if self.set_disk_consistent_lsn(frozen_to_lsn) { + if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) { + tracing::warn!("Failed to schedule metadata upload after updating disk_consistent_lsn: {e}"); + } + } + } + + // Notify any listeners that we're done + let _ = self + .layer_flush_done_tx + .send_replace((flush_counter, result)); + } + } + + /// Waits any flush request created by [`Self::freeze_inmem_layer_at`] to complete. + async fn wait_flush_completion(&self, request: u64) -> Result<(), FlushLayerError> { + let mut rx = self.layer_flush_done_tx.subscribe(); loop { { let (last_result_counter, last_result) = &*rx.borrow(); - if *last_result_counter >= my_flush_request { - if let Err(_err) = last_result { + if *last_result_counter >= request { + if let Err(err) = last_result { // We already logged the original error in // flush_loop. We cannot propagate it to the caller // here, because it might not be Cloneable - anyhow::bail!( - "Could not flush frozen layer. Request id: {}", - my_flush_request - ); + return Err(err.clone()); } else { return Ok(()); } @@ -2763,7 +3534,7 @@ impl Timeline { trace!("waiting for flush to complete"); tokio::select! { rx_e = rx.changed() => { - rx_e?; + rx_e.map_err(|_| FlushLayerError::NotRunning(*self.flush_loop_state.lock().unwrap()))?; }, // Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring // the notification from [`flush_loop`] that it completed. @@ -2776,81 +3547,113 @@ impl Timeline { } } - fn flush_frozen_layers(&self) { - self.layer_flush_start_tx.send_modify(|val| *val += 1); - } - /// Flush one frozen in-memory layer to disk, as a new delta layer. - #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id, layer=%frozen_layer))] + /// + /// Return value is the last lsn (inclusive) of the layer that was frozen. + #[instrument(skip_all, fields(layer=%frozen_layer))] async fn flush_frozen_layer( self: &Arc, frozen_layer: Arc, ctx: &RequestContext, - ) -> Result<(), FlushLayerError> { + ) -> Result { + debug_assert_current_span_has_tenant_and_timeline_id(); + // As a special case, when we have just imported an image into the repository, // instead of writing out a L0 delta layer, we directly write out image layer // files instead. This is possible as long as *all* the data imported into the // repository have the same LSN. let lsn_range = frozen_layer.get_lsn_range(); - let (layers_to_upload, delta_layer_to_add) = - if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) { - #[cfg(test)] - match &mut *self.flush_loop_state.lock().unwrap() { - FlushLoopState::NotStarted | FlushLoopState::Exited => { - panic!("flush loop not running") - } - FlushLoopState::Running { - initdb_optimization_count, - .. - } => { + + // Whether to directly create image layers for this flush, or flush them as delta layers + let create_image_layer = + lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1); + + #[cfg(test)] + { + match &mut *self.flush_loop_state.lock().unwrap() { + FlushLoopState::NotStarted | FlushLoopState::Exited => { + panic!("flush loop not running") + } + FlushLoopState::Running { + expect_initdb_optimization, + initdb_optimization_count, + .. + } => { + if create_image_layer { *initdb_optimization_count += 1; - } - } - // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not - // require downloading anything during initial import. - let (partitioning, _lsn) = self - .repartition( - self.initdb_lsn, - self.get_compaction_target_size(), - EnumSet::empty(), - ctx, - ) - .await?; - - if self.cancel.is_cancelled() { - return Err(FlushLayerError::Cancelled); - } - - // For image layers, we add them immediately into the layer map. - ( - self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx) - .await?, - None, - ) - } else { - #[cfg(test)] - match &mut *self.flush_loop_state.lock().unwrap() { - FlushLoopState::NotStarted | FlushLoopState::Exited => { - panic!("flush loop not running") - } - FlushLoopState::Running { - expect_initdb_optimization, - .. - } => { + } else { assert!(!*expect_initdb_optimization, "expected initdb optimization"); } } - // Normal case, write out a L0 delta layer file. - // `create_delta_layer` will not modify the layer map. - // We will remove frozen layer and add delta layer in one atomic operation later. - let layer = self.create_delta_layer(&frozen_layer, ctx).await?; - ( - // FIXME: even though we have a single image and single delta layer assumption - // we push them to vec - vec![layer.clone()], - Some(layer), + } + } + + let (layers_to_upload, delta_layer_to_add) = if create_image_layer { + // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not + // require downloading anything during initial import. + let ((rel_partition, metadata_partition), _lsn) = self + .repartition( + self.initdb_lsn, + self.get_compaction_target_size(), + EnumSet::empty(), + ctx, ) + .await + .map_err(|e| FlushLayerError::from_anyhow(self, e))?; + + if self.cancel.is_cancelled() { + return Err(FlushLayerError::Cancelled); + } + + let mut layers_to_upload = Vec::new(); + layers_to_upload.extend( + self.create_image_layers( + &rel_partition, + self.initdb_lsn, + ImageLayerCreationMode::Initial, + ctx, + ) + .await?, + ); + if !metadata_partition.parts.is_empty() { + assert_eq!( + metadata_partition.parts.len(), + 1, + "currently sparse keyspace should only contain a single metadata keyspace" + ); + layers_to_upload.extend( + self.create_image_layers( + // Safety: create_image_layers treat sparse keyspaces differently that it does not scan + // every single key within the keyspace, and therefore, it's safe to force converting it + // into a dense keyspace before calling this function. + &metadata_partition.into_dense(), + self.initdb_lsn, + ImageLayerCreationMode::Initial, + ctx, + ) + .await?, + ); + } + + (layers_to_upload, None) + } else { + // Normal case, write out a L0 delta layer file. + // `create_delta_layer` will not modify the layer map. + // We will remove frozen layer and add delta layer in one atomic operation later. + let Some(layer) = self + .create_delta_layer(&frozen_layer, None, ctx) + .await + .map_err(|e| FlushLayerError::from_anyhow(self, e))? + else { + panic!("delta layer cannot be empty if no filter is applied"); }; + ( + // FIXME: even though we have a single image and single delta layer assumption + // we push them to vec + vec![layer.clone()], + Some(layer), + ) + }; pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable"); @@ -2859,32 +3662,42 @@ impl Timeline { } let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1); - let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); // The new on-disk layers are now in the layer map. We can remove the // in-memory layer from the map now. The flushed layer is stored in // the mapping in `create_delta_layer`. - let metadata = { + { let mut guard = self.layers.write().await; - if self.cancel.is_cancelled() { - return Err(FlushLayerError::Cancelled); - } - - guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics); - - if disk_consistent_lsn != old_disk_consistent_lsn { - assert!(disk_consistent_lsn > old_disk_consistent_lsn); - self.disk_consistent_lsn.store(disk_consistent_lsn); + guard.open_mut()?.finish_flush_l0_layer( + delta_layer_to_add.as_ref(), + &frozen_layer, + &self.metrics, + ); + if self.set_disk_consistent_lsn(disk_consistent_lsn) { // Schedule remote uploads that will reflect our new disk_consistent_lsn - Some(self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?) - } else { - None + self.schedule_uploads(disk_consistent_lsn, layers_to_upload) + .map_err(|e| FlushLayerError::from_anyhow(self, e))?; } // release lock on 'layers' }; + // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files. + // This makes us refuse ingest until the new layers have been persisted to the remote. + self.remote_client + .wait_completion() + .await + .map_err(|e| match e { + WaitCompletionError::UploadQueueShutDownOrStopped + | WaitCompletionError::NotInitialized( + NotInitialized::ShuttingDown | NotInitialized::Stopped, + ) => FlushLayerError::Cancelled, + WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => { + FlushLayerError::Other(anyhow!(e).into()) + } + })?; + // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`, // a compaction can delete the file and then it won't be available for uploads any more. // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this @@ -2895,23 +3708,16 @@ impl Timeline { // This failpoint is used by another test case `test_pageserver_recovery`. fail_point!("flush-frozen-exit"); - // Update the metadata file, with new 'disk_consistent_lsn' - // - // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing - // *all* the layers, to avoid fsyncing the file multiple times. + Ok(Lsn(lsn_range.end.0 - 1)) + } - // If we updated our disk_consistent_lsn, persist the updated metadata to local disk. - if let Some(metadata) = metadata { - save_metadata( - self.conf, - &self.tenant_shard_id, - &self.timeline_id, - &metadata, - ) - .await - .context("save_metadata")?; - } - Ok(()) + /// Return true if the value changed + /// + /// This function must only be used from the layer flush task. + fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool { + let old_value = self.disk_consistent_lsn.fetch_max(new_value); + assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}"); + new_value != old_value } /// Update metadata file @@ -2919,7 +3725,7 @@ impl Timeline { &self, disk_consistent_lsn: Lsn, layers_to_upload: impl IntoIterator, - ) -> anyhow::Result { + ) -> anyhow::Result<()> { // We can only save a valid 'prev_record_lsn' value on disk if we // flushed *all* in-memory changes to disk. We only track // 'prev_record_lsn' in memory for the latest processed record, so we @@ -2936,19 +3742,10 @@ impl Timeline { None }; - let ancestor_timeline_id = self - .ancestor_timeline - .as_ref() - .map(|ancestor| ancestor.timeline_id); - - let metadata = TimelineMetadata::new( + let update = crate::tenant::metadata::MetadataUpdate::new( disk_consistent_lsn, ondisk_prev_record_lsn, - ancestor_timeline_id, - self.ancestor_lsn, *self.latest_gc_cutoff_lsn.read(), - self.initdb_lsn, - self.pg_version, ); fail_point!("checkpoint-before-saving-metadata", |x| bail!( @@ -2956,48 +3753,23 @@ impl Timeline { x.unwrap() )); - if let Some(remote_client) = &self.remote_client { - for layer in layers_to_upload { - remote_client.schedule_layer_file_upload(layer)?; - } - remote_client.schedule_index_upload_for_metadata_update(&metadata)?; + for layer in layers_to_upload { + self.remote_client.schedule_layer_file_upload(layer)?; } - - Ok(metadata) - } - - async fn update_metadata_file( - &self, - disk_consistent_lsn: Lsn, - layers_to_upload: impl IntoIterator, - ) -> anyhow::Result<()> { - let metadata = self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?; - - save_metadata( - self.conf, - &self.tenant_shard_id, - &self.timeline_id, - &metadata, - ) - .await - .context("save_metadata")?; + self.remote_client + .schedule_index_upload_for_metadata_update(&update)?; Ok(()) } pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> { - if let Some(remote_client) = &self.remote_client { - remote_client - .preserve_initdb_archive( - &self.tenant_shard_id.tenant_id, - &self.timeline_id, - &self.cancel, - ) - .await?; - } else { - bail!("No remote storage configured, but was asked to backup the initdb archive for {} / {}", self.tenant_shard_id.tenant_id, self.timeline_id); - } - Ok(()) + self.remote_client + .preserve_initdb_archive( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + &self.cancel, + ) + .await } // Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked @@ -3005,55 +3777,59 @@ impl Timeline { async fn create_delta_layer( self: &Arc, frozen_layer: &Arc, + key_range: Option>, ctx: &RequestContext, - ) -> anyhow::Result { - let span = tracing::info_span!("blocking"); - let new_delta: ResidentLayer = tokio::task::spawn_blocking({ - let self_clone = Arc::clone(self); - let frozen_layer = Arc::clone(frozen_layer); - let ctx = ctx.attached_child(); - move || { - // Write it out - // Keep this inside `spawn_blocking` and `Handle::current` - // as long as the write path is still sync and the read impl - // is still not fully async. Otherwise executor threads would - // be blocked. - let _g = span.entered(); - let new_delta = - Handle::current().block_on(frozen_layer.write_to_disk(&self_clone, &ctx))?; - let new_delta_path = new_delta.local_path().to_owned(); + ) -> anyhow::Result> { + let self_clone = Arc::clone(self); + let frozen_layer = Arc::clone(frozen_layer); + let ctx = ctx.attached_child(); + let work = async move { + let Some((desc, path)) = frozen_layer + .write_to_disk(&ctx, key_range, self_clone.l0_flush_global_state.inner()) + .await? + else { + return Ok(None); + }; + let new_delta = Layer::finish_creating(self_clone.conf, &self_clone, desc, &path)?; - // Sync it to disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable. - // - // NB: timeline dir must be synced _after_ the file contents are durable. - // So, two separate fsyncs are required, they mustn't be batched. - // - // TODO: If we're running inside 'flush_frozen_layers' and there are multiple - // files to flush, the fsync overhead can be reduces as follows: - // 1. write them all to temporary file names - // 2. fsync them - // 3. rename to the final name - // 4. fsync the parent directory. - // Note that (1),(2),(3) today happen inside write_to_disk(). - // - // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here - par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?; - par_fsync::par_fsync(&[self_clone + // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes. + // We just need to fsync the directory in which these inodes are linked, + // which we know to be the timeline directory. + // + // We use fatal_err() below because the after write_to_disk returns with success, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. + let timeline_dir = VirtualFile::open( + &self_clone .conf - .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)]) - .context("fsync of timeline dir")?; - - anyhow::Ok(new_delta) + .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id), + &ctx, + ) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + anyhow::Ok(Some(new_delta)) + }; + // Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking. + // Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`. + use crate::virtual_file::io_engine::IoEngine; + match crate::virtual_file::io_engine::get() { + IoEngine::NotSet => panic!("io engine not set"), + IoEngine::StdFs => { + let span = tracing::info_span!("blocking"); + tokio::task::spawn_blocking({ + move || Handle::current().block_on(work.instrument(span)) + }) + .await + .context("spawn_blocking") + .and_then(|x| x) } - }) - .await - .context("spawn_blocking") - .and_then(|x| x)?; - - Ok(new_delta) + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => work.await, + } } async fn repartition( @@ -3062,31 +3838,41 @@ impl Timeline { partition_size: u64, flags: EnumSet, ctx: &RequestContext, - ) -> anyhow::Result<(KeyPartitioning, Lsn)> { - { - let partitioning_guard = self.partitioning.lock().unwrap(); - let distance = lsn.0 - partitioning_guard.1 .0; - if partitioning_guard.1 != Lsn(0) - && distance <= self.repartition_threshold - && !flags.contains(CompactFlags::ForceRepartition) - { - debug!( - distance, - threshold = self.repartition_threshold, - "no repartitioning needed" - ); - return Ok((partitioning_guard.0.clone(), partitioning_guard.1)); - } + ) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> { + let Ok(mut partitioning_guard) = self.partitioning.try_lock() else { + // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline. + // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()` + // and hence before the compaction task starts. + anyhow::bail!("repartition() called concurrently, this should not happen"); + }; + let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard; + if lsn < *partition_lsn { + anyhow::bail!("repartition() called with LSN going backwards, this should not happen"); } - let keyspace = self.collect_keyspace(lsn, ctx).await?; - let partitioning = keyspace.partition(partition_size); - let mut partitioning_guard = self.partitioning.lock().unwrap(); - if lsn > partitioning_guard.1 { - *partitioning_guard = (partitioning, lsn); - } else { - warn!("Concurrent repartitioning of keyspace. This unexpected, but probably harmless"); + let distance = lsn.0 - partition_lsn.0; + if *partition_lsn != Lsn(0) + && distance <= self.repartition_threshold + && !flags.contains(CompactFlags::ForceRepartition) + { + debug!( + distance, + threshold = self.repartition_threshold, + "no repartitioning needed" + ); + return Ok(( + (dense_partition.clone(), sparse_partition.clone()), + *partition_lsn, + )); } + + let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?; + let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size); + let sparse_partitioning = SparseKeyPartitioning { + parts: vec![sparse_ks], + }; // no partitioning for metadata keys for now + *partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn); + Ok((partitioning_guard.0.clone(), partitioning_guard.1)) } @@ -3095,34 +3881,11 @@ impl Timeline { let threshold = self.get_image_creation_threshold(); let guard = self.layers.read().await; - let layers = guard.layer_map(); + let Ok(layers) = guard.layer_map() else { + return false; + }; let mut max_deltas = 0; - { - let wanted_image_layers = self.wanted_image_layers.lock().unwrap(); - if let Some((cutoff_lsn, wanted)) = &*wanted_image_layers { - let img_range = - partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; - if wanted.overlaps(&img_range) { - // - // gc_timeline only pays attention to image layers that are older than the GC cutoff, - // but create_image_layers creates image layers at last-record-lsn. - // So it's possible that gc_timeline wants a new image layer to be created for a key range, - // but the range is already covered by image layers at more recent LSNs. Before we - // create a new image layer, check if the range is already covered at more recent LSNs. - if !layers - .image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1)) - { - debug!( - "Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})", - img_range.start, img_range.end, cutoff_lsn, lsn - ); - return true; - } - } - } - } - for part_range in &partition.ranges { let image_coverage = layers.image_coverage(part_range, lsn); for (img_range, last_img) in image_coverage { @@ -3166,12 +3929,254 @@ impl Timeline { false } - #[tracing::instrument(skip_all, fields(%lsn, %force))] + /// Create image layers for Postgres data. Assumes the caller passes a partition that is not too large, + /// so that at most one image layer will be produced from this function. + async fn create_image_layer_for_rel_blocks( + self: &Arc, + partition: &KeySpace, + mut image_layer_writer: ImageLayerWriter, + lsn: Lsn, + ctx: &RequestContext, + img_range: Range, + start: Key, + ) -> Result { + let mut wrote_keys = false; + + let mut key_request_accum = KeySpaceAccum::new(); + for range in &partition.ranges { + let mut key = range.start; + while key < range.end { + // Decide whether to retain this key: usually we do, but sharded tenants may + // need to drop keys that don't belong to them. If we retain the key, add it + // to `key_request_accum` for later issuing a vectored get + if self.shard_identity.is_key_disposable(&key) { + debug!( + "Dropping key {} during compaction (it belongs on shard {:?})", + key, + self.shard_identity.get_shard_number(&key) + ); + } else { + key_request_accum.add_key(key); + } + + let last_key_in_range = key.next() == range.end; + key = key.next(); + + // Maybe flush `key_rest_accum` + if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS + || (last_key_in_range && key_request_accum.raw_size() > 0) + { + let results = self + .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx) + .await?; + + if self.cancel.is_cancelled() { + return Err(CreateImageLayersError::Cancelled); + } + + for (img_key, img) in results { + let img = match img { + Ok(img) => img, + Err(err) => { + // If we fail to reconstruct a VM or FSM page, we can zero the + // page without losing any actual user data. That seems better + // than failing repeatedly and getting stuck. + // + // We had a bug at one point, where we truncated the FSM and VM + // in the pageserver, but the Postgres didn't know about that + // and continued to generate incremental WAL records for pages + // that didn't exist in the pageserver. Trying to replay those + // WAL records failed to find the previous image of the page. + // This special case allows us to recover from that situation. + // See https://github.com/neondatabase/neon/issues/2601. + // + // Unfortunately we cannot do this for the main fork, or for + // any metadata keys, keys, as that would lead to actual data + // loss. + if img_key.is_rel_fsm_block_key() || img_key.is_rel_vm_block_key() { + warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}"); + ZERO_PAGE.clone() + } else { + return Err(CreateImageLayersError::from(err)); + } + } + }; + + // Write all the keys we just read into our new image layer. + image_layer_writer.put_image(img_key, img, ctx).await?; + wrote_keys = true; + } + } + } + } + + if wrote_keys { + // Normal path: we have written some data into the new image layer for this + // partition, so flush it to disk. + let image_layer = image_layer_writer.finish(self, ctx).await?; + Ok(ImageLayerCreationOutcome { + image: Some(image_layer), + next_start_key: img_range.end, + }) + } else { + // Special case: the image layer may be empty if this is a sharded tenant and the + // partition does not cover any keys owned by this shard. In this case, to ensure + // we don't leave gaps between image layers, leave `start` where it is, so that the next + // layer we write will cover the key range that we just scanned. + tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); + Ok(ImageLayerCreationOutcome { + image: None, + next_start_key: start, + }) + } + } + + /// Create an image layer for metadata keys. This function produces one image layer for all metadata + /// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it + /// would not be too large to fit in a single image layer. + #[allow(clippy::too_many_arguments)] + async fn create_image_layer_for_metadata_keys( + self: &Arc, + partition: &KeySpace, + mut image_layer_writer: ImageLayerWriter, + lsn: Lsn, + ctx: &RequestContext, + img_range: Range, + mode: ImageLayerCreationMode, + start: Key, + ) -> Result { + // Metadata keys image layer creation. + let mut reconstruct_state = ValuesReconstructState::default(); + let data = self + .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx) + .await?; + let (data, total_kb_retrieved, total_keys_retrieved) = { + let mut new_data = BTreeMap::new(); + let mut total_kb_retrieved = 0; + let mut total_keys_retrieved = 0; + for (k, v) in data { + let v = v?; + total_kb_retrieved += KEY_SIZE + v.len(); + total_keys_retrieved += 1; + new_data.insert(k, v); + } + (new_data, total_kb_retrieved / 1024, total_keys_retrieved) + }; + let delta_files_accessed = reconstruct_state.get_delta_layers_visited(); + + let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS; + debug!( + trigger_generation, + delta_files_accessed, + total_kb_retrieved, + total_keys_retrieved, + "generate metadata images" + ); + + if !trigger_generation && mode == ImageLayerCreationMode::Try { + return Ok(ImageLayerCreationOutcome { + image: None, + next_start_key: img_range.end, + }); + } + if self.cancel.is_cancelled() { + return Err(CreateImageLayersError::Cancelled); + } + let mut wrote_any_image = false; + for (k, v) in data { + if v.is_empty() { + // the key has been deleted, it does not need an image + // in metadata keyspace, an empty image == tombstone + continue; + } + wrote_any_image = true; + + // No need to handle sharding b/c metadata keys are always on the 0-th shard. + + // TODO: split image layers to avoid too large layer files. Too large image files are not handled + // on the normal data path either. + image_layer_writer.put_image(k, v, ctx).await?; + } + + if wrote_any_image { + // Normal path: we have written some data into the new image layer for this + // partition, so flush it to disk. + let image_layer = image_layer_writer.finish(self, ctx).await?; + Ok(ImageLayerCreationOutcome { + image: Some(image_layer), + next_start_key: img_range.end, + }) + } else { + // Special case: the image layer may be empty if this is a sharded tenant and the + // partition does not cover any keys owned by this shard. In this case, to ensure + // we don't leave gaps between image layers, leave `start` where it is, so that the next + // layer we write will cover the key range that we just scanned. + tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); + Ok(ImageLayerCreationOutcome { + image: None, + next_start_key: start, + }) + } + } + + /// Predicate function which indicates whether we should check if new image layers + /// are required. Since checking if new image layers are required is expensive in + /// terms of CPU, we only do it in the following cases: + /// 1. If the timeline has ingested sufficient WAL to justify the cost + /// 2. If enough time has passed since the last check: + /// 1. For large tenants, we wish to perform the check more often since they + /// suffer from the lack of image layers + /// 2. For small tenants (that can mostly fit in RAM), we use a much longer interval + fn should_check_if_image_layers_required(self: &Arc, lsn: Lsn) -> bool { + const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024; + + let last_checks_at = self.last_image_layer_creation_check_at.load(); + let distance = lsn + .checked_sub(last_checks_at) + .expect("Attempt to compact with LSN going backwards"); + let min_distance = + self.get_image_layer_creation_check_threshold() as u64 * self.get_checkpoint_distance(); + + let distance_based_decision = distance.0 >= min_distance; + + let mut time_based_decision = false; + let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap(); + if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() { + let check_required_after = if Into::::into(&logical_size) >= LARGE_TENANT_THRESHOLD + { + self.get_checkpoint_timeout() + } else { + Duration::from_secs(3600 * 48) + }; + + time_based_decision = match *last_check_instant { + Some(last_check) => { + let elapsed = last_check.elapsed(); + elapsed >= check_required_after + } + None => true, + }; + } + + // Do the expensive delta layer counting only if this timeline has ingested sufficient + // WAL since the last check or a checkpoint timeout interval has elapsed since the last + // check. + let decision = distance_based_decision || time_based_decision; + + if decision { + self.last_image_layer_creation_check_at.store(lsn); + *last_check_instant = Some(Instant::now()); + } + + decision + } + + #[tracing::instrument(skip_all, fields(%lsn, %mode))] async fn create_image_layers( self: &Arc, partitioning: &KeyPartitioning, lsn: Lsn, - force: bool, + mode: ImageLayerCreationMode, ctx: &RequestContext, ) -> Result, CreateImageLayersError> { let timer = self.metrics.create_images_time_histo.start_timer(); @@ -3188,135 +4193,124 @@ impl Timeline { // image layers <100000000..100000099> and <200000000..200000199> are not completely covering it. let mut start = Key::MIN; + let check_for_image_layers = self.should_check_if_image_layers_required(lsn); + for partition in partitioning.parts.iter() { - let img_range = start..partition.ranges.last().unwrap().end; - start = img_range.end; - if force || self.time_for_new_image_layer(partition, lsn).await { - let mut image_layer_writer = ImageLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_shard_id, - &img_range, - lsn, - ) - .await?; - - fail_point!("image-layer-writer-fail-before-finish", |_| { - Err(CreateImageLayersError::Other(anyhow::anyhow!( - "failpoint image-layer-writer-fail-before-finish" - ))) - }); - - let mut key_request_accum = KeySpaceAccum::new(); - for range in &partition.ranges { - let mut key = range.start; - while key < range.end { - if self.shard_identity.is_key_disposable(&key) { - debug!( - "Dropping key {} during compaction (it belongs on shard {:?})", - key, - self.shard_identity.get_shard_number(&key) - ); - key = key.next(); - continue; - } - - key_request_accum.add_key(key); - if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS - || key.next() == range.end - { - let results = self - .get_vectored( - &key_request_accum.consume_keyspace().ranges, - lsn, - ctx, - ) - .await?; - - for (img_key, img) in results { - let img = match img { - Ok(img) => img, - Err(err) => { - // If we fail to reconstruct a VM or FSM page, we can zero the - // page without losing any actual user data. That seems better - // than failing repeatedly and getting stuck. - // - // We had a bug at one point, where we truncated the FSM and VM - // in the pageserver, but the Postgres didn't know about that - // and continued to generate incremental WAL records for pages - // that didn't exist in the pageserver. Trying to replay those - // WAL records failed to find the previous image of the page. - // This special case allows us to recover from that situation. - // See https://github.com/neondatabase/neon/issues/2601. - // - // Unfortunately we cannot do this for the main fork, or for - // any metadata keys, keys, as that would lead to actual data - // loss. - if is_rel_fsm_block_key(img_key) - || is_rel_vm_block_key(img_key) - { - warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}"); - ZERO_PAGE.clone() - } else { - return Err( - CreateImageLayersError::PageReconstructError(err), - ); - } - } - }; - - image_layer_writer.put_image(img_key, &img).await?; - } - } - - key = key.next(); - } - } - let image_layer = image_layer_writer.finish(self).await?; - image_layers.push(image_layer); + if self.cancel.is_cancelled() { + return Err(CreateImageLayersError::Cancelled); } - } - // All layers that the GC wanted us to create have now been created. - // - // It's possible that another GC cycle happened while we were compacting, and added - // something new to wanted_image_layers, and we now clear that before processing it. - // That's OK, because the next GC iteration will put it back in. - *self.wanted_image_layers.lock().unwrap() = None; - // Sync the new layer to disk before adding it to the layer map, to make sure - // we don't garbage collect something based on the new layer, before it has - // reached the disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable - // - // Compaction creates multiple image layers. It would be better to create them all - // and fsync them all in parallel. - let all_paths = image_layers - .iter() - .map(|layer| layer.local_path().to_owned()) - .collect::>(); + let img_range = start..partition.ranges.last().unwrap().end; + let compact_metadata = partition.overlaps(&Key::metadata_key_range()); + if compact_metadata { + for range in &partition.ranges { + assert!( + range.start.field1 >= METADATA_KEY_BEGIN_PREFIX + && range.end.field1 <= METADATA_KEY_END_PREFIX, + "metadata keys must be partitioned separately" + ); + } + if mode == ImageLayerCreationMode::Try && !check_for_image_layers { + // Skip compaction if there are not enough updates. Metadata compaction will do a scan and + // might mess up with evictions. + start = img_range.end; + continue; + } + // For initial and force modes, we always generate image layers for metadata keys. + } else if let ImageLayerCreationMode::Try = mode { + // check_for_image_layers = false -> skip + // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate + if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await { + start = img_range.end; + continue; + } + } + if let ImageLayerCreationMode::Force = mode { + // When forced to create image layers, we might try and create them where they already + // exist. This mode is only used in tests/debug. + let layers = self.layers.read().await; + if layers.contains_key(&PersistentLayerKey { + key_range: img_range.clone(), + lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn), + is_delta: false, + }) { + tracing::info!( + "Skipping image layer at {lsn} {}..{}, already exists", + img_range.start, + img_range.end + ); + start = img_range.end; + continue; + } + } - par_fsync::par_fsync_async(&all_paths) - .await - .context("fsync of newly created layer files")?; + let image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &img_range, + lsn, + ctx, + ) + .await?; - if !all_paths.is_empty() { - par_fsync::par_fsync_async(&[self - .conf - .timeline_path(&self.tenant_shard_id, &self.timeline_id)]) - .await - .context("fsync of timeline dir")?; + fail_point!("image-layer-writer-fail-before-finish", |_| { + Err(CreateImageLayersError::Other(anyhow::anyhow!( + "failpoint image-layer-writer-fail-before-finish" + ))) + }); + + if !compact_metadata { + let ImageLayerCreationOutcome { + image, + next_start_key, + } = self + .create_image_layer_for_rel_blocks( + partition, + image_layer_writer, + lsn, + ctx, + img_range, + start, + ) + .await?; + + start = next_start_key; + image_layers.extend(image); + } else { + let ImageLayerCreationOutcome { + image, + next_start_key, + } = self + .create_image_layer_for_metadata_keys( + partition, + image_layer_writer, + lsn, + ctx, + img_range, + mode, + start, + ) + .await?; + start = next_start_key; + image_layers.extend(image); + } } let mut guard = self.layers.write().await; // FIXME: we could add the images to be uploaded *before* returning from here, but right - // now they are being scheduled outside of write lock - guard.track_new_image_layers(&image_layers, &self.metrics); + // now they are being scheduled outside of write lock; current way is inconsistent with + // compaction lock order. + guard + .open_mut()? + .track_new_image_layers(&image_layers, &self.metrics); drop_wlock(guard); timer.stop_and_record(); + // Creating image layers may have caused some previously visible layers to be covered + self.update_layer_visibility().await?; + Ok(image_layers) } @@ -3324,6 +4318,22 @@ impl Timeline { /// this Timeline is shut down. Calling this function will cause the initial /// logical size calculation to skip waiting for the background jobs barrier. pub(crate) async fn await_initial_logical_size(self: Arc) { + if !self.shard_identity.is_shard_zero() { + // We don't populate logical size on shard >0: skip waiting for it. + return; + } + + if self.remote_client.is_deleting() { + // The timeline was created in a deletion-resume state, we don't expect logical size to be populated + return; + } + + if self.current_logical_size.current_size().is_exact() { + // root timelines are initialized with exact count, but never start the background + // calculation + return; + } + if let Some(await_bg_cancel) = self .current_logical_size .cancel_wait_for_background_loop_concurrency_limit_semaphore @@ -3335,9 +4345,10 @@ impl Timeline { // the logical size cancellation to skip the concurrency limit semaphore. // TODO: this is an unexpected case. We should restructure so that it // can't happen. - tracing::info!( + tracing::warn!( "await_initial_logical_size: can't get semaphore cancel token, skipping" ); + debug_assert!(false); } tokio::select!( @@ -3345,12 +4356,78 @@ impl Timeline { _ = self.cancel.cancelled() => {} ) } + + /// Detach this timeline from its ancestor by copying all of ancestors layers as this + /// Timelines layers up to the ancestor_lsn. + /// + /// Requires a timeline that: + /// - has an ancestor to detach from + /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not + /// a technical requirement + /// + /// After the operation has been started, it cannot be canceled. Upon restart it needs to be + /// polled again until completion. + /// + /// During the operation all timelines sharing the data with this timeline will be reparented + /// from our ancestor to be branches of this timeline. + pub(crate) async fn prepare_to_detach_from_ancestor( + self: &Arc, + tenant: &crate::tenant::Tenant, + options: detach_ancestor::Options, + ctx: &RequestContext, + ) -> Result { + detach_ancestor::prepare(self, tenant, options, ctx).await + } + + /// Second step of detach from ancestor; detaches the `self` from it's current ancestor and + /// reparents any reparentable children of previous ancestor. + /// + /// This method is to be called while holding the TenantManager's tenant slot, so during this + /// method we cannot be deleted nor can any timeline be deleted. After this method returns + /// successfully, tenant must be reloaded. + /// + /// Final step will be to [`Self::complete_detaching_timeline_ancestor`] after optionally + /// resetting the tenant. + pub(crate) async fn detach_from_ancestor_and_reparent( + self: &Arc, + tenant: &crate::tenant::Tenant, + prepared: detach_ancestor::PreparedTimelineDetach, + ctx: &RequestContext, + ) -> Result { + detach_ancestor::detach_and_reparent(self, tenant, prepared, ctx).await + } + + /// Final step which unblocks the GC. + /// + /// The tenant must've been reset if ancestry was modified previously (in tenant manager). + pub(crate) async fn complete_detaching_timeline_ancestor( + self: &Arc, + tenant: &crate::tenant::Tenant, + attempt: detach_ancestor::Attempt, + ctx: &RequestContext, + ) -> Result<(), detach_ancestor::Error> { + detach_ancestor::complete(self, tenant, attempt, ctx).await + } + + /// Switch aux file policy and schedule upload to the index part. + pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> { + self.last_aux_file_policy.store(Some(policy)); + self.remote_client + .schedule_index_upload_for_aux_file_policy_update(Some(policy))?; + Ok(()) + } } -#[derive(Default)] -struct CompactLevel0Phase1Result { - new_layers: Vec, - deltas_to_compact: Vec, +impl Drop for Timeline { + fn drop(&mut self) { + if let Some(ancestor) = &self.ancestor_timeline { + // This lock should never be poisoned, but in case it is we do a .map() instead of + // an unwrap(), to avoid panicking in a destructor and thereby aborting the process. + if let Ok(mut gc_info) = ancestor.gc_info.write() { + gc_info.remove_child(self.timeline_id) + } + } + } } /// Top-level failure to compact. @@ -3360,7 +4437,59 @@ pub(crate) enum CompactionError { ShuttingDown, /// Compaction cannot be done right now; page reconstruction and so on. #[error(transparent)] - Other(#[from] anyhow::Error), + Other(anyhow::Error), +} + +impl From for CompactionError { + fn from(err: CollectKeySpaceError) -> Self { + match err { + CollectKeySpaceError::Cancelled + | CollectKeySpaceError::PageRead(PageReconstructError::Cancelled) => { + CompactionError::ShuttingDown + } + e => CompactionError::Other(e.into()), + } + } +} + +impl From for CompactionError { + fn from(value: super::upload_queue::NotInitialized) -> Self { + match value { + super::upload_queue::NotInitialized::Uninitialized => { + CompactionError::Other(anyhow::anyhow!(value)) + } + super::upload_queue::NotInitialized::ShuttingDown + | super::upload_queue::NotInitialized::Stopped => CompactionError::ShuttingDown, + } + } +} + +impl From for CompactionError { + fn from(e: super::storage_layer::layer::DownloadError) -> Self { + match e { + super::storage_layer::layer::DownloadError::TimelineShutdown + | super::storage_layer::layer::DownloadError::DownloadCancelled => { + CompactionError::ShuttingDown + } + super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads + | super::storage_layer::layer::DownloadError::DownloadRequired + | super::storage_layer::layer::DownloadError::NotFile(_) + | super::storage_layer::layer::DownloadError::DownloadFailed + | super::storage_layer::layer::DownloadError::PreStatFailed(_) => { + CompactionError::Other(anyhow::anyhow!(e)) + } + #[cfg(test)] + super::storage_layer::layer::DownloadError::Failpoint(_) => { + CompactionError::Other(anyhow::anyhow!(e)) + } + } + } +} + +impl From for CompactionError { + fn from(_: layer_manager::Shutdown) -> Self { + CompactionError::ShuttingDown + } } #[serde_as] @@ -3375,7 +4504,7 @@ enum DurationRecorder { } impl DurationRecorder { - pub fn till_now(&self) -> DurationRecorder { + fn till_now(&self) -> DurationRecorder { match self { DurationRecorder::NotStarted => { panic!("must only call on recorded measurements") @@ -3386,7 +4515,7 @@ impl DurationRecorder { } } } - pub fn into_recorded(self) -> Option { + fn into_recorded(self) -> Option { match self { DurationRecorder::NotStarted => None, DurationRecorder::Recorded(recorded, _) => Some(recorded), @@ -3394,580 +4523,68 @@ impl DurationRecorder { } } -#[derive(Default)] -struct CompactLevel0Phase1StatsBuilder { - version: Option, - tenant_id: Option, - timeline_id: Option, - read_lock_acquisition_micros: DurationRecorder, - read_lock_held_spawn_blocking_startup_micros: DurationRecorder, - read_lock_held_key_sort_micros: DurationRecorder, - read_lock_held_prerequisites_micros: DurationRecorder, - read_lock_held_compute_holes_micros: DurationRecorder, - read_lock_drop_micros: DurationRecorder, - write_layer_files_micros: DurationRecorder, - level0_deltas_count: Option, - new_deltas_count: Option, - new_deltas_size: Option, +/// Descriptor for a delta layer used in testing infra. The start/end key/lsn range of the +/// delta layer might be different from the min/max key/lsn in the delta layer. Therefore, +/// the layer descriptor requires the user to provide the ranges, which should cover all +/// keys specified in the `data` field. +#[cfg(test)] +#[derive(Clone)] +pub struct DeltaLayerTestDesc { + pub lsn_range: Range, + pub key_range: Range, + pub data: Vec<(Key, Lsn, Value)>, } -#[derive(serde::Serialize)] -struct CompactLevel0Phase1Stats { - version: u64, - tenant_id: TenantShardId, - timeline_id: TimelineId, - read_lock_acquisition_micros: RecordedDuration, - read_lock_held_spawn_blocking_startup_micros: RecordedDuration, - read_lock_held_key_sort_micros: RecordedDuration, - read_lock_held_prerequisites_micros: RecordedDuration, - read_lock_held_compute_holes_micros: RecordedDuration, - read_lock_drop_micros: RecordedDuration, - write_layer_files_micros: RecordedDuration, - level0_deltas_count: usize, - new_deltas_count: usize, - new_deltas_size: u64, -} +#[cfg(test)] +impl DeltaLayerTestDesc { + pub fn new(lsn_range: Range, key_range: Range, data: Vec<(Key, Lsn, Value)>) -> Self { + Self { + lsn_range, + key_range, + data, + } + } -impl TryFrom for CompactLevel0Phase1Stats { - type Error = anyhow::Error; + pub fn new_with_inferred_key_range( + lsn_range: Range, + data: Vec<(Key, Lsn, Value)>, + ) -> Self { + let key_min = data.iter().map(|(key, _, _)| key).min().unwrap(); + let key_max = data.iter().map(|(key, _, _)| key).max().unwrap(); + Self { + key_range: (*key_min)..(key_max.next()), + lsn_range, + data, + } + } - fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result { - Ok(Self { - version: value.version.ok_or_else(|| anyhow!("version not set"))?, - tenant_id: value - .tenant_id - .ok_or_else(|| anyhow!("tenant_id not set"))?, - timeline_id: value - .timeline_id - .ok_or_else(|| anyhow!("timeline_id not set"))?, - read_lock_acquisition_micros: value - .read_lock_acquisition_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?, - read_lock_held_spawn_blocking_startup_micros: value - .read_lock_held_spawn_blocking_startup_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?, - read_lock_held_key_sort_micros: value - .read_lock_held_key_sort_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?, - read_lock_held_prerequisites_micros: value - .read_lock_held_prerequisites_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?, - read_lock_held_compute_holes_micros: value - .read_lock_held_compute_holes_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?, - read_lock_drop_micros: value - .read_lock_drop_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?, - write_layer_files_micros: value - .write_layer_files_micros - .into_recorded() - .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?, - level0_deltas_count: value - .level0_deltas_count - .ok_or_else(|| anyhow!("level0_deltas_count not set"))?, - new_deltas_count: value - .new_deltas_count - .ok_or_else(|| anyhow!("new_deltas_count not set"))?, - new_deltas_size: value - .new_deltas_size - .ok_or_else(|| anyhow!("new_deltas_size not set"))?, + pub(crate) fn layer_name(&self) -> LayerName { + LayerName::Delta(super::storage_layer::DeltaLayerName { + key_range: self.key_range.clone(), + lsn_range: self.lsn_range.clone(), }) } } impl Timeline { - /// Level0 files first phase of compaction, explained in the [`Self::compact`] comment. - async fn compact_level0_phase1( + async fn finish_compact_batch( self: &Arc, - guard: tokio::sync::OwnedRwLockReadGuard, - mut stats: CompactLevel0Phase1StatsBuilder, - target_file_size: u64, - ctx: &RequestContext, - ) -> Result { - stats.read_lock_held_spawn_blocking_startup_micros = - stats.read_lock_acquisition_micros.till_now(); // set by caller - let layers = guard.layer_map(); - let level0_deltas = layers.get_level0_deltas()?; - let mut level0_deltas = level0_deltas - .into_iter() - .map(|x| guard.get_from_desc(&x)) - .collect_vec(); - stats.level0_deltas_count = Some(level0_deltas.len()); - // Only compact if enough layers have accumulated. - let threshold = self.get_compaction_threshold(); - if level0_deltas.is_empty() || level0_deltas.len() < threshold { - debug!( - level0_deltas = level0_deltas.len(), - threshold, "too few deltas to compact" - ); - return Ok(CompactLevel0Phase1Result::default()); - } - - // This failpoint is used together with `test_duplicate_layers` integration test. - // It returns the compaction result exactly the same layers as input to compaction. - // We want to ensure that this will not cause any problem when updating the layer map - // after the compaction is finished. - // - // Currently, there are two rare edge cases that will cause duplicated layers being - // inserted. - // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which - // is compacted to 5, but the page server is shut down, next time we start page server we will get a layer - // map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this - // point again, it is likely that we will get a file 6 which has the same content and the key range as 5, - // and this causes an overwrite. This is acceptable because the content is the same, and we should do a - // layer replace instead of the normal remove / upload process. - // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file - // size length. Compaction will likely create the same set of n files afterwards. - // - // This failpoint is a superset of both of the cases. - if cfg!(feature = "testing") { - let active = (|| { - ::fail::fail_point!("compact-level0-phase1-return-same", |_| true); - false - })(); - - if active { - let mut new_layers = Vec::with_capacity(level0_deltas.len()); - for delta in &level0_deltas { - // we are just faking these layers as being produced again for this failpoint - new_layers.push( - delta - .download_and_keep_resident() - .await - .context("download layer for failpoint")?, - ); - } - tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint - return Ok(CompactLevel0Phase1Result { - new_layers, - deltas_to_compact: level0_deltas, - }); - } - } - - // Gather the files to compact in this iteration. - // - // Start with the oldest Level 0 delta file, and collect any other - // level 0 files that form a contiguous sequence, such that the end - // LSN of previous file matches the start LSN of the next file. - // - // Note that if the files don't form such a sequence, we might - // "compact" just a single file. That's a bit pointless, but it allows - // us to get rid of the level 0 file, and compact the other files on - // the next iteration. This could probably made smarter, but such - // "gaps" in the sequence of level 0 files should only happen in case - // of a crash, partial download from cloud storage, or something like - // that, so it's not a big deal in practice. - level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start); - let mut level0_deltas_iter = level0_deltas.iter(); - - let first_level0_delta = level0_deltas_iter.next().unwrap(); - let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end; - let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len()); - - deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); - for l in level0_deltas_iter { - let lsn_range = &l.layer_desc().lsn_range; - - if lsn_range.start != prev_lsn_end { - break; - } - deltas_to_compact.push(l.download_and_keep_resident().await?); - prev_lsn_end = lsn_range.end; - } - let lsn_range = Range { - start: deltas_to_compact - .first() - .unwrap() - .layer_desc() - .lsn_range - .start, - end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end, - }; - - info!( - "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", - lsn_range.start, - lsn_range.end, - deltas_to_compact.len(), - level0_deltas.len() - ); - - for l in deltas_to_compact.iter() { - info!("compact includes {l}"); - } - - // We don't need the original list of layers anymore. Drop it so that - // we don't accidentally use it later in the function. - drop(level0_deltas); - - stats.read_lock_held_prerequisites_micros = stats - .read_lock_held_spawn_blocking_startup_micros - .till_now(); - - // Determine N largest holes where N is number of compacted layers. - let max_holes = deltas_to_compact.len(); - let last_record_lsn = self.get_last_record_lsn(); - let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; - let min_hole_coverage_size = 3; // TODO: something more flexible? - - // min-heap (reserve space for one more element added before eviction) - let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); - let mut prev: Option = None; - - let mut all_keys = Vec::new(); - - for l in deltas_to_compact.iter() { - all_keys.extend(l.load_keys(ctx).await?); - } - - // FIXME: should spawn_blocking the rest of this function - - // The current stdlib sorting implementation is designed in a way where it is - // particularly fast where the slice is made up of sorted sub-ranges. - all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn)); - - stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now(); - - for &DeltaEntry { key: next_key, .. } in all_keys.iter() { - if let Some(prev_key) = prev { - // just first fast filter - if next_key.to_i128() - prev_key.to_i128() >= min_hole_range { - let key_range = prev_key..next_key; - // Measuring hole by just subtraction of i128 representation of key range boundaries - // has not so much sense, because largest holes will corresponds field1/field2 changes. - // But we are mostly interested to eliminate holes which cause generation of excessive image layers. - // That is why it is better to measure size of hole as number of covering image layers. - let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len(); - if coverage_size >= min_hole_coverage_size { - heap.push(Hole { - key_range, - coverage_size, - }); - if heap.len() > max_holes { - heap.pop(); // remove smallest hole - } - } - } - } - prev = Some(next_key.next()); - } - stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); - drop_rlock(guard); - stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now(); - let mut holes = heap.into_vec(); - holes.sort_unstable_by_key(|hole| hole.key_range.start); - let mut next_hole = 0; // index of next hole in holes vector - - // This iterator walks through all key-value pairs from all the layers - // we're compacting, in key, LSN order. - let all_values_iter = all_keys.iter(); - - // This iterator walks through all keys and is needed to calculate size used by each key - let mut all_keys_iter = all_keys - .iter() - .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size)) - .coalesce(|mut prev, cur| { - // Coalesce keys that belong to the same key pair. - // This ensures that compaction doesn't put them - // into different layer files. - // Still limit this by the target file size, - // so that we keep the size of the files in - // check. - if prev.0 == cur.0 && prev.2 < target_file_size { - prev.2 += cur.2; - Ok(prev) - } else { - Err((prev, cur)) - } - }); - - // Merge the contents of all the input delta layers into a new set - // of delta layers, based on the current partitioning. - // - // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one. - // It's possible that there is a single key with so many page versions that storing all of them in a single layer file - // would be too large. In that case, we also split on the LSN dimension. - // - // LSN - // ^ - // | - // | +-----------+ +--+--+--+--+ - // | | | | | | | | - // | +-----------+ | | | | | - // | | | | | | | | - // | +-----------+ ==> | | | | | - // | | | | | | | | - // | +-----------+ | | | | | - // | | | | | | | | - // | +-----------+ +--+--+--+--+ - // | - // +--------------> key - // - // - // If one key (X) has a lot of page versions: - // - // LSN - // ^ - // | (X) - // | +-----------+ +--+--+--+--+ - // | | | | | | | | - // | +-----------+ | | +--+ | - // | | | | | | | | - // | +-----------+ ==> | | | | | - // | | | | | +--+ | - // | +-----------+ | | | | | - // | | | | | | | | - // | +-----------+ +--+--+--+--+ - // | - // +--------------> key - // TODO: this actually divides the layers into fixed-size chunks, not - // based on the partitioning. - // - // TODO: we should also opportunistically materialize and - // garbage collect what we can. - let mut new_layers = Vec::new(); - let mut prev_key: Option = None; - let mut writer: Option = None; - let mut key_values_total_size = 0u64; - let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key - let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key - - for &DeltaEntry { - key, lsn, ref val, .. - } in all_values_iter - { - let value = val.load(ctx).await?; - let same_key = prev_key.map_or(false, |prev_key| prev_key == key); - // We need to check key boundaries once we reach next key or end of layer with the same key - if !same_key || lsn == dup_end_lsn { - let mut next_key_size = 0u64; - let is_dup_layer = dup_end_lsn.is_valid(); - dup_start_lsn = Lsn::INVALID; - if !same_key { - dup_end_lsn = Lsn::INVALID; - } - // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size - for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { - next_key_size = next_size; - if key != next_key { - if dup_end_lsn.is_valid() { - // We are writting segment with duplicates: - // place all remaining values of this key in separate segment - dup_start_lsn = dup_end_lsn; // new segments starts where old stops - dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range - } - break; - } - key_values_total_size += next_size; - // Check if it is time to split segment: if total keys size is larger than target file size. - // We need to avoid generation of empty segments if next_size > target_file_size. - if key_values_total_size > target_file_size && lsn != next_lsn { - // Split key between multiple layers: such layer can contain only single key - dup_start_lsn = if dup_end_lsn.is_valid() { - dup_end_lsn // new segment with duplicates starts where old one stops - } else { - lsn // start with the first LSN for this key - }; - dup_end_lsn = next_lsn; // upper LSN boundary is exclusive - break; - } - } - // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set. - if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { - dup_start_lsn = dup_end_lsn; - dup_end_lsn = lsn_range.end; - } - if writer.is_some() { - let written_size = writer.as_mut().unwrap().size(); - let contains_hole = - next_hole < holes.len() && key >= holes[next_hole].key_range.end; - // check if key cause layer overflow or contains hole... - if is_dup_layer - || dup_end_lsn.is_valid() - || written_size + key_values_total_size > target_file_size - || contains_hole - { - // ... if so, flush previous layer and prepare to write new one - new_layers.push( - writer - .take() - .unwrap() - .finish(prev_key.unwrap().next(), self) - .await?, - ); - writer = None; - - if contains_hole { - // skip hole - next_hole += 1; - } - } - } - // Remember size of key value because at next iteration we will access next item - key_values_total_size = next_key_size; - } - if writer.is_none() { - // Create writer if not initiaized yet - writer = Some( - DeltaLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_shard_id, - key, - if dup_end_lsn.is_valid() { - // this is a layer containing slice of values of the same key - debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); - dup_start_lsn..dup_end_lsn - } else { - debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); - lsn_range.clone() - }, - ) - .await?, - ); - } - - fail_point!("delta-layer-writer-fail-before-finish", |_| { - Err(CompactionError::Other(anyhow::anyhow!( - "failpoint delta-layer-writer-fail-before-finish" - ))) - }); - - if !self.shard_identity.is_key_disposable(&key) { - writer.as_mut().unwrap().put_value(key, lsn, value).await?; - } else { - debug!( - "Dropping key {} during compaction (it belongs on shard {:?})", - key, - self.shard_identity.get_shard_number(&key) - ); - } - - if !new_layers.is_empty() { - fail_point!("after-timeline-compacted-first-L1"); - } - - prev_key = Some(key); - } - if let Some(writer) = writer { - new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?); - } - - // Sync layers - if !new_layers.is_empty() { - // Print a warning if the created layer is larger than double the target size - // Add two pages for potential overhead. This should in theory be already - // accounted for in the target calculation, but for very small targets, - // we still might easily hit the limit otherwise. - let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2; - for layer in new_layers.iter() { - if layer.layer_desc().file_size > warn_limit { - warn!( - %layer, - "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size - ); - } - } - - // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here - let layer_paths: Vec = new_layers - .iter() - .map(|l| l.local_path().to_owned()) - .collect(); - - // Fsync all the layer files and directory using multiple threads to - // minimize latency. - par_fsync::par_fsync_async(&layer_paths) - .await - .context("fsync all new layers")?; - - let timeline_dir = self - .conf - .timeline_path(&self.tenant_shard_id, &self.timeline_id); - - par_fsync::par_fsync_async(&[timeline_dir]) - .await - .context("fsync of timeline dir")?; - } - - stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now(); - stats.new_deltas_count = Some(new_layers.len()); - stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum()); - - match TryInto::::try_into(stats) - .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string")) - { - Ok(stats_json) => { - info!( - stats_json = stats_json.as_str(), - "compact_level0_phase1 stats available" - ) - } - Err(e) => { - warn!("compact_level0_phase1 stats failed to serialize: {:#}", e); - } - } - - Ok(CompactLevel0Phase1Result { - new_layers, - deltas_to_compact: deltas_to_compact - .into_iter() - .map(|x| x.drop_eviction_guard()) - .collect::>(), - }) - } - - /// - /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as - /// as Level 1 files. - /// - async fn compact_level0( - self: &Arc, - target_file_size: u64, - ctx: &RequestContext, + new_deltas: &[ResidentLayer], + new_images: &[ResidentLayer], + layers_to_remove: &[Layer], ) -> Result<(), CompactionError> { - let CompactLevel0Phase1Result { - new_layers, - deltas_to_compact, - } = { - let phase1_span = info_span!("compact_level0_phase1"); - let ctx = ctx.attached_child(); - let mut stats = CompactLevel0Phase1StatsBuilder { - version: Some(2), - tenant_id: Some(self.tenant_shard_id), - timeline_id: Some(self.timeline_id), - ..Default::default() - }; - - let begin = tokio::time::Instant::now(); - let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await; - let now = tokio::time::Instant::now(); - stats.read_lock_acquisition_micros = - DurationRecorder::Recorded(RecordedDuration(now - begin), now); - self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx) - .instrument(phase1_span) - .await? + let mut guard = tokio::select! { + guard = self.layers.write() => guard, + _ = self.cancel.cancelled() => { + return Err(CompactionError::ShuttingDown); + } }; - if new_layers.is_empty() && deltas_to_compact.is_empty() { - // nothing to do - return Ok(()); - } - - let mut guard = self.layers.write().await; - let mut duplicated_layers = HashSet::new(); - let mut insert_layers = Vec::with_capacity(new_layers.len()); + let mut insert_layers = Vec::with_capacity(new_deltas.len()); - for l in &new_layers { + for l in new_deltas { if guard.contains(l.as_ref()) { // expected in tests tracing::error!(layer=%l, "duplicated L1 layer"); @@ -3977,130 +4594,188 @@ impl Timeline { // for compact_level0_phase1 creating an L0, which does not happen in practice // because we have not implemented L0 => L0 compaction. duplicated_layers.insert(l.layer_desc().key()); - } else if LayerMap::is_l0(l.layer_desc()) { - return Err(CompactionError::Other(anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction."))); + } else if LayerMap::is_l0(&l.layer_desc().key_range, l.layer_desc().is_delta) { + return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction."))); } else { insert_layers.push(l.clone()); } } - let remove_layers = { - let mut deltas_to_compact = deltas_to_compact; - // only remove those inputs which were not outputs - deltas_to_compact.retain(|l| !duplicated_layers.contains(&l.layer_desc().key())); - deltas_to_compact - }; + // only remove those inputs which were not outputs + let remove_layers: Vec = layers_to_remove + .iter() + .filter(|l| !duplicated_layers.contains(&l.layer_desc().key())) + .cloned() + .collect(); - // deletion will happen later, the layer file manager calls garbage_collect_on_drop - guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics); - - if let Some(remote_client) = self.remote_client.as_ref() { - remote_client.schedule_compaction_update(&remove_layers, &new_layers)?; + if !new_images.is_empty() { + guard + .open_mut()? + .track_new_image_layers(new_images, &self.metrics); } + guard + .open_mut()? + .finish_compact_l0(&remove_layers, &insert_layers, &self.metrics); + + self.remote_client + .schedule_compaction_update(&remove_layers, new_deltas)?; + drop_wlock(guard); Ok(()) } - /// Update information about which layer files need to be retained on - /// garbage collection. This is separate from actually performing the GC, - /// and is updated more frequently, so that compaction can remove obsolete - /// page versions more aggressively. + async fn rewrite_layers( + self: &Arc, + mut replace_layers: Vec<(Layer, ResidentLayer)>, + mut drop_layers: Vec, + ) -> Result<(), CompactionError> { + let mut guard = self.layers.write().await; + + // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want + // to avoid double-removing, and avoid rewriting something that was removed. + replace_layers.retain(|(l, _)| guard.contains(l)); + drop_layers.retain(|l| guard.contains(l)); + + guard + .open_mut()? + .rewrite_layers(&replace_layers, &drop_layers, &self.metrics); + + let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect(); + + self.remote_client + .schedule_compaction_update(&drop_layers, &upload_layers)?; + + Ok(()) + } + + /// Schedules the uploads of the given image layers + fn upload_new_image_layers( + self: &Arc, + new_images: impl IntoIterator, + ) -> Result<(), super::upload_queue::NotInitialized> { + for layer in new_images { + self.remote_client.schedule_layer_file_upload(layer)?; + } + // should any new image layer been created, not uploading index_part will + // result in a mismatch between remote_physical_size and layermap calculated + // size, which will fail some tests, but should not be an issue otherwise. + self.remote_client + .schedule_index_upload_for_file_changes()?; + Ok(()) + } + + /// Find the Lsns above which layer files need to be retained on + /// garbage collection. /// - /// TODO: that's wishful thinking, compaction doesn't actually do that - /// currently. - /// - /// The caller specifies how much history is needed with the 3 arguments: - /// - /// retain_lsns: keep a version of each page at these LSNs - /// cutoff_horizon: also keep everything newer than this LSN - /// pitr: the time duration required to keep data for PITR - /// - /// The 'retain_lsns' list is currently used to prevent removing files that - /// are needed by child timelines. In the future, the user might be able to - /// name additional points in time to retain. The caller is responsible for - /// collecting that information. - /// - /// The 'cutoff_horizon' point is used to retain recent versions that might still be - /// needed by read-only nodes. (As of this writing, the caller just passes - /// the latest LSN subtracted by a constant, and doesn't do anything smart - /// to figure out what read-only nodes might actually need.) - /// - /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine - /// whether a record is needed for PITR. - /// - /// NOTE: This function holds a short-lived lock to protect the 'gc_info' - /// field, so that the three values passed as argument are stored - /// atomically. But the caller is responsible for ensuring that no new - /// branches are created that would need to be included in 'retain_lsns', - /// for example. The caller should hold `Tenant::gc_cs` lock to ensure - /// that. + /// We calculate two cutoffs, one based on time and one based on WAL size. `pitr` + /// controls the time cutoff (or ZERO to disable time-based retention), and `space_cutoff` controls + /// the space-based retention. /// + /// This function doesn't simply to calculate time & space based retention: it treats time-based + /// retention as authoritative if enabled, and falls back to space-based retention if calculating + /// the LSN for a time point isn't possible. Therefore the GcCutoffs::horizon in the response might + /// be different to the `space_cutoff` input. Callers should treat the min() of the two cutoffs + /// in the response as the GC cutoff point for the timeline. #[instrument(skip_all, fields(timeline_id=%self.timeline_id))] - pub(super) async fn update_gc_info( + pub(super) async fn find_gc_cutoffs( &self, - retain_lsns: Vec, - cutoff_horizon: Lsn, + space_cutoff: Lsn, pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result<()> { - // First, calculate pitr_cutoff_timestamp and then convert it to LSN. - // - // Some unit tests depend on garbage-collection working even when - // CLOG data is missing, so that find_lsn_for_timestamp() doesn't - // work, so avoid calling it altogether if time-based retention is not - // configured. It would be pointless anyway. - let pitr_cutoff = if pitr != Duration::ZERO { - let now = SystemTime::now(); - if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { - let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); + ) -> Result { + let _timer = self + .metrics + .find_gc_cutoffs_histo + .start_timer() + .record_on_drop(); - match self - .find_lsn_for_timestamp(pitr_timestamp, cancel, ctx) - .await? - { - LsnForTimestamp::Present(lsn) => lsn, - LsnForTimestamp::Future(lsn) => { - // The timestamp is in the future. That sounds impossible, - // but what it really means is that there hasn't been - // any commits since the cutoff timestamp. - debug!("future({})", lsn); - cutoff_horizon - } - LsnForTimestamp::Past(lsn) => { - debug!("past({})", lsn); - // conservative, safe default is to remove nothing, when we - // have no commit timestamp data available - *self.get_latest_gc_cutoff_lsn() - } - LsnForTimestamp::NoData(lsn) => { - debug!("nodata({})", lsn); - // conservative, safe default is to remove nothing, when we - // have no commit timestamp data available - *self.get_latest_gc_cutoff_lsn() - } - } - } else { - // If we don't have enough data to convert to LSN, - // play safe and don't remove any layers. - *self.get_latest_gc_cutoff_lsn() + pausable_failpoint!("Timeline::find_gc_cutoffs-pausable"); + + if cfg!(test) { + // Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup + if pitr == Duration::ZERO { + return Ok(GcCutoffs { + time: self.get_last_record_lsn(), + space: space_cutoff, + }); + } + } + + // Calculate a time-based limit on how much to retain: + // - if PITR interval is set, then this is our cutoff. + // - if PITR interval is not set, then we do a lookup + // based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases. + let time_cutoff = { + let now = SystemTime::now(); + let time_range = if pitr == Duration::ZERO { + humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid") + } else { + pitr + }; + + // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case) + let time_cutoff = now.checked_sub(time_range).unwrap_or(now); + let timestamp = to_pg_timestamp(time_cutoff); + + match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? { + LsnForTimestamp::Present(lsn) => Some(lsn), + LsnForTimestamp::Future(lsn) => { + // The timestamp is in the future. That sounds impossible, + // but what it really means is that there hasn't been + // any commits since the cutoff timestamp. + // + // In this case we should use the LSN of the most recent commit, + // which is implicitly the last LSN in the log. + debug!("future({})", lsn); + Some(self.get_last_record_lsn()) + } + LsnForTimestamp::Past(lsn) => { + debug!("past({})", lsn); + None + } + LsnForTimestamp::NoData(lsn) => { + debug!("nodata({})", lsn); + None + } } - } else { - // No time-based retention was configured. Set time-based cutoff to - // same as LSN based. - cutoff_horizon }; - // Grab the lock and update the values - *self.gc_info.write().unwrap() = GcInfo { - retain_lsns, - horizon_cutoff: cutoff_horizon, - pitr_cutoff, - }; - - Ok(()) + Ok(match (pitr, time_cutoff) { + (Duration::ZERO, Some(time_cutoff)) => { + // PITR is not set. Retain the size-based limit, or the default time retention, + // whichever requires less data. + GcCutoffs { + time: self.get_last_record_lsn(), + space: std::cmp::max(time_cutoff, space_cutoff), + } + } + (Duration::ZERO, None) => { + // PITR is not set, and time lookup failed + GcCutoffs { + time: self.get_last_record_lsn(), + space: space_cutoff, + } + } + (_, None) => { + // PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR + // cannot advance beyond what was already GC'd, and respect space-based retention + GcCutoffs { + time: *self.get_latest_gc_cutoff_lsn(), + space: space_cutoff, + } + } + (_, Some(time_cutoff)) => { + // PITR interval is set and we looked up timestamp successfully. Ignore + // size based retention and make time cutoff authoritative + GcCutoffs { + time: time_cutoff, + space: time_cutoff, + } + } + }) } /// Garbage collect layer files on a timeline that are no longer needed. @@ -4108,14 +4783,12 @@ impl Timeline { /// Currently, we don't make any attempt at removing unneeded page versions /// within a layer file. We can only remove the whole file if it's fully /// obsolete. - pub(super) async fn gc(&self) -> anyhow::Result { + pub(super) async fn gc(&self) -> Result { // this is most likely the background tasks, but it might be the spawned task from // immediate_gc - let cancel = crate::task_mgr::shutdown_token(); let _g = tokio::select! { guard = self.gc_lock.lock() => guard, _ = self.cancel.cancelled() => return Ok(GcResult::default()), - _ = cancel.cancelled() => return Ok(GcResult::default()), }; let timer = self.metrics.garbage_collect_histo.start_timer(); @@ -4123,22 +4796,69 @@ impl Timeline { // Is the timeline being deleted? if self.is_stopping() { - anyhow::bail!("timeline is Stopping"); + return Err(GcError::TimelineCancelled); } - let (horizon_cutoff, pitr_cutoff, retain_lsns) = { + let (space_cutoff, time_cutoff, retain_lsns, max_lsn_with_valid_lease) = { let gc_info = self.gc_info.read().unwrap(); - let horizon_cutoff = min(gc_info.horizon_cutoff, self.get_disk_consistent_lsn()); - let pitr_cutoff = gc_info.pitr_cutoff; - let retain_lsns = gc_info.retain_lsns.clone(); - (horizon_cutoff, pitr_cutoff, retain_lsns) + let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn()); + let time_cutoff = gc_info.cutoffs.time; + let retain_lsns = gc_info + .retain_lsns + .iter() + .map(|(lsn, _child_id)| *lsn) + .collect(); + + // Gets the maximum LSN that holds the valid lease. + // + // Caveat: `refresh_gc_info` is in charged of updating the lease map. + // Here, we do not check for stale leases again. + let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn); + + ( + space_cutoff, + time_cutoff, + retain_lsns, + max_lsn_with_valid_lease, + ) }; - let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); + let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff); + let standby_horizon = self.standby_horizon.load(); + // Hold GC for the standby, but as a safety guard do it only within some + // reasonable lag. + if standby_horizon != Lsn::INVALID { + if let Some(standby_lag) = new_gc_cutoff.checked_sub(standby_horizon) { + const MAX_ALLOWED_STANDBY_LAG: u64 = 10u64 << 30; // 10 GB + if standby_lag.0 < MAX_ALLOWED_STANDBY_LAG { + new_gc_cutoff = Lsn::min(standby_horizon, new_gc_cutoff); + trace!("holding off GC for standby apply LSN {}", standby_horizon); + } else { + warn!( + "standby is lagging for more than {}MB, not holding gc for it", + MAX_ALLOWED_STANDBY_LAG / 1024 / 1024 + ) + } + } + } + + // Reset standby horizon to ignore it if it is not updated till next GC. + // It is an easy way to unset it when standby disappears without adding + // more conf options. + self.standby_horizon.store(Lsn::INVALID); + self.metrics + .standby_horizon_gauge + .set(Lsn::INVALID.0 as i64); let res = self - .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff) + .gc_timeline( + space_cutoff, + time_cutoff, + retain_lsns, + max_lsn_with_valid_lease, + new_gc_cutoff, + ) .instrument( info_span!("gc_timeline", timeline_id = %self.timeline_id, cutoff = %new_gc_cutoff), ) @@ -4152,11 +4872,14 @@ impl Timeline { async fn gc_timeline( &self, - horizon_cutoff: Lsn, - pitr_cutoff: Lsn, + space_cutoff: Lsn, + time_cutoff: Lsn, retain_lsns: Vec, + max_lsn_with_valid_lease: Option, new_gc_cutoff: Lsn, - ) -> anyhow::Result { + ) -> Result { + // FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc + let now = SystemTime::now(); let mut result: GcResult = GcResult::default(); @@ -4176,12 +4899,15 @@ impl Timeline { // The GC cutoff should only ever move forwards. let waitlist = { let write_guard = self.latest_gc_cutoff_lsn.lock_for_write(); - ensure!( - *write_guard <= new_gc_cutoff, - "Cannot move GC cutoff LSN backwards (was {}, new {})", - *write_guard, - new_gc_cutoff - ); + if *write_guard > new_gc_cutoff { + return Err(GcError::BadLsn { + why: format!( + "Cannot move GC cutoff LSN backwards (was {}, new {})", + *write_guard, new_gc_cutoff + ), + }); + } + write_guard.store_and_unlock(new_gc_cutoff) }; waitlist.wait().await; @@ -4191,7 +4917,6 @@ impl Timeline { debug!("retain_lsns: {:?}", retain_lsns); let mut layers_to_remove = Vec::new(); - let mut wanted_image_layers = KeySpaceRandomAccum::default(); // Scan all layers in the timeline (remote or on-disk). // @@ -4199,31 +4924,32 @@ impl Timeline { // 1. it is older than cutoff LSN; // 2. it is older than PITR interval; // 3. it doesn't need to be retained for 'retain_lsns'; - // 4. newer on-disk image layers cover the layer's whole key range + // 4. it does not need to be kept for LSNs holding valid leases. + // 5. newer on-disk image layers cover the layer's whole key range // // TODO holding a write lock is too agressive and avoidable let mut guard = self.layers.write().await; - let layers = guard.layer_map(); + let layers = guard.layer_map()?; 'outer: for l in layers.iter_historic_layers() { result.layers_total += 1; // 1. Is it newer than GC horizon cutoff point? - if l.get_lsn_range().end > horizon_cutoff { + if l.get_lsn_range().end > space_cutoff { debug!( - "keeping {} because it's newer than horizon_cutoff {}", - l.filename(), - horizon_cutoff, + "keeping {} because it's newer than space_cutoff {}", + l.layer_name(), + space_cutoff, ); result.layers_needed_by_cutoff += 1; continue 'outer; } // 2. It is newer than PiTR cutoff point? - if l.get_lsn_range().end > pitr_cutoff { + if l.get_lsn_range().end > time_cutoff { debug!( - "keeping {} because it's newer than pitr_cutoff {}", - l.filename(), - pitr_cutoff, + "keeping {} because it's newer than time_cutoff {}", + l.layer_name(), + time_cutoff, ); result.layers_needed_by_pitr += 1; continue 'outer; @@ -4241,7 +4967,7 @@ impl Timeline { if &l.get_lsn_range().start <= retain_lsn { debug!( "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", - l.filename(), + l.layer_name(), retain_lsn, l.is_incremental(), ); @@ -4250,7 +4976,21 @@ impl Timeline { } } - // 4. Is there a later on-disk layer for this relation? + // 4. Is there a valid lease that requires us to keep this layer? + if let Some(lsn) = &max_lsn_with_valid_lease { + // keep if layer start <= any of the lease + if &l.get_lsn_range().start <= lsn { + debug!( + "keeping {} because there is a valid lease preventing GC at {}", + l.layer_name(), + lsn, + ); + result.layers_needed_by_leases += 1; + continue 'outer; + } + } + + // 5. Is there a later on-disk layer for this relation? // // The end-LSN is exclusive, while disk_consistent_lsn is // inclusive. For example, if disk_consistent_lsn is 100, it is @@ -4272,16 +5012,7 @@ impl Timeline { if !layers .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff)) { - debug!("keeping {} because it is the latest layer", l.filename()); - // Collect delta key ranges that need image layers to allow garbage - // collecting the layers. - // It is not so obvious whether we need to propagate information only about - // delta layers. Image layers can form "stairs" preventing old image from been deleted. - // But image layers are in any case less sparse than delta layers. Also we need some - // protection from replacing recent image layers with new one after each GC iteration. - if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&l) { - wanted_image_layers.add_range(l.get_key_range()); - } + debug!("keeping {} because it is the latest layer", l.layer_name()); result.layers_not_updated += 1; continue 'outer; } @@ -4289,29 +5020,25 @@ impl Timeline { // We didn't find any reason to keep this file, so remove it. debug!( "garbage collecting {} is_dropped: xx is_incremental: {}", - l.filename(), + l.layer_name(), l.is_incremental(), ); layers_to_remove.push(l); } - self.wanted_image_layers - .lock() - .unwrap() - .replace((new_gc_cutoff, wanted_image_layers.to_keyspace())); if !layers_to_remove.is_empty() { - // Persist the new GC cutoff value in the metadata file, before - // we actually remove anything. - // - // This does not in fact have any effect as we no longer consider local metadata unless - // running without remote storage. - // + // Persist the new GC cutoff value before we actually remove anything. // This unconditionally schedules also an index_part.json update, even though, we will // be doing one a bit later with the unlinked gc'd layers. - // - // TODO: remove when implementing . - self.update_metadata_file(self.disk_consistent_lsn.load(), None) - .await?; + let disk_consistent_lsn = self.disk_consistent_lsn.load(); + self.schedule_uploads(disk_consistent_lsn, None) + .map_err(|e| { + if self.cancel.is_cancelled() { + GcError::TimelineCancelled + } else { + GcError::Remote(e) + } + })?; let gc_layers = layers_to_remove .iter() @@ -4320,15 +5047,9 @@ impl Timeline { result.layers_removed = gc_layers.len() as u64; - if let Some(remote_client) = self.remote_client.as_ref() { - remote_client.schedule_gc_update(&gc_layers)?; - } + self.remote_client.schedule_gc_update(&gc_layers)?; - guard.finish_gc_timeline(&gc_layers); - - if result.layers_removed != 0 { - fail_point!("after-timeline-gc-removed-layers"); - } + guard.open_mut()?.finish_gc_timeline(&gc_layers); #[cfg(feature = "testing")] { @@ -4341,7 +5062,7 @@ impl Timeline { result.layers_removed, new_gc_cutoff ); - result.elapsed = now.elapsed()?; + result.elapsed = now.elapsed().unwrap_or(Duration::ZERO); Ok(result) } @@ -4393,36 +5114,22 @@ impl Timeline { } else { trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn); }; - - let last_rec_lsn = data.records.last().unwrap().0; - - let img = match self + let res = self .walredo_mgr + .as_ref() + .context("timeline has no walredo manager") + .map_err(PageReconstructError::WalRedo)? .request_redo(key, request_lsn, data.img, data.records, self.pg_version) - .await - .context("reconstruct a page image") - { + .await; + let img = match res { Ok(img) => img, - Err(e) => return Err(PageReconstructError::WalRedo(e)), - }; - - if img.len() == page_cache::PAGE_SZ { - let cache = page_cache::get(); - if let Err(e) = cache - .memorize_materialized_page( - self.tenant_shard_id, - self.timeline_id, - key, - last_rec_lsn, - &img, - ) - .await - .context("Materialized page memoization failed") - { - return Err(PageReconstructError::from(e)); + Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled), + Err(walredo::Error::Other(e)) => { + return Err(PageReconstructError::WalRedo( + e.context("reconstruct a page image"), + )) } - } - + }; Ok(img) } } @@ -4454,10 +5161,9 @@ impl Timeline { let task_id = task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::DownloadAllRemoteLayers, - Some(self.tenant_shard_id), + self.tenant_shard_id, Some(self.timeline_id), "download all remote layers task", - false, async move { self_clone.download_all_remote_layers(request).await; let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap(); @@ -4499,9 +5205,13 @@ impl Timeline { let remaining = { let guard = self.layers.read().await; - guard - .layer_map() - .iter_historic_layers() + let Ok(lm) = guard.layer_map() else { + // technically here we could look into iterating accessible layers, but downloading + // all layers of a shutdown timeline makes no sense regardless. + tracing::info!("attempted to download all layers of shutdown timeline"); + return; + }; + lm.iter_historic_layers() .map(|desc| guard.get_from_desc(&desc)) .collect::>() }; @@ -4586,7 +5296,9 @@ impl Timeline { } } - pub fn get_download_all_remote_layers_task_info(&self) -> Option { + pub(crate) fn get_download_all_remote_layers_task_info( + &self, + ) -> Option { self.download_all_remote_layers_task_info .read() .unwrap() @@ -4598,41 +5310,24 @@ impl Timeline { /// Returns non-remote layers for eviction. pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo { let guard = self.layers.read().await; - let layers = guard.layer_map(); - let mut max_layer_size: Option = None; - let mut resident_layers = Vec::new(); - for l in layers.iter_historic_layers() { - let file_size = l.file_size(); - max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); + let resident_layers = guard + .likely_resident_layers() + .map(|layer| { + let file_size = layer.layer_desc().file_size; + max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); - let l = guard.get_from_desc(&l); + let last_activity_ts = layer.latest_activity(); - let l = match l.keep_resident().await { - Ok(Some(l)) => l, - Ok(None) => continue, - Err(e) => { - // these should not happen, but we cannot make them statically impossible right - // now. - tracing::warn!(layer=%l, "failed to keep the layer resident: {e:#}"); - continue; + EvictionCandidate { + layer: layer.to_owned().into(), + last_activity_ts, + relative_last_activity: finite_f32::FiniteF32::ZERO, + visibility: layer.visibility(), } - }; - - let last_activity_ts = l.access_stats().latest_activity().unwrap_or_else(|| { - // We only use this fallback if there's an implementation error. - // `latest_activity` already does rate-limited warn!() log. - debug!(layer=%l, "last_activity returns None, using SystemTime::now"); - SystemTime::now() - }); - - resident_layers.push(EvictionCandidate { - layer: l.drop_eviction_guard().into(), - last_activity_ts, - relative_last_activity: finite_f32::FiniteF32::ZERO, - }); - } + }) + .collect(); DiskUsageEvictionInfo { max_layer_size, @@ -4646,45 +5341,241 @@ impl Timeline { shard_count: self.tenant_shard_id.shard_count, } } + + /// Persistently blocks gc for `Manual` reason. + /// + /// Returns true if no such block existed before, false otherwise. + pub(crate) async fn block_gc(&self, tenant: &super::Tenant) -> anyhow::Result { + use crate::tenant::remote_timeline_client::index::GcBlockingReason; + assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id); + tenant.gc_block.insert(self, GcBlockingReason::Manual).await + } + + /// Persistently unblocks gc for `Manual` reason. + pub(crate) async fn unblock_gc(&self, tenant: &super::Tenant) -> anyhow::Result<()> { + use crate::tenant::remote_timeline_client::index::GcBlockingReason; + assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id); + tenant.gc_block.remove(self, GcBlockingReason::Manual).await + } + + #[cfg(test)] + pub(super) fn force_advance_lsn(self: &Arc, new_lsn: Lsn) { + self.last_record_lsn.advance(new_lsn); + } + + #[cfg(test)] + pub(super) fn force_set_disk_consistent_lsn(&self, new_value: Lsn) { + self.disk_consistent_lsn.store(new_value); + } + + /// Force create an image layer and place it into the layer map. + /// + /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`] + /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run. + #[cfg(test)] + pub(super) async fn force_create_image_layer( + self: &Arc, + lsn: Lsn, + mut images: Vec<(Key, Bytes)>, + check_start_lsn: Option, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + let last_record_lsn = self.get_last_record_lsn(); + assert!( + lsn <= last_record_lsn, + "advance last record lsn before inserting a layer, lsn={lsn}, last_record_lsn={last_record_lsn}" + ); + if let Some(check_start_lsn) = check_start_lsn { + assert!(lsn >= check_start_lsn); + } + images.sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb)); + let min_key = *images.first().map(|(k, _)| k).unwrap(); + let end_key = images.last().map(|(k, _)| k).unwrap().next(); + let mut image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &(min_key..end_key), + lsn, + ctx, + ) + .await?; + for (key, img) in images { + image_layer_writer.put_image(key, img, ctx).await?; + } + let image_layer = image_layer_writer.finish(self, ctx).await?; + + { + let mut guard = self.layers.write().await; + guard.open_mut().unwrap().force_insert_layer(image_layer); + } + + Ok(()) + } + + /// Force create a delta layer and place it into the layer map. + /// + /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`] + /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run. + #[cfg(test)] + pub(super) async fn force_create_delta_layer( + self: &Arc, + mut deltas: DeltaLayerTestDesc, + check_start_lsn: Option, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + let last_record_lsn = self.get_last_record_lsn(); + deltas + .data + .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb))); + assert!(deltas.data.first().unwrap().0 >= deltas.key_range.start); + assert!(deltas.data.last().unwrap().0 < deltas.key_range.end); + for (_, lsn, _) in &deltas.data { + assert!(deltas.lsn_range.start <= *lsn && *lsn < deltas.lsn_range.end); + } + assert!( + deltas.lsn_range.end <= last_record_lsn, + "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}", + deltas.lsn_range.end, + last_record_lsn + ); + if let Some(check_start_lsn) = check_start_lsn { + assert!(deltas.lsn_range.start >= check_start_lsn); + } + // check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of + // layers of the same start/end LSN, and so should the force inserted layer + { + /// Checks if a overlaps with b, assume a/b = [start, end). + pub fn overlaps_with(a: &Range, b: &Range) -> bool { + !(a.end <= b.start || b.end <= a.start) + } + + if deltas.key_range.start.next() != deltas.key_range.end { + let guard = self.layers.read().await; + let mut invalid_layers = + guard.layer_map()?.iter_historic_layers().filter(|layer| { + layer.is_delta() + && overlaps_with(&layer.lsn_range, &deltas.lsn_range) + && layer.lsn_range != deltas.lsn_range + // skip single-key layer files + && layer.key_range.start.next() != layer.key_range.end + }); + if let Some(layer) = invalid_layers.next() { + // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic + panic!( + "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}", + deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end + ); + } + } + } + let mut delta_layer_writer = DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + deltas.key_range.start, + deltas.lsn_range, + ctx, + ) + .await?; + for (key, lsn, val) in deltas.data { + delta_layer_writer.put_value(key, lsn, val, ctx).await?; + } + let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?; + let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?; + + { + let mut guard = self.layers.write().await; + guard.open_mut().unwrap().force_insert_layer(delta_layer); + } + + Ok(()) + } + + /// Return all keys at the LSN in the image layers + #[cfg(test)] + pub(crate) async fn inspect_image_layers( + self: &Arc, + lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result> { + let mut all_data = Vec::new(); + let guard = self.layers.read().await; + for layer in guard.layer_map()?.iter_historic_layers() { + if !layer.is_delta() && layer.image_layer_lsn() == lsn { + let layer = guard.get_from_desc(&layer); + let mut reconstruct_data = ValuesReconstructState::default(); + layer + .get_values_reconstruct_data( + KeySpace::single(Key::MIN..Key::MAX), + lsn..Lsn(lsn.0 + 1), + &mut reconstruct_data, + ctx, + ) + .await?; + for (k, v) in reconstruct_data.keys { + all_data.push((k, v?.img.unwrap().1)); + } + } + } + all_data.sort(); + Ok(all_data) + } + + /// Get all historic layer descriptors in the layer map + #[cfg(test)] + pub(crate) async fn inspect_historic_layers( + self: &Arc, + ) -> anyhow::Result> { + let mut layers = Vec::new(); + let guard = self.layers.read().await; + for layer in guard.layer_map()?.iter_historic_layers() { + layers.push(layer.key()); + } + Ok(layers) + } + + #[cfg(test)] + pub(crate) fn add_extra_test_dense_keyspace(&self, ks: KeySpace) { + let mut keyspace = self.extra_test_dense_keyspace.load().as_ref().clone(); + keyspace.merge(&ks); + self.extra_test_dense_keyspace.store(Arc::new(keyspace)); + } } -type TraversalPathItem = ( - ValueReconstructResult, - Lsn, - Box TraversalId>, -); +/// Tracking writes ingestion does to a particular in-memory layer. +/// +/// Cleared upon freezing a layer. +pub(crate) struct TimelineWriterState { + open_layer: Arc, + current_size: u64, + // Previous Lsn which passed through + prev_lsn: Option, + // Largest Lsn which passed through the current writer + max_lsn: Option, + // Cached details of the last freeze. Avoids going trough the atomic/lock on every put. + cached_last_freeze_at: Lsn, +} -/// Helper function for get_reconstruct_data() to add the path of layers traversed -/// to an error, as anyhow context information. -fn layer_traversal_error(msg: String, path: Vec) -> PageReconstructError { - // We want the original 'msg' to be the outermost context. The outermost context - // is the most high-level information, which also gets propagated to the client. - let mut msg_iter = path - .into_iter() - .map(|(r, c, l)| { - format!( - "layer traversal: result {:?}, cont_lsn {}, layer: {}", - r, - c, - l(), - ) - }) - .chain(std::iter::once(msg)); - // Construct initial message from the first traversed layer - let err = anyhow!(msg_iter.next().unwrap()); - - // Append all subsequent traversals, and the error message 'msg', as contexts. - let msg = msg_iter.fold(err, |err, msg| err.context(msg)); - PageReconstructError::from(msg) +impl TimelineWriterState { + fn new(open_layer: Arc, current_size: u64, last_freeze_at: Lsn) -> Self { + Self { + open_layer, + current_size, + prev_lsn: None, + max_lsn: None, + cached_last_freeze_at: last_freeze_at, + } + } } /// Various functions to mutate the timeline. // TODO Currently, Deref is used to allow easy access to read methods from this trait. // This is probably considered a bad practice in Rust and should be fixed eventually, // but will cause large code changes. -pub struct TimelineWriter<'a> { +pub(crate) struct TimelineWriter<'a> { tl: &'a Timeline, - _write_guard: tokio::sync::MutexGuard<'a, ()>, + write_guard: tokio::sync::MutexGuard<'a, Option>, } impl Deref for TimelineWriter<'_> { @@ -4695,31 +5586,191 @@ impl Deref for TimelineWriter<'_> { } } +#[derive(PartialEq)] +enum OpenLayerAction { + Roll, + Open, + None, +} + impl<'a> TimelineWriter<'a> { - /// Put a new page version that can be constructed from a WAL record - /// - /// This will implicitly extend the relation, if the page is beyond the - /// current end-of-file. - pub async fn put( - &self, + async fn handle_open_layer_action( + &mut self, + at: Lsn, + action: OpenLayerAction, + ctx: &RequestContext, + ) -> anyhow::Result<&Arc> { + match action { + OpenLayerAction::Roll => { + let freeze_at = self.write_guard.as_ref().unwrap().max_lsn.unwrap(); + self.roll_layer(freeze_at).await?; + self.open_layer(at, ctx).await?; + } + OpenLayerAction::Open => self.open_layer(at, ctx).await?, + OpenLayerAction::None => { + assert!(self.write_guard.is_some()); + } + } + + Ok(&self.write_guard.as_ref().unwrap().open_layer) + } + + async fn open_layer(&mut self, at: Lsn, ctx: &RequestContext) -> anyhow::Result<()> { + let layer = self + .tl + .get_layer_for_write(at, &self.write_guard, ctx) + .await?; + let initial_size = layer.size().await?; + + let last_freeze_at = self.last_freeze_at.load(); + self.write_guard.replace(TimelineWriterState::new( + layer, + initial_size, + last_freeze_at, + )); + + Ok(()) + } + + async fn roll_layer(&mut self, freeze_at: Lsn) -> Result<(), FlushLayerError> { + let current_size = self.write_guard.as_ref().unwrap().current_size; + + // self.write_guard will be taken by the freezing + self.tl + .freeze_inmem_layer_at(freeze_at, &mut self.write_guard) + .await?; + + assert!(self.write_guard.is_none()); + + if current_size >= self.get_checkpoint_distance() * 2 { + warn!("Flushed oversized open layer with size {}", current_size) + } + + Ok(()) + } + + fn get_open_layer_action(&self, lsn: Lsn, new_value_size: u64) -> OpenLayerAction { + let state = &*self.write_guard; + let Some(state) = &state else { + return OpenLayerAction::Open; + }; + + #[cfg(feature = "testing")] + if state.cached_last_freeze_at < self.tl.last_freeze_at.load() { + // this check and assertion are not really needed because + // LayerManager::try_freeze_in_memory_layer will always clear out the + // TimelineWriterState if something is frozen. however, we can advance last_freeze_at when there + // is no TimelineWriterState. + assert!( + state.open_layer.end_lsn.get().is_some(), + "our open_layer must be outdated" + ); + + // this would be a memory leak waiting to happen because the in-memory layer always has + // an index + panic!("BUG: TimelineWriterState held on to frozen in-memory layer."); + } + + if state.prev_lsn == Some(lsn) { + // Rolling mid LSN is not supported by [downstream code]. + // Hence, only roll at LSN boundaries. + // + // [downstream code]: https://github.com/neondatabase/neon/pull/7993#discussion_r1633345422 + return OpenLayerAction::None; + } + + if state.current_size == 0 { + // Don't roll empty layers + return OpenLayerAction::None; + } + + if self.tl.should_roll( + state.current_size, + state.current_size + new_value_size, + self.get_checkpoint_distance(), + lsn, + state.cached_last_freeze_at, + state.open_layer.get_opened_at(), + ) { + OpenLayerAction::Roll + } else { + OpenLayerAction::None + } + } + + /// Put a batch of keys at the specified Lsns. + pub(crate) async fn put_batch( + &mut self, + batch: Vec<(CompactKey, Lsn, usize, Value)>, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + if batch.is_empty() { + return Ok(()); + } + + let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch)?; + let batch_max_lsn = serialized_batch.max_lsn; + let buf_size: u64 = serialized_batch.raw.len() as u64; + + let action = self.get_open_layer_action(batch_max_lsn, buf_size); + let layer = self + .handle_open_layer_action(batch_max_lsn, action, ctx) + .await?; + + let res = layer.put_batch(serialized_batch, ctx).await; + + if res.is_ok() { + // Update the current size only when the entire write was ok. + // In case of failures, we may have had partial writes which + // render the size tracking out of sync. That's ok because + // the checkpoint distance should be significantly smaller + // than the S3 single shot upload limit of 5GiB. + let state = self.write_guard.as_mut().unwrap(); + + state.current_size += buf_size; + state.prev_lsn = Some(batch_max_lsn); + state.max_lsn = std::cmp::max(state.max_lsn, Some(batch_max_lsn)); + } + + res + } + + #[cfg(test)] + /// Test helper, for tests that would like to poke individual values without composing a batch + pub(crate) async fn put( + &mut self, key: Key, lsn: Lsn, value: &Value, ctx: &RequestContext, ) -> anyhow::Result<()> { - self.tl.put_value(key, lsn, value, ctx).await + use utils::bin_ser::BeSer; + if !key.is_valid_key_on_write_path() { + bail!( + "the request contains data not supported by pageserver at TimelineWriter::put: {}", + key + ); + } + let val_ser_size = value.serialized_size().unwrap() as usize; + self.put_batch( + vec![(key.to_compact(), lsn, val_ser_size, value.clone())], + ctx, + ) + .await } - pub(crate) async fn put_batch( - &self, - batch: &HashMap>, + pub(crate) async fn delete_batch( + &mut self, + batch: &[(Range, Lsn)], ctx: &RequestContext, ) -> anyhow::Result<()> { - self.tl.put_values(batch, ctx).await - } + if let Some((_, lsn)) = batch.first() { + let action = self.get_open_layer_action(*lsn, 0); + let layer = self.handle_open_layer_action(*lsn, action, ctx).await?; + layer.put_tombstones(batch).await?; + } - pub(crate) async fn delete_batch(&self, batch: &[(Range, Lsn)]) -> anyhow::Result<()> { - self.tl.put_tombstones(batch).await + Ok(()) } /// Track the end of the latest digested WAL record. @@ -4747,41 +5798,119 @@ fn is_send() { _assert_send::>(); } -/// Add a suffix to a layer file's name: .{num}.old -/// Uses the first available num (starts at 0) -fn rename_to_backup(path: &Utf8Path) -> anyhow::Result<()> { - let filename = path - .file_name() - .ok_or_else(|| anyhow!("Path {path} don't have a file name"))?; - let mut new_path = path.to_owned(); - - for i in 0u32.. { - new_path.set_file_name(format!("{filename}.{i}.old")); - if !new_path.exists() { - std::fs::rename(path, &new_path) - .with_context(|| format!("rename {path:?} to {new_path:?}"))?; - return Ok(()); - } - } - - bail!("couldn't find an unused backup number for {:?}", path) -} - #[cfg(test)] mod tests { + use pageserver_api::key::Key; use utils::{id::TimelineId, lsn::Lsn}; - use crate::tenant::{ - harness::TenantHarness, storage_layer::Layer, timeline::EvictionError, Timeline, + use crate::{ + repository::Value, + tenant::{ + harness::{test_img, TenantHarness}, + layer_map::LayerMap, + storage_layer::{Layer, LayerName}, + timeline::{DeltaLayerTestDesc, EvictionError}, + Timeline, + }, }; #[tokio::test] - async fn two_layer_eviction_attempts_at_the_same_time() { - let harness = - TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap(); + async fn test_heatmap_generation() { + let harness = TenantHarness::create("heatmap_generation").await.unwrap(); - let ctx = any_context(); - let tenant = harness.try_load(&ctx).await.unwrap(); + let covered_delta = DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![( + Key::from_hex("620000000033333333444444445500000000").unwrap(), + Lsn(0x11), + Value::Image(test_img("foo")), + )], + ); + let visible_delta = DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![( + Key::from_hex("720000000033333333444444445500000000").unwrap(), + Lsn(0x11), + Value::Image(test_img("foo")), + )], + ); + let l0_delta = DeltaLayerTestDesc::new( + Lsn(0x20)..Lsn(0x30), + Key::from_hex("000000000000000000000000000000000000").unwrap() + ..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(), + vec![( + Key::from_hex("720000000033333333444444445500000000").unwrap(), + Lsn(0x25), + Value::Image(test_img("foo")), + )], + ); + let delta_layers = vec![ + covered_delta.clone(), + visible_delta.clone(), + l0_delta.clone(), + ]; + + let image_layer = ( + Lsn(0x40), + vec![( + Key::from_hex("620000000033333333444444445500000000").unwrap(), + test_img("bar"), + )], + ); + let image_layers = vec![image_layer]; + + let (tenant, ctx) = harness.load().await; + let timeline = tenant + .create_test_timeline_with_layers( + TimelineId::generate(), + Lsn(0x10), + 14, + &ctx, + delta_layers, + image_layers, + Lsn(0x100), + ) + .await + .unwrap(); + + // Layer visibility is an input to heatmap generation, so refresh it first + timeline.update_layer_visibility().await.unwrap(); + + let heatmap = timeline + .generate_heatmap() + .await + .expect("Infallible while timeline is not shut down"); + + assert_eq!(heatmap.timeline_id, timeline.timeline_id); + + // L0 should come last + assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name()); + + let mut last_lsn = Lsn::MAX; + for layer in heatmap.layers { + // Covered layer should be omitted + assert!(layer.name != covered_delta.layer_name()); + + let layer_lsn = match &layer.name { + LayerName::Delta(d) => d.lsn_range.end, + LayerName::Image(i) => i.lsn, + }; + + // Apart from L0s, newest Layers should come first + if !LayerMap::is_l0(layer.name.key_range(), layer.name.is_delta()) { + assert!(layer_lsn <= last_lsn); + last_lsn = layer_lsn; + } + } + } + + #[tokio::test] + async fn two_layer_eviction_attempts_at_the_same_time() { + let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time") + .await + .unwrap(); + + let (tenant, ctx) = harness.load().await; let timeline = tenant .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) .await @@ -4792,16 +5921,17 @@ mod tests { .keep_resident() .await .expect("no download => no downloading errors") - .expect("should had been resident") .drop_eviction_guard(); - let first = async { layer.evict_and_wait().await }; - let second = async { layer.evict_and_wait().await }; + let forever = std::time::Duration::from_secs(120); + + let first = layer.evict_and_wait(forever); + let second = layer.evict_and_wait(forever); let (first, second) = tokio::join!(first, second); let res = layer.keep_resident().await; - assert!(matches!(res, Ok(None)), "{res:?}"); + assert!(res.is_none(), "{res:?}"); match (first, second) { (Ok(()), Ok(())) => { @@ -4815,16 +5945,11 @@ mod tests { } } - fn any_context() -> crate::context::RequestContext { - use crate::context::*; - use crate::task_mgr::*; - RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) - } - async fn find_some_layer(timeline: &Timeline) -> Layer { let layers = timeline.layers.read().await; let desc = layers .layer_map() + .unwrap() .iter_historic_layers() .next() .expect("must find one layer to evict"); diff --git a/pageserver/src/tenant/timeline/analysis.rs b/pageserver/src/tenant/timeline/analysis.rs new file mode 100644 index 0000000000..cd61418f3d --- /dev/null +++ b/pageserver/src/tenant/timeline/analysis.rs @@ -0,0 +1,90 @@ +use std::{collections::BTreeSet, ops::Range}; + +use utils::lsn::Lsn; + +use super::Timeline; + +#[derive(serde::Serialize)] +pub(crate) struct RangeAnalysis { + start: String, + end: String, + has_image: bool, + num_of_deltas_above_image: usize, + total_num_of_deltas: usize, +} + +impl Timeline { + pub(crate) async fn perf_info(&self) -> Vec { + // First, collect all split points of the layers. + let mut split_points = BTreeSet::new(); + let mut delta_ranges = Vec::new(); + let mut image_ranges = Vec::new(); + + let all_layer_files = { + let guard = self.layers.read().await; + guard.all_persistent_layers() + }; + let lsn = self.get_last_record_lsn(); + + for key in all_layer_files { + split_points.insert(key.key_range.start); + split_points.insert(key.key_range.end); + if key.is_delta { + delta_ranges.push((key.key_range.clone(), key.lsn_range.clone())); + } else { + image_ranges.push((key.key_range.clone(), key.lsn_range.start)); + } + } + + // For each split range, compute the estimated read amplification. + let split_points = split_points.into_iter().collect::>(); + + let mut result = Vec::new(); + + for i in 0..(split_points.len() - 1) { + let start = split_points[i]; + let end = split_points[i + 1]; + // Find the latest image layer that contains the information. + let mut maybe_image_layers = image_ranges + .iter() + // We insert split points for all image layers, and therefore a `contains` check for the start point should be enough. + .filter(|(key_range, img_lsn)| key_range.contains(&start) && img_lsn <= &lsn) + .cloned() + .collect::>(); + maybe_image_layers.sort_by(|a, b| a.1.cmp(&b.1)); + let image_layer = maybe_image_layers.last().cloned(); + let lsn_filter_start = image_layer + .as_ref() + .map(|(_, lsn)| *lsn) + .unwrap_or(Lsn::INVALID); + + fn overlaps_with(lsn_range_a: &Range, lsn_range_b: &Range) -> bool { + !(lsn_range_a.end <= lsn_range_b.start || lsn_range_a.start >= lsn_range_b.end) + } + + let maybe_delta_layers = delta_ranges + .iter() + .filter(|(key_range, lsn_range)| { + key_range.contains(&start) && overlaps_with(&(lsn_filter_start..lsn), lsn_range) + }) + .cloned() + .collect::>(); + + let pitr_delta_layers = delta_ranges + .iter() + .filter(|(key_range, _)| key_range.contains(&start)) + .cloned() + .collect::>(); + + result.push(RangeAnalysis { + start: start.to_string(), + end: end.to_string(), + has_image: image_layer.is_some(), + num_of_deltas_above_image: maybe_delta_layers.len(), + total_num_of_deltas: pitr_delta_layers.len(), + }); + } + + result + } +} diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs new file mode 100644 index 0000000000..a87b502cd6 --- /dev/null +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -0,0 +1,2416 @@ +//! New compaction implementation. The algorithm itself is implemented in the +//! compaction crate. This file implements the callbacks and structs that allow +//! the algorithm to drive the process. +//! +//! The old legacy algorithm is implemented directly in `timeline.rs`. + +use std::collections::{BinaryHeap, HashSet}; +use std::ops::{Deref, Range}; +use std::sync::Arc; + +use super::layer_manager::LayerManager; +use super::{ + CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode, + RecordedDuration, Timeline, +}; + +use anyhow::{anyhow, bail, Context}; +use bytes::Bytes; +use enumset::EnumSet; +use fail::fail_point; +use itertools::Itertools; +use pageserver_api::key::KEY_SIZE; +use pageserver_api::keyspace::ShardedRange; +use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; +use serde::Serialize; +use tokio_util::sync::CancellationToken; +use tracing::{debug, info, info_span, trace, warn, Instrument}; +use utils::id::TimelineId; + +use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; +use crate::page_cache; +use crate::tenant::remote_timeline_client::WaitCompletionError; +use crate::tenant::storage_layer::merge_iterator::MergeIterator; +use crate::tenant::storage_layer::split_writer::{ + SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult, +}; +use crate::tenant::storage_layer::{ + AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState, +}; +use crate::tenant::timeline::ImageLayerCreationOutcome; +use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; +use crate::tenant::timeline::{Layer, ResidentLayer}; +use crate::tenant::DeltaLayer; +use crate::virtual_file::{MaybeFatalIo, VirtualFile}; +use pageserver_api::config::tenant_conf_defaults::{ + DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD, +}; + +use crate::keyspace::KeySpace; +use crate::repository::{Key, Value}; +use crate::walrecord::NeonWalRecord; + +use utils::lsn::Lsn; + +use pageserver_compaction::helpers::overlaps_with; +use pageserver_compaction::interface::*; + +use super::CompactionError; + +/// Maximum number of deltas before generating an image layer in bottom-most compaction. +const COMPACTION_DELTA_THRESHOLD: usize = 5; + +/// The result of bottom-most compaction for a single key at each LSN. +#[derive(Debug)] +#[cfg_attr(test, derive(PartialEq))] +pub struct KeyLogAtLsn(pub Vec<(Lsn, Value)>); + +/// The result of bottom-most compaction. +#[derive(Debug)] +#[cfg_attr(test, derive(PartialEq))] +pub(crate) struct KeyHistoryRetention { + /// Stores logs to reconstruct the value at the given LSN, that is to say, logs <= LSN or image == LSN. + pub(crate) below_horizon: Vec<(Lsn, KeyLogAtLsn)>, + /// Stores logs to reconstruct the value at any LSN above the horizon, that is to say, log > LSN. + pub(crate) above_horizon: KeyLogAtLsn, +} + +impl KeyHistoryRetention { + /// Hack: skip delta layer if we need to produce a layer of a same key-lsn. + /// + /// This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range. + /// For example, consider the case where a single delta with range [0x10,0x50) exists. + /// And we have branches at LSN 0x10, 0x20, 0x30. + /// Then we delete branch @ 0x20. + /// Bottom-most compaction may now delete the delta [0x20,0x30). + /// And that wouldnt' change the shape of the layer. + /// + /// Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes. + /// + /// `discard_key` will only be called when the writer reaches its target (instead of for every key), so it's fine to grab a lock inside. + async fn discard_key(key: &PersistentLayerKey, tline: &Arc, dry_run: bool) -> bool { + if dry_run { + return true; + } + let guard = tline.layers.read().await; + if !guard.contains_key(key) { + return false; + } + let layer_generation = guard.get_from_key(key).metadata().generation; + drop(guard); + if layer_generation == tline.generation { + info!( + key=%key, + ?layer_generation, + "discard layer due to duplicated layer key in the same generation", + ); + true + } else { + false + } + } + + /// Pipe a history of a single key to the writers. + /// + /// If `image_writer` is none, the images will be placed into the delta layers. + /// The delta writer will contain all images and deltas (below and above the horizon) except the bottom-most images. + #[allow(clippy::too_many_arguments)] + async fn pipe_to( + self, + key: Key, + tline: &Arc, + delta_writer: &mut SplitDeltaLayerWriter, + mut image_writer: Option<&mut SplitImageLayerWriter>, + stat: &mut CompactionStatistics, + dry_run: bool, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + let mut first_batch = true; + let discard = |key: &PersistentLayerKey| { + let key = key.clone(); + async move { Self::discard_key(&key, tline, dry_run).await } + }; + for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon { + if first_batch { + if logs.len() == 1 && logs[0].1.is_image() { + let Value::Image(img) = &logs[0].1 else { + unreachable!() + }; + stat.produce_image_key(img); + if let Some(image_writer) = image_writer.as_mut() { + image_writer + .put_image_with_discard_fn(key, img.clone(), tline, ctx, discard) + .await?; + } else { + delta_writer + .put_value_with_discard_fn( + key, + cutoff_lsn, + Value::Image(img.clone()), + tline, + ctx, + discard, + ) + .await?; + } + } else { + for (lsn, val) in logs { + stat.produce_key(&val); + delta_writer + .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard) + .await?; + } + } + first_batch = false; + } else { + for (lsn, val) in logs { + stat.produce_key(&val); + delta_writer + .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard) + .await?; + } + } + } + let KeyLogAtLsn(above_horizon_logs) = self.above_horizon; + for (lsn, val) in above_horizon_logs { + stat.produce_key(&val); + delta_writer + .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard) + .await?; + } + Ok(()) + } +} + +#[derive(Debug, Serialize, Default)] +struct CompactionStatisticsNumSize { + num: u64, + size: u64, +} + +#[derive(Debug, Serialize, Default)] +pub struct CompactionStatistics { + delta_layer_visited: CompactionStatisticsNumSize, + image_layer_visited: CompactionStatisticsNumSize, + delta_layer_produced: CompactionStatisticsNumSize, + image_layer_produced: CompactionStatisticsNumSize, + num_delta_layer_discarded: usize, + num_image_layer_discarded: usize, + num_unique_keys_visited: usize, + wal_keys_visited: CompactionStatisticsNumSize, + image_keys_visited: CompactionStatisticsNumSize, + wal_produced: CompactionStatisticsNumSize, + image_produced: CompactionStatisticsNumSize, +} + +impl CompactionStatistics { + fn estimated_size_of_value(val: &Value) -> usize { + match val { + Value::Image(img) => img.len(), + Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(), + _ => std::mem::size_of::(), + } + } + fn estimated_size_of_key() -> usize { + KEY_SIZE // TODO: distinguish image layer and delta layer (count LSN in delta layer) + } + fn visit_delta_layer(&mut self, size: u64) { + self.delta_layer_visited.num += 1; + self.delta_layer_visited.size += size; + } + fn visit_image_layer(&mut self, size: u64) { + self.image_layer_visited.num += 1; + self.image_layer_visited.size += size; + } + fn on_unique_key_visited(&mut self) { + self.num_unique_keys_visited += 1; + } + fn visit_wal_key(&mut self, val: &Value) { + self.wal_keys_visited.num += 1; + self.wal_keys_visited.size += + Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64; + } + fn visit_image_key(&mut self, val: &Value) { + self.image_keys_visited.num += 1; + self.image_keys_visited.size += + Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64; + } + fn produce_key(&mut self, val: &Value) { + match val { + Value::Image(img) => self.produce_image_key(img), + Value::WalRecord(_) => self.produce_wal_key(val), + } + } + fn produce_wal_key(&mut self, val: &Value) { + self.wal_produced.num += 1; + self.wal_produced.size += + Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64; + } + fn produce_image_key(&mut self, val: &Bytes) { + self.image_produced.num += 1; + self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64; + } + fn discard_delta_layer(&mut self) { + self.num_delta_layer_discarded += 1; + } + fn discard_image_layer(&mut self) { + self.num_image_layer_discarded += 1; + } + fn produce_delta_layer(&mut self, size: u64) { + self.delta_layer_produced.num += 1; + self.delta_layer_produced.size += size; + } + fn produce_image_layer(&mut self, size: u64) { + self.image_layer_produced.num += 1; + self.image_layer_produced.size += size; + } +} + +impl Timeline { + /// TODO: cancellation + /// + /// Returns whether the compaction has pending tasks. + pub(crate) async fn compact_legacy( + self: &Arc, + cancel: &CancellationToken, + flags: EnumSet, + ctx: &RequestContext, + ) -> Result { + if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) { + self.compact_with_gc(cancel, flags, ctx) + .await + .map_err(CompactionError::Other)?; + return Ok(false); + } + + if flags.contains(CompactFlags::DryRun) { + return Err(CompactionError::Other(anyhow!( + "dry-run mode is not supported for legacy compaction for now" + ))); + } + + // High level strategy for compaction / image creation: + // + // 1. First, calculate the desired "partitioning" of the + // currently in-use key space. The goal is to partition the + // key space into roughly fixed-size chunks, but also take into + // account any existing image layers, and try to align the + // chunk boundaries with the existing image layers to avoid + // too much churn. Also try to align chunk boundaries with + // relation boundaries. In principle, we don't know about + // relation boundaries here, we just deal with key-value + // pairs, and the code in pgdatadir_mapping.rs knows how to + // map relations into key-value pairs. But in practice we know + // that 'field6' is the block number, and the fields 1-5 + // identify a relation. This is just an optimization, + // though. + // + // 2. Once we know the partitioning, for each partition, + // decide if it's time to create a new image layer. The + // criteria is: there has been too much "churn" since the last + // image layer? The "churn" is fuzzy concept, it's a + // combination of too many delta files, or too much WAL in + // total in the delta file. Or perhaps: if creating an image + // file would allow to delete some older files. + // + // 3. After that, we compact all level0 delta files if there + // are too many of them. While compacting, we also garbage + // collect any page versions that are no longer needed because + // of the new image layers we created in step 2. + // + // TODO: This high level strategy hasn't been implemented yet. + // Below are functions compact_level0() and create_image_layers() + // but they are a bit ad hoc and don't quite work like it's explained + // above. Rewrite it. + + // Is the timeline being deleted? + if self.is_stopping() { + trace!("Dropping out of compaction on timeline shutdown"); + return Err(CompactionError::ShuttingDown); + } + + let target_file_size = self.get_checkpoint_distance(); + + // Define partitioning schema if needed + + // FIXME: the match should only cover repartitioning, not the next steps + let (partition_count, has_pending_tasks) = match self + .repartition( + self.get_last_record_lsn(), + self.get_compaction_target_size(), + flags, + ctx, + ) + .await + { + Ok(((dense_partitioning, sparse_partitioning), lsn)) => { + // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them + let image_ctx = RequestContextBuilder::extend(ctx) + .access_stats_behavior(AccessStatsBehavior::Skip) + .build(); + + // 2. Compact + let timer = self.metrics.compact_time_histo.start_timer(); + let fully_compacted = self.compact_level0(target_file_size, ctx).await?; + timer.stop_and_record(); + + let mut partitioning = dense_partitioning; + partitioning + .parts + .extend(sparse_partitioning.into_dense().parts); + + // 3. Create new image layers for partitions that have been modified + // "enough". Skip image layer creation if L0 compaction cannot keep up. + if fully_compacted { + let image_layers = self + .create_image_layers( + &partitioning, + lsn, + if flags.contains(CompactFlags::ForceImageLayerCreation) { + ImageLayerCreationMode::Force + } else { + ImageLayerCreationMode::Try + }, + &image_ctx, + ) + .await?; + + self.upload_new_image_layers(image_layers)?; + } else { + info!("skipping image layer generation due to L0 compaction did not include all layers."); + } + (partitioning.parts.len(), !fully_compacted) + } + Err(err) => { + // no partitioning? This is normal, if the timeline was just created + // as an empty timeline. Also in unit tests, when we use the timeline + // as a simple key-value store, ignoring the datadir layout. Log the + // error but continue. + // + // Suppress error when it's due to cancellation + if !self.cancel.is_cancelled() { + tracing::error!("could not compact, repartitioning keyspace failed: {err:?}"); + } + (1, false) + } + }; + + if self.shard_identity.count >= ShardCount::new(2) { + // Limit the number of layer rewrites to the number of partitions: this means its + // runtime should be comparable to a full round of image layer creations, rather than + // being potentially much longer. + let rewrite_max = partition_count; + + self.compact_shard_ancestors(rewrite_max, ctx).await?; + } + + Ok(has_pending_tasks) + } + + /// Check for layers that are elegible to be rewritten: + /// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that + /// we don't indefinitely retain keys in this shard that aren't needed. + /// - For future use: layers beyond pitr_interval that are in formats we would + /// rather not maintain compatibility with indefinitely. + /// + /// Note: this phase may read and write many gigabytes of data: use rewrite_max to bound + /// how much work it will try to do in each compaction pass. + async fn compact_shard_ancestors( + self: &Arc, + rewrite_max: usize, + ctx: &RequestContext, + ) -> Result<(), CompactionError> { + let mut drop_layers = Vec::new(); + let mut layers_to_rewrite: Vec = Vec::new(); + + // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a + // layer is behind this Lsn, it indicates that the layer is being retained beyond the + // pitr_interval, for example because a branchpoint references it. + // + // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we + // are rewriting layers. + let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn(); + + tracing::info!( + "latest_gc_cutoff: {}, pitr cutoff {}", + *latest_gc_cutoff, + self.gc_info.read().unwrap().cutoffs.time + ); + + let layers = self.layers.read().await; + for layer_desc in layers.layer_map()?.iter_historic_layers() { + let layer = layers.get_from_desc(&layer_desc); + if layer.metadata().shard.shard_count == self.shard_identity.count { + // This layer does not belong to a historic ancestor, no need to re-image it. + continue; + } + + // This layer was created on an ancestor shard: check if it contains any data for this shard. + let sharded_range = ShardedRange::new(layer_desc.get_key_range(), &self.shard_identity); + let layer_local_page_count = sharded_range.page_count(); + let layer_raw_page_count = ShardedRange::raw_size(&layer_desc.get_key_range()); + if layer_local_page_count == 0 { + // This ancestral layer only covers keys that belong to other shards. + // We include the full metadata in the log: if we had some critical bug that caused + // us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers. + info!(%layer, old_metadata=?layer.metadata(), + "dropping layer after shard split, contains no keys for this shard.", + ); + + if cfg!(debug_assertions) { + // Expensive, exhaustive check of keys in this layer: this guards against ShardedRange's calculations being + // wrong. If ShardedRange claims the local page count is zero, then no keys in this layer + // should be !is_key_disposable() + let range = layer_desc.get_key_range(); + let mut key = range.start; + while key < range.end { + debug_assert!(self.shard_identity.is_key_disposable(&key)); + key = key.next(); + } + } + + drop_layers.push(layer); + continue; + } else if layer_local_page_count != u32::MAX + && layer_local_page_count == layer_raw_page_count + { + debug!(%layer, + "layer is entirely shard local ({} keys), no need to filter it", + layer_local_page_count + ); + continue; + } + + // Don't bother re-writing a layer unless it will at least halve its size + if layer_local_page_count != u32::MAX + && layer_local_page_count > layer_raw_page_count / 2 + { + debug!(%layer, + "layer is already mostly local ({}/{}), not rewriting", + layer_local_page_count, + layer_raw_page_count + ); + } + + // Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually + // without incurring the I/O cost of a rewrite. + if layer_desc.get_lsn_range().end >= *latest_gc_cutoff { + debug!(%layer, "Skipping rewrite of layer still in GC window ({} >= {})", + layer_desc.get_lsn_range().end, *latest_gc_cutoff); + continue; + } + + if layer_desc.is_delta() { + // We do not yet implement rewrite of delta layers + debug!(%layer, "Skipping rewrite of delta layer"); + continue; + } + + // Only rewrite layers if their generations differ. This guarantees: + // - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one + // - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage + if layer.metadata().generation == self.generation { + debug!(%layer, "Skipping rewrite, is not from old generation"); + continue; + } + + if layers_to_rewrite.len() >= rewrite_max { + tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}", + layers_to_rewrite.len() + ); + continue; + } + + // Fall through: all our conditions for doing a rewrite passed. + layers_to_rewrite.push(layer); + } + + // Drop read lock on layer map before we start doing time-consuming I/O + drop(layers); + + let mut replace_image_layers = Vec::new(); + + for layer in layers_to_rewrite { + tracing::info!(layer=%layer, "Rewriting layer after shard split..."); + let mut image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &layer.layer_desc().key_range, + layer.layer_desc().image_layer_lsn(), + ctx, + ) + .await + .map_err(CompactionError::Other)?; + + // Safety of layer rewrites: + // - We are writing to a different local file path than we are reading from, so the old Layer + // cannot interfere with the new one. + // - In the page cache, contents for a particular VirtualFile are stored with a file_id that + // is different for two layers with the same name (in `ImageLayerInner::new` we always + // acquire a fresh id from [`crate::page_cache::next_file_id`]. So readers do not risk + // reading the index from one layer file, and then data blocks from the rewritten layer file. + // - Any readers that have a reference to the old layer will keep it alive until they are done + // with it. If they are trying to promote from remote storage, that will fail, but this is the same + // as for compaction generally: compaction is allowed to delete layers that readers might be trying to use. + // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are: + // - GC, which at worst witnesses us "undelete" a layer that they just deleted. + // - ingestion, which only inserts layers, therefore cannot collide with us. + let resident = layer.download_and_keep_resident().await?; + + let keys_written = resident + .filter(&self.shard_identity, &mut image_layer_writer, ctx) + .await?; + + if keys_written > 0 { + let new_layer = image_layer_writer + .finish(self, ctx) + .await + .map_err(CompactionError::Other)?; + tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes", + layer.metadata().file_size, + new_layer.metadata().file_size); + + replace_image_layers.push((layer, new_layer)); + } else { + // Drop the old layer. Usually for this case we would already have noticed that + // the layer has no data for us with the ShardedRange check above, but + drop_layers.push(layer); + } + } + + // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded + // metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch + // to remote index) and be removed. This is inefficient but safe. + fail::fail_point!("compact-shard-ancestors-localonly"); + + // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage + self.rewrite_layers(replace_image_layers, drop_layers) + .await?; + + fail::fail_point!("compact-shard-ancestors-enqueued"); + + // We wait for all uploads to complete before finishing this compaction stage. This is not + // necessary for correctness, but it simplifies testing, and avoids proceeding with another + // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O + // load. + match self.remote_client.wait_completion().await { + Ok(()) => (), + Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)), + Err(WaitCompletionError::UploadQueueShutDownOrStopped) => { + return Err(CompactionError::ShuttingDown) + } + } + + fail::fail_point!("compact-shard-ancestors-persistent"); + + Ok(()) + } + + /// Update the LayerVisibilityHint of layers covered by image layers, based on whether there is + /// an image layer between them and the most recent readable LSN (branch point or tip of timeline). The + /// purpose of the visibility hint is to record which layers need to be available to service reads. + /// + /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers + /// that we know won't be needed for reads. + pub(super) async fn update_layer_visibility( + &self, + ) -> Result<(), super::layer_manager::Shutdown> { + let head_lsn = self.get_last_record_lsn(); + + // We will sweep through layers in reverse-LSN order. We only do historic layers. L0 deltas + // are implicitly left visible, because LayerVisibilityHint's default is Visible, and we never modify it here. + // Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that + // they will be subject to L0->L1 compaction in the near future. + let layer_manager = self.layers.read().await; + let layer_map = layer_manager.layer_map()?; + + let readable_points = { + let children = self.gc_info.read().unwrap().retain_lsns.clone(); + + let mut readable_points = Vec::with_capacity(children.len() + 1); + for (child_lsn, _child_timeline_id) in &children { + readable_points.push(*child_lsn); + } + readable_points.push(head_lsn); + readable_points + }; + + let (layer_visibility, covered) = layer_map.get_visibility(readable_points); + for (layer_desc, visibility) in layer_visibility { + // FIXME: a more efficiency bulk zip() through the layers rather than NlogN getting each one + let layer = layer_manager.get_from_desc(&layer_desc); + layer.set_visibility(visibility); + } + + // TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can + // avoid assuming that everything at a branch point is visible. + drop(covered); + Ok(()) + } + + /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as + /// as Level 1 files. Returns whether the L0 layers are fully compacted. + async fn compact_level0( + self: &Arc, + target_file_size: u64, + ctx: &RequestContext, + ) -> Result { + let CompactLevel0Phase1Result { + new_layers, + deltas_to_compact, + fully_compacted, + } = { + let phase1_span = info_span!("compact_level0_phase1"); + let ctx = ctx.attached_child(); + let mut stats = CompactLevel0Phase1StatsBuilder { + version: Some(2), + tenant_id: Some(self.tenant_shard_id), + timeline_id: Some(self.timeline_id), + ..Default::default() + }; + + let begin = tokio::time::Instant::now(); + let phase1_layers_locked = self.layers.read().await; + let now = tokio::time::Instant::now(); + stats.read_lock_acquisition_micros = + DurationRecorder::Recorded(RecordedDuration(now - begin), now); + self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx) + .instrument(phase1_span) + .await? + }; + + if new_layers.is_empty() && deltas_to_compact.is_empty() { + // nothing to do + return Ok(true); + } + + self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact) + .await?; + Ok(fully_compacted) + } + + /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment. + async fn compact_level0_phase1<'a>( + self: &'a Arc, + guard: tokio::sync::RwLockReadGuard<'a, LayerManager>, + mut stats: CompactLevel0Phase1StatsBuilder, + target_file_size: u64, + ctx: &RequestContext, + ) -> Result { + stats.read_lock_held_spawn_blocking_startup_micros = + stats.read_lock_acquisition_micros.till_now(); // set by caller + let layers = guard.layer_map()?; + let level0_deltas = layers.level0_deltas(); + stats.level0_deltas_count = Some(level0_deltas.len()); + + // Only compact if enough layers have accumulated. + let threshold = self.get_compaction_threshold(); + if level0_deltas.is_empty() || level0_deltas.len() < threshold { + debug!( + level0_deltas = level0_deltas.len(), + threshold, "too few deltas to compact" + ); + return Ok(CompactLevel0Phase1Result::default()); + } + + let mut level0_deltas = level0_deltas + .iter() + .map(|x| guard.get_from_desc(x)) + .collect::>(); + + // Gather the files to compact in this iteration. + // + // Start with the oldest Level 0 delta file, and collect any other + // level 0 files that form a contiguous sequence, such that the end + // LSN of previous file matches the start LSN of the next file. + // + // Note that if the files don't form such a sequence, we might + // "compact" just a single file. That's a bit pointless, but it allows + // us to get rid of the level 0 file, and compact the other files on + // the next iteration. This could probably made smarter, but such + // "gaps" in the sequence of level 0 files should only happen in case + // of a crash, partial download from cloud storage, or something like + // that, so it's not a big deal in practice. + level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start); + let mut level0_deltas_iter = level0_deltas.iter(); + + let first_level0_delta = level0_deltas_iter.next().unwrap(); + let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end; + let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len()); + + // Accumulate the size of layers in `deltas_to_compact` + let mut deltas_to_compact_bytes = 0; + + // Under normal circumstances, we will accumulate up to compaction_interval L0s of size + // checkpoint_distance each. To avoid edge cases using extra system resources, bound our + // work in this function to only operate on this much delta data at once. + // + // Take the max of the configured value & the default, so that tests that configure tiny values + // can still use a sensible amount of memory, but if a deployed system configures bigger values we + // still let them compact a full stack of L0s in one go. + let delta_size_limit = std::cmp::max( + self.get_compaction_threshold(), + DEFAULT_COMPACTION_THRESHOLD, + ) as u64 + * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE); + + let mut fully_compacted = true; + + deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); + for l in level0_deltas_iter { + let lsn_range = &l.layer_desc().lsn_range; + + if lsn_range.start != prev_lsn_end { + break; + } + deltas_to_compact.push(l.download_and_keep_resident().await?); + deltas_to_compact_bytes += l.metadata().file_size; + prev_lsn_end = lsn_range.end; + + if deltas_to_compact_bytes >= delta_size_limit { + info!( + l0_deltas_selected = deltas_to_compact.len(), + l0_deltas_total = level0_deltas.len(), + "L0 compaction picker hit max delta layer size limit: {}", + delta_size_limit + ); + fully_compacted = false; + + // Proceed with compaction, but only a subset of L0s + break; + } + } + let lsn_range = Range { + start: deltas_to_compact + .first() + .unwrap() + .layer_desc() + .lsn_range + .start, + end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end, + }; + + info!( + "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", + lsn_range.start, + lsn_range.end, + deltas_to_compact.len(), + level0_deltas.len() + ); + + for l in deltas_to_compact.iter() { + info!("compact includes {l}"); + } + + // We don't need the original list of layers anymore. Drop it so that + // we don't accidentally use it later in the function. + drop(level0_deltas); + + stats.read_lock_held_prerequisites_micros = stats + .read_lock_held_spawn_blocking_startup_micros + .till_now(); + + // TODO: replace with streaming k-merge + let all_keys = { + let mut all_keys = Vec::new(); + for l in deltas_to_compact.iter() { + if self.cancel.is_cancelled() { + return Err(CompactionError::ShuttingDown); + } + all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?); + } + // The current stdlib sorting implementation is designed in a way where it is + // particularly fast where the slice is made up of sorted sub-ranges. + all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn)); + all_keys + }; + + stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now(); + + // Determine N largest holes where N is number of compacted layers. The vec is sorted by key range start. + // + // A hole is a key range for which this compaction doesn't have any WAL records. + // Our goal in this compaction iteration is to avoid creating L1s that, in terms of their key range, + // cover the hole, but actually don't contain any WAL records for that key range. + // The reason is that the mere stack of L1s (`count_deltas`) triggers image layer creation (`create_image_layers`). + // That image layer creation would be useless for a hole range covered by L1s that don't contain any WAL records. + // + // The algorithm chooses holes as follows. + // - Slide a 2-window over the keys in key orde to get the hole range (=distance between two keys). + // - Filter: min threshold on range length + // - Rank: by coverage size (=number of image layers required to reconstruct each key in the range for which we have any data) + // + // For more details, intuition, and some ASCII art see https://github.com/neondatabase/neon/pull/3597#discussion_r1112704451 + #[derive(PartialEq, Eq)] + struct Hole { + key_range: Range, + coverage_size: usize, + } + let holes: Vec = { + use std::cmp::Ordering; + impl Ord for Hole { + fn cmp(&self, other: &Self) -> Ordering { + self.coverage_size.cmp(&other.coverage_size).reverse() + } + } + impl PartialOrd for Hole { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + let max_holes = deltas_to_compact.len(); + let last_record_lsn = self.get_last_record_lsn(); + let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; + let min_hole_coverage_size = 3; // TODO: something more flexible? + // min-heap (reserve space for one more element added before eviction) + let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); + let mut prev: Option = None; + + for &DeltaEntry { key: next_key, .. } in all_keys.iter() { + if let Some(prev_key) = prev { + // just first fast filter, do not create hole entries for metadata keys. The last hole in the + // compaction is the gap between data key and metadata keys. + if next_key.to_i128() - prev_key.to_i128() >= min_hole_range + && !Key::is_metadata_key(&prev_key) + { + let key_range = prev_key..next_key; + // Measuring hole by just subtraction of i128 representation of key range boundaries + // has not so much sense, because largest holes will corresponds field1/field2 changes. + // But we are mostly interested to eliminate holes which cause generation of excessive image layers. + // That is why it is better to measure size of hole as number of covering image layers. + let coverage_size = + layers.image_coverage(&key_range, last_record_lsn).len(); + if coverage_size >= min_hole_coverage_size { + heap.push(Hole { + key_range, + coverage_size, + }); + if heap.len() > max_holes { + heap.pop(); // remove smallest hole + } + } + } + } + prev = Some(next_key.next()); + } + let mut holes = heap.into_vec(); + holes.sort_unstable_by_key(|hole| hole.key_range.start); + holes + }; + stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); + drop_rlock(guard); + + if self.cancel.is_cancelled() { + return Err(CompactionError::ShuttingDown); + } + + stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now(); + + // This iterator walks through all key-value pairs from all the layers + // we're compacting, in key, LSN order. + // If there's both a Value::Image and Value::WalRecord for the same (key,lsn), + // then the Value::Image is ordered before Value::WalRecord. + let mut all_values_iter = { + let mut deltas = Vec::with_capacity(deltas_to_compact.len()); + for l in deltas_to_compact.iter() { + let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?; + deltas.push(l); + } + MergeIterator::create(&deltas, &[], ctx) + }; + + // This iterator walks through all keys and is needed to calculate size used by each key + let mut all_keys_iter = all_keys + .iter() + .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size)) + .coalesce(|mut prev, cur| { + // Coalesce keys that belong to the same key pair. + // This ensures that compaction doesn't put them + // into different layer files. + // Still limit this by the target file size, + // so that we keep the size of the files in + // check. + if prev.0 == cur.0 && prev.2 < target_file_size { + prev.2 += cur.2; + Ok(prev) + } else { + Err((prev, cur)) + } + }); + + // Merge the contents of all the input delta layers into a new set + // of delta layers, based on the current partitioning. + // + // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one. + // It's possible that there is a single key with so many page versions that storing all of them in a single layer file + // would be too large. In that case, we also split on the LSN dimension. + // + // LSN + // ^ + // | + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // + // + // If one key (X) has a lot of page versions: + // + // LSN + // ^ + // | (X) + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | +--+ | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | +--+ | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // TODO: this actually divides the layers into fixed-size chunks, not + // based on the partitioning. + // + // TODO: we should also opportunistically materialize and + // garbage collect what we can. + let mut new_layers = Vec::new(); + let mut prev_key: Option = None; + let mut writer: Option = None; + let mut key_values_total_size = 0u64; + let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key + let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key + let mut next_hole = 0; // index of next hole in holes vector + + let mut keys = 0; + + while let Some((key, lsn, value)) = all_values_iter + .next() + .await + .map_err(CompactionError::Other)? + { + keys += 1; + + if keys % 32_768 == 0 && self.cancel.is_cancelled() { + // avoid hitting the cancellation token on every key. in benches, we end up + // shuffling an order of million keys per layer, this means we'll check it + // around tens of times per layer. + return Err(CompactionError::ShuttingDown); + } + + let same_key = prev_key.map_or(false, |prev_key| prev_key == key); + // We need to check key boundaries once we reach next key or end of layer with the same key + if !same_key || lsn == dup_end_lsn { + let mut next_key_size = 0u64; + let is_dup_layer = dup_end_lsn.is_valid(); + dup_start_lsn = Lsn::INVALID; + if !same_key { + dup_end_lsn = Lsn::INVALID; + } + // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size + for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { + next_key_size = next_size; + if key != next_key { + if dup_end_lsn.is_valid() { + // We are writting segment with duplicates: + // place all remaining values of this key in separate segment + dup_start_lsn = dup_end_lsn; // new segments starts where old stops + dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range + } + break; + } + key_values_total_size += next_size; + // Check if it is time to split segment: if total keys size is larger than target file size. + // We need to avoid generation of empty segments if next_size > target_file_size. + if key_values_total_size > target_file_size && lsn != next_lsn { + // Split key between multiple layers: such layer can contain only single key + dup_start_lsn = if dup_end_lsn.is_valid() { + dup_end_lsn // new segment with duplicates starts where old one stops + } else { + lsn // start with the first LSN for this key + }; + dup_end_lsn = next_lsn; // upper LSN boundary is exclusive + break; + } + } + // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set. + if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { + dup_start_lsn = dup_end_lsn; + dup_end_lsn = lsn_range.end; + } + if writer.is_some() { + let written_size = writer.as_mut().unwrap().size(); + let contains_hole = + next_hole < holes.len() && key >= holes[next_hole].key_range.end; + // check if key cause layer overflow or contains hole... + if is_dup_layer + || dup_end_lsn.is_valid() + || written_size + key_values_total_size > target_file_size + || contains_hole + { + // ... if so, flush previous layer and prepare to write new one + let (desc, path) = writer + .take() + .unwrap() + .finish(prev_key.unwrap().next(), ctx) + .await + .map_err(CompactionError::Other)?; + let new_delta = Layer::finish_creating(self.conf, self, desc, &path) + .map_err(CompactionError::Other)?; + + new_layers.push(new_delta); + writer = None; + + if contains_hole { + // skip hole + next_hole += 1; + } + } + } + // Remember size of key value because at next iteration we will access next item + key_values_total_size = next_key_size; + } + fail_point!("delta-layer-writer-fail-before-finish", |_| { + Err(CompactionError::Other(anyhow::anyhow!( + "failpoint delta-layer-writer-fail-before-finish" + ))) + }); + + if !self.shard_identity.is_key_disposable(&key) { + if writer.is_none() { + if self.cancel.is_cancelled() { + // to be somewhat responsive to cancellation, check for each new layer + return Err(CompactionError::ShuttingDown); + } + // Create writer if not initiaized yet + writer = Some( + DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + key, + if dup_end_lsn.is_valid() { + // this is a layer containing slice of values of the same key + debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); + dup_start_lsn..dup_end_lsn + } else { + debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); + lsn_range.clone() + }, + ctx, + ) + .await + .map_err(CompactionError::Other)?, + ); + + keys = 0; + } + + writer + .as_mut() + .unwrap() + .put_value(key, lsn, value, ctx) + .await + .map_err(CompactionError::Other)?; + } else { + debug!( + "Dropping key {} during compaction (it belongs on shard {:?})", + key, + self.shard_identity.get_shard_number(&key) + ); + } + + if !new_layers.is_empty() { + fail_point!("after-timeline-compacted-first-L1"); + } + + prev_key = Some(key); + } + if let Some(writer) = writer { + let (desc, path) = writer + .finish(prev_key.unwrap().next(), ctx) + .await + .map_err(CompactionError::Other)?; + let new_delta = Layer::finish_creating(self.conf, self, desc, &path) + .map_err(CompactionError::Other)?; + new_layers.push(new_delta); + } + + // Sync layers + if !new_layers.is_empty() { + // Print a warning if the created layer is larger than double the target size + // Add two pages for potential overhead. This should in theory be already + // accounted for in the target calculation, but for very small targets, + // we still might easily hit the limit otherwise. + let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2; + for layer in new_layers.iter() { + if layer.layer_desc().file_size > warn_limit { + warn!( + %layer, + "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size + ); + } + } + + // The writer.finish() above already did the fsync of the inodes. + // We just need to fsync the directory in which these inodes are linked, + // which we know to be the timeline directory. + // + // We use fatal_err() below because the after writer.finish() returns with success, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. + let timeline_dir = VirtualFile::open( + &self + .conf + .timeline_path(&self.tenant_shard_id, &self.timeline_id), + ctx, + ) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + } + + stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now(); + stats.new_deltas_count = Some(new_layers.len()); + stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum()); + + match TryInto::::try_into(stats) + .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string")) + { + Ok(stats_json) => { + info!( + stats_json = stats_json.as_str(), + "compact_level0_phase1 stats available" + ) + } + Err(e) => { + warn!("compact_level0_phase1 stats failed to serialize: {:#}", e); + } + } + + // Without this, rustc complains about deltas_to_compact still + // being borrowed when we `.into_iter()` below. + drop(all_values_iter); + + Ok(CompactLevel0Phase1Result { + new_layers, + deltas_to_compact: deltas_to_compact + .into_iter() + .map(|x| x.drop_eviction_guard()) + .collect::>(), + fully_compacted, + }) + } +} + +#[derive(Default)] +struct CompactLevel0Phase1Result { + new_layers: Vec, + deltas_to_compact: Vec, + // Whether we have included all L0 layers, or selected only part of them due to the + // L0 compaction size limit. + fully_compacted: bool, +} + +#[derive(Default)] +struct CompactLevel0Phase1StatsBuilder { + version: Option, + tenant_id: Option, + timeline_id: Option, + read_lock_acquisition_micros: DurationRecorder, + read_lock_held_spawn_blocking_startup_micros: DurationRecorder, + read_lock_held_key_sort_micros: DurationRecorder, + read_lock_held_prerequisites_micros: DurationRecorder, + read_lock_held_compute_holes_micros: DurationRecorder, + read_lock_drop_micros: DurationRecorder, + write_layer_files_micros: DurationRecorder, + level0_deltas_count: Option, + new_deltas_count: Option, + new_deltas_size: Option, +} + +#[derive(serde::Serialize)] +struct CompactLevel0Phase1Stats { + version: u64, + tenant_id: TenantShardId, + timeline_id: TimelineId, + read_lock_acquisition_micros: RecordedDuration, + read_lock_held_spawn_blocking_startup_micros: RecordedDuration, + read_lock_held_key_sort_micros: RecordedDuration, + read_lock_held_prerequisites_micros: RecordedDuration, + read_lock_held_compute_holes_micros: RecordedDuration, + read_lock_drop_micros: RecordedDuration, + write_layer_files_micros: RecordedDuration, + level0_deltas_count: usize, + new_deltas_count: usize, + new_deltas_size: u64, +} + +impl TryFrom for CompactLevel0Phase1Stats { + type Error = anyhow::Error; + + fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result { + Ok(Self { + version: value.version.ok_or_else(|| anyhow!("version not set"))?, + tenant_id: value + .tenant_id + .ok_or_else(|| anyhow!("tenant_id not set"))?, + timeline_id: value + .timeline_id + .ok_or_else(|| anyhow!("timeline_id not set"))?, + read_lock_acquisition_micros: value + .read_lock_acquisition_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?, + read_lock_held_spawn_blocking_startup_micros: value + .read_lock_held_spawn_blocking_startup_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?, + read_lock_held_key_sort_micros: value + .read_lock_held_key_sort_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?, + read_lock_held_prerequisites_micros: value + .read_lock_held_prerequisites_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?, + read_lock_held_compute_holes_micros: value + .read_lock_held_compute_holes_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?, + read_lock_drop_micros: value + .read_lock_drop_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?, + write_layer_files_micros: value + .write_layer_files_micros + .into_recorded() + .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?, + level0_deltas_count: value + .level0_deltas_count + .ok_or_else(|| anyhow!("level0_deltas_count not set"))?, + new_deltas_count: value + .new_deltas_count + .ok_or_else(|| anyhow!("new_deltas_count not set"))?, + new_deltas_size: value + .new_deltas_size + .ok_or_else(|| anyhow!("new_deltas_size not set"))?, + }) + } +} + +impl Timeline { + /// Entry point for new tiered compaction algorithm. + /// + /// All the real work is in the implementation in the pageserver_compaction + /// crate. The code here would apply to any algorithm implemented by the + /// same interface, but tiered is the only one at the moment. + /// + /// TODO: cancellation + pub(crate) async fn compact_tiered( + self: &Arc, + _cancel: &CancellationToken, + ctx: &RequestContext, + ) -> Result<(), CompactionError> { + let fanout = self.get_compaction_threshold() as u64; + let target_file_size = self.get_checkpoint_distance(); + + // Find the top of the historical layers + let end_lsn = { + let guard = self.layers.read().await; + let layers = guard.layer_map()?; + + let l0_deltas = layers.level0_deltas(); + + // As an optimization, if we find that there are too few L0 layers, + // bail out early. We know that the compaction algorithm would do + // nothing in that case. + if l0_deltas.len() < fanout as usize { + // doesn't need compacting + return Ok(()); + } + l0_deltas.iter().map(|l| l.lsn_range.end).max().unwrap() + }; + + // Is the timeline being deleted? + if self.is_stopping() { + trace!("Dropping out of compaction on timeline shutdown"); + return Err(CompactionError::ShuttingDown); + } + + let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?; + // TODO(chi): ignore sparse_keyspace for now, compact it in the future. + let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks)); + + pageserver_compaction::compact_tiered::compact_tiered( + &mut adaptor, + end_lsn, + target_file_size, + fanout, + ctx, + ) + .await + // TODO: compact_tiered needs to return CompactionError + .map_err(CompactionError::Other)?; + + adaptor.flush_updates().await?; + Ok(()) + } + + /// Take a list of images and deltas, produce images and deltas according to GC horizon and retain_lsns. + /// + /// It takes a key, the values of the key within the compaction process, a GC horizon, and all retain_lsns below the horizon. + /// For now, it requires the `accumulated_values` contains the full history of the key (i.e., the key with the lowest LSN is + /// an image or a WAL not requiring a base image). This restriction will be removed once we implement gc-compaction on branch. + /// + /// The function returns the deltas and the base image that need to be placed at each of the retain LSN. For example, we have: + /// + /// A@0x10, +B@0x20, +C@0x30, +D@0x40, +E@0x50, +F@0x60 + /// horizon = 0x50, retain_lsn = 0x20, 0x40, delta_threshold=3 + /// + /// The function will produce: + /// + /// ```plain + /// 0x20(retain_lsn) -> img=AB@0x20 always produce a single image below the lowest retain LSN + /// 0x40(retain_lsn) -> deltas=[+C@0x30, +D@0x40] two deltas since the last base image, keeping the deltas + /// 0x50(horizon) -> deltas=[ABCDE@0x50] three deltas since the last base image, generate an image but put it in the delta + /// above_horizon -> deltas=[+F@0x60] full history above the horizon + /// ``` + /// + /// Note that `accumulated_values` must be sorted by LSN and should belong to a single key. + pub(crate) async fn generate_key_retention( + self: &Arc, + key: Key, + full_history: &[(Key, Lsn, Value)], + horizon: Lsn, + retain_lsn_below_horizon: &[Lsn], + delta_threshold_cnt: usize, + base_img_from_ancestor: Option<(Key, Lsn, Bytes)>, + ) -> anyhow::Result { + // Pre-checks for the invariants + if cfg!(debug_assertions) { + for (log_key, _, _) in full_history { + assert_eq!(log_key, &key, "mismatched key"); + } + for i in 1..full_history.len() { + assert!(full_history[i - 1].1 <= full_history[i].1, "unordered LSN"); + if full_history[i - 1].1 == full_history[i].1 { + assert!( + matches!(full_history[i - 1].2, Value::Image(_)), + "unordered delta/image, or duplicated delta" + ); + } + } + // There was an assertion for no base image that checks if the first + // record in the history is `will_init` before, but it was removed. + // This is explained in the test cases for generate_key_retention. + // Search "incomplete history" for more information. + for lsn in retain_lsn_below_horizon { + assert!(lsn < &horizon, "retain lsn must be below horizon") + } + for i in 1..retain_lsn_below_horizon.len() { + assert!( + retain_lsn_below_horizon[i - 1] <= retain_lsn_below_horizon[i], + "unordered LSN" + ); + } + } + let has_ancestor = base_img_from_ancestor.is_some(); + // Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon, + // and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket. + let (mut split_history, lsn_split_points) = { + let mut split_history = Vec::new(); + split_history.resize_with(retain_lsn_below_horizon.len() + 2, Vec::new); + let mut lsn_split_points = Vec::with_capacity(retain_lsn_below_horizon.len() + 1); + for lsn in retain_lsn_below_horizon { + lsn_split_points.push(*lsn); + } + lsn_split_points.push(horizon); + let mut current_idx = 0; + for item @ (_, lsn, _) in full_history { + while current_idx < lsn_split_points.len() && *lsn > lsn_split_points[current_idx] { + current_idx += 1; + } + split_history[current_idx].push(item); + } + (split_history, lsn_split_points) + }; + // Step 2: filter out duplicated records due to the k-merge of image/delta layers + for split_for_lsn in &mut split_history { + let mut prev_lsn = None; + let mut new_split_for_lsn = Vec::with_capacity(split_for_lsn.len()); + for record @ (_, lsn, _) in std::mem::take(split_for_lsn) { + if let Some(prev_lsn) = &prev_lsn { + if *prev_lsn == lsn { + // The case that we have an LSN with both data from the delta layer and the image layer. As + // `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply + // drop this delta and keep the image. + // + // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will + // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply + // dropped. + // + // TODO: in case we have both delta + images for a given LSN and it does not exceed the delta + // threshold, we could have kept delta instead to save space. This is an optimization for the future. + continue; + } + } + prev_lsn = Some(lsn); + new_split_for_lsn.push(record); + } + *split_for_lsn = new_split_for_lsn; + } + // Step 3: generate images when necessary + let mut retention = Vec::with_capacity(split_history.len()); + let mut records_since_last_image = 0; + let batch_cnt = split_history.len(); + assert!( + batch_cnt >= 2, + "should have at least below + above horizon batches" + ); + let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new(); + if let Some((key, lsn, img)) = base_img_from_ancestor { + replay_history.push((key, lsn, Value::Image(img))); + } + + /// Generate debug information for the replay history + fn generate_history_trace(replay_history: &[(Key, Lsn, Value)]) -> String { + use std::fmt::Write; + let mut output = String::new(); + if let Some((key, _, _)) = replay_history.first() { + write!(output, "key={} ", key).unwrap(); + let mut cnt = 0; + for (_, lsn, val) in replay_history { + if val.is_image() { + write!(output, "i@{} ", lsn).unwrap(); + } else if val.will_init() { + write!(output, "di@{} ", lsn).unwrap(); + } else { + write!(output, "d@{} ", lsn).unwrap(); + } + cnt += 1; + if cnt >= 128 { + write!(output, "... and more").unwrap(); + break; + } + } + } else { + write!(output, "").unwrap(); + } + output + } + + fn generate_debug_trace( + replay_history: Option<&[(Key, Lsn, Value)]>, + full_history: &[(Key, Lsn, Value)], + lsns: &[Lsn], + horizon: Lsn, + ) -> String { + use std::fmt::Write; + let mut output = String::new(); + if let Some(replay_history) = replay_history { + writeln!( + output, + "replay_history: {}", + generate_history_trace(replay_history) + ) + .unwrap(); + } else { + writeln!(output, "replay_history: ",).unwrap(); + } + writeln!( + output, + "full_history: {}", + generate_history_trace(full_history) + ) + .unwrap(); + writeln!( + output, + "when processing: [{}] horizon={}", + lsns.iter().map(|l| format!("{l}")).join(","), + horizon + ) + .unwrap(); + output + } + + for (i, split_for_lsn) in split_history.into_iter().enumerate() { + // TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly. + records_since_last_image += split_for_lsn.len(); + let generate_image = if i == 0 && !has_ancestor { + // We always generate images for the first batch (below horizon / lowest retain_lsn) + true + } else if i == batch_cnt - 1 { + // Do not generate images for the last batch (above horizon) + false + } else if records_since_last_image >= delta_threshold_cnt { + // Generate images when there are too many records + true + } else { + false + }; + replay_history.extend(split_for_lsn.iter().map(|x| (*x).clone())); + // Only retain the items after the last image record + for idx in (0..replay_history.len()).rev() { + if replay_history[idx].2.will_init() { + replay_history = replay_history[idx..].to_vec(); + break; + } + } + if let Some((_, _, val)) = replay_history.first() { + if !val.will_init() { + return Err(anyhow::anyhow!("invalid history, no base image")).with_context( + || { + generate_debug_trace( + Some(&replay_history), + full_history, + retain_lsn_below_horizon, + horizon, + ) + }, + ); + } + } + if generate_image && records_since_last_image > 0 { + records_since_last_image = 0; + let replay_history_for_debug = if cfg!(debug_assertions) { + Some(replay_history.clone()) + } else { + None + }; + let replay_history_for_debug_ref = replay_history_for_debug.as_deref(); + let history = std::mem::take(&mut replay_history); + let mut img = None; + let mut records = Vec::with_capacity(history.len()); + if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() { + img = Some((*lsn, val.clone())); + for (_, lsn, val) in history.into_iter().skip(1) { + let Value::WalRecord(rec) = val else { + return Err(anyhow::anyhow!( + "invalid record, first record is image, expect walrecords" + )) + .with_context(|| { + generate_debug_trace( + replay_history_for_debug_ref, + full_history, + retain_lsn_below_horizon, + horizon, + ) + }); + }; + records.push((lsn, rec)); + } + } else { + for (_, lsn, val) in history.into_iter() { + let Value::WalRecord(rec) = val else { + return Err(anyhow::anyhow!("invalid record, first record is walrecord, expect rest are walrecord")) + .with_context(|| generate_debug_trace( + replay_history_for_debug_ref, + full_history, + retain_lsn_below_horizon, + horizon, + )); + }; + records.push((lsn, rec)); + } + } + records.reverse(); + let state = ValueReconstructState { img, records }; + let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range + let img = self.reconstruct_value(key, request_lsn, state).await?; + replay_history.push((key, request_lsn, Value::Image(img.clone()))); + retention.push(vec![(request_lsn, Value::Image(img))]); + } else { + let deltas = split_for_lsn + .iter() + .map(|(_, lsn, value)| (*lsn, value.clone())) + .collect_vec(); + retention.push(deltas); + } + } + let mut result = Vec::with_capacity(retention.len()); + assert_eq!(retention.len(), lsn_split_points.len() + 1); + for (idx, logs) in retention.into_iter().enumerate() { + if idx == lsn_split_points.len() { + return Ok(KeyHistoryRetention { + below_horizon: result, + above_horizon: KeyLogAtLsn(logs), + }); + } else { + result.push((lsn_split_points[idx], KeyLogAtLsn(logs))); + } + } + unreachable!("key retention is empty") + } + + /// An experimental compaction building block that combines compaction with garbage collection. + /// + /// The current implementation picks all delta + image layers that are below or intersecting with + /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta + /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon, + /// and create delta layers with all deltas >= gc horizon. + pub(crate) async fn compact_with_gc( + self: &Arc, + cancel: &CancellationToken, + flags: EnumSet, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + use std::collections::BTreeSet; + + // Block other compaction/GC tasks from running for now. GC-compaction could run along + // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc. + // Note that we already acquired the compaction lock when the outer `compact` function gets called. + + let gc_lock = async { + tokio::select! { + guard = self.gc_lock.lock() => Ok(guard), + // TODO: refactor to CompactionError to correctly pass cancelled error + _ = cancel.cancelled() => Err(anyhow!("cancelled")), + } + }; + + let gc_lock = crate::timed( + gc_lock, + "acquires gc lock", + std::time::Duration::from_secs(5), + ) + .await?; + + let dry_run = flags.contains(CompactFlags::DryRun); + + info!("running enhanced gc bottom-most compaction, dry_run={dry_run}"); + + scopeguard::defer! { + info!("done enhanced gc bottom-most compaction"); + }; + + let mut stat = CompactionStatistics::default(); + + // Step 0: pick all delta layers + image layers below/intersect with the GC horizon. + // The layer selection has the following properties: + // 1. If a layer is in the selection, all layers below it are in the selection. + // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection. + let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = { + let guard = self.layers.read().await; + let layers = guard.layer_map()?; + let gc_info = self.gc_info.read().unwrap(); + let mut retain_lsns_below_horizon = Vec::new(); + let gc_cutoff = gc_info.cutoffs.select_min(); + for (lsn, _timeline_id) in &gc_info.retain_lsns { + if lsn < &gc_cutoff { + retain_lsns_below_horizon.push(*lsn); + } + } + for lsn in gc_info.leases.keys() { + if lsn < &gc_cutoff { + retain_lsns_below_horizon.push(*lsn); + } + } + let mut selected_layers = Vec::new(); + drop(gc_info); + // Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers. + let Some(max_layer_lsn) = layers + .iter_historic_layers() + .filter(|desc| desc.get_lsn_range().start <= gc_cutoff) + .map(|desc| desc.get_lsn_range().end) + .max() + else { + info!("no layers to compact with gc"); + return Ok(()); + }; + // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key + // layers to compact. + for desc in layers.iter_historic_layers() { + if desc.get_lsn_range().end <= max_layer_lsn { + selected_layers.push(guard.get_from_desc(&desc)); + } + } + if selected_layers.is_empty() { + info!("no layers to compact with gc"); + return Ok(()); + } + retain_lsns_below_horizon.sort(); + (selected_layers, gc_cutoff, retain_lsns_below_horizon) + }; + let lowest_retain_lsn = if self.ancestor_timeline.is_some() { + Lsn(self.ancestor_lsn.0 + 1) + } else { + let res = retain_lsns_below_horizon + .first() + .copied() + .unwrap_or(gc_cutoff); + if cfg!(debug_assertions) { + assert_eq!( + res, + retain_lsns_below_horizon + .iter() + .min() + .copied() + .unwrap_or(gc_cutoff) + ); + } + res + }; + info!( + "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}", + layer_selection.len(), + gc_cutoff, + lowest_retain_lsn + ); + // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs. + // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point. + let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?) + for layer in &layer_selection { + let desc = layer.layer_desc(); + if desc.is_delta() { + // ignore single-key layer files + if desc.key_range.start.next() != desc.key_range.end { + let lsn_range = &desc.lsn_range; + lsn_split_point.insert(lsn_range.start); + lsn_split_point.insert(lsn_range.end); + } + stat.visit_delta_layer(desc.file_size()); + } else { + stat.visit_image_layer(desc.file_size()); + } + } + for layer in &layer_selection { + let desc = layer.layer_desc(); + let key_range = &desc.key_range; + if desc.is_delta() && key_range.start.next() != key_range.end { + let lsn_range = desc.lsn_range.clone(); + let intersects = lsn_split_point.range(lsn_range).collect_vec(); + if intersects.len() > 1 { + bail!( + "cannot run gc-compaction because it violates the layer map LSN split assumption: layer {} intersects with LSN [{}]", + desc.key(), + intersects.into_iter().map(|lsn| lsn.to_string()).join(", ") + ); + } + } + } + // The maximum LSN we are processing in this compaction loop + let end_lsn = layer_selection + .iter() + .map(|l| l.layer_desc().lsn_range.end) + .max() + .unwrap(); + // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized + // as an L0 layer. + let hack_end_key = Key::NON_L0_MAX; + let mut delta_layers = Vec::new(); + let mut image_layers = Vec::new(); + let mut downloaded_layers = Vec::new(); + for layer in &layer_selection { + let resident_layer = layer.download_and_keep_resident().await?; + downloaded_layers.push(resident_layer); + } + for resident_layer in &downloaded_layers { + if resident_layer.layer_desc().is_delta() { + let layer = resident_layer.get_as_delta(ctx).await?; + delta_layers.push(layer); + } else { + let layer = resident_layer.get_as_image(ctx).await?; + image_layers.push(layer); + } + } + let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx); + // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas. + // Data of the same key. + let mut accumulated_values = Vec::new(); + let mut last_key: Option = None; + + // Only create image layers when there is no ancestor branches. TODO: create covering image layer + // when some condition meet. + let mut image_layer_writer = if self.ancestor_timeline.is_none() { + Some( + SplitImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + Key::MIN, + lowest_retain_lsn, + self.get_compaction_target_size(), + ctx, + ) + .await?, + ) + } else { + None + }; + + let mut delta_layer_writer = SplitDeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + Key::MIN, + lowest_retain_lsn..end_lsn, + self.get_compaction_target_size(), + ctx, + ) + .await?; + + /// Returns None if there is no ancestor branch. Throw an error when the key is not found. + /// + /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image + /// is needed for reconstruction. This should be fixed in the future. + /// + /// Furthermore, we should do vectored get instead of a single get, or better, use k-merge for ancestor + /// images. + async fn get_ancestor_image( + tline: &Arc, + key: Key, + ctx: &RequestContext, + ) -> anyhow::Result> { + if tline.ancestor_timeline.is_none() { + return Ok(None); + }; + // This function is implemented as a get of the current timeline at ancestor LSN, therefore reusing + // as much existing code as possible. + let img = tline.get(key, tline.ancestor_lsn, ctx).await?; + Ok(Some((key, tline.ancestor_lsn, img))) + } + + // Actually, we can decide not to write to the image layer at all at this point because + // the key and LSN range are determined. However, to keep things simple here, we still + // create this writer, and discard the writer in the end. + + while let Some((key, lsn, val)) = merge_iter.next().await? { + if cancel.is_cancelled() { + return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error + } + match val { + Value::Image(_) => stat.visit_image_key(&val), + Value::WalRecord(_) => stat.visit_wal_key(&val), + } + if last_key.is_none() || last_key.as_ref() == Some(&key) { + if last_key.is_none() { + last_key = Some(key); + } + accumulated_values.push((key, lsn, val)); + } else { + let last_key = last_key.as_mut().unwrap(); + stat.on_unique_key_visited(); + let retention = self + .generate_key_retention( + *last_key, + &accumulated_values, + gc_cutoff, + &retain_lsns_below_horizon, + COMPACTION_DELTA_THRESHOLD, + get_ancestor_image(self, *last_key, ctx).await?, + ) + .await?; + // Put the image into the image layer. Currently we have a single big layer for the compaction. + retention + .pipe_to( + *last_key, + self, + &mut delta_layer_writer, + image_layer_writer.as_mut(), + &mut stat, + dry_run, + ctx, + ) + .await?; + accumulated_values.clear(); + *last_key = key; + accumulated_values.push((key, lsn, val)); + } + } + + let last_key = last_key.expect("no keys produced during compaction"); + // TODO: move this part to the loop body + stat.on_unique_key_visited(); + let retention = self + .generate_key_retention( + last_key, + &accumulated_values, + gc_cutoff, + &retain_lsns_below_horizon, + COMPACTION_DELTA_THRESHOLD, + get_ancestor_image(self, last_key, ctx).await?, + ) + .await?; + // Put the image into the image layer. Currently we have a single big layer for the compaction. + retention + .pipe_to( + last_key, + self, + &mut delta_layer_writer, + image_layer_writer.as_mut(), + &mut stat, + dry_run, + ctx, + ) + .await?; + + let discard = |key: &PersistentLayerKey| { + let key = key.clone(); + async move { KeyHistoryRetention::discard_key(&key, self, dry_run).await } + }; + + let produced_image_layers = if let Some(writer) = image_layer_writer { + if !dry_run { + writer + .finish_with_discard_fn(self, ctx, hack_end_key, discard) + .await? + } else { + let (layers, _) = writer.take()?; + assert!(layers.is_empty(), "image layers produced in dry run mode?"); + Vec::new() + } + } else { + Vec::new() + }; + + let produced_delta_layers = if !dry_run { + delta_layer_writer + .finish_with_discard_fn(self, ctx, hack_end_key, discard) + .await? + } else { + let (layers, _) = delta_layer_writer.take()?; + assert!(layers.is_empty(), "delta layers produced in dry run mode?"); + Vec::new() + }; + + let mut compact_to = Vec::new(); + let mut keep_layers = HashSet::new(); + let produced_delta_layers_len = produced_delta_layers.len(); + let produced_image_layers_len = produced_image_layers.len(); + for action in produced_delta_layers { + match action { + SplitWriterResult::Produced(layer) => { + stat.produce_delta_layer(layer.layer_desc().file_size()); + compact_to.push(layer); + } + SplitWriterResult::Discarded(l) => { + keep_layers.insert(l); + stat.discard_delta_layer(); + } + } + } + for action in produced_image_layers { + match action { + SplitWriterResult::Produced(layer) => { + stat.produce_image_layer(layer.layer_desc().file_size()); + compact_to.push(layer); + } + SplitWriterResult::Discarded(l) => { + keep_layers.insert(l); + stat.discard_image_layer(); + } + } + } + let mut layer_selection = layer_selection; + layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key())); + + info!( + "gc-compaction statistics: {}", + serde_json::to_string(&stat)? + ); + + if dry_run { + return Ok(()); + } + + info!( + "produced {} delta layers and {} image layers, {} layers are kept", + produced_delta_layers_len, + produced_image_layers_len, + layer_selection.len() + ); + + // Step 3: Place back to the layer map. + { + let mut guard = self.layers.write().await; + guard + .open_mut()? + .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics) + }; + self.remote_client + .schedule_compaction_update(&layer_selection, &compact_to)?; + + drop(gc_lock); + + Ok(()) + } +} + +struct TimelineAdaptor { + timeline: Arc, + + keyspace: (Lsn, KeySpace), + + new_deltas: Vec, + new_images: Vec, + layers_to_delete: Vec>, +} + +impl TimelineAdaptor { + pub fn new(timeline: &Arc, keyspace: (Lsn, KeySpace)) -> Self { + Self { + timeline: timeline.clone(), + keyspace, + new_images: Vec::new(), + new_deltas: Vec::new(), + layers_to_delete: Vec::new(), + } + } + + pub async fn flush_updates(&mut self) -> Result<(), CompactionError> { + let layers_to_delete = { + let guard = self.timeline.layers.read().await; + self.layers_to_delete + .iter() + .map(|x| guard.get_from_desc(x)) + .collect::>() + }; + self.timeline + .finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete) + .await?; + + self.timeline + .upload_new_image_layers(std::mem::take(&mut self.new_images))?; + + self.new_deltas.clear(); + self.layers_to_delete.clear(); + Ok(()) + } +} + +#[derive(Clone)] +struct ResidentDeltaLayer(ResidentLayer); +#[derive(Clone)] +struct ResidentImageLayer(ResidentLayer); + +impl CompactionJobExecutor for TimelineAdaptor { + type Key = crate::repository::Key; + + type Layer = OwnArc; + type DeltaLayer = ResidentDeltaLayer; + type ImageLayer = ResidentImageLayer; + + type RequestContext = crate::context::RequestContext; + + fn get_shard_identity(&self) -> &ShardIdentity { + self.timeline.get_shard_identity() + } + + async fn get_layers( + &mut self, + key_range: &Range, + lsn_range: &Range, + _ctx: &RequestContext, + ) -> anyhow::Result>> { + self.flush_updates().await?; + + let guard = self.timeline.layers.read().await; + let layer_map = guard.layer_map()?; + + let result = layer_map + .iter_historic_layers() + .filter(|l| { + overlaps_with(&l.lsn_range, lsn_range) && overlaps_with(&l.key_range, key_range) + }) + .map(OwnArc) + .collect(); + Ok(result) + } + + async fn get_keyspace( + &mut self, + key_range: &Range, + lsn: Lsn, + _ctx: &RequestContext, + ) -> anyhow::Result>> { + if lsn == self.keyspace.0 { + Ok(pageserver_compaction::helpers::intersect_keyspace( + &self.keyspace.1.ranges, + key_range, + )) + } else { + // The current compaction implementation only ever requests the key space + // at the compaction end LSN. + anyhow::bail!("keyspace not available for requested lsn"); + } + } + + async fn downcast_delta_layer( + &self, + layer: &OwnArc, + ) -> anyhow::Result> { + // this is a lot more complex than a simple downcast... + if layer.is_delta() { + let l = { + let guard = self.timeline.layers.read().await; + guard.get_from_desc(layer) + }; + let result = l.download_and_keep_resident().await?; + + Ok(Some(ResidentDeltaLayer(result))) + } else { + Ok(None) + } + } + + async fn create_image( + &mut self, + lsn: Lsn, + key_range: &Range, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + Ok(self.create_image_impl(lsn, key_range, ctx).await?) + } + + async fn create_delta( + &mut self, + lsn_range: &Range, + key_range: &Range, + input_layers: &[ResidentDeltaLayer], + ctx: &RequestContext, + ) -> anyhow::Result<()> { + debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); + + let mut all_entries = Vec::new(); + for dl in input_layers.iter() { + all_entries.extend(dl.load_keys(ctx).await?); + } + + // The current stdlib sorting implementation is designed in a way where it is + // particularly fast where the slice is made up of sorted sub-ranges. + all_entries.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn)); + + let mut writer = DeltaLayerWriter::new( + self.timeline.conf, + self.timeline.timeline_id, + self.timeline.tenant_shard_id, + key_range.start, + lsn_range.clone(), + ctx, + ) + .await?; + + let mut dup_values = 0; + + // This iterator walks through all key-value pairs from all the layers + // we're compacting, in key, LSN order. + let mut prev: Option<(Key, Lsn)> = None; + for &DeltaEntry { + key, lsn, ref val, .. + } in all_entries.iter() + { + if prev == Some((key, lsn)) { + // This is a duplicate. Skip it. + // + // It can happen if compaction is interrupted after writing some + // layers but not all, and we are compacting the range again. + // The calculations in the algorithm assume that there are no + // duplicates, so the math on targeted file size is likely off, + // and we will create smaller files than expected. + dup_values += 1; + continue; + } + + let value = val.load(ctx).await?; + + writer.put_value(key, lsn, value, ctx).await?; + + prev = Some((key, lsn)); + } + + if dup_values > 0 { + warn!("delta layer created with {} duplicate values", dup_values); + } + + fail_point!("delta-layer-writer-fail-before-finish", |_| { + Err(anyhow::anyhow!( + "failpoint delta-layer-writer-fail-before-finish" + )) + }); + + let (desc, path) = writer.finish(prev.unwrap().0.next(), ctx).await?; + let new_delta_layer = + Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?; + + self.new_deltas.push(new_delta_layer); + Ok(()) + } + + async fn delete_layer( + &mut self, + layer: &OwnArc, + _ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.layers_to_delete.push(layer.clone().0); + Ok(()) + } +} + +impl TimelineAdaptor { + async fn create_image_impl( + &mut self, + lsn: Lsn, + key_range: &Range, + ctx: &RequestContext, + ) -> Result<(), CreateImageLayersError> { + let timer = self.timeline.metrics.create_images_time_histo.start_timer(); + + let image_layer_writer = ImageLayerWriter::new( + self.timeline.conf, + self.timeline.timeline_id, + self.timeline.tenant_shard_id, + key_range, + lsn, + ctx, + ) + .await?; + + fail_point!("image-layer-writer-fail-before-finish", |_| { + Err(CreateImageLayersError::Other(anyhow::anyhow!( + "failpoint image-layer-writer-fail-before-finish" + ))) + }); + + let keyspace = KeySpace { + ranges: self.get_keyspace(key_range, lsn, ctx).await?, + }; + // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly + let start = Key::MIN; + let ImageLayerCreationOutcome { + image, + next_start_key: _, + } = self + .timeline + .create_image_layer_for_rel_blocks( + &keyspace, + image_layer_writer, + lsn, + ctx, + key_range.clone(), + start, + ) + .await?; + + if let Some(image_layer) = image { + self.new_images.push(image_layer); + } + + timer.stop_and_record(); + + Ok(()) + } +} + +impl CompactionRequestContext for crate::context::RequestContext {} + +#[derive(Debug, Clone)] +pub struct OwnArc(pub Arc); + +impl Deref for OwnArc { + type Target = as Deref>::Target; + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl AsRef for OwnArc { + fn as_ref(&self) -> &T { + self.0.as_ref() + } +} + +impl CompactionLayer for OwnArc { + fn key_range(&self) -> &Range { + &self.key_range + } + fn lsn_range(&self) -> &Range { + &self.lsn_range + } + fn file_size(&self) -> u64 { + self.file_size + } + fn short_id(&self) -> std::string::String { + self.as_ref().short_id().to_string() + } + fn is_delta(&self) -> bool { + self.as_ref().is_delta() + } +} + +impl CompactionLayer for OwnArc { + fn key_range(&self) -> &Range { + &self.layer_desc().key_range + } + fn lsn_range(&self) -> &Range { + &self.layer_desc().lsn_range + } + fn file_size(&self) -> u64 { + self.layer_desc().file_size + } + fn short_id(&self) -> std::string::String { + self.layer_desc().short_id().to_string() + } + fn is_delta(&self) -> bool { + true + } +} + +use crate::tenant::timeline::DeltaEntry; + +impl CompactionLayer for ResidentDeltaLayer { + fn key_range(&self) -> &Range { + &self.0.layer_desc().key_range + } + fn lsn_range(&self) -> &Range { + &self.0.layer_desc().lsn_range + } + fn file_size(&self) -> u64 { + self.0.layer_desc().file_size + } + fn short_id(&self) -> std::string::String { + self.0.layer_desc().short_id().to_string() + } + fn is_delta(&self) -> bool { + true + } +} + +impl CompactionDeltaLayer for ResidentDeltaLayer { + type DeltaEntry<'a> = DeltaEntry<'a>; + + async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result>> { + self.0.load_keys(ctx).await + } +} + +impl CompactionLayer for ResidentImageLayer { + fn key_range(&self) -> &Range { + &self.0.layer_desc().key_range + } + fn lsn_range(&self) -> &Range { + &self.0.layer_desc().lsn_range + } + fn file_size(&self) -> u64 { + self.0.layer_desc().file_size + } + fn short_id(&self) -> std::string::String { + self.0.layer_desc().short_id().to_string() + } + fn is_delta(&self) -> bool { + false + } +} +impl CompactionImageLayer for ResidentImageLayer {} diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index be873181d9..dc4118bb4a 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -6,106 +6,40 @@ use std::{ use anyhow::Context; use pageserver_api::{models::TimelineState, shard::TenantShardId}; use tokio::sync::OwnedMutexGuard; -use tracing::{debug, error, info, instrument, warn, Instrument, Span}; -use utils::{crashsafe, fs_ext, id::TimelineId}; +use tracing::{error, info, instrument, Instrument}; +use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint}; use crate::{ config::PageServerConf, - deletion_queue::DeletionQueueClient, task_mgr::{self, TaskKind}, tenant::{ - debug_assert_current_span_has_tenant_and_timeline_id, metadata::TimelineMetadata, - remote_timeline_client::{ - self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient, - }, + remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient}, CreateTimelineCause, DeleteTimelineError, Tenant, }, }; use super::{Timeline, TimelineResources}; -/// Now that the Timeline is in Stopping state, request all the related tasks to shut down. -async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> { - debug_assert_current_span_has_tenant_and_timeline_id(); - // Notify any timeline work to drop out of loops/requests - tracing::debug!("Cancelling CancellationToken"); - timeline.cancel.cancel(); - - // Stop the walreceiver first. - debug!("waiting for wal receiver to shutdown"); - let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() }; - if let Some(walreceiver) = maybe_started_walreceiver { - walreceiver.stop().await; - } - debug!("wal receiver shutdown confirmed"); - - // Shut down the layer flush task before the remote client, as one depends on the other - task_mgr::shutdown_tasks( - Some(TaskKind::LayerFlushTask), - Some(timeline.tenant_shard_id), - Some(timeline.timeline_id), - ) - .await; - - // Prevent new uploads from starting. - if let Some(remote_client) = timeline.remote_client.as_ref() { - let res = remote_client.stop(); - match res { - Ok(()) => {} - Err(e) => match e { - remote_timeline_client::StopError::QueueUninitialized => { - // This case shouldn't happen currently because the - // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart. - // That is, before we declare the Tenant as Active. - // But we only allow calls to delete_timeline on Active tenants. - return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs"))); - } - }, - } - } - - // Stop & wait for the remaining timeline tasks, including upload tasks. - // NB: This and other delete_timeline calls do not run as a task_mgr task, - // so, they are not affected by this shutdown_tasks() call. - info!("waiting for timeline tasks to shutdown"); - task_mgr::shutdown_tasks( - None, - Some(timeline.tenant_shard_id), - Some(timeline.timeline_id), - ) - .await; - - fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { - Err(anyhow::anyhow!( - "failpoint: timeline-delete-before-index-deleted-at" - ))? - }); - - tracing::debug!("Waiting for gate..."); - timeline.gate.close().await; - tracing::debug!("Shutdown complete"); - - Ok(()) -} - /// Mark timeline as deleted in S3 so we won't pick it up next time /// during attach or pageserver restart. /// See comment in persist_index_part_with_deleted_flag. async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> { - if let Some(remote_client) = timeline.remote_client.as_ref() { - match remote_client.persist_index_part_with_deleted_flag().await { - // If we (now, or already) marked it successfully as deleted, we can proceed - Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), - // Bail out otherwise - // - // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents - // two tasks from performing the deletion at the same time. The first task - // that starts deletion should run it to completion. - Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) - | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { - return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); - } + match timeline + .remote_client + .persist_index_part_with_deleted_flag() + .await + { + // If we (now, or already) marked it successfully as deleted, we can proceed + Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), + // Bail out otherwise + // + // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents + // two tasks from performing the deletion at the same time. The first task + // that starts deletion should run it to completion. + Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) + | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { + return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); } } Ok(()) @@ -124,15 +58,24 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi /// No timeout here, GC & Compaction should be responsive to the /// `TimelineState::Stopping` change. // pub(super): documentation link -pub(super) async fn delete_local_layer_files( +pub(super) async fn delete_local_timeline_directory( conf: &PageServerConf, tenant_shard_id: TenantShardId, timeline: &Timeline, ) -> anyhow::Result<()> { - let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) }; - let guards = crate::timed( - guards, - "acquire gc and compaction locks", + // Always ensure the lock order is compaction -> gc. + let compaction_lock = timeline.compaction_lock.lock(); + let compaction_lock = crate::timed( + compaction_lock, + "acquires compaction lock", + std::time::Duration::from_secs(5), + ) + .await; + + let gc_lock = timeline.gc_lock.lock(); + let gc_lock = crate::timed( + gc_lock, + "acquires gc lock", std::time::Duration::from_secs(5), ) .await; @@ -149,8 +92,6 @@ pub(super) async fn delete_local_layer_files( // NB: This need not be atomic because the deleted flag in the IndexPart // will be observed during tenant/timeline load. The deletion will be resumed there. // - // For configurations without remote storage, we guarantee crash-safety by persising delete mark file. - // // Note that here we do not bail out on std::io::ErrorKind::NotFound. // This can happen if we're called a second time, e.g., // because of a previous failure/cancellation at/after @@ -158,123 +99,10 @@ pub(super) async fn delete_local_layer_files( // // ErrorKind::NotFound can also happen if we race with tenant detach, because, // no locks are shared. - // - // For now, log and continue. - // warn! level is technically not appropriate for the - // first case because we should expect retries to happen. - // But the error is so rare, it seems better to get attention if it happens. - // - // Note that metadata removal is skipped, this is not technically needed, - // but allows to reuse timeline loading code during resumed deletion. - // (we always expect that metadata is in place when timeline is being loaded) - - #[cfg(feature = "testing")] - let mut counter = 0; - - // Timeline directory may not exist if we failed to delete mark file and request was retried. - if !local_timeline_directory.exists() { - return Ok(()); - } - - let metadata_path = conf.metadata_path(&tenant_shard_id, &timeline.timeline_id); - - for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) { - #[cfg(feature = "testing")] - { - counter += 1; - if counter == 2 { - fail::fail_point!("timeline-delete-during-rm", |_| { - Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))? - }); - } - } - - let entry = entry?; - if entry.path() == metadata_path { - debug!("found metadata, skipping"); - continue; - } - - if entry.path() == local_timeline_directory { - // Keeping directory because metedata file is still there - debug!("found timeline dir itself, skipping"); - continue; - } - - let metadata = match entry.metadata() { - Ok(metadata) => metadata, - Err(e) => { - if crate::is_walkdir_io_not_found(&e) { - warn!( - timeline_dir=?local_timeline_directory, - path=?entry.path().display(), - "got not found err while removing timeline dir, proceeding anyway" - ); - continue; - } - anyhow::bail!(e); - } - }; - - if metadata.is_dir() { - warn!(path=%entry.path().display(), "unexpected directory under timeline dir"); - tokio::fs::remove_dir(entry.path()).await - } else { - tokio::fs::remove_file(entry.path()).await - } - .with_context(|| format!("Failed to remove: {}", entry.path().display()))?; - } - - info!("finished deleting layer files, releasing locks"); - drop(guards); - - fail::fail_point!("timeline-delete-after-rm", |_| { - Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))? - }); - - Ok(()) -} - -/// Removes remote layers and an index file after them. -async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> { - if let Some(remote_client) = &timeline.remote_client { - remote_client.delete_all().await.context("delete_all")? - }; - - Ok(()) -} - -// This function removs remaining traces of a timeline on disk. -// Namely: metadata file, timeline directory, delete mark. -// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir. -// delete mark should be present because it is the last step during deletion. -// (nothing can fail after its deletion) -async fn cleanup_remaining_timeline_fs_traces( - conf: &PageServerConf, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, -) -> anyhow::Result<()> { - // Remove local metadata - tokio::fs::remove_file(conf.metadata_path(&tenant_shard_id, &timeline_id)) + tokio::fs::remove_dir_all(local_timeline_directory) .await .or_else(fs_ext::ignore_not_found) - .context("remove metadata")?; - - fail::fail_point!("timeline-delete-after-rm-metadata", |_| { - Err(anyhow::anyhow!( - "failpoint: timeline-delete-after-rm-metadata" - ))? - }); - - // Remove timeline dir - tokio::fs::remove_dir(conf.timeline_path(&tenant_shard_id, &timeline_id)) - .await - .or_else(fs_ext::ignore_not_found) - .context("timeline dir")?; - - fail::fail_point!("timeline-delete-after-rm-dir", |_| { - Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))? - }); + .context("remove local timeline directory")?; // Make sure previous deletions are ordered before mark removal. // Otherwise there is no guarantee that they reach the disk before mark deletion. @@ -287,6 +115,36 @@ async fn cleanup_remaining_timeline_fs_traces( .await .context("fsync_pre_mark_remove")?; + info!("finished deleting layer files, releasing locks"); + drop(gc_lock); + drop(compaction_lock); + + fail::fail_point!("timeline-delete-after-rm", |_| { + Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))? + }); + + Ok(()) +} + +/// Removes remote layers and an index file after them. +async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> { + timeline + .remote_client + .delete_all() + .await + .context("delete_all") +} + +// This function removs remaining traces of a timeline on disk. +// Namely: metadata file, timeline directory, delete mark. +// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir. +// delete mark should be present because it is the last step during deletion. +// (nothing can fail after its deletion) +async fn cleanup_remaining_timeline_fs_traces( + conf: &PageServerConf, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, +) -> anyhow::Result<()> { // Remove delete mark // TODO: once we are confident that no more exist in the field, remove this // line. It cleans up a legacy marker file that might in rare cases be present. @@ -300,14 +158,14 @@ async fn cleanup_remaining_timeline_fs_traces( /// For more context see comments in [`DeleteTimelineFlow::prepare`] async fn remove_timeline_from_tenant( tenant: &Tenant, - timeline_id: TimelineId, + timeline: &Timeline, _: &DeletionGuard, // using it as a witness ) -> anyhow::Result<()> { // Remove the timeline from the map. let mut timelines = tenant.timelines.lock().unwrap(); let children_exist = timelines .iter() - .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); + .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id)); // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. // We already deleted the layer files, so it's probably best to panic. // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) @@ -316,7 +174,7 @@ async fn remove_timeline_from_tenant( } timelines - .remove(&timeline_id) + .remove(&timeline.timeline_id) .expect("timeline that we were deleting was concurrently removed from 'timelines' map"); drop(timelines); @@ -334,13 +192,15 @@ async fn remove_timeline_from_tenant( /// 5. Delete index part /// 6. Delete meta, timeline directory /// 7. Delete mark file +/// /// It is resumable from any step in case a crash/restart occurs. /// There are three entrypoints to the process: /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler. /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present -/// and we possibly neeed to continue deletion of remote files. +/// and we possibly neeed to continue deletion of remote files. /// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote -/// index but still have local metadata, timeline directory and delete mark. +/// index but still have local metadata, timeline directory and delete mark. +/// /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load. #[derive(Default)] pub enum DeleteTimelineFlow { @@ -356,17 +216,27 @@ impl DeleteTimelineFlow { // NB: If this fails half-way through, and is retried, the retry will go through // all the same steps again. Make sure the code here is idempotent, and don't // error out if some of the shutdown tasks have already been completed! - #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_shard_id.tenant_id, shard_id=%tenant.tenant_shard_id.shard_slug()))] + #[instrument(skip_all)] pub async fn run( tenant: &Arc, timeline_id: TimelineId, - inplace: bool, ) -> Result<(), DeleteTimelineError> { + super::debug_assert_current_span_has_tenant_and_timeline_id(); + let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?; guard.mark_in_progress()?; - stop_tasks(&timeline).await?; + // Now that the Timeline is in Stopping state, request all the related tasks to shut down. + timeline.shutdown(super::ShutdownMode::Hard).await; + + tenant.gc_block.before_delete(&timeline); + + fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { + Err(anyhow::anyhow!( + "failpoint: timeline-delete-before-index-deleted-at" + ))? + }); set_deleted_in_remote_index(&timeline).await?; @@ -376,11 +246,7 @@ impl DeleteTimelineFlow { ))? }); - if inplace { - Self::background(guard, tenant.conf, tenant, &timeline).await? - } else { - Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline); - } + Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline); Ok(()) } @@ -398,14 +264,12 @@ impl DeleteTimelineFlow { } /// Shortcut to create Timeline in stopping state and spawn deletion task. - /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`] #[instrument(skip_all, fields(%timeline_id))] pub async fn resume_deletion( tenant: Arc, timeline_id: TimelineId, local_metadata: &TimelineMetadata, - remote_client: Option, - deletion_queue_client: DeletionQueueClient, + remote_client: RemoteTimelineClient, ) -> anyhow::Result<()> { // Note: here we even skip populating layer map. Timeline is essentially uninitialized. // RemoteTimelineClient is the only functioning part. @@ -416,11 +280,14 @@ impl DeleteTimelineFlow { None, // Ancestor is not needed for deletion. TimelineResources { remote_client, - deletion_queue_client, + timeline_get_throttle: tenant.timeline_get_throttle.clone(), + l0_flush_global_state: tenant.l0_flush_global_state.clone(), }, // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. CreateTimelineCause::Delete, + // Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace + None, ) .context("create_timeline_struct")?; @@ -528,10 +395,9 @@ impl DeleteTimelineFlow { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, - Some(tenant_shard_id), + tenant_shard_id, Some(timeline_id), "timeline_delete", - false, async move { if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await { error!("Error: {err:#}"); @@ -539,12 +405,7 @@ impl DeleteTimelineFlow { }; Ok(()) } - .instrument({ - let span = - tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id); - span.follows_from(Span::current()); - span - }), + .instrument(tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id)), ); } @@ -554,24 +415,21 @@ impl DeleteTimelineFlow { tenant: &Tenant, timeline: &Timeline, ) -> Result<(), DeleteTimelineError> { - delete_local_layer_files(conf, tenant.tenant_shard_id, timeline).await?; + delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?; delete_remote_layers_and_index(timeline).await?; pausable_failpoint!("in_progress_delete"); - cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_shard_id, timeline.timeline_id) - .await?; - - remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?; + remove_timeline_from_tenant(tenant, timeline, &guard).await?; *guard = Self::Finished; Ok(()) } - pub(crate) fn is_finished(&self) -> bool { - matches!(self, Self::Finished) + pub(crate) fn is_not_started(&self) -> bool { + matches!(self, Self::NotStarted) } } diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs new file mode 100644 index 0000000000..641faada25 --- /dev/null +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -0,0 +1,952 @@ +use std::{collections::HashSet, sync::Arc}; + +use super::{layer_manager::LayerManager, FlushLayerError, Timeline}; +use crate::{ + context::{DownloadBehavior, RequestContext}, + task_mgr::TaskKind, + tenant::{ + remote_timeline_client::index::GcBlockingReason::DetachAncestor, + storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer}, + Tenant, + }, + virtual_file::{MaybeFatalIo, VirtualFile}, +}; +use anyhow::Context; +use pageserver_api::models::detach_ancestor::AncestorDetached; +use tokio::sync::Semaphore; +use tokio_util::sync::CancellationToken; +use tracing::Instrument; +use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn}; + +#[derive(Debug, thiserror::Error)] +pub(crate) enum Error { + #[error("no ancestors")] + NoAncestor, + + #[error("too many ancestors")] + TooManyAncestors, + + #[error("shutting down, please retry later")] + ShuttingDown, + + #[error(transparent)] + NotFound(crate::tenant::GetTimelineError), + + #[error("failed to reparent all candidate timelines, please retry")] + FailedToReparentAll, + + #[error("ancestor is already being detached by: {}", .0)] + OtherTimelineDetachOngoing(TimelineId), + + #[error("preparing to timeline ancestor detach failed")] + Prepare(#[source] anyhow::Error), + + #[error("detaching and reparenting failed")] + DetachReparent(#[source] anyhow::Error), + + #[error("completing ancestor detach failed")] + Complete(#[source] anyhow::Error), + + #[error("failpoint: {}", .0)] + Failpoint(&'static str), +} + +impl Error { + /// Try to catch cancellation from within the `anyhow::Error`, or wrap the anyhow as the given + /// variant or fancier `or_else`. + fn launder(e: anyhow::Error, or_else: F) -> Error + where + F: Fn(anyhow::Error) -> Error, + { + use crate::tenant::remote_timeline_client::WaitCompletionError; + use crate::tenant::upload_queue::NotInitialized; + use remote_storage::TimeoutOrCancel; + + if e.is::() + || TimeoutOrCancel::caused_by_cancel(&e) + || e.downcast_ref::() + .is_some_and(|e| e.is_cancelled()) + || e.is::() + { + Error::ShuttingDown + } else { + or_else(e) + } + } +} + +impl From for ApiError { + fn from(value: Error) -> Self { + match value { + Error::NoAncestor => ApiError::Conflict(value.to_string()), + Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", value)), + Error::ShuttingDown => ApiError::ShuttingDown, + Error::OtherTimelineDetachOngoing(_) | Error::FailedToReparentAll => { + ApiError::ResourceUnavailable(value.to_string().into()) + } + Error::NotFound(e) => ApiError::from(e), + // these variants should have no cancellation errors because of Error::launder + Error::Prepare(_) + | Error::DetachReparent(_) + | Error::Complete(_) + | Error::Failpoint(_) => ApiError::InternalServerError(value.into()), + } + } +} + +impl From for Error { + fn from(_: crate::tenant::upload_queue::NotInitialized) -> Self { + // treat all as shutting down signals, even though that is not entirely correct + // (uninitialized state) + Error::ShuttingDown + } +} +impl From for Error { + fn from(_: super::layer_manager::Shutdown) -> Self { + Error::ShuttingDown + } +} + +pub(crate) enum Progress { + Prepared(Attempt, PreparedTimelineDetach), + Done(AncestorDetached), +} + +pub(crate) struct PreparedTimelineDetach { + layers: Vec, +} + +/// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments. +#[derive(Debug)] +pub(crate) struct Options { + pub(crate) rewrite_concurrency: std::num::NonZeroUsize, + pub(crate) copy_concurrency: std::num::NonZeroUsize, +} + +impl Default for Options { + fn default() -> Self { + Self { + rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(), + copy_concurrency: std::num::NonZeroUsize::new(100).unwrap(), + } + } +} + +/// Represents an across tenant reset exclusive single attempt to detach ancestor. +#[derive(Debug)] +pub(crate) struct Attempt { + pub(crate) timeline_id: TimelineId, + + _guard: completion::Completion, + gate_entered: Option, +} + +impl Attempt { + pub(crate) fn before_reset_tenant(&mut self) { + let taken = self.gate_entered.take(); + assert!(taken.is_some()); + } + + pub(crate) fn new_barrier(&self) -> completion::Barrier { + self._guard.barrier() + } +} + +/// See [`Timeline::prepare_to_detach_from_ancestor`] +pub(super) async fn prepare( + detached: &Arc, + tenant: &Tenant, + options: Options, + ctx: &RequestContext, +) -> Result { + use Error::*; + + let Some((ancestor, ancestor_lsn)) = detached + .ancestor_timeline + .as_ref() + .map(|tl| (tl.clone(), detached.ancestor_lsn)) + else { + let still_in_progress = { + let accessor = detached.remote_client.initialized_upload_queue()?; + + // we are safe to inspect the latest uploaded, because we can only witness this after + // restart is complete and ancestor is no more. + let latest = accessor.latest_uploaded_index_part(); + if latest.lineage.detached_previous_ancestor().is_none() { + return Err(NoAncestor); + }; + + latest + .gc_blocking + .as_ref() + .is_some_and(|b| b.blocked_by(DetachAncestor)) + }; + + if still_in_progress { + // gc is still blocked, we can still reparent and complete. + // we are safe to reparent remaining, because they were locked in in the beginning. + let attempt = continue_with_blocked_gc(detached, tenant).await?; + + // because the ancestor of detached is already set to none, we have published all + // of the layers, so we are still "prepared." + return Ok(Progress::Prepared( + attempt, + PreparedTimelineDetach { layers: Vec::new() }, + )); + } + + let reparented_timelines = reparented_direct_children(detached, tenant)?; + return Ok(Progress::Done(AncestorDetached { + reparented_timelines, + })); + }; + + if !ancestor_lsn.is_valid() { + // rare case, probably wouldn't even load + tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing"); + return Err(NoAncestor); + } + + if ancestor.ancestor_timeline.is_some() { + // non-technical requirement; we could flatten N ancestors just as easily but we chose + // not to, at least initially + return Err(TooManyAncestors); + } + + let attempt = start_new_attempt(detached, tenant).await?; + + utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking-pausable"); + + fail::fail_point!( + "timeline-detach-ancestor::before_starting_after_locking", + |_| Err(Error::Failpoint( + "timeline-detach-ancestor::before_starting_after_locking" + )) + ); + + if ancestor_lsn >= ancestor.get_disk_consistent_lsn() { + let span = + tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id); + async { + let started_at = std::time::Instant::now(); + let freeze_and_flush = ancestor.freeze_and_flush0(); + let mut freeze_and_flush = std::pin::pin!(freeze_and_flush); + + let res = + tokio::time::timeout(std::time::Duration::from_secs(1), &mut freeze_and_flush) + .await; + + let res = match res { + Ok(res) => res, + Err(_elapsed) => { + tracing::info!("freezing and flushing ancestor is still ongoing"); + freeze_and_flush.await + } + }; + + res.map_err(|e| { + use FlushLayerError::*; + match e { + Cancelled | NotRunning(_) => { + // FIXME(#6424): technically statically unreachable right now, given how we never + // drop the sender + Error::ShuttingDown + } + CreateImageLayersError(_) | Other(_) => Error::Prepare(e.into()), + } + })?; + + // we do not need to wait for uploads to complete but we do need `struct Layer`, + // copying delta prefix is unsupported currently for `InMemoryLayer`. + tracing::info!( + elapsed_ms = started_at.elapsed().as_millis(), + "froze and flushed the ancestor" + ); + Ok::<_, Error>(()) + } + .instrument(span) + .await?; + } + + let end_lsn = ancestor_lsn + 1; + + let (filtered_layers, straddling_branchpoint, rest_of_historic) = { + // we do not need to start from our layers, because they can only be layers that come + // *after* ancestor_lsn + let layers = tokio::select! { + guard = ancestor.layers.read() => guard, + _ = detached.cancel.cancelled() => { + return Err(ShuttingDown); + } + _ = ancestor.cancel.cancelled() => { + return Err(ShuttingDown); + } + }; + + // between retries, these can change if compaction or gc ran in between. this will mean + // we have to redo work. + partition_work(ancestor_lsn, &layers)? + }; + + // TODO: layers are already sorted by something: use that to determine how much of remote + // copies are already done -- gc is blocked, but a compaction could had happened on ancestor, + // which is something to keep in mind if copy skipping is implemented. + tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers"); + + // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after + let mut new_layers: Vec = + Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len()); + + { + tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers"); + + let mut tasks = tokio::task::JoinSet::new(); + + let mut wrote_any = false; + + let limiter = Arc::new(Semaphore::new(options.rewrite_concurrency.get())); + + for layer in straddling_branchpoint { + let limiter = limiter.clone(); + let timeline = detached.clone(); + let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download); + + let span = tracing::info_span!("upload_rewritten_layer", %layer); + tasks.spawn( + async move { + let _permit = limiter.acquire().await; + let copied = + upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx) + .await?; + if let Some(copied) = copied.as_ref() { + tracing::info!(%copied, "rewrote and uploaded"); + } + Ok(copied) + } + .instrument(span), + ); + } + + while let Some(res) = tasks.join_next().await { + match res { + Ok(Ok(Some(copied))) => { + wrote_any = true; + new_layers.push(copied); + } + Ok(Ok(None)) => {} + Ok(Err(e)) => return Err(e), + Err(je) => return Err(Error::Prepare(je.into())), + } + } + + // FIXME: the fsync should be mandatory, after both rewrites and copies + if wrote_any { + let timeline_dir = VirtualFile::open( + &detached + .conf + .timeline_path(&detached.tenant_shard_id, &detached.timeline_id), + ctx, + ) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + } + } + + let mut tasks = tokio::task::JoinSet::new(); + let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get())); + + for adopted in rest_of_historic { + let limiter = limiter.clone(); + let timeline = detached.clone(); + + tasks.spawn( + async move { + let _permit = limiter.acquire().await; + let owned = + remote_copy(&adopted, &timeline, timeline.generation, &timeline.cancel).await?; + tracing::info!(layer=%owned, "remote copied"); + Ok(owned) + } + .in_current_span(), + ); + } + + while let Some(res) = tasks.join_next().await { + match res { + Ok(Ok(owned)) => { + new_layers.push(owned); + } + Ok(Err(failed)) => { + return Err(failed); + } + Err(je) => return Err(Error::Prepare(je.into())), + } + } + + // TODO: fsync directory again if we hardlinked something + + let prepared = PreparedTimelineDetach { layers: new_layers }; + + Ok(Progress::Prepared(attempt, prepared)) +} + +async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result { + let attempt = obtain_exclusive_attempt(detached, tenant)?; + + // insert the block in the index_part.json, if not already there. + let _dont_care = tenant + .gc_block + .insert( + detached, + crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor, + ) + .await + .map_err(|e| Error::launder(e, Error::Prepare))?; + + Ok(attempt) +} + +async fn continue_with_blocked_gc(detached: &Timeline, tenant: &Tenant) -> Result { + // FIXME: it would be nice to confirm that there is an in-memory version, since we've just + // verified there is a persistent one? + obtain_exclusive_attempt(detached, tenant) +} + +fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result { + use Error::{OtherTimelineDetachOngoing, ShuttingDown}; + + // ensure we are the only active attempt for this tenant + let (guard, barrier) = completion::channel(); + { + let mut guard = tenant.ongoing_timeline_detach.lock().unwrap(); + if let Some((tl, other)) = guard.as_ref() { + if !other.is_ready() { + return Err(OtherTimelineDetachOngoing(*tl)); + } + // FIXME: no test enters here + } + *guard = Some((detached.timeline_id, barrier)); + } + + // ensure the gate is still open + let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?; + + Ok(Attempt { + timeline_id: detached.timeline_id, + _guard: guard, + gate_entered: Some(_gate_entered), + }) +} + +fn reparented_direct_children( + detached: &Arc, + tenant: &Tenant, +) -> Result, Error> { + let mut all_direct_children = tenant + .timelines + .lock() + .unwrap() + .values() + .filter_map(|tl| { + let is_direct_child = matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)); + + if is_direct_child { + Some(tl.clone()) + } else { + if let Some(timeline) = tl.ancestor_timeline.as_ref() { + assert_ne!(timeline.timeline_id, detached.timeline_id, "we cannot have two timelines with the same timeline_id live"); + } + None + } + }) + // Collect to avoid lock taking order problem with Tenant::timelines and + // Timeline::remote_client + .collect::>(); + + let mut any_shutdown = false; + + all_direct_children.retain(|tl| match tl.remote_client.initialized_upload_queue() { + Ok(accessor) => accessor + .latest_uploaded_index_part() + .lineage + .is_reparented(), + Err(_shutdownalike) => { + // not 100% a shutdown, but let's bail early not to give inconsistent results in + // sharded enviroment. + any_shutdown = true; + true + } + }); + + if any_shutdown { + // it could be one or many being deleted; have client retry + return Err(Error::ShuttingDown); + } + + Ok(all_direct_children + .into_iter() + .map(|tl| tl.timeline_id) + .collect()) +} + +fn partition_work( + ancestor_lsn: Lsn, + source: &LayerManager, +) -> Result<(usize, Vec, Vec), Error> { + let mut straddling_branchpoint = vec![]; + let mut rest_of_historic = vec![]; + + let mut later_by_lsn = 0; + + for desc in source.layer_map()?.iter_historic_layers() { + // off by one chances here: + // - start is inclusive + // - end is exclusive + if desc.lsn_range.start > ancestor_lsn { + later_by_lsn += 1; + continue; + } + + let target = if desc.lsn_range.start <= ancestor_lsn + && desc.lsn_range.end > ancestor_lsn + && desc.is_delta + { + // TODO: image layer at Lsn optimization + &mut straddling_branchpoint + } else { + &mut rest_of_historic + }; + + target.push(source.get_from_desc(&desc)); + } + + Ok((later_by_lsn, straddling_branchpoint, rest_of_historic)) +} + +async fn upload_rewritten_layer( + end_lsn: Lsn, + layer: &Layer, + target: &Arc, + cancel: &CancellationToken, + ctx: &RequestContext, +) -> Result, Error> { + let copied = copy_lsn_prefix(end_lsn, layer, target, ctx).await?; + + let Some(copied) = copied else { + return Ok(None); + }; + + target + .remote_client + .upload_layer_file(&copied, cancel) + .await + .map_err(|e| Error::launder(e, Error::Prepare))?; + + Ok(Some(copied.into())) +} + +async fn copy_lsn_prefix( + end_lsn: Lsn, + layer: &Layer, + target_timeline: &Arc, + ctx: &RequestContext, +) -> Result, Error> { + if target_timeline.cancel.is_cancelled() { + return Err(Error::ShuttingDown); + } + + tracing::debug!(%layer, %end_lsn, "copying lsn prefix"); + + let mut writer = DeltaLayerWriter::new( + target_timeline.conf, + target_timeline.timeline_id, + target_timeline.tenant_shard_id, + layer.layer_desc().key_range.start, + layer.layer_desc().lsn_range.start..end_lsn, + ctx, + ) + .await + .with_context(|| format!("prepare to copy lsn prefix of ancestors {layer}")) + .map_err(Error::Prepare)?; + + let resident = layer.download_and_keep_resident().await.map_err(|e| { + if e.is_cancelled() { + Error::ShuttingDown + } else { + Error::Prepare(e.into()) + } + })?; + + let records = resident + .copy_delta_prefix(&mut writer, end_lsn, ctx) + .await + .with_context(|| format!("copy lsn prefix of ancestors {layer}")) + .map_err(Error::Prepare)?; + + drop(resident); + + tracing::debug!(%layer, records, "copied records"); + + if records == 0 { + drop(writer); + // TODO: we might want to store an empty marker in remote storage for this + // layer so that we will not needlessly walk `layer` on repeated attempts. + Ok(None) + } else { + // reuse the key instead of adding more holes between layers by using the real + // highest key in the layer. + let reused_highest_key = layer.layer_desc().key_range.end; + let (desc, path) = writer + .finish(reused_highest_key, ctx) + .await + .map_err(Error::Prepare)?; + let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path) + .map_err(Error::Prepare)?; + + tracing::debug!(%layer, %copied, "new layer produced"); + + Ok(Some(copied)) + } +} + +/// Creates a new Layer instance for the adopted layer, and ensures it is found from the remote +/// storage on successful return without the adopted layer being added to `index_part.json`. +async fn remote_copy( + adopted: &Layer, + adoptee: &Arc, + generation: Generation, + cancel: &CancellationToken, +) -> Result { + // depending if Layer::keep_resident we could hardlink + + let mut metadata = adopted.metadata(); + debug_assert!(metadata.generation <= generation); + metadata.generation = generation; + + let owned = crate::tenant::storage_layer::Layer::for_evicted( + adoptee.conf, + adoptee, + adopted.layer_desc().layer_name(), + metadata, + ); + + adoptee + .remote_client + .copy_timeline_layer(adopted, &owned, cancel) + .await + .map(move |()| owned) + .map_err(|e| Error::launder(e, Error::Prepare)) +} + +pub(crate) enum DetachingAndReparenting { + /// All of the following timeline ids were reparented and the timeline ancestor detach must be + /// marked as completed. + Reparented(HashSet), + + /// Some of the reparentings failed. The timeline ancestor detach must **not** be marked as + /// completed. + /// + /// Nested `must_reset_tenant` is set to true when any restart requiring changes were made. + SomeReparentingFailed { must_reset_tenant: bool }, + + /// Detaching and reparentings were completed in a previous attempt. Timeline ancestor detach + /// must be marked as completed. + AlreadyDone(HashSet), +} + +impl DetachingAndReparenting { + pub(crate) fn reset_tenant_required(&self) -> bool { + use DetachingAndReparenting::*; + match self { + Reparented(_) => true, + SomeReparentingFailed { must_reset_tenant } => *must_reset_tenant, + AlreadyDone(_) => false, + } + } + + pub(crate) fn completed(self) -> Option> { + use DetachingAndReparenting::*; + match self { + Reparented(x) | AlreadyDone(x) => Some(x), + SomeReparentingFailed { .. } => None, + } + } +} + +/// See [`Timeline::detach_from_ancestor_and_reparent`]. +pub(super) async fn detach_and_reparent( + detached: &Arc, + tenant: &Tenant, + prepared: PreparedTimelineDetach, + _ctx: &RequestContext, +) -> Result { + let PreparedTimelineDetach { layers } = prepared; + + #[derive(Debug)] + enum Ancestor { + NotDetached(Arc, Lsn), + Detached(Arc, Lsn), + } + + let (recorded_branchpoint, still_ongoing) = { + let access = detached.remote_client.initialized_upload_queue()?; + let latest = access.latest_uploaded_index_part(); + + ( + latest.lineage.detached_previous_ancestor(), + latest + .gc_blocking + .as_ref() + .is_some_and(|b| b.blocked_by(DetachAncestor)), + ) + }; + assert!( + still_ongoing, + "cannot (detach? reparent)? complete if the operation is not still ongoing" + ); + + let ancestor = match (detached.ancestor_timeline.as_ref(), recorded_branchpoint) { + (Some(ancestor), None) => { + assert!( + !layers.is_empty(), + "there should always be at least one layer to inherit" + ); + Ancestor::NotDetached(ancestor.clone(), detached.ancestor_lsn) + } + (Some(_), Some(_)) => { + panic!( + "it should be impossible to get to here without having gone through the tenant reset; if the tenant was reset, then the ancestor_timeline would be None" + ); + } + (None, Some((ancestor_id, ancestor_lsn))) => { + // it has been either: + // - detached but still exists => we can try reparenting + // - detached and deleted + // + // either way, we must complete + assert!( + layers.is_empty(), + "no layers should had been copied as detach is done" + ); + + let existing = tenant.timelines.lock().unwrap().get(&ancestor_id).cloned(); + + if let Some(ancestor) = existing { + Ancestor::Detached(ancestor, ancestor_lsn) + } else { + let direct_children = reparented_direct_children(detached, tenant)?; + return Ok(DetachingAndReparenting::AlreadyDone(direct_children)); + } + } + (None, None) => { + // TODO: make sure there are no `?` before tenant_reset from after a questionmark from + // here. + panic!( + "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor" + ); + } + }; + + // publish the prepared layers before we reparent any of the timelines, so that on restart + // reparented timelines find layers. also do the actual detaching. + // + // if we crash after this operation, a retry will allow reparenting the remaining timelines as + // gc is blocked. + + let (ancestor, ancestor_lsn, was_detached) = match ancestor { + Ancestor::NotDetached(ancestor, ancestor_lsn) => { + // this has to complete before any reparentings because otherwise they would not have + // layers on the new parent. + detached + .remote_client + .schedule_adding_existing_layers_to_index_detach_and_wait( + &layers, + (ancestor.timeline_id, ancestor_lsn), + ) + .await + .context("publish layers and detach ancestor") + .map_err(|e| Error::launder(e, Error::DetachReparent))?; + + tracing::info!( + ancestor=%ancestor.timeline_id, + %ancestor_lsn, + inherited_layers=%layers.len(), + "detached from ancestor" + ); + (ancestor, ancestor_lsn, true) + } + Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn, false), + }; + + let mut tasks = tokio::task::JoinSet::new(); + + // Returns a single permit semaphore which will be used to make one reparenting succeed, + // others will fail as if those timelines had been stopped for whatever reason. + #[cfg(feature = "testing")] + let failpoint_sem = || -> Option> { + fail::fail_point!("timeline-detach-ancestor::allow_one_reparented", |_| Some( + Arc::new(Semaphore::new(1)) + )); + None + }(); + + // because we are now keeping the slot in progress, it is unlikely that there will be any + // timeline deletions during this time. if we raced one, then we'll just ignore it. + { + let g = tenant.timelines.lock().unwrap(); + reparentable_timelines(g.values(), detached, &ancestor, ancestor_lsn) + .cloned() + .for_each(|timeline| { + // important in this scope: we are holding the Tenant::timelines lock + let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id); + let new_parent = detached.timeline_id; + #[cfg(feature = "testing")] + let failpoint_sem = failpoint_sem.clone(); + + tasks.spawn( + async move { + let res = async { + #[cfg(feature = "testing")] + if let Some(failpoint_sem) = failpoint_sem { + let _permit = failpoint_sem.acquire().await.map_err(|_| { + anyhow::anyhow!( + "failpoint: timeline-detach-ancestor::allow_one_reparented", + ) + })?; + failpoint_sem.close(); + } + + timeline + .remote_client + .schedule_reparenting_and_wait(&new_parent) + .await + } + .await; + + match res { + Ok(()) => { + tracing::info!("reparented"); + Some(timeline) + } + Err(e) => { + // with the use of tenant slot, raced timeline deletion is the most + // likely reason. + tracing::warn!("reparenting failed: {e:#}"); + None + } + } + } + .instrument(span), + ); + }); + } + + let reparenting_candidates = tasks.len(); + let mut reparented = HashSet::with_capacity(tasks.len()); + + while let Some(res) = tasks.join_next().await { + match res { + Ok(Some(timeline)) => { + assert!( + reparented.insert(timeline.timeline_id), + "duplicate reparenting? timeline_id={}", + timeline.timeline_id + ); + } + Err(je) if je.is_cancelled() => unreachable!("not used"), + // just ignore failures now, we can retry + Ok(None) => {} + Err(je) if je.is_panic() => {} + Err(je) => tracing::error!("unexpected join error: {je:?}"), + } + } + + let reparented_all = reparenting_candidates == reparented.len(); + + if reparented_all { + Ok(DetachingAndReparenting::Reparented(reparented)) + } else { + tracing::info!( + reparented = reparented.len(), + candidates = reparenting_candidates, + "failed to reparent all candidates; they can be retried after the tenant_reset", + ); + + let must_reset_tenant = !reparented.is_empty() || was_detached; + Ok(DetachingAndReparenting::SomeReparentingFailed { must_reset_tenant }) + } +} + +pub(super) async fn complete( + detached: &Arc, + tenant: &Tenant, + mut attempt: Attempt, + _ctx: &RequestContext, +) -> Result<(), Error> { + assert_eq!(detached.timeline_id, attempt.timeline_id); + + if attempt.gate_entered.is_none() { + let entered = detached.gate.enter().map_err(|_| Error::ShuttingDown)?; + attempt.gate_entered = Some(entered); + } else { + // Some(gate_entered) means the tenant was not restarted, as is not required + } + + assert!(detached.ancestor_timeline.is_none()); + + // this should be an 503 at least...? + fail::fail_point!( + "timeline-detach-ancestor::complete_before_uploading", + |_| Err(Error::Failpoint( + "timeline-detach-ancestor::complete_before_uploading" + )) + ); + + tenant + .gc_block + .remove( + detached, + crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor, + ) + .await + .map_err(|e| Error::launder(e, Error::Complete))?; + + Ok(()) +} + +/// Query against a locked `Tenant::timelines`. +fn reparentable_timelines<'a, I>( + timelines: I, + detached: &'a Arc, + ancestor: &'a Arc, + ancestor_lsn: Lsn, +) -> impl Iterator> + 'a +where + I: Iterator> + 'a, +{ + timelines.filter_map(move |tl| { + if Arc::ptr_eq(tl, detached) { + return None; + } + + let tl_ancestor = tl.ancestor_timeline.as_ref()?; + let is_same = Arc::ptr_eq(ancestor, tl_ancestor); + let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn; + + let is_deleting = tl + .delete_progress + .try_lock() + .map(|flow| !flow.is_not_started()) + .unwrap_or(true); + + if is_same && is_earlier && !is_deleting { + Some(tl) + } else { + None + } + }) +} diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 01a5bfc32b..2f6cb4d73a 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -23,18 +23,19 @@ use std::{ use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, info_span, instrument, warn, Instrument}; +use tracing::{debug, info, info_span, instrument, warn, Instrument}; use crate::{ context::{DownloadBehavior, RequestContext}, pgdatadir_mapping::CollectKeySpaceError, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, tenant::{ - tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant, + storage_layer::LayerVisibilityHint, tasks::BackgroundLoopKind, timeline::EvictionError, + LogicalSizeCalculationCause, Tenant, }, }; -use utils::completion; +use utils::{completion, sync::gate::GateGuard}; use super::Timeline; @@ -51,6 +52,7 @@ pub struct EvictionTaskTenantState { impl Timeline { pub(super) fn launch_eviction_task( self: &Arc, + parent: Arc, background_tasks_can_start: Option<&completion::Barrier>, ) { let self_clone = Arc::clone(self); @@ -58,36 +60,41 @@ impl Timeline { task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Eviction, - Some(self.tenant_shard_id), + self.tenant_shard_id, Some(self.timeline_id), &format!( "layer eviction for {}/{}", self.tenant_shard_id, self.timeline_id ), - false, async move { - let cancel = task_mgr::shutdown_token(); tokio::select! { - _ = cancel.cancelled() => { return Ok(()); } + _ = self_clone.cancel.cancelled() => { return Ok(()); } _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {} }; - self_clone.eviction_task(cancel).await; + self_clone.eviction_task(parent).await; Ok(()) }, ); } #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] - async fn eviction_task(self: Arc, cancel: CancellationToken) { + async fn eviction_task(self: Arc, tenant: Arc) { use crate::tenant::tasks::random_init_delay; + + // acquire the gate guard only once within a useful span + let Ok(guard) = self.gate.enter() else { + return; + }; + { let policy = self.get_eviction_policy(); let period = match policy { EvictionPolicy::LayerAccessThreshold(lat) => lat.period, + EvictionPolicy::OnlyImitiate(lat) => lat.period, EvictionPolicy::NoEviction => Duration::from_secs(10), }; - if random_init_delay(period, &cancel).await.is_err() { + if random_init_delay(period, &self.cancel).await.is_err() { return; } } @@ -95,12 +102,14 @@ impl Timeline { let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn); loop { let policy = self.get_eviction_policy(); - let cf = self.eviction_iteration(&policy, &cancel, &ctx).await; + let cf = self + .eviction_iteration(&tenant, &policy, &self.cancel, &guard, &ctx) + .await; match cf { ControlFlow::Break(()) => break, ControlFlow::Continue(sleep_until) => { - if tokio::time::timeout_at(sleep_until, cancel.cancelled()) + if tokio::time::timeout_at(sleep_until, self.cancel.cancelled()) .await .is_ok() { @@ -114,95 +123,84 @@ impl Timeline { #[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))] async fn eviction_iteration( self: &Arc, + tenant: &Tenant, policy: &EvictionPolicy, cancel: &CancellationToken, + gate: &GateGuard, ctx: &RequestContext, ) -> ControlFlow<(), Instant> { debug!("eviction iteration: {policy:?}"); - match policy { + let start = Instant::now(); + let (period, threshold) = match policy { EvictionPolicy::NoEviction => { // check again in 10 seconds; XXX config watch mechanism - ControlFlow::Continue(Instant::now() + Duration::from_secs(10)) + return ControlFlow::Continue(Instant::now() + Duration::from_secs(10)); } EvictionPolicy::LayerAccessThreshold(p) => { - let start = Instant::now(); - match self.eviction_iteration_threshold(p, cancel, ctx).await { + match self + .eviction_iteration_threshold(tenant, p, cancel, gate, ctx) + .await + { ControlFlow::Break(()) => return ControlFlow::Break(()), ControlFlow::Continue(()) => (), } - let elapsed = start.elapsed(); - crate::tenant::tasks::warn_when_period_overrun( - elapsed, - p.period, - BackgroundLoopKind::Eviction, - ); - crate::metrics::EVICTION_ITERATION_DURATION - .get_metric_with_label_values(&[ - &format!("{}", p.period.as_secs()), - &format!("{}", p.threshold.as_secs()), - ]) - .unwrap() - .observe(elapsed.as_secs_f64()); - ControlFlow::Continue(start + p.period) + (p.period, p.threshold) } - } + EvictionPolicy::OnlyImitiate(p) => { + if self + .imitiate_only(tenant, p, cancel, gate, ctx) + .await + .is_break() + { + return ControlFlow::Break(()); + } + (p.period, p.threshold) + } + }; + + let elapsed = start.elapsed(); + crate::tenant::tasks::warn_when_period_overrun( + elapsed, + period, + BackgroundLoopKind::Eviction, + ); + // FIXME: if we were to mix policies on a pageserver, we would have no way to sense this. I + // don't think that is a relevant fear however, and regardless the imitation should be the + // most costly part. + crate::metrics::EVICTION_ITERATION_DURATION + .get_metric_with_label_values(&[ + &format!("{}", period.as_secs()), + &format!("{}", threshold.as_secs()), + ]) + .unwrap() + .observe(elapsed.as_secs_f64()); + + ControlFlow::Continue(start + period) } async fn eviction_iteration_threshold( self: &Arc, + tenant: &Tenant, p: &EvictionPolicyLayerAccessThreshold, cancel: &CancellationToken, + gate: &GateGuard, ctx: &RequestContext, ) -> ControlFlow<()> { let now = SystemTime::now(); - let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit( - BackgroundLoopKind::Eviction, - ctx, - ); + let permit = self.acquire_imitation_permit(cancel, ctx).await?; - let _permit = tokio::select! { - permit = acquire_permit => permit, - _ = cancel.cancelled() => return ControlFlow::Break(()), - _ = self.cancel.cancelled() => return ControlFlow::Break(()), - }; + self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx) + .await?; - // If we evict layers but keep cached values derived from those layers, then - // we face a storm of on-demand downloads after pageserver restart. - // The reason is that the restart empties the caches, and so, the values - // need to be re-computed by accessing layers, which we evicted while the - // caches were filled. - // - // Solutions here would be one of the following: - // 1. Have a persistent cache. - // 2. Count every access to a cached value to the access stats of all layers - // that were accessed to compute the value in the first place. - // 3. Invalidate the caches at a period of < p.threshold/2, so that the values - // get re-computed from layers, thereby counting towards layer access stats. - // 4. Make the eviction task imitate the layer accesses that typically hit caches. - // - // We follow approach (4) here because in Neon prod deployment: - // - page cache is quite small => high churn => low hit rate - // => eviction gets correct access stats - // - value-level caches such as logical size & repatition have a high hit rate, - // especially for inactive tenants - // => eviction sees zero accesses for these - // => they cause the on-demand download storm on pageserver restart - // - // We should probably move to persistent caches in the future, or avoid - // having inactive tenants attached to pageserver in the first place. - match self.imitate_layer_accesses(p, cancel, ctx).await { - ControlFlow::Break(()) => return ControlFlow::Break(()), - ControlFlow::Continue(()) => (), - } - - #[allow(dead_code)] #[derive(Debug, Default)] struct EvictionStats { candidates: usize, evicted: usize, errors: usize, not_evictable: usize, + timeouts: usize, + #[allow(dead_code)] skipped_for_shutdown: usize, } @@ -213,67 +211,63 @@ impl Timeline { // So, we just need to deal with this. - if self.remote_client.is_none() { - error!("no remote storage configured, cannot evict layers"); - return ControlFlow::Continue(()); - } - let mut js = tokio::task::JoinSet::new(); { let guard = self.layers.read().await; - let layers = guard.layer_map(); - for hist_layer in layers.iter_historic_layers() { - let hist_layer = guard.get_from_desc(&hist_layer); - // guard against eviction while we inspect it; it might be that eviction_task and - // disk_usage_eviction_task both select the same layers to be evicted, and - // seemingly free up double the space. both succeeding is of no consequence. - let guard = match hist_layer.keep_resident().await { - Ok(Some(l)) => l, - Ok(None) => continue, - Err(e) => { - // these should not happen, but we cannot make them statically impossible right - // now. - tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}"); - continue; + guard + .likely_resident_layers() + .filter(|layer| { + let last_activity_ts = layer.latest_activity(); + + let no_activity_for = match now.duration_since(last_activity_ts) { + Ok(d) => d, + Err(_e) => { + // We reach here if `now` < `last_activity_ts`, which can legitimately + // happen if there is an access between us getting `now`, and us getting + // the access stats from the layer. + // + // The other reason why it can happen is system clock skew because + // SystemTime::now() is not monotonic, so, even if there is no access + // to the layer after we get `now` at the beginning of this function, + // it could be that `now` < `last_activity_ts`. + // + // To distinguish the cases, we would need to record `Instant`s in the + // access stats (i.e., monotonic timestamps), but then, the timestamps + // values in the access stats would need to be `Instant`'s, and hence + // they would be meaningless outside of the pageserver process. + // At the time of writing, the trade-off is that access stats are more + // valuable than detecting clock skew. + return false; + } + }; + + match layer.visibility() { + LayerVisibilityHint::Visible => { + // Usual case: a visible layer might be read any time, and we will keep it + // resident until it hits our configured TTL threshold. + no_activity_for > p.threshold + } + LayerVisibilityHint::Covered => { + // Covered layers: this is probably a layer that was recently covered by + // an image layer during compaction. We don't evict it immediately, but + // it doesn't stay resident for the full `threshold`: we just keep it + // for a shorter time in case + // - it is used for Timestamp->LSN lookups + // - a new branch is created in recent history which will read this layer + no_activity_for > p.period + } } - }; - - let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| { - // We only use this fallback if there's an implementation error. - // `latest_activity` already does rate-limited warn!() log. - debug!(layer=%hist_layer, "last_activity returns None, using SystemTime::now"); - SystemTime::now() - }); - - let no_activity_for = match now.duration_since(last_activity_ts) { - Ok(d) => d, - Err(_e) => { - // We reach here if `now` < `last_activity_ts`, which can legitimately - // happen if there is an access between us getting `now`, and us getting - // the access stats from the layer. - // - // The other reason why it can happen is system clock skew because - // SystemTime::now() is not monotonic, so, even if there is no access - // to the layer after we get `now` at the beginning of this function, - // it could be that `now` < `last_activity_ts`. - // - // To distinguish the cases, we would need to record `Instant`s in the - // access stats (i.e., monotonic timestamps), but then, the timestamps - // values in the access stats would need to be `Instant`'s, and hence - // they would be meaningless outside of the pageserver process. - // At the time of writing, the trade-off is that access stats are more - // valuable than detecting clock skew. - continue; - } - }; - let layer = guard.drop_eviction_guard(); - if no_activity_for > p.threshold { - // this could cause a lot of allocations in some cases - js.spawn(async move { layer.evict_and_wait().await }); + }) + .cloned() + .for_each(|layer| { + js.spawn(async move { + layer + .evict_and_wait(std::time::Duration::from_secs(5)) + .await + }); stats.candidates += 1; - } - } + }); }; let join_all = async move { @@ -283,6 +277,9 @@ impl Timeline { Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => { stats.not_evictable += 1; } + Ok(Err(EvictionError::Timeout)) => { + stats.timeouts += 1; + } Err(je) if je.is_cancelled() => unreachable!("not used"), Err(je) if je.is_panic() => { /* already logged */ @@ -298,7 +295,8 @@ impl Timeline { stats = join_all => { if stats.candidates == stats.not_evictable { debug!(stats=?stats, "eviction iteration complete"); - } else if stats.errors > 0 || stats.not_evictable > 0 { + } else if stats.errors > 0 || stats.not_evictable > 0 || stats.timeouts > 0 { + // reminder: timeouts are not eviction cancellations warn!(stats=?stats, "eviction iteration complete"); } else { info!(stats=?stats, "eviction iteration complete"); @@ -312,13 +310,80 @@ impl Timeline { ControlFlow::Continue(()) } + /// Like `eviction_iteration_threshold`, but without any eviction. Eviction will be done by + /// disk usage based eviction task. + async fn imitiate_only( + self: &Arc, + tenant: &Tenant, + p: &EvictionPolicyLayerAccessThreshold, + cancel: &CancellationToken, + gate: &GateGuard, + ctx: &RequestContext, + ) -> ControlFlow<()> { + let permit = self.acquire_imitation_permit(cancel, ctx).await?; + + self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx) + .await + } + + async fn acquire_imitation_permit( + &self, + cancel: &CancellationToken, + ctx: &RequestContext, + ) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> { + let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit( + BackgroundLoopKind::Eviction, + ctx, + ); + + tokio::select! { + permit = acquire_permit => ControlFlow::Continue(permit), + _ = cancel.cancelled() => ControlFlow::Break(()), + _ = self.cancel.cancelled() => ControlFlow::Break(()), + } + } + + /// If we evict layers but keep cached values derived from those layers, then + /// we face a storm of on-demand downloads after pageserver restart. + /// The reason is that the restart empties the caches, and so, the values + /// need to be re-computed by accessing layers, which we evicted while the + /// caches were filled. + /// + /// Solutions here would be one of the following: + /// 1. Have a persistent cache. + /// 2. Count every access to a cached value to the access stats of all layers + /// that were accessed to compute the value in the first place. + /// 3. Invalidate the caches at a period of < p.threshold/2, so that the values + /// get re-computed from layers, thereby counting towards layer access stats. + /// 4. Make the eviction task imitate the layer accesses that typically hit caches. + /// + /// We follow approach (4) here because in Neon prod deployment: + /// - page cache is quite small => high churn => low hit rate + /// => eviction gets correct access stats + /// - value-level caches such as logical size & repatition have a high hit rate, + /// especially for inactive tenants + /// => eviction sees zero accesses for these + /// => they cause the on-demand download storm on pageserver restart + /// + /// We should probably move to persistent caches in the future, or avoid + /// having inactive tenants attached to pageserver in the first place. #[instrument(skip_all)] async fn imitate_layer_accesses( &self, + tenant: &Tenant, p: &EvictionPolicyLayerAccessThreshold, cancel: &CancellationToken, + gate: &GateGuard, + permit: tokio::sync::SemaphorePermit<'static>, ctx: &RequestContext, ) -> ControlFlow<()> { + if !self.tenant_shard_id.is_shard_zero() { + // Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size + // for consumption metrics (consumption metrics are only sent from shard 0). We may therefore + // skip imitating logical size accesses for eviction purposes. + return ControlFlow::Continue(()); + } + let mut state = self.eviction_task_timeline_state.lock().await; // Only do the imitate_layer accesses approximately as often as the threshold. A little @@ -328,7 +393,7 @@ impl Timeline { match state.last_layer_access_imitation { Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ } _ => { - self.imitate_timeline_cached_layer_accesses(ctx).await; + self.imitate_timeline_cached_layer_accesses(gate, ctx).await; state.last_layer_access_imitation = Some(tokio::time::Instant::now()) } } @@ -342,17 +407,32 @@ impl Timeline { // Make one of the tenant's timelines draw the short straw and run the calculation. // The others wait until the calculation is done so that they take into account the // imitated accesses that the winner made. - let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) { - Ok(t) => t, - Err(_) => { - return ControlFlow::Break(()); + let (mut state, _permit) = { + if let Ok(locked) = tenant.eviction_task_tenant_state.try_lock() { + (locked, permit) + } else { + // we might need to wait for a long time here in case of pathological synthetic + // size calculation performance + drop(permit); + let locked = tokio::select! { + locked = tenant.eviction_task_tenant_state.lock() => locked, + _ = self.cancel.cancelled() => { + return ControlFlow::Break(()) + }, + _ = cancel.cancelled() => { + return ControlFlow::Break(()) + } + }; + // then reacquire -- this will be bad if there is a lot of traffic, but because we + // released the permit, the overall latency will be much better. + let permit = self.acquire_imitation_permit(cancel, ctx).await?; + (locked, permit) } }; - let mut state = tenant.eviction_task_tenant_state.lock().await; match state.last_layer_access_imitation { Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ } _ => { - self.imitate_synthetic_size_calculation_worker(&tenant, cancel, ctx) + self.imitate_synthetic_size_calculation_worker(tenant, cancel, ctx) .await; state.last_layer_access_imitation = Some(tokio::time::Instant::now()); } @@ -368,12 +448,21 @@ impl Timeline { /// Recompute the values which would cause on-demand downloads during restart. #[instrument(skip_all)] - async fn imitate_timeline_cached_layer_accesses(&self, ctx: &RequestContext) { + async fn imitate_timeline_cached_layer_accesses( + &self, + guard: &GateGuard, + ctx: &RequestContext, + ) { let lsn = self.get_last_record_lsn(); // imitiate on-restart initial logical size let size = self - .calculate_logical_size(lsn, LogicalSizeCalculationCause::EvictionTaskImitation, ctx) + .calculate_logical_size( + lsn, + LogicalSizeCalculationCause::EvictionTaskImitation, + guard, + ctx, + ) .instrument(info_span!("calculate_logical_size")) .await; @@ -417,7 +506,7 @@ impl Timeline { #[instrument(skip_all)] async fn imitate_synthetic_size_calculation_worker( &self, - tenant: &Arc, + tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext, ) { diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs new file mode 100644 index 0000000000..e82559b8b3 --- /dev/null +++ b/pageserver/src/tenant/timeline/handle.rs @@ -0,0 +1,967 @@ +//! An efficient way to keep the timeline gate open without preventing +//! timeline shutdown for longer than a single call to a timeline method. +//! +//! # Motivation +//! +//! On a single page service connection, we're typically serving a single TenantTimelineId. +//! +//! Without sharding, there is a single Timeline object to which we dispatch +//! all requests. For example, a getpage request gets dispatched to the +//! Timeline::get method of the Timeline object that represents the +//! (tenant,timeline) of that connection. +//! +//! With sharding, for each request that comes in on the connection, +//! we first have to perform shard routing based on the requested key (=~ page number). +//! The result of shard routing is a Timeline object. +//! We then dispatch the request to that Timeline object. +//! +//! Regardless of whether the tenant is sharded or not, we want to ensure that +//! we hold the Timeline gate open while we're invoking the method on the +//! Timeline object. +//! +//! However, we want to avoid the overhead of entering the gate for every +//! method invocation. +//! +//! Further, for shard routing, we want to avoid calling the tenant manager to +//! resolve the shard for every request. Instead, we want to cache the +//! routing result so we can bypass the tenant manager for all subsequent requests +//! that get routed to that shard. +//! +//! Regardless of how we accomplish the above, it should not +//! prevent the Timeline from shutting down promptly. +//! +//! # Design +//! +//! There are three user-facing data structures: +//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime. +//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime. +//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`. +//! Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request) +//! +//! The `Handle` is just a wrapper around an `Arc`. +//! +//! There is one long-lived `Arc`, which is stored in the `PerTimelineState`. +//! The `Cache` stores a `Weak` for each cached Timeline. +//! +//! To dispatch a request, the page service connection calls `Cache::get`. +//! +//! A cache miss means we consult the tenant manager for shard routing, +//! resulting in an `Arc`. We enter its gate _once_ and construct an +//! `Arc`. We store a `Weak` in the cache +//! and the `Arc` in the `PerTimelineState`. +//! +//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing +//! and find the `Weak` in the cache. +//! We upgrade the `Weak` to an `Arc` and wrap it in the user-facing `Handle` type. +//! +//! The request handler dispatches the request to the right `>::$request_method`. +//! It then drops the `Handle`, which drops the `Arc`. +//! +//! # Memory Management / How The Reference Cycle Is Broken +//! +//! The attentive reader may have noticed the strong reference cycle +//! from `Arc` to `PerTimelineState` to `Arc`. +//! +//! This cycle is intentional: while it exists, the `Cache` can upgrade its +//! `Weak` to an `Arc` in a single atomic operation. +//! +//! The cycle is broken by either +//! - `PerTimelineState::shutdown` or +//! - dropping the `Cache`. +//! +//! Concurrently existing `Handle`s will extend the existence of the cycle. +//! However, since `Handle`s are short-lived and new `Handle`s are not +//! handed out after either `PerTimelineState::shutdown` or `Cache` drop, +//! that extension of the cycle is bounded. +//! +//! # Fast Path for Shard Routing +//! +//! The `Cache` has a fast path for shard routing to avoid calling into +//! the tenant manager for every request. +//! +//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak`. +//! +//! The current implementation uses the first entry in the hash map +//! to determine the `ShardParameters` and derive the correct +//! `ShardIndex` for the requested key. +//! +//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`. +//! +//! If the lookup is successful and the `Weak` can be upgraded, +//! it's a hit. +//! +//! ## Cache invalidation +//! +//! The insight is that cache invalidation is sufficient and most efficiently done lazily. +//! The only reasons why an entry in the cache can become stale are: +//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is +//! being detached, timeline or shard deleted, or pageserver is shutting down. +//! 2. We're doing a shard split and new traffic should be routed to the child shards. +//! +//! Regarding (1), we will eventually fail to upgrade the `Weak` once the +//! timeline has shut down, and when that happens, we remove the entry from the cache. +//! +//! Regarding (2), the insight is that it is toally fine to keep dispatching requests +//! to the parent shard during a shard split. Eventually, the shard split task will +//! shut down the parent => case (1). + +use std::collections::hash_map; +use std::collections::HashMap; +use std::sync::atomic::AtomicBool; +use std::sync::atomic::Ordering; +use std::sync::Arc; +use std::sync::Mutex; +use std::sync::Weak; + +use pageserver_api::shard::ShardIdentity; +use tracing::instrument; +use tracing::trace; +use utils::id::TimelineId; +use utils::shard::ShardIndex; +use utils::shard::ShardNumber; + +use crate::tenant::mgr::ShardSelector; + +/// The requirement for Debug is so that #[derive(Debug)] works in some places. +pub(crate) trait Types: Sized + std::fmt::Debug { + type TenantManagerError: Sized + std::fmt::Debug; + type TenantManager: TenantManager + Sized; + type Timeline: ArcTimeline + Sized; +} + +/// Uniquely identifies a [`Cache`] instance over the lifetime of the process. +/// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`]. +/// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer. +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +struct CacheId(u64); + +impl CacheId { + fn next() -> Self { + static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1); + let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + if id == 0 { + panic!("CacheId::new() returned 0, overflow"); + } + Self(id) + } +} + +/// See module-level comment. +pub(crate) struct Cache { + id: CacheId, + map: Map, +} + +type Map = HashMap>>; + +impl Default for Cache { + fn default() -> Self { + Self { + id: CacheId::next(), + map: Default::default(), + } + } +} + +#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)] +pub(crate) struct ShardTimelineId { + pub(crate) shard_index: ShardIndex, + pub(crate) timeline_id: TimelineId, +} + +/// See module-level comment. +pub(crate) struct Handle(Arc>); +struct HandleInner { + shut_down: AtomicBool, + timeline: T::Timeline, + // The timeline's gate held open. + _gate_guard: utils::sync::gate::GateGuard, +} + +/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`. +/// +/// See module-level comment for details. +pub struct PerTimelineState { + // None = shutting down + handles: Mutex>>>>, +} + +impl Default for PerTimelineState { + fn default() -> Self { + Self { + handles: Mutex::new(Some(Default::default())), + } + } +} + +/// Abstract view of [`crate::tenant::mgr`], for testability. +pub(crate) trait TenantManager { + /// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`]. + /// Errors are returned as [`GetError::TenantManager`]. + async fn resolve( + &self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + ) -> Result; +} + +/// Abstract view of an [`Arc`], for testability. +pub(crate) trait ArcTimeline: Clone { + fn gate(&self) -> &utils::sync::gate::Gate; + fn shard_timeline_id(&self) -> ShardTimelineId; + fn get_shard_identity(&self) -> &ShardIdentity; + fn per_timeline_state(&self) -> &PerTimelineState; +} + +/// Errors returned by [`Cache::get`]. +#[derive(Debug)] +pub(crate) enum GetError { + TenantManager(T::TenantManagerError), + TimelineGateClosed, + PerTimelineStateShutDown, +} + +/// Internal type used in [`Cache::get`]. +enum RoutingResult { + FastPath(Handle), + SlowPath(ShardTimelineId), + NeedConsultTenantManager, +} + +impl Cache { + /// See module-level comment for details. + /// + /// Does NOT check for the shutdown state of [`Types::Timeline`]. + /// Instead, the methods of [`Types::Timeline`] that are invoked through + /// the [`Handle`] are responsible for checking these conditions + /// and if so, return an error that causes the page service to + /// close the connection. + #[instrument(level = "trace", skip_all)] + pub(crate) async fn get( + &mut self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + tenant_manager: &T::TenantManager, + ) -> Result, GetError> { + // terminates because each iteration removes an element from the map + loop { + let handle = self + .get_impl(timeline_id, shard_selector, tenant_manager) + .await?; + if handle.0.shut_down.load(Ordering::Relaxed) { + let removed = self + .map + .remove(&handle.0.timeline.shard_timeline_id()) + .expect("invariant of get_impl is that the returned handle is in the map"); + assert!( + Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)), + "shard_timeline_id() incorrect?" + ); + } else { + return Ok(handle); + } + } + } + + #[instrument(level = "trace", skip_all)] + async fn get_impl( + &mut self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + tenant_manager: &T::TenantManager, + ) -> Result, GetError> { + let miss: ShardSelector = { + let routing_state = self.shard_routing(timeline_id, shard_selector); + match routing_state { + RoutingResult::FastPath(handle) => return Ok(handle), + RoutingResult::SlowPath(key) => match self.map.get(&key) { + Some(cached) => match cached.upgrade() { + Some(upgraded) => return Ok(Handle(upgraded)), + None => { + trace!("handle cache stale"); + self.map.remove(&key).unwrap(); + ShardSelector::Known(key.shard_index) + } + }, + None => ShardSelector::Known(key.shard_index), + }, + RoutingResult::NeedConsultTenantManager => shard_selector, + } + }; + self.get_miss(timeline_id, miss, tenant_manager).await + } + + #[inline(always)] + fn shard_routing( + &mut self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + ) -> RoutingResult { + loop { + // terminates because when every iteration we remove an element from the map + let Some((first_key, first_handle)) = self.map.iter().next() else { + return RoutingResult::NeedConsultTenantManager; + }; + let Some(first_handle) = first_handle.upgrade() else { + // TODO: dedup with get() + trace!("handle cache stale"); + let first_key_owned = *first_key; + self.map.remove(&first_key_owned).unwrap(); + continue; + }; + + let first_handle_shard_identity = first_handle.timeline.get_shard_identity(); + let make_shard_index = |shard_num: ShardNumber| ShardIndex { + shard_number: shard_num, + shard_count: first_handle_shard_identity.count, + }; + + let need_idx = match shard_selector { + ShardSelector::Page(key) => { + make_shard_index(first_handle_shard_identity.get_shard_number(&key)) + } + ShardSelector::Zero => make_shard_index(ShardNumber(0)), + ShardSelector::Known(shard_idx) => shard_idx, + }; + let need_shard_timeline_id = ShardTimelineId { + shard_index: need_idx, + timeline_id, + }; + let first_handle_shard_timeline_id = ShardTimelineId { + shard_index: first_handle_shard_identity.shard_index(), + timeline_id: first_handle.timeline.shard_timeline_id().timeline_id, + }; + + if need_shard_timeline_id == first_handle_shard_timeline_id { + return RoutingResult::FastPath(Handle(first_handle)); + } else { + return RoutingResult::SlowPath(need_shard_timeline_id); + } + } + } + + #[instrument(level = "trace", skip_all)] + #[inline(always)] + async fn get_miss( + &mut self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + tenant_manager: &T::TenantManager, + ) -> Result, GetError> { + match tenant_manager.resolve(timeline_id, shard_selector).await { + Ok(timeline) => { + let key = timeline.shard_timeline_id(); + match &shard_selector { + ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)), + ShardSelector::Page(_) => (), // gotta trust tenant_manager + ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index), + } + + let gate_guard = match timeline.gate().enter() { + Ok(guard) => guard, + Err(_) => { + return Err(GetError::TimelineGateClosed); + } + }; + trace!("creating new HandleInner"); + let handle = Arc::new( + // TODO: global metric that keeps track of the number of live HandlerTimeline instances + // so we can identify reference cycle bugs. + HandleInner { + shut_down: AtomicBool::new(false), + _gate_guard: gate_guard, + timeline: timeline.clone(), + }, + ); + let handle = { + let mut lock_guard = timeline + .per_timeline_state() + .handles + .lock() + .expect("mutex poisoned"); + match &mut *lock_guard { + Some(per_timeline_state) => { + let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle)); + assert!(replaced.is_none(), "some earlier code left a stale handle"); + match self.map.entry(key) { + hash_map::Entry::Occupied(_o) => { + // This cannot not happen because + // 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and + // 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle + // while we were waiting for the tenant manager. + unreachable!() + } + hash_map::Entry::Vacant(v) => { + v.insert(Arc::downgrade(&handle)); + handle + } + } + } + None => { + return Err(GetError::PerTimelineStateShutDown); + } + } + }; + Ok(Handle(handle)) + } + Err(e) => Err(GetError::TenantManager(e)), + } + } +} + +impl PerTimelineState { + /// After this method returns, [`Cache::get`] will never again return a [`Handle`] + /// to the [`Types::Timeline`] that embeds this per-timeline state. + /// Even if [`TenantManager::resolve`] would still resolve to it. + /// + /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive. + /// That's ok because they're short-lived. See module-level comment for details. + #[instrument(level = "trace", skip_all)] + pub(super) fn shutdown(&self) { + let handles = self + .handles + .lock() + .expect("mutex poisoned") + // NB: this .take() sets locked to None. + // That's what makes future `Cache::get` misses fail. + // Cache hits are taken care of below. + .take(); + let Some(handles) = handles else { + trace!("already shut down"); + return; + }; + for handle in handles.values() { + // Make hits fail. + handle.shut_down.store(true, Ordering::Relaxed); + } + drop(handles); + } +} + +impl std::ops::Deref for Handle { + type Target = T::Timeline; + fn deref(&self) -> &Self::Target { + &self.0.timeline + } +} + +#[cfg(test)] +impl Drop for HandleInner { + fn drop(&mut self) { + trace!("HandleInner dropped"); + } +} + +// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle. +impl Drop for Cache { + fn drop(&mut self) { + for (_, weak) in self.map.drain() { + if let Some(strong) = weak.upgrade() { + // handle is still being kept alive in PerTimelineState + let timeline = strong.timeline.per_timeline_state(); + let mut handles = timeline.handles.lock().expect("mutex poisoned"); + if let Some(handles) = &mut *handles { + let Some(removed) = handles.remove(&self.id) else { + // There could have been a shutdown inbetween us upgrading the weak and locking the mutex. + continue; + }; + assert!(Arc::ptr_eq(&removed, &strong)); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use pageserver_api::{ + key::{rel_block_to_key, Key, DBDIR_KEY}, + models::ShardParameters, + reltag::RelTag, + shard::ShardStripeSize, + }; + use utils::shard::ShardCount; + + use super::*; + + const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX); + + #[derive(Debug)] + struct TestTypes; + impl Types for TestTypes { + type TenantManagerError = anyhow::Error; + type TenantManager = StubManager; + type Timeline = Arc; + } + + struct StubManager { + shards: Vec>, + } + + struct StubTimeline { + gate: utils::sync::gate::Gate, + id: TimelineId, + shard: ShardIdentity, + per_timeline_state: PerTimelineState, + myself: Weak, + } + + impl StubTimeline { + fn getpage(&self) { + // do nothing + } + } + + impl ArcTimeline for Arc { + fn gate(&self) -> &utils::sync::gate::Gate { + &self.gate + } + + fn shard_timeline_id(&self) -> ShardTimelineId { + ShardTimelineId { + shard_index: self.shard.shard_index(), + timeline_id: self.id, + } + } + + fn get_shard_identity(&self) -> &ShardIdentity { + &self.shard + } + + fn per_timeline_state(&self) -> &PerTimelineState { + &self.per_timeline_state + } + } + + impl TenantManager for StubManager { + async fn resolve( + &self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + ) -> anyhow::Result> { + for timeline in &self.shards { + if timeline.id == timeline_id { + match &shard_selector { + ShardSelector::Zero if timeline.shard.is_shard_zero() => { + return Ok(Arc::clone(timeline)); + } + ShardSelector::Zero => continue, + ShardSelector::Page(key) if timeline.shard.is_key_local(key) => { + return Ok(Arc::clone(timeline)); + } + ShardSelector::Page(_) => continue, + ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => { + return Ok(Arc::clone(timeline)); + } + ShardSelector::Known(_) => continue, + } + } + } + anyhow::bail!("not found") + } + } + + #[tokio::test(start_paused = true)] + async fn test_timeline_shutdown() { + crate::tenant::harness::setup_logging(); + + let timeline_id = TimelineId::generate(); + let shard0 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let mgr = StubManager { + shards: vec![shard0.clone()], + }; + let key = DBDIR_KEY; + + let mut cache = Cache::::default(); + + // + // fill the cache + // + assert_eq!( + (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), + (2, 1), + "strong: shard0, mgr; weak: myself" + ); + + let handle: Handle<_> = cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .expect("we have the timeline"); + let handle_inner_weak = Arc::downgrade(&handle.0); + assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); + assert_eq!( + ( + Weak::strong_count(&handle_inner_weak), + Weak::weak_count(&handle_inner_weak) + ), + (2, 2), + "strong: handle, per_timeline_state, weak: handle_inner_weak, cache" + ); + assert_eq!(cache.map.len(), 1); + + assert_eq!( + (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), + (3, 1), + "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself" + ); + drop(handle); + assert_eq!( + (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), + (3, 1), + "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself" + ); + + // + // demonstrate that Handle holds up gate closure + // but shutdown prevents new handles from being handed out + // + + tokio::select! { + _ = shard0.gate.close() => { + panic!("cache and per-timeline handler state keep cache open"); + } + _ = tokio::time::sleep(FOREVER) => { + // NB: first poll of close() makes it enter closing state + } + } + + let handle = cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .expect("we have the timeline"); + assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); + + // SHUTDOWN + shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown + + assert_eq!( + 1, + Weak::strong_count(&handle_inner_weak), + "through local var handle" + ); + assert_eq!( + cache.map.len(), + 1, + "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after" + ); + assert_eq!( + (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), + (3, 1), + "strong: handleinner(via handle), shard0, mgr; weak: myself" + ); + + // this handle is perfectly usable + handle.getpage(); + + cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .err() + .expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle"); + assert_eq!( + cache.map.len(), + 0, + "first access after shutdown cleans up the Weak's from the cache" + ); + + tokio::select! { + _ = shard0.gate.close() => { + panic!("handle is keeping gate open"); + } + _ = tokio::time::sleep(FOREVER) => { } + } + + drop(handle); + assert_eq!( + 0, + Weak::strong_count(&handle_inner_weak), + "the HandleInner destructor already ran" + ); + assert_eq!( + (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), + (2, 1), + "strong: shard0, mgr; weak: myself" + ); + + // closing gate succeeds after dropping handle + tokio::select! { + _ = shard0.gate.close() => { } + _ = tokio::time::sleep(FOREVER) => { + panic!("handle is dropped, no other gate holders exist") + } + } + + // map gets cleaned on next lookup + cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .err() + .expect("documented behavior: can't get new handle after shutdown"); + assert_eq!(cache.map.len(), 0); + + // ensure all refs to shard0 are gone and we're not leaking anything + let myself = Weak::clone(&shard0.myself); + drop(shard0); + drop(mgr); + assert_eq!(Weak::strong_count(&myself), 0); + } + + #[tokio::test] + async fn test_multiple_timelines_and_deletion() { + crate::tenant::harness::setup_logging(); + + let timeline_a = TimelineId::generate(); + let timeline_b = TimelineId::generate(); + assert_ne!(timeline_a, timeline_b); + let timeline_a = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_a, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let timeline_b = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_b, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let mut mgr = StubManager { + shards: vec![timeline_a.clone(), timeline_b.clone()], + }; + let key = DBDIR_KEY; + + let mut cache = Cache::::default(); + + cache + .get(timeline_a.id, ShardSelector::Page(key), &mgr) + .await + .expect("we have it"); + cache + .get(timeline_b.id, ShardSelector::Page(key), &mgr) + .await + .expect("we have it"); + assert_eq!(cache.map.len(), 2); + + // delete timeline A + timeline_a.per_timeline_state.shutdown(); + mgr.shards.retain(|t| t.id != timeline_a.id); + assert!( + mgr.resolve(timeline_a.id, ShardSelector::Page(key)) + .await + .is_err(), + "broken StubManager implementation" + ); + + assert_eq!( + cache.map.len(), + 2, + "cache still has a Weak handle to Timeline A" + ); + cache + .get(timeline_a.id, ShardSelector::Page(key), &mgr) + .await + .err() + .expect("documented behavior: can't get new handle after shutdown"); + assert_eq!(cache.map.len(), 1, "next access cleans up the cache"); + + cache + .get(timeline_b.id, ShardSelector::Page(key), &mgr) + .await + .expect("we still have it"); + } + + fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key { + rel_block_to_key( + RelTag { + spcnode: 1663, + dbnode: 208101, + relnode: 2620, + forknum: 0, + }, + shard.0 as u32 * params.stripe_size.0, + ) + } + + #[tokio::test(start_paused = true)] + async fn test_shard_split() { + crate::tenant::harness::setup_logging(); + let timeline_id = TimelineId::generate(); + let parent = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let child_params = ShardParameters { + count: ShardCount(2), + stripe_size: ShardStripeSize::default(), + }; + let child0 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::from_params(ShardNumber(0), &child_params), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let child1 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::from_params(ShardNumber(1), &child_params), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let child_shards_by_shard_number = [child0.clone(), child1.clone()]; + + let mut cache = Cache::::default(); + + // fill the cache with the parent + for i in 0..2 { + let handle = cache + .get( + timeline_id, + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)), + &StubManager { + shards: vec![parent.clone()], + }, + ) + .await + .expect("we have it"); + assert!( + Weak::ptr_eq(&handle.myself, &parent.myself), + "mgr returns parent first" + ); + drop(handle); + } + + // + // SHARD SPLIT: tenant manager changes, but the cache isn't informed + // + + // while we haven't shut down the parent, the cache will return the cached parent, even + // if the tenant manager returns the child + for i in 0..2 { + let handle = cache + .get( + timeline_id, + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)), + &StubManager { + shards: vec![], // doesn't matter what's in here, the cache is fully loaded + }, + ) + .await + .expect("we have it"); + assert!( + Weak::ptr_eq(&handle.myself, &parent.myself), + "mgr returns parent" + ); + drop(handle); + } + + let parent_handle = cache + .get( + timeline_id, + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)), + &StubManager { + shards: vec![parent.clone()], + }, + ) + .await + .expect("we have it"); + assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself)); + + // invalidate the cache + parent.per_timeline_state.shutdown(); + + // the cache will now return the child, even though the parent handle still exists + for i in 0..2 { + let handle = cache + .get( + timeline_id, + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)), + &StubManager { + shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop + }, + ) + .await + .expect("we have it"); + assert!( + Weak::ptr_eq( + &handle.myself, + &child_shards_by_shard_number[i as usize].myself + ), + "mgr returns child" + ); + drop(handle); + } + + // all the while the parent handle kept the parent gate open + tokio::select! { + _ = parent_handle.gate.close() => { + panic!("parent handle is keeping gate open"); + } + _ = tokio::time::sleep(FOREVER) => { } + } + drop(parent_handle); + tokio::select! { + _ = parent.gate.close() => { } + _ = tokio::time::sleep(FOREVER) => { + panic!("parent handle is dropped, no other gate holders exist") + } + } + } + + #[tokio::test(start_paused = true)] + async fn test_connection_handler_exit() { + crate::tenant::harness::setup_logging(); + let timeline_id = TimelineId::generate(); + let shard0 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let mgr = StubManager { + shards: vec![shard0.clone()], + }; + let key = DBDIR_KEY; + + // Simulate 10 connections that's opened, used, and closed + let mut used_handles = vec![]; + for _ in 0..10 { + let mut cache = Cache::::default(); + let handle = { + let handle = cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .expect("we have the timeline"); + assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); + handle + }; + handle.getpage(); + used_handles.push(Arc::downgrade(&handle.0)); + } + + // No handles exist, thus gates are closed and don't require shutdown + assert!(used_handles + .iter() + .all(|weak| Weak::strong_count(weak) == 0)); + + // ... thus the gate should close immediately, even without shutdown + tokio::select! { + _ = shard0.gate.close() => { } + _ = tokio::time::sleep(FOREVER) => { + panic!("handle is dropped, no other gate holders exist") + } + } + } +} diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs index 916ebfc6d9..5bc67c7133 100644 --- a/pageserver/src/tenant/timeline/init.rs +++ b/pageserver/src/tenant/timeline/init.rs @@ -6,31 +6,29 @@ use crate::{ self, index::{IndexPart, LayerFileMetadata}, }, - storage_layer::LayerFileName, - Generation, + storage_layer::LayerName, }, - METADATA_FILE_NAME, }; use anyhow::Context; -use camino::Utf8Path; -use pageserver_api::shard::ShardIndex; -use std::{collections::HashMap, str::FromStr}; +use camino::{Utf8Path, Utf8PathBuf}; +use std::{ + collections::{hash_map, HashMap}, + str::FromStr, +}; use utils::lsn::Lsn; /// Identified files in the timeline directory. pub(super) enum Discovered { /// The only one we care about - Layer(LayerFileName, u64), + Layer(LayerName, LocalLayerFileMetadata), /// Old ephmeral files from previous launches, should be removed Ephemeral(String), /// Old temporary timeline files, unsure what these really are, should be removed Temporary(String), /// Temporary on-demand download files, should be removed TemporaryDownload(String), - /// "metadata" file we persist locally and include in `index_part.json` - Metadata, /// Backup file from previously future layers - IgnoredBackup, + IgnoredBackup(Utf8PathBuf), /// Unrecognized, warn about these Unknown(String), } @@ -43,17 +41,18 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result { let file_size = direntry.metadata()?.len(); - Discovered::Layer(file_name, file_size) + Discovered::Layer( + file_name, + LocalLayerFileMetadata::new(direntry.path().to_owned(), file_size), + ) } Err(_) => { - if file_name == METADATA_FILE_NAME { - Discovered::Metadata - } else if file_name.ends_with(".old") { + if file_name.ends_with(".old") { // ignore these - Discovered::IgnoredBackup + Discovered::IgnoredBackup(direntry.path().to_owned()) } else if remote_timeline_client::is_temp_download_file(direntry.path()) { Discovered::TemporaryDownload(file_name) } else if is_ephemeral_file(&file_name) { @@ -72,19 +71,36 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result Self { + Self { + local_path, + file_size, + } + } +} + +/// For a layer that is present in remote metadata, this type describes how to handle +/// it during startup: it is either Resident (and we have some metadata about a local file), +/// or it is Evicted (and we only have remote metadata). #[derive(Clone, Debug)] pub(super) enum Decision { /// The layer is not present locally. Evicted(LayerFileMetadata), - /// The layer is present locally, but local metadata does not match remote; we must - /// delete it and treat it as evicted. - UseRemote { - local: LayerFileMetadata, + /// The layer is present locally, and metadata matches: we may hook up this layer to the + /// existing file in local storage. + Resident { + local: LocalLayerFileMetadata, remote: LayerFileMetadata, }, - /// The layer is present locally, and metadata matches. - UseLocal(LayerFileMetadata), } /// A layer needs to be left out of the layer map. @@ -92,82 +108,89 @@ pub(super) enum Decision { pub(super) enum DismissedLayer { /// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded. Future { - /// The local metadata. `None` if the layer is only known through [`IndexPart`]. - local: Option, + /// `None` if the layer is only known through [`IndexPart`]. + local: Option, }, /// The layer only exists locally. /// /// In order to make crash safe updates to layer map, we must dismiss layers which are only /// found locally or not yet included in the remote `index_part.json`. - LocalOnly(LayerFileMetadata), + LocalOnly(LocalLayerFileMetadata), + + /// The layer exists in remote storage but the local layer's metadata (e.g. file size) + /// does not match it + BadMetadata(LocalLayerFileMetadata), } /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions. pub(super) fn reconcile( - discovered: Vec<(LayerFileName, u64)>, + local_layers: Vec<(LayerName, LocalLayerFileMetadata)>, index_part: Option<&IndexPart>, disk_consistent_lsn: Lsn, - generation: Generation, - shard: ShardIndex, -) -> Vec<(LayerFileName, Result)> { - use Decision::*; +) -> Vec<(LayerName, Result)> { + let Some(index_part) = index_part else { + // If we have no remote metadata, no local layer files are considered valid to load + return local_layers + .into_iter() + .map(|(layer_name, local_metadata)| { + (layer_name, Err(DismissedLayer::LocalOnly(local_metadata))) + }) + .collect(); + }; - // name => (local, remote) - type Collected = HashMap, Option)>; + let mut result = Vec::new(); - let mut discovered = discovered - .into_iter() - .map(|(name, file_size)| { - ( - name, - // The generation and shard here will be corrected to match IndexPart in the merge below, unless - // it is not in IndexPart, in which case using our current generation makes sense - // because it will be uploaded in this generation. - ( - Some(LayerFileMetadata::new(file_size, generation, shard)), - None, - ), - ) - }) - .collect::(); + let mut remote_layers = HashMap::new(); - // merge any index_part information, when available + // Construct Decisions for layers that are found locally, if they're in remote metadata. Otherwise + // construct DismissedLayers to get rid of them. + for (layer_name, local_metadata) in local_layers { + let Some(remote_metadata) = index_part.layer_metadata.get(&layer_name) else { + result.push((layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))); + continue; + }; + + if remote_metadata.file_size != local_metadata.file_size { + result.push((layer_name, Err(DismissedLayer::BadMetadata(local_metadata)))); + continue; + } + + remote_layers.insert( + layer_name, + Decision::Resident { + local: local_metadata, + remote: remote_metadata.clone(), + }, + ); + } + + // Construct Decision for layers that were not found locally index_part - .as_ref() - .map(|ip| ip.layer_metadata.iter()) - .into_iter() - .flatten() - .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata))) + .layer_metadata + .iter() .for_each(|(name, metadata)| { - if let Some(existing) = discovered.get_mut(name) { - existing.1 = Some(metadata); - } else { - discovered.insert(name.to_owned(), (None, Some(metadata))); + if let hash_map::Entry::Vacant(entry) = remote_layers.entry(name.clone()) { + entry.insert(Decision::Evicted(metadata.clone())); } }); - discovered - .into_iter() - .map(|(name, (local, remote))| { - let decision = if name.is_in_future(disk_consistent_lsn) { - Err(DismissedLayer::Future { local }) - } else { - match (local, remote) { - (Some(local), Some(remote)) if local != remote => { - Ok(UseRemote { local, remote }) - } - (Some(x), Some(_)) => Ok(UseLocal(x)), - (None, Some(x)) => Ok(Evicted(x)), - (Some(x), None) => Err(DismissedLayer::LocalOnly(x)), - (None, None) => { - unreachable!("there must not be any non-local non-remote files") - } - } - }; + // For layers that were found in authoritative remote metadata, apply a final check that they are within + // the disk_consistent_lsn. + result.extend(remote_layers.into_iter().map(|(name, decision)| { + if name.is_in_future(disk_consistent_lsn) { + match decision { + Decision::Evicted(_remote) => (name, Err(DismissedLayer::Future { local: None })), + Decision::Resident { + local, + remote: _remote, + } => (name, Err(DismissedLayer::Future { local: Some(local) })), + } + } else { + (name, Ok(decision)) + } + })); - (name, decision) - }) - .collect::>() + result } pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> { @@ -176,30 +199,20 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> { std::fs::remove_file(path).with_context(|| format!("failed to remove {kind} at {path}")) } -pub(super) fn cleanup_local_file_for_remote( - path: &Utf8Path, - local: &LayerFileMetadata, - remote: &LayerFileMetadata, -) -> anyhow::Result<()> { - let local_size = local.file_size(); - let remote_size = remote.file_size(); - +pub(super) fn cleanup_local_file_for_remote(local: &LocalLayerFileMetadata) -> anyhow::Result<()> { + let local_size = local.file_size; + let path = &local.local_path; let file_name = path.file_name().expect("must be file path"); - tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}"); - if let Err(err) = crate::tenant::timeline::rename_to_backup(path) { - assert!( - path.exists(), - "we would leave the local_layer without a file if this does not hold: {path}", - ); - Err(err) - } else { - Ok(()) - } + tracing::warn!( + "removing local file {file_name:?} because it has unexpected length {local_size};" + ); + + std::fs::remove_file(path).with_context(|| format!("failed to remove layer at {path}")) } pub(super) fn cleanup_future_layer( path: &Utf8Path, - name: &LayerFileName, + name: &LayerName, disk_consistent_lsn: Lsn, ) -> anyhow::Result<()> { // future image layers are allowed to be produced always for not yet flushed to disk @@ -211,12 +224,14 @@ pub(super) fn cleanup_future_layer( } pub(super) fn cleanup_local_only_file( - path: &Utf8Path, - name: &LayerFileName, - local: &LayerFileMetadata, + name: &LayerName, + local: &LocalLayerFileMetadata, ) -> anyhow::Result<()> { let kind = name.kind(); - tracing::info!("found local-only {kind} layer {name}, metadata {local:?}"); - std::fs::remove_file(path)?; + tracing::info!( + "found local-only {kind} layer {name} size {}", + local.file_size + ); + std::fs::remove_file(&local.local_path)?; Ok(()) } diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index e38f5be209..8f20d84401 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -1,4 +1,5 @@ -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{bail, ensure, Context}; +use itertools::Itertools; use pageserver_api::shard::TenantShardId; use std::{collections::HashMap, sync::Arc}; use tracing::trace; @@ -9,6 +10,7 @@ use utils::{ use crate::{ config::PageServerConf, + context::RequestContext, metrics::TimelineMetrics, tenant::{ layer_map::{BatchedUpdates, LayerMap}, @@ -19,42 +21,145 @@ use crate::{ }, }; +use super::TimelineWriterState; + /// Provides semantic APIs to manipulate the layer map. -pub(crate) struct LayerManager { - layer_map: LayerMap, - layer_fmgr: LayerFileManager, +pub(crate) enum LayerManager { + /// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate + /// the layers. + Open(OpenLayerManager), + /// Shutdown layer manager where there are no more in-memory layers and persistent layers are + /// read-only. + Closed { + layers: HashMap, + }, +} + +impl Default for LayerManager { + fn default() -> Self { + LayerManager::Open(OpenLayerManager::default()) + } } impl LayerManager { - pub(crate) fn create() -> Self { - Self { - layer_map: LayerMap::default(), - layer_fmgr: LayerFileManager::new(), - } + pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer { + // The assumption for the `expect()` is that all code maintains the following invariant: + // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. + self.layers() + .get(key) + .with_context(|| format!("get layer from key: {key}")) + .expect("not found") + .clone() } pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer { - self.layer_fmgr.get_from_desc(desc) + self.get_from_key(&desc.key()) } /// Get an immutable reference to the layer map. /// /// We expect users only to be able to get an immutable layer map. If users want to make modifications, /// they should use the below semantic APIs. This design makes us step closer to immutable storage state. - pub(crate) fn layer_map(&self) -> &LayerMap { - &self.layer_map + pub(crate) fn layer_map(&self) -> Result<&LayerMap, Shutdown> { + use LayerManager::*; + match self { + Open(OpenLayerManager { layer_map, .. }) => Ok(layer_map), + Closed { .. } => Err(Shutdown), + } } + pub(crate) fn open_mut(&mut self) -> Result<&mut OpenLayerManager, Shutdown> { + use LayerManager::*; + + match self { + Open(open) => Ok(open), + Closed { .. } => Err(Shutdown), + } + } + + /// LayerManager shutdown. The in-memory layers do cleanup on drop, so we must drop them in + /// order to allow shutdown to complete. + /// + /// If there was a want to flush in-memory layers, it must have happened earlier. + pub(crate) fn shutdown(&mut self, writer_state: &mut Option) { + use LayerManager::*; + match self { + Open(OpenLayerManager { + layer_map, + layer_fmgr: LayerFileManager(hashmap), + }) => { + let open = layer_map.open_layer.take(); + let frozen = layer_map.frozen_layers.len(); + let taken_writer_state = writer_state.take(); + tracing::info!(open = open.is_some(), frozen, "dropped inmemory layers"); + let layers = std::mem::take(hashmap); + *self = Closed { layers }; + assert_eq!(open.is_some(), taken_writer_state.is_some()); + } + Closed { .. } => { + tracing::debug!("ignoring multiple shutdowns on layer manager") + } + } + } + + /// Sum up the historic layer sizes + pub(crate) fn layer_size_sum(&self) -> u64 { + self.layers() + .values() + .map(|l| l.layer_desc().file_size) + .sum() + } + + pub(crate) fn likely_resident_layers(&self) -> impl Iterator + '_ { + self.layers().values().filter(|l| l.is_likely_resident()) + } + + pub(crate) fn contains(&self, layer: &Layer) -> bool { + self.contains_key(&layer.layer_desc().key()) + } + + pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool { + self.layers().contains_key(key) + } + + pub(crate) fn all_persistent_layers(&self) -> Vec { + self.layers().keys().cloned().collect_vec() + } + + fn layers(&self) -> &HashMap { + use LayerManager::*; + match self { + Open(OpenLayerManager { layer_fmgr, .. }) => &layer_fmgr.0, + Closed { layers } => layers, + } + } +} + +#[derive(Default)] +pub(crate) struct OpenLayerManager { + layer_map: LayerMap, + layer_fmgr: LayerFileManager, +} + +impl std::fmt::Debug for OpenLayerManager { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("OpenLayerManager") + .field("layer_count", &self.layer_fmgr.0.len()) + .finish() + } +} + +#[derive(Debug, thiserror::Error)] +#[error("layer manager has been shutdown")] +pub(crate) struct Shutdown; + +impl OpenLayerManager { /// Called from `load_layer_map`. Initialize the layer manager with: /// 1. all on-disk layers /// 2. next open layer (with disk disk_consistent_lsn LSN) - pub(crate) fn initialize_local_layers( - &mut self, - on_disk_layers: Vec, - next_open_layer_at: Lsn, - ) { + pub(crate) fn initialize_local_layers(&mut self, layers: Vec, next_open_layer_at: Lsn) { let mut updates = self.layer_map.batch_update(); - for layer in on_disk_layers { + for layer in layers { Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr); } updates.flush(); @@ -66,25 +171,19 @@ impl LayerManager { self.layer_map.next_open_layer_at = Some(next_open_layer_at); } - /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer, - /// called within `get_layer_for_write`. + /// Open a new writable layer to append data if there is no open layer, otherwise return the + /// current open layer, called within `get_layer_for_write`. pub(crate) async fn get_layer_for_write( &mut self, lsn: Lsn, - last_record_lsn: Lsn, conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, - ) -> Result> { + gate_guard: utils::sync::gate::GateGuard, + ctx: &RequestContext, + ) -> anyhow::Result> { ensure!(lsn.is_aligned()); - ensure!( - lsn > last_record_lsn, - "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", - lsn, - last_record_lsn, - ); - // Do we have a layer open for writing already? let layer = if let Some(open_layer) = &self.layer_map.open_layer { if open_layer.get_lsn_range().start > lsn { @@ -110,8 +209,15 @@ impl LayerManager { lsn ); - let new_layer = - InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn).await?; + let new_layer = InMemoryLayer::create( + conf, + timeline_id, + tenant_shard_id, + start_lsn, + gate_guard, + ctx, + ) + .await?; let layer = Arc::new(new_layer); self.layer_map.open_layer = Some(layer.clone()); @@ -123,17 +229,20 @@ impl LayerManager { Ok(layer) } - /// Called from `freeze_inmem_layer`, returns true if successfully frozen. - pub(crate) async fn try_freeze_in_memory_layer( + /// Tries to freeze an open layer and also manages clearing the TimelineWriterState. + /// + /// Returns true if anything was frozen. + pub(super) async fn try_freeze_in_memory_layer( &mut self, - Lsn(last_record_lsn): Lsn, + lsn: Lsn, last_freeze_at: &AtomicLsn, - ) { + write_lock: &mut tokio::sync::MutexGuard<'_, Option>, + ) -> bool { + let Lsn(last_record_lsn) = lsn; let end_lsn = Lsn(last_record_lsn + 1); - if let Some(open_layer) = &self.layer_map.open_layer { + let froze = if let Some(open_layer) = &self.layer_map.open_layer { let open_layer_rc = Arc::clone(open_layer); - // Does this layer need freezing? open_layer.freeze(end_lsn).await; // The layer is no longer open, update the layer map to reflect this. @@ -141,11 +250,28 @@ impl LayerManager { self.layer_map.frozen_layers.push_back(open_layer_rc); self.layer_map.open_layer = None; self.layer_map.next_open_layer_at = Some(end_lsn); - last_freeze_at.store(end_lsn); - } + + true + } else { + false + }; + + // Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this + // accounts for regions in the LSN range where we might have ingested no data due to sharding. + last_freeze_at.store(end_lsn); + + // the writer state must no longer have a reference to the frozen layer + let taken = write_lock.take(); + assert_eq!( + froze, + taken.is_some(), + "should only had frozen a layer when TimelineWriterState existed" + ); + + froze } - /// Add image layers to the layer map, called from `create_image_layers`. + /// Add image layers to the layer map, called from [`super::Timeline::create_image_layers`]. pub(crate) fn track_new_image_layers( &mut self, image_layers: &[ResidentLayer], @@ -207,6 +333,61 @@ impl LayerManager { updates.flush(); } + /// Called when a GC-compaction is completed. + pub(crate) fn finish_gc_compaction( + &mut self, + compact_from: &[Layer], + compact_to: &[ResidentLayer], + metrics: &TimelineMetrics, + ) { + // We can simply reuse compact l0 logic. Use a different function name to indicate a different type of layer map modification. + self.finish_compact_l0(compact_from, compact_to, metrics) + } + + /// Called post-compaction when some previous generation image layers were trimmed. + pub(crate) fn rewrite_layers( + &mut self, + rewrite_layers: &[(Layer, ResidentLayer)], + drop_layers: &[Layer], + metrics: &TimelineMetrics, + ) { + let mut updates = self.layer_map.batch_update(); + for (old_layer, new_layer) in rewrite_layers { + debug_assert_eq!( + old_layer.layer_desc().key_range, + new_layer.layer_desc().key_range + ); + debug_assert_eq!( + old_layer.layer_desc().lsn_range, + new_layer.layer_desc().lsn_range + ); + + // Transfer visibility hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to + // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents + // always marking rewritten layers as visible. + new_layer.as_ref().set_visibility(old_layer.visibility()); + + // Safety: we may never rewrite the same file in-place. Callers are responsible + // for ensuring that they only rewrite layers after something changes the path, + // such as an increment in the generation number. + assert_ne!(old_layer.local_path(), new_layer.local_path()); + + Self::delete_historic_layer(old_layer, &mut updates, &mut self.layer_fmgr); + + Self::insert_historic_layer( + new_layer.as_ref().clone(), + &mut updates, + &mut self.layer_fmgr, + ); + + metrics.record_new_file_metrics(new_layer.layer_desc().file_size); + } + for l in drop_layers { + Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr); + } + updates.flush(); + } + /// Called when garbage collect has selected the layers to be removed. pub(crate) fn finish_gc_timeline(&mut self, gc_layers: &[Layer]) { let mut updates = self.layer_map.batch_update(); @@ -216,6 +397,13 @@ impl LayerManager { updates.flush() } + #[cfg(test)] + pub(crate) fn force_insert_layer(&mut self, layer: ResidentLayer) { + let mut updates = self.layer_map.batch_update(); + Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr); + updates.flush() + } + /// Helper function to insert a layer into the layer map and file manager. fn insert_historic_layer( layer: Layer, @@ -245,25 +433,17 @@ impl LayerManager { mapping.remove(layer); layer.delete_on_drop(); } - - pub(crate) fn contains(&self, layer: &Layer) -> bool { - self.layer_fmgr.contains(layer) - } } pub(crate) struct LayerFileManager(HashMap); -impl LayerFileManager { - fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T { - // The assumption for the `expect()` is that all code maintains the following invariant: - // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. - self.0 - .get(&desc.key()) - .with_context(|| format!("get layer from desc: {}", desc.filename())) - .expect("not found") - .clone() +impl Default for LayerFileManager { + fn default() -> Self { + Self(HashMap::default()) } +} +impl LayerFileManager { pub(crate) fn insert(&mut self, layer: T) { let present = self.0.insert(layer.layer_desc().key(), layer.clone()); if present.is_some() && cfg!(debug_assertions) { @@ -271,14 +451,6 @@ impl LayerFileManager { } } - pub(crate) fn contains(&self, layer: &T) -> bool { - self.0.contains_key(&layer.layer_desc().key()) - } - - pub(crate) fn new() -> Self { - Self(HashMap::new()) - } - pub(crate) fn remove(&mut self, layer: &T) { let present = self.0.remove(&layer.layer_desc().key()); if present.is_none() && cfg!(debug_assertions) { diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs index 03bc59ea38..f4a4eea54a 100644 --- a/pageserver/src/tenant/timeline/logical_size.rs +++ b/pageserver/src/tenant/timeline/logical_size.rs @@ -11,11 +11,11 @@ use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering}; /// Calculation consists of two stages: /// /// 1. Initial size calculation. That might take a long time, because it requires -/// reading all layers containing relation sizes at `initial_part_end`. +/// reading all layers containing relation sizes at `initial_part_end`. /// /// 2. Collecting an incremental part and adding that to the initial size. -/// Increments are appended on walreceiver writing new timeline data, -/// which result in increase or decrease of the logical size. +/// Increments are appended on walreceiver writing new timeline data, +/// which result in increase or decrease of the logical size. pub(super) struct LogicalSize { /// Size, potentially slow to compute. Calculating this might require reading multiple /// layers, and even ancestor's layers. @@ -45,17 +45,17 @@ pub(super) struct LogicalSize { /// Size shouldn't ever be negative, but this is signed for two reasons: /// /// 1. If we initialized the "baseline" size lazily, while we already - /// process incoming WAL, the incoming WAL records could decrement the - /// variable and temporarily make it negative. (This is just future-proofing; - /// the initialization is currently not done lazily.) + /// process incoming WAL, the incoming WAL records could decrement the + /// variable and temporarily make it negative. (This is just future-proofing; + /// the initialization is currently not done lazily.) /// /// 2. If there is a bug and we e.g. forget to increment it in some cases - /// when size grows, but remember to decrement it when it shrinks again, the - /// variable could go negative. In that case, it seems better to at least - /// try to keep tracking it, rather than clamp or overflow it. Note that - /// get_current_logical_size() will clamp the returned value to zero if it's - /// negative, and log an error. Could set it permanently to zero or some - /// special value to indicate "broken" instead, but this will do for now. + /// when size grows, but remember to decrement it when it shrinks again, the + /// variable could go negative. In that case, it seems better to at least + /// try to keep tracking it, rather than clamp or overflow it. Note that + /// get_current_logical_size() will clamp the returned value to zero if it's + /// negative, and log an error. Could set it permanently to zero or some + /// special value to indicate "broken" instead, but this will do for now. /// /// Note that we also expose a copy of this value as a prometheus metric, /// see `current_logical_size_gauge`. Use the `update_current_logical_size` @@ -101,6 +101,14 @@ impl From<&Exact> for u64 { } } +impl Approximate { + /// For use in situations where we don't have a sane logical size value but need + /// to return something, e.g. in HTTP API on shard >0 of a sharded tenant. + pub(crate) fn zero() -> Self { + Self(0) + } +} + impl CurrentLogicalSize { pub(crate) fn size_dont_care_about_accuracy(&self) -> u64 { match self { @@ -114,6 +122,10 @@ impl CurrentLogicalSize { Self::Exact(_) => Accuracy::Exact, } } + + pub(crate) fn is_exact(&self) -> bool { + matches!(self, Self::Exact(_)) + } } impl LogicalSize { diff --git a/pageserver/src/tenant/timeline/span.rs b/pageserver/src/tenant/timeline/span.rs index 3b580c9d1b..8b13789179 100644 --- a/pageserver/src/tenant/timeline/span.rs +++ b/pageserver/src/tenant/timeline/span.rs @@ -1,20 +1 @@ -#[cfg(debug_assertions)] -use utils::tracing_span_assert::{check_fields_present, Extractor, MultiNameExtractor}; -#[cfg(not(debug_assertions))] -pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {} - -#[cfg(debug_assertions)] -#[track_caller] -pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { - static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy> = - once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TimelineId", ["timeline_id"])); - - let fields: [&dyn Extractor; 2] = [ - &*crate::tenant::span::TENANT_ID_EXTRACTOR, - &*TIMELINE_ID_EXTRACTOR, - ]; - if let Err(missing) = check_fields_present!(fields) { - panic!("missing extractors: {missing:?}") - } -} diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index 27d6fd9c28..2b60e670ea 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -2,8 +2,8 @@ use std::{collections::hash_map::Entry, fs, sync::Arc}; use anyhow::Context; use camino::Utf8PathBuf; -use tracing::{error, info, info_span, warn}; -use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn}; +use tracing::{error, info, info_span}; +use utils::{fs_ext, id::TimelineId, lsn::Lsn}; use crate::{context::RequestContext, import_datadir, tenant::Tenant}; @@ -11,22 +11,22 @@ use super::Timeline; /// A timeline with some of its files on disk, being initialized. /// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or -/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory -/// to be removed on next restart. +/// its local files are removed. If we crash while this class exists, then the timeline's local +/// state is cleaned up during [`Tenant::clean_up_timelines`], because the timeline's content isn't in remote storage. /// /// The caller is responsible for proper timeline data filling before the final init. #[must_use] pub struct UninitializedTimeline<'t> { pub(crate) owning_tenant: &'t Tenant, timeline_id: TimelineId, - raw_timeline: Option<(Arc, TimelineUninitMark<'t>)>, + raw_timeline: Option<(Arc, TimelineCreateGuard<'t>)>, } impl<'t> UninitializedTimeline<'t> { pub(crate) fn new( owning_tenant: &'t Tenant, timeline_id: TimelineId, - raw_timeline: Option<(Arc, TimelineUninitMark<'t>)>, + raw_timeline: Option<(Arc, TimelineCreateGuard<'t>)>, ) -> Self { Self { owning_tenant, @@ -35,8 +35,7 @@ impl<'t> UninitializedTimeline<'t> { } } - /// Finish timeline creation: insert it into the Tenant's timelines map and remove the - /// uninit mark file. + /// Finish timeline creation: insert it into the Tenant's timelines map /// /// This function launches the flush loop if not already done. /// @@ -72,16 +71,9 @@ impl<'t> UninitializedTimeline<'t> { Entry::Vacant(v) => { // after taking here should be no fallible operations, because the drop guard will not // cleanup after and would block for example the tenant deletion - let (new_timeline, uninit_mark) = + let (new_timeline, _create_guard) = self.raw_timeline.take().expect("already checked"); - // this is the mutual exclusion between different retries to create the timeline; - // this should be an assertion. - uninit_mark.remove_uninit_mark().with_context(|| { - format!( - "Failed to remove uninit mark file for timeline {tenant_shard_id}/{timeline_id}" - ) - })?; v.insert(Arc::clone(&new_timeline)); new_timeline.maybe_spawn_flush_loop(); @@ -94,6 +86,7 @@ impl<'t> UninitializedTimeline<'t> { /// Prepares timeline data by loading it from the basebackup archive. pub(crate) async fn import_basebackup_from_tar( self, + tenant: Arc, copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin), base_lsn: Lsn, broker_client: storage_broker::BrokerClientChannel, @@ -120,10 +113,9 @@ impl<'t> UninitializedTimeline<'t> { .await .context("Failed to flush after basebackup import")?; - // All the data has been imported. Insert the Timeline into the tenant's timelines - // map and remove the uninit mark file. + // All the data has been imported. Insert the Timeline into the tenant's timelines map let tl = self.finish_creation()?; - tl.activate(broker_client, None, ctx); + tl.activate(tenant, broker_client, None, ctx); Ok(tl) } @@ -143,37 +135,35 @@ impl<'t> UninitializedTimeline<'t> { impl Drop for UninitializedTimeline<'_> { fn drop(&mut self) { - if let Some((_, uninit_mark)) = self.raw_timeline.take() { + if let Some((_, create_guard)) = self.raw_timeline.take() { let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered(); error!("Timeline got dropped without initializing, cleaning its files"); - cleanup_timeline_directory(uninit_mark); + cleanup_timeline_directory(create_guard); } } } -pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) { - let timeline_path = &uninit_mark.timeline_path; +pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) { + let timeline_path = &create_guard.timeline_path; match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) { Ok(()) => { - info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark") + info!("Timeline dir {timeline_path:?} removed successfully") } Err(e) => { error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}") } } - drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists + // Having cleaned up, we can release this TimelineId in `[Tenant::timelines_creating]` to allow other + // timeline creation attempts under this TimelineId to proceed + drop(create_guard); } -/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory, -/// or gets removed eventually. -/// -/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first. +/// A guard for timeline creations in process: as long as this object exists, the timeline ID +/// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline. #[must_use] -pub(crate) struct TimelineUninitMark<'t> { +pub(crate) struct TimelineCreateGuard<'t> { owning_tenant: &'t Tenant, timeline_id: TimelineId, - uninit_mark_deleted: bool, - uninit_mark_path: Utf8PathBuf, pub(crate) timeline_path: Utf8PathBuf, } @@ -190,11 +180,10 @@ pub(crate) enum TimelineExclusionError { Other(#[from] anyhow::Error), } -impl<'t> TimelineUninitMark<'t> { +impl<'t> TimelineCreateGuard<'t> { pub(crate) fn new( owning_tenant: &'t Tenant, timeline_id: TimelineId, - uninit_mark_path: Utf8PathBuf, timeline_path: Utf8PathBuf, ) -> Result { // Lock order: this is the only place we take both locks. During drop() we only @@ -214,56 +203,14 @@ impl<'t> TimelineUninitMark<'t> { Ok(Self { owning_tenant, timeline_id, - uninit_mark_deleted: false, - uninit_mark_path, timeline_path, }) } } - - fn remove_uninit_mark(mut self) -> anyhow::Result<()> { - if !self.uninit_mark_deleted { - self.delete_mark_file_if_present()?; - } - - Ok(()) - } - - fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> { - let uninit_mark_file = &self.uninit_mark_path; - let uninit_mark_parent = uninit_mark_file - .parent() - .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?; - fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| { - format!("Failed to remove uninit mark file at path {uninit_mark_file:?}") - })?; - crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?; - self.uninit_mark_deleted = true; - - Ok(()) - } } -impl Drop for TimelineUninitMark<'_> { +impl Drop for TimelineCreateGuard<'_> { fn drop(&mut self) { - if !self.uninit_mark_deleted { - if self.timeline_path.exists() { - error!( - "Uninit mark {} is not removed, timeline {} stays uninitialized", - self.uninit_mark_path, self.timeline_path - ) - } else { - // unblock later timeline creation attempts - warn!( - "Removing intermediate uninit mark file {}", - self.uninit_mark_path - ); - if let Err(e) = self.delete_mark_file_if_present() { - error!("Failed to remove the uninit mark file: {e}") - } - } - } - self.owning_tenant .timelines_creating .lock() diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 2fab6722b8..4a3a5c621b 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -2,13 +2,13 @@ //! To do so, a current implementation needs to do the following: //! //! * acknowledge the timelines that it needs to stream WAL into. -//! Pageserver is able to dynamically (un)load tenants on attach and detach, -//! hence WAL receiver needs to react on such events. +//! Pageserver is able to dynamically (un)load tenants on attach and detach, +//! hence WAL receiver needs to react on such events. //! //! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming. -//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically. -//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other. -//! Without this data, no WAL streaming is possible currently. +//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically. +//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other. +//! Without this data, no WAL streaming is possible currently. //! //! Only one active WAL streaming connection is allowed at a time. //! The connection is supposed to be updated periodically, based on safekeeper timeline data. @@ -24,26 +24,21 @@ mod connection_manager; mod walreceiver_connection; use crate::context::{DownloadBehavior, RequestContext}; -use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME}; +use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::timeline::walreceiver::connection_manager::{ connection_manager_loop_step, ConnectionManagerState, }; -use pageserver_api::shard::TenantShardId; use std::future::Future; use std::num::NonZeroU64; -use std::ops::ControlFlow; use std::sync::Arc; use std::time::Duration; use storage_broker::BrokerClientChannel; -use tokio::select; use tokio::sync::watch; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::id::TimelineId; - use self::connection_manager::ConnectionManagerStatus; use super::Timeline; @@ -62,9 +57,10 @@ pub struct WalReceiverConf { } pub struct WalReceiver { - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, manager_status: Arc>>, + /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token. + /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`. + cancel: CancellationToken, } impl WalReceiver { @@ -78,65 +74,58 @@ impl WalReceiver { let timeline_id = timeline.timeline_id; let walreceiver_ctx = ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); - let loop_status = Arc::new(std::sync::RwLock::new(None)); let manager_status = Arc::clone(&loop_status); - task_mgr::spawn( - WALRECEIVER_RUNTIME.handle(), - TaskKind::WalReceiverManager, - Some(timeline.tenant_shard_id), - Some(timeline_id), - &format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"), - false, + let cancel = timeline.cancel.child_token(); + WALRECEIVER_RUNTIME.spawn({ + let cancel = cancel.clone(); async move { debug_assert_current_span_has_tenant_and_timeline_id(); + // acquire timeline gate so we know the task doesn't outlive the Timeline + let Ok(_guard) = timeline.gate.enter() else { + debug!("WAL receiver manager could not enter the gate timeline gate, it's closed already"); + return; + }; debug!("WAL receiver manager started, connecting to broker"); let mut connection_manager_state = ConnectionManagerState::new( timeline, conf, + cancel.clone(), ); - loop { - select! { - _ = task_mgr::shutdown_watcher() => { - trace!("WAL receiver shutdown requested, shutting down"); + while !cancel.is_cancelled() { + let loop_step_result = connection_manager_loop_step( + &mut broker_client, + &mut connection_manager_state, + &walreceiver_ctx, + &cancel, + &loop_status, + ).await; + match loop_step_result { + Ok(()) => continue, + Err(_cancelled) => { + trace!("Connection manager loop ended, shutting down"); break; - }, - loop_step_result = connection_manager_loop_step( - &mut broker_client, - &mut connection_manager_state, - &walreceiver_ctx, - &loop_status, - ) => match loop_step_result { - ControlFlow::Continue(()) => continue, - ControlFlow::Break(()) => { - trace!("Connection manager loop ended, shutting down"); - break; - } - }, + } } } - connection_manager_state.shutdown().await; *loop_status.write().unwrap() = None; - Ok(()) + debug!("task exits"); } .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id)) - ); + }); Self { - tenant_shard_id, - timeline_id, manager_status, + cancel, } } - pub async fn stop(self) { - task_mgr::shutdown_tasks( - Some(TaskKind::WalReceiverManager), - Some(self.tenant_shard_id), - Some(self.timeline_id), - ) - .await; + #[instrument(skip_all, level = tracing::Level::DEBUG)] + pub fn cancel(&self) { + debug_assert_current_span_has_tenant_and_timeline_id(); + debug!("cancelling walreceiver tasks"); + self.cancel.cancel(); } pub(crate) fn status(&self) -> Option { @@ -170,14 +159,18 @@ enum TaskStateUpdate { impl TaskHandle { /// Initializes the task, starting it immediately after the creation. + /// + /// The second argument to `task` is a child token of `cancel_parent` ([`CancellationToken::child_token`]). + /// It being a child token enables us to provide a [`Self::shutdown`] method. fn spawn( + cancel_parent: &CancellationToken, task: impl FnOnce(watch::Sender>, CancellationToken) -> Fut + Send + 'static, ) -> Self where Fut: Future> + Send, E: Send + Sync + 'static, { - let cancellation = CancellationToken::new(); + let cancellation = cancel_parent.child_token(); let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started); let cancellation_clone = cancellation.clone(); @@ -197,6 +190,9 @@ impl TaskHandle { } } + /// # Cancel-Safety + /// + /// Cancellation-safe. async fn next_task_event(&mut self) -> TaskEvent { match self.events_receiver.changed().await { Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()), diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index cf6dee114f..de50f217d8 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -17,17 +17,19 @@ use crate::metrics::{ WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED, WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES, }; -use crate::task_mgr::{shutdown_token, TaskKind}; +use crate::task_mgr::TaskKind; use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline}; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use pageserver_api::models::TimelineState; -use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey; -use storage_broker::proto::SafekeeperTimelineInfo; -use storage_broker::proto::SubscribeSafekeeperInfoRequest; + use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; +use storage_broker::proto::{ + FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, + SubscribeByFilterRequest, TypeSubscription, TypedMessage, +}; use storage_broker::{BrokerClientChannel, Code, Streaming}; -use tokio::select; +use tokio_util::sync::CancellationToken; use tracing::*; use postgres_connection::PgConnectionConfig; @@ -45,27 +47,33 @@ use super::{ TaskEvent, TaskHandle, }; +pub(crate) struct Cancelled; + /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. /// Based on the updates, desides whether to start, keep or stop a WAL receiver task. /// If storage broker subscription is cancelled, exits. +/// +/// # Cancel-Safety +/// +/// Not cancellation-safe. Use `cancel` token to request cancellation. pub(super) async fn connection_manager_loop_step( broker_client: &mut BrokerClientChannel, connection_manager_state: &mut ConnectionManagerState, ctx: &RequestContext, + cancel: &CancellationToken, manager_status: &std::sync::RwLock>, -) -> ControlFlow<(), ()> { - match connection_manager_state - .timeline - .wait_to_become_active(ctx) - .await - { +) -> Result<(), Cancelled> { + match tokio::select! { + _ = cancel.cancelled() => { return Err(Cancelled); }, + st = connection_manager_state.timeline.wait_to_become_active(ctx) => { st } + } { Ok(()) => {} Err(new_state) => { debug!( ?new_state, "state changed, stopping wal connection manager loop" ); - return ControlFlow::Break(()); + return Err(Cancelled); } } @@ -83,17 +91,28 @@ pub(super) async fn connection_manager_loop_step( .timeline .subscribe_for_state_updates(); + let mut wait_lsn_status = connection_manager_state + .timeline + .subscribe_for_wait_lsn_updates(); + + // TODO: create a separate config option for discovery request interval + let discovery_request_interval = connection_manager_state.conf.lagging_wal_timeout; + let mut last_discovery_ts: Option = None; + // Subscribe to the broker updates. Stream shares underlying TCP connection // with other streams on this client (other connection managers). When // object goes out of scope, stream finishes in drop() automatically. - let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await; + let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?; debug!("Subscribed for broker timeline updates"); loop { let time_until_next_retry = connection_manager_state.time_until_next_retry(); + let any_activity = connection_manager_state.wal_connection.is_some() + || !connection_manager_state.wal_stream_candidates.is_empty(); // These things are happening concurrently: // + // - cancellation request // - keep receiving WAL on the current connection // - if the shared state says we need to change connection, disconnect and return // - this runs in a separate task and we receive updates via a watch channel @@ -101,7 +120,12 @@ pub(super) async fn connection_manager_loop_step( // - receive updates from broker // - this might change the current desired connection // - timeline state changes to something that does not allow walreceiver to run concurrently - select! { + // - if there's no connection and no candidates, try to send a discovery request + + // NB: make sure each of the select expressions are cancellation-safe + // (no need for arms to be cancellation-safe). + tokio::select! { + _ = cancel.cancelled() => { return Err(Cancelled); } Some(wal_connection_update) = async { match connection_manager_state.wal_connection.as_mut() { Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await), @@ -133,7 +157,7 @@ pub(super) async fn connection_manager_loop_step( }, // Got a new update from the broker - broker_update = broker_subscription.message() => { + broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => { match broker_update { Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update), Err(status) => { @@ -147,16 +171,17 @@ pub(super) async fn connection_manager_loop_step( warn!("broker subscription failed: {status}"); } } - return ControlFlow::Continue(()); + return Ok(()); } Ok(None) => { error!("broker subscription stream ended"); // can't happen - return ControlFlow::Continue(()); + return Ok(()); } } }, new_event = async { + // Reminder: this match arm needs to be cancellation-safe. loop { if connection_manager_state.timeline.current_state() == TimelineState::Loading { warn!("wal connection manager should only be launched after timeline has become active"); @@ -182,11 +207,11 @@ pub(super) async fn connection_manager_loop_step( } } => match new_event { ControlFlow::Continue(()) => { - return ControlFlow::Continue(()); + return Ok(()); } ControlFlow::Break(()) => { debug!("Timeline is no longer active, stopping wal connection manager loop"); - return ControlFlow::Break(()); + return Err(Cancelled); } }, @@ -202,6 +227,65 @@ pub(super) async fn connection_manager_loop_step( } } } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"), + + Some(()) = async { + // Reminder: this match arm needs to be cancellation-safe. + // Calculating time needed to wait until sending the next discovery request. + // Current implementation is conservative and sends discovery requests only when there are no candidates. + + if any_activity { + // No need to send discovery requests if there is an active connection or candidates. + return None; + } + + // Waiting for an active wait_lsn request. + while wait_lsn_status.borrow().is_none() { + if wait_lsn_status.changed().await.is_err() { + // wait_lsn_status channel was closed, exiting + warn!("wait_lsn_status channel was closed in connection_manager_loop_step"); + return None; + } + } + + // All preconditions met, preparing to send a discovery request. + let now = std::time::Instant::now(); + let next_discovery_ts = last_discovery_ts + .map(|ts| ts + discovery_request_interval) + .unwrap_or_else(|| now); + + if next_discovery_ts > now { + // Prevent sending discovery requests too frequently. + tokio::time::sleep(next_discovery_ts - now).await; + } + + let tenant_timeline_id = Some(ProtoTenantTimelineId { + tenant_id: id.tenant_id.as_ref().to_owned(), + timeline_id: id.timeline_id.as_ref().to_owned(), + }); + let request = SafekeeperDiscoveryRequest { tenant_timeline_id }; + let msg = TypedMessage { + r#type: MessageType::SafekeeperDiscoveryRequest as i32, + safekeeper_timeline_info: None, + safekeeper_discovery_request: Some(request), + safekeeper_discovery_response: None, + }; + + last_discovery_ts = Some(std::time::Instant::now()); + debug!("No active connection and no candidates, sending discovery request to the broker"); + + // Cancellation safety: we want to send a message to the broker, but publish_one() + // function can get cancelled by the other select! arm. This is absolutely fine, because + // we just want to receive broker updates and discovery is not important if we already + // receive updates. + // + // It is possible that `last_discovery_ts` will be updated, but the message will not be sent. + // This is totally fine because of the reason above. + + // This is a fire-and-forget request, we don't care about the response + let _ = broker_client.publish_one(msg).await; + debug!("Discovery request sent to the broker"); + None + } => {} } if let Some(new_candidate) = connection_manager_state.next_connection_candidate() { @@ -218,32 +302,46 @@ pub(super) async fn connection_manager_loop_step( async fn subscribe_for_timeline_updates( broker_client: &mut BrokerClientChannel, id: TenantTimelineId, -) -> Streaming { + cancel: &CancellationToken, +) -> Result, Cancelled> { let mut attempt = 0; - let cancel = shutdown_token(); - loop { exponential_backoff( attempt, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, - &cancel, + cancel, ) .await; attempt += 1; // subscribe to the specific timeline - let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId { - tenant_id: id.tenant_id.as_ref().to_owned(), - timeline_id: id.timeline_id.as_ref().to_owned(), - }); - let request = SubscribeSafekeeperInfoRequest { - subscription_key: Some(key), + let request = SubscribeByFilterRequest { + types: vec![ + TypeSubscription { + r#type: MessageType::SafekeeperTimelineInfo as i32, + }, + TypeSubscription { + r#type: MessageType::SafekeeperDiscoveryResponse as i32, + }, + ], + tenant_timeline_id: Some(FilterTenantTimelineId { + enabled: true, + tenant_timeline_id: Some(ProtoTenantTimelineId { + tenant_id: id.tenant_id.as_ref().to_owned(), + timeline_id: id.timeline_id.as_ref().to_owned(), + }), + }), }; - match broker_client.subscribe_safekeeper_info(request).await { + match { + tokio::select! { + r = broker_client.subscribe_by_filter(request) => { r } + _ = cancel.cancelled() => { return Err(Cancelled); } + } + } { Ok(resp) => { - return resp.into_inner(); + return Ok(resp.into_inner()); } Err(e) => { // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and @@ -264,6 +362,8 @@ pub(super) struct ConnectionManagerState { id: TenantTimelineId, /// Use pageserver data about the timeline to filter out some of the safekeepers. timeline: Arc, + /// Child token of [`super::WalReceiver::cancel`], inherited to all tasks we spawn. + cancel: CancellationToken, conf: WalReceiverConf, /// Current connection to safekeeper for WAL streaming. wal_connection: Option, @@ -380,13 +480,17 @@ struct RetryInfo { /// Data about the timeline to connect to, received from the broker. #[derive(Debug, Clone)] struct BrokerSkTimeline { - timeline: SafekeeperTimelineInfo, + timeline: SafekeeperDiscoveryResponse, /// Time at which the data was fetched from the broker last time, to track the stale data. latest_update: NaiveDateTime, } impl ConnectionManagerState { - pub(super) fn new(timeline: Arc, conf: WalReceiverConf) -> Self { + pub(super) fn new( + timeline: Arc, + conf: WalReceiverConf, + cancel: CancellationToken, + ) -> Self { let id = TenantTimelineId { tenant_id: timeline.tenant_shard_id.tenant_id, timeline_id: timeline.timeline_id, @@ -394,6 +498,7 @@ impl ConnectionManagerState { Self { id, timeline, + cancel, conf, wal_connection: None, wal_stream_candidates: HashMap::new(), @@ -401,6 +506,22 @@ impl ConnectionManagerState { } } + fn spawn( + &self, + task: impl FnOnce( + tokio::sync::watch::Sender>, + CancellationToken, + ) -> Fut + + Send + + 'static, + ) -> TaskHandle + where + Fut: std::future::Future> + Send, + { + // TODO: get rid of TaskHandle + super::TaskHandle::spawn(&self.cancel, task) + } + /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) { WALRECEIVER_SWITCHES @@ -419,7 +540,7 @@ impl ConnectionManagerState { ); let span = info_span!("connection", %node_id); - let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { + let connection_handle = self.spawn(move |events_sender, cancellation| { async move { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -447,6 +568,12 @@ impl ConnectionManagerState { info!("walreceiver connection handling ended: {e}"); Ok(()) } + WalReceiverError::ClosedGate => { + info!( + "walreceiver connection handling ended because of closed gate" + ); + Ok(()) + } WalReceiverError::Other(e) => { // give out an error to have task_mgr give it a really verbose logging if cancellation.is_cancelled() { @@ -486,6 +613,10 @@ impl ConnectionManagerState { /// Drops the current connection (if any) and updates retry timeout for the next /// connection attempt to the same safekeeper. + /// + /// # Cancel-Safety + /// + /// Not cancellation-safe. async fn drop_old_connection(&mut self, needs_shutdown: bool) { let wal_connection = match self.wal_connection.take() { Some(wal_connection) => wal_connection, @@ -493,7 +624,14 @@ impl ConnectionManagerState { }; if needs_shutdown { - wal_connection.connection_task.shutdown().await; + wal_connection + .connection_task + .shutdown() + // This here is why this function isn't cancellation-safe. + // If we got cancelled here, then self.wal_connection is already None and we lose track of the task. + // Even if our caller diligently calls Self::shutdown(), it will find a self.wal_connection=None + // and thus be ineffective. + .await; } let retry = self @@ -550,9 +688,59 @@ impl ConnectionManagerState { } /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key. - fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) { + fn register_timeline_update(&mut self, typed_msg: TypedMessage) { + let mut is_discovery = false; + let timeline_update = match typed_msg.r#type() { + MessageType::SafekeeperTimelineInfo => { + let info = match typed_msg.safekeeper_timeline_info { + Some(info) => info, + None => { + warn!("bad proto message from broker: no safekeeper_timeline_info"); + return; + } + }; + SafekeeperDiscoveryResponse { + safekeeper_id: info.safekeeper_id, + tenant_timeline_id: info.tenant_timeline_id, + commit_lsn: info.commit_lsn, + safekeeper_connstr: info.safekeeper_connstr, + availability_zone: info.availability_zone, + standby_horizon: info.standby_horizon, + } + } + MessageType::SafekeeperDiscoveryResponse => { + is_discovery = true; + match typed_msg.safekeeper_discovery_response { + Some(response) => response, + None => { + warn!("bad proto message from broker: no safekeeper_discovery_response"); + return; + } + } + } + _ => { + // unexpected message + return; + } + }; + WALRECEIVER_BROKER_UPDATES.inc(); + trace!( + "safekeeper info update: standby_horizon(cutoff)={}", + timeline_update.standby_horizon + ); + if timeline_update.standby_horizon != 0 { + // ignore reports from safekeepers not connected to replicas + self.timeline + .standby_horizon + .store(Lsn(timeline_update.standby_horizon)); + self.timeline + .metrics + .standby_horizon_gauge + .set(timeline_update.standby_horizon as i64); + } + let new_safekeeper_id = NodeId(timeline_update.safekeeper_id); let old_entry = self.wal_stream_candidates.insert( new_safekeeper_id, @@ -563,7 +751,11 @@ impl ConnectionManagerState { ); if old_entry.is_none() { - info!("New SK node was added: {new_safekeeper_id}"); + info!( + ?is_discovery, + %new_safekeeper_id, + "New SK node was added", + ); WALRECEIVER_CANDIDATES_ADDED.inc(); } } @@ -762,7 +954,7 @@ impl ConnectionManagerState { fn select_connection_candidate( &self, node_to_omit: Option, - ) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> { + ) -> Option<(NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> { self.applicable_connection_candidates() .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit) .max_by_key(|(_, info, _)| info.commit_lsn) @@ -772,7 +964,7 @@ impl ConnectionManagerState { /// Some safekeepers are filtered by the retry cooldown. fn applicable_connection_candidates( &self, - ) -> impl Iterator { + ) -> impl Iterator { let now = Utc::now().naive_utc(); self.wal_stream_candidates @@ -838,6 +1030,9 @@ impl ConnectionManagerState { } } + /// # Cancel-Safety + /// + /// Not cancellation-safe. pub(super) async fn shutdown(mut self) { if let Some(wal_connection) = self.wal_connection.take() { wal_connection.connection_task.shutdown().await; @@ -909,20 +1104,13 @@ mod tests { latest_update: NaiveDateTime, ) -> BrokerSkTimeline { BrokerSkTimeline { - timeline: SafekeeperTimelineInfo { + timeline: SafekeeperDiscoveryResponse { safekeeper_id: 0, tenant_timeline_id: None, - term: 0, - last_log_term: 0, - flush_lsn: 0, commit_lsn, - backup_lsn: 0, - remote_consistent_lsn: 0, - peer_horizon_lsn: 0, - local_start_lsn: 0, safekeeper_connstr: safekeeper_connstr.to_owned(), - http_connstr: safekeeper_connstr.to_owned(), availability_zone: None, + standby_horizon: 0, }, latest_update, } @@ -930,7 +1118,7 @@ mod tests { #[tokio::test] async fn no_connection_no_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("no_connection_no_candidate")?; + let harness = TenantHarness::create("no_connection_no_candidate").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); @@ -963,7 +1151,7 @@ mod tests { #[tokio::test] async fn connection_no_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("connection_no_candidate")?; + let harness = TenantHarness::create("connection_no_candidate").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); @@ -986,7 +1174,7 @@ mod tests { sk_id: connected_sk_id, availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); @@ -1028,7 +1216,7 @@ mod tests { #[tokio::test] async fn no_connection_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("no_connection_candidate")?; + let harness = TenantHarness::create("no_connection_candidate").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); @@ -1091,7 +1279,7 @@ mod tests { #[tokio::test] async fn candidate_with_many_connection_failures() -> anyhow::Result<()> { - let harness = TenantHarness::create("candidate_with_many_connection_failures")?; + let harness = TenantHarness::create("candidate_with_many_connection_failures").await?; let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); @@ -1131,7 +1319,7 @@ mod tests { #[tokio::test] async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?; + let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate").await?; let mut state = dummy_state(&harness).await; let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1154,7 +1342,7 @@ mod tests { sk_id: connected_sk_id, availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); @@ -1197,7 +1385,8 @@ mod tests { #[tokio::test] async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?; + let harness = + TenantHarness::create("timeout_connection_threshold_current_candidate").await?; let mut state = dummy_state(&harness).await; let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1221,7 +1410,7 @@ mod tests { sk_id: NodeId(1), availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); @@ -1260,7 +1449,7 @@ mod tests { #[tokio::test] async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?; + let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate").await?; let mut state = dummy_state(&harness).await; let current_lsn = Lsn(100_000).align(); let new_lsn = Lsn(100_100).align(); @@ -1285,7 +1474,7 @@ mod tests { sk_id: NodeId(1), availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }), + connection_task: state.spawn(move |_, _| async move { Ok(()) }), discovered_new_wal: Some(NewCommittedWAL { discovered_at: time_over_threshold, lsn: new_lsn, @@ -1341,6 +1530,7 @@ mod tests { timeline_id: TIMELINE_ID, }, timeline, + cancel: CancellationToken::new(), conf: WalReceiverConf { wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), @@ -1361,9 +1551,9 @@ mod tests { // and pageserver should prefer to connect to it. let test_az = Some("test_az".to_owned()); - let harness = TenantHarness::create("switch_to_same_availability_zone")?; + let harness = TenantHarness::create("switch_to_same_availability_zone").await?; let mut state = dummy_state(&harness).await; - state.conf.availability_zone = test_az.clone(); + state.conf.availability_zone.clone_from(&test_az); let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1384,7 +1574,7 @@ mod tests { sk_id: connected_sk_id, availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); @@ -1396,7 +1586,7 @@ mod tests { // We have another safekeeper with the same commit_lsn, and it have the same availability zone as // the current pageserver. let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now); - same_az_sk.timeline.availability_zone = test_az.clone(); + same_az_sk.timeline.availability_zone.clone_from(&test_az); state.wal_stream_candidates = HashMap::from([ ( diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index e398d683e5..cee259e2e0 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -26,19 +26,18 @@ use tracing::{debug, error, info, trace, warn, Instrument}; use super::TaskStateUpdate; use crate::{ context::RequestContext, - metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, - task_mgr, - task_mgr::TaskKind, - task_mgr::WALRECEIVER_RUNTIME, + metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, + pgdatadir_mapping::DatadirModification, + task_mgr::{TaskKind, WALRECEIVER_RUNTIME}, tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, walingest::WalIngest, - walrecord::DecodedWALRecord, + walrecord::{decode_wal_record, DecodedWALRecord}, }; use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; -use utils::pageserver_feedback::PageserverFeedback; use utils::{id::NodeId, lsn::Lsn}; +use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError}; /// Status of the connection. #[derive(Debug, Clone, Copy)] @@ -68,6 +67,7 @@ pub(super) enum WalReceiverError { SuccessfulCompletion(String), /// Generic error Other(anyhow::Error), + ClosedGate, } impl From for WalReceiverError { @@ -119,6 +119,16 @@ pub(super) async fn handle_walreceiver_connection( ) -> Result<(), WalReceiverError> { debug_assert_current_span_has_tenant_and_timeline_id(); + // prevent timeline shutdown from finishing until we have exited + let _guard = timeline.gate.enter().map_err(|e| match e { + GateError::GateClosed => WalReceiverError::ClosedGate, + })?; + // This function spawns a side-car task (WalReceiverConnectionPoller). + // Get its gate guard now as well. + let poller_guard = timeline.gate.enter().map_err(|e| match e { + GateError::GateClosed => WalReceiverError::ClosedGate, + })?; + WALRECEIVER_STARTED_CONNECTIONS.inc(); // Connect to the database in replication mode. @@ -156,22 +166,19 @@ pub(super) async fn handle_walreceiver_connection( } // The connection object performs the actual communication with the database, - // so spawn it off to run on its own. + // so spawn it off to run on its own. It shouldn't outlive this function, but, + // due to lack of async drop, we can't enforce that. However, we ensure that + // 1. it is sensitive to `cancellation` and + // 2. holds the Timeline gate open so that after timeline shutdown, + // we know this task is gone. let _connection_ctx = ctx.detached_child( TaskKind::WalReceiverConnectionPoller, ctx.download_behavior(), ); let connection_cancellation = cancellation.clone(); - task_mgr::spawn( - WALRECEIVER_RUNTIME.handle(), - TaskKind::WalReceiverConnectionPoller, - Some(timeline.tenant_shard_id), - Some(timeline.timeline_id), - "walreceiver connection", - false, + WALRECEIVER_RUNTIME.spawn( async move { debug_assert_current_span_has_tenant_and_timeline_id(); - select! { connection_result = connection => match connection_result { Ok(()) => debug!("Walreceiver db connection closed"), @@ -182,6 +189,9 @@ pub(super) async fn handle_walreceiver_connection( // with a similar error. }, WalReceiverError::SuccessfulCompletion(_) => {} + WalReceiverError::ClosedGate => { + // doesn't happen at runtime + } WalReceiverError::Other(err) => { warn!("Connection aborted: {err:#}") } @@ -190,7 +200,7 @@ pub(super) async fn handle_walreceiver_connection( }, _ = connection_cancellation.cancelled() => debug!("Connection cancelled"), } - Ok(()) + drop(poller_guard); } // Enrich the log lines emitted by this closure with meaningful context. // TODO: technically, this task outlives the surrounding function, so, the @@ -198,14 +208,9 @@ pub(super) async fn handle_walreceiver_connection( .instrument(tracing::info_span!("poller")), ); - // Immediately increment the gauge, then create a job to decrement it on task exit. - // One of the pros of `defer!` is that this will *most probably* - // get called, even in presence of panics. - let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]); - gauge.inc(); - scopeguard::defer! { - gauge.dec(); - } + let _guard = LIVE_CONNECTIONS + .with_label_values(&["wal_receiver"]) + .guard(); let identify = identify_system(&replication_client).await?; info!("{identify:?}"); @@ -303,13 +308,29 @@ pub(super) async fn handle_walreceiver_connection( trace!("received XLogData between {startlsn} and {endlsn}"); + WAL_INGEST.bytes_received.inc_by(data.len() as u64); waldecoder.feed_bytes(data); { - let mut decoded = DecodedWALRecord::default(); let mut modification = timeline.begin_modification(startlsn); let mut uncommitted_records = 0; let mut filtered_records = 0; + + async fn commit( + modification: &mut DatadirModification<'_>, + uncommitted: &mut u64, + filtered: &mut u64, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + WAL_INGEST + .records_committed + .inc_by(*uncommitted - *filtered); + modification.commit(ctx).await?; + *uncommitted = 0; + *filtered = 0; + Ok(()) + } + while let Some((lsn, recdata)) = waldecoder.poll_decode()? { // It is important to deal with the aligned records as lsn in getPage@LSN is // aligned and can be several bytes bigger. Without this alignment we are @@ -318,9 +339,28 @@ pub(super) async fn handle_walreceiver_connection( return Err(WalReceiverError::Other(anyhow!("LSN not aligned"))); } + // Deserialize WAL record + let mut decoded = DecodedWALRecord::default(); + decode_wal_record(recdata, &mut decoded, modification.tline.pg_version)?; + + if decoded.is_dbase_create_copy(timeline.pg_version) + && uncommitted_records > 0 + { + // Special case: legacy PG database creations operate by reading pages from a 'template' database: + // these are the only kinds of WAL record that require reading data blocks while ingesting. Ensure + // all earlier writes of data blocks are visible by committing any modification in flight. + commit( + &mut modification, + &mut uncommitted_records, + &mut filtered_records, + &ctx, + ) + .await?; + } + // Ingest the records without immediately committing them. let ingested = walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx) + .ingest_record(decoded, lsn, &mut modification, &ctx) .await .with_context(|| format!("could not ingest record at {lsn}"))?; if !ingested { @@ -329,6 +369,9 @@ pub(super) async fn handle_walreceiver_connection( filtered_records += 1; } + // FIXME: this cannot be made pausable_failpoint without fixing the + // failpoint library; in tests, the added amount of debugging will cause us + // to timeout the tests. fail_point!("walreceiver-after-ingest"); last_rec_lsn = lsn; @@ -336,22 +379,29 @@ pub(super) async fn handle_walreceiver_connection( // Commit every ingest_batch_size records. Even if we filtered out // all records, we still need to call commit to advance the LSN. uncommitted_records += 1; - if uncommitted_records >= ingest_batch_size { - WAL_INGEST - .records_committed - .inc_by(uncommitted_records - filtered_records); - modification.commit(&ctx).await?; - uncommitted_records = 0; - filtered_records = 0; + if uncommitted_records >= ingest_batch_size + || modification.approx_pending_bytes() + > DatadirModification::MAX_PENDING_BYTES + { + commit( + &mut modification, + &mut uncommitted_records, + &mut filtered_records, + &ctx, + ) + .await?; } } // Commit the remaining records. if uncommitted_records > 0 { - WAL_INGEST - .records_committed - .inc_by(uncommitted_records - filtered_records); - modification.commit(&ctx).await?; + commit( + &mut modification, + &mut uncommitted_records, + &mut filtered_records, + &ctx, + ) + .await?; } } @@ -389,16 +439,6 @@ pub(super) async fn handle_walreceiver_connection( } } - timeline - .check_checkpoint_distance() - .await - .with_context(|| { - format!( - "Failed to check checkpoint distance for timeline {}", - timeline.timeline_id - ) - })?; - if let Some(last_lsn) = status_update { let timeline_remote_consistent_lsn = timeline .get_remote_consistent_lsn_visible() @@ -426,19 +466,28 @@ pub(super) async fn handle_walreceiver_connection( // Send the replication feedback message. // Regular standby_status_update fields are put into this message. - let current_timeline_size = timeline - .get_current_logical_size( - crate::tenant::timeline::GetLogicalSizePriority::User, - &ctx, - ) - // FIXME: https://github.com/neondatabase/neon/issues/5963 - .size_dont_care_about_accuracy(); + let current_timeline_size = if timeline.tenant_shard_id.is_shard_zero() { + timeline + .get_current_logical_size( + crate::tenant::timeline::GetLogicalSizePriority::User, + &ctx, + ) + // FIXME: https://github.com/neondatabase/neon/issues/5963 + .size_dont_care_about_accuracy() + } else { + // Non-zero shards send zero for logical size. The safekeeper will ignore + // this number. This is because in a sharded tenant, only shard zero maintains + // accurate logical size. + 0 + }; + let status_update = PageserverFeedback { current_timeline_size, last_received_lsn, disk_consistent_lsn, remote_consistent_lsn, replytime: ts, + shard_number: timeline.tenant_shard_id.shard_number.0 as u32, }; debug!("neon_status_update {status_update:?}"); diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 0b61bc0a10..592f41cb21 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -1,4 +1,4 @@ -use super::storage_layer::LayerFileName; +use super::storage_layer::LayerName; use super::storage_layer::ResidentLayer; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::remote_timeline_client::index::IndexPart; @@ -43,28 +43,25 @@ pub(crate) struct UploadQueueInitialized { /// Counter to assign task IDs pub(crate) task_counter: u64, - /// All layer files stored in the remote storage, taking into account all - /// in-progress and queued operations - pub(crate) latest_files: HashMap, + /// The next uploaded index_part.json; assumed to be dirty. + /// + /// Should not be read, directly except for layer file updates. Instead you should add a + /// projected field. + pub(crate) dirty: IndexPart, + + /// The latest remote persisted IndexPart. + /// + /// Each completed metadata upload will update this. The second item is the task_id which last + /// updated the value, used to ensure we never store an older value over a newer one. + pub(crate) clean: (IndexPart, Option), /// How many file uploads or deletions been scheduled, since the /// last (scheduling of) metadata index upload? pub(crate) latest_files_changes_since_metadata_upload_scheduled: u64, - /// Metadata stored in the remote storage, taking into account all - /// in-progress and queued operations. - /// DANGER: do not return to outside world, e.g., safekeepers. - pub(crate) latest_metadata: TimelineMetadata, - - /// `disk_consistent_lsn` from the last metadata file that was successfully - /// uploaded. `Lsn(0)` if nothing was uploaded yet. - /// Unlike `latest_files` or `latest_metadata`, this value is never ahead. - /// Safekeeper can rely on it to make decisions for WAL storage. - /// - /// visible_remote_consistent_lsn is only updated after our generation has been validated with + /// The Lsn is only updated after our generation has been validated with /// the control plane (unlesss a timeline's generation is None, in which case /// we skip validation) - pub(crate) projected_remote_consistent_lsn: Option, pub(crate) visible_remote_consistent_lsn: Arc, // Breakdown of different kinds of tasks currently in-progress @@ -89,7 +86,7 @@ pub(crate) struct UploadQueueInitialized { /// Putting this behind a testing feature to catch problems in tests, but assuming we could have a /// bug causing leaks, then it's better to not leave this enabled for production builds. #[cfg(feature = "testing")] - pub(crate) dangling_files: HashMap, + pub(crate) dangling_files: HashMap, /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`. pub(crate) shutting_down: bool, @@ -110,7 +107,8 @@ impl UploadQueueInitialized { } pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option { - self.projected_remote_consistent_lsn + let lsn = self.clean.0.metadata.disk_consistent_lsn(); + self.clean.1.map(|_| lsn) } } @@ -121,16 +119,21 @@ pub(super) enum SetDeletedFlagProgress { Successful(NaiveDateTime), } -pub(super) struct UploadQueueStopped { +pub(super) struct UploadQueueStoppedDeletable { pub(super) upload_queue_for_deletion: UploadQueueInitialized, pub(super) deleted_at: SetDeletedFlagProgress, } +pub(super) enum UploadQueueStopped { + Deletable(UploadQueueStoppedDeletable), + Uninitialized, +} + #[derive(thiserror::Error, Debug)] -pub(crate) enum NotInitialized { +pub enum NotInitialized { #[error("queue is in state Uninitialized")] Uninitialized, - #[error("queue is in state Stopping")] + #[error("queue is in state Stopped")] Stopped, #[error("queue is shutting down")] ShuttingDown, @@ -161,12 +164,12 @@ impl UploadQueue { info!("initializing upload queue for empty remote"); + let index_part = IndexPart::empty(metadata.clone()); + let state = UploadQueueInitialized { - // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead. - latest_files: HashMap::new(), + dirty: index_part.clone(), + clean: (index_part, None), latest_files_changes_since_metadata_upload_scheduled: 0, - latest_metadata: metadata.clone(), - projected_remote_consistent_lsn: None, visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)), // what follows are boring default initializations task_counter: 0, @@ -196,24 +199,15 @@ impl UploadQueue { } } - let mut files = HashMap::with_capacity(index_part.layer_metadata.len()); - for (layer_name, layer_metadata) in &index_part.layer_metadata { - files.insert( - layer_name.to_owned(), - LayerFileMetadata::from(layer_metadata), - ); - } - info!( "initializing upload queue with remote index_part.disk_consistent_lsn: {}", index_part.metadata.disk_consistent_lsn() ); let state = UploadQueueInitialized { - latest_files: files, + dirty: index_part.clone(), + clean: (index_part.clone(), None), latest_files_changes_since_metadata_upload_scheduled: 0, - latest_metadata: index_part.metadata.clone(), - projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()), visible_remote_consistent_lsn: Arc::new( index_part.metadata.disk_consistent_lsn().into(), ), @@ -234,27 +228,32 @@ impl UploadQueue { Ok(self.initialized_mut().expect("we just set it")) } - pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> { + pub(crate) fn initialized_mut( + &mut self, + ) -> Result<&mut UploadQueueInitialized, NotInitialized> { use UploadQueue::*; match self { - Uninitialized => Err(NotInitialized::Uninitialized.into()), + Uninitialized => Err(NotInitialized::Uninitialized), Initialized(x) => { if x.shutting_down { - Err(NotInitialized::ShuttingDown.into()) + Err(NotInitialized::ShuttingDown) } else { Ok(x) } } - Stopped(_) => Err(NotInitialized::Stopped.into()), + Stopped(_) => Err(NotInitialized::Stopped), } } - pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStopped> { + pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStoppedDeletable> { match self { UploadQueue::Initialized(_) | UploadQueue::Uninitialized => { anyhow::bail!("queue is in state {}", self.as_str()) } - UploadQueue::Stopped(stopped) => Ok(stopped), + UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => { + anyhow::bail!("queue is in state Stopped(Uninitialized)") + } + UploadQueue::Stopped(UploadQueueStopped::Deletable(deletable)) => Ok(deletable), } } } @@ -273,7 +272,7 @@ pub(crate) struct UploadTask { /// for timeline deletion, which skips this queue and goes directly to DeletionQueue. #[derive(Debug)] pub(crate) struct Delete { - pub(crate) layers: Vec<(LayerFileName, LayerFileMetadata)>, + pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>, } #[derive(Debug)] @@ -281,13 +280,16 @@ pub(crate) enum UploadOp { /// Upload a layer file UploadLayer(ResidentLayer, LayerFileMetadata), - /// Upload the metadata file - UploadMetadata(IndexPart, Lsn), + /// Upload a index_part.json file + UploadMetadata { + /// The next [`UploadQueueInitialized::clean`] after this upload succeeds. + uploaded: Box, + }, /// Delete layer files Delete(Delete), - /// Barrier. When the barrier operation is reached, + /// Barrier. When the barrier operation is reached, the channel is closed. Barrier(tokio::sync::watch::Sender<()>), /// Shutdown; upon encountering this operation no new operations will be spawned, otherwise @@ -302,13 +304,15 @@ impl std::fmt::Display for UploadOp { write!( f, "UploadLayer({}, size={:?}, gen={:?})", - layer, - metadata.file_size(), - metadata.generation + layer, metadata.file_size, metadata.generation ) } - UploadOp::UploadMetadata(_, lsn) => { - write!(f, "UploadMetadata(lsn: {})", lsn) + UploadOp::UploadMetadata { uploaded, .. } => { + write!( + f, + "UploadMetadata(lsn: {})", + uploaded.metadata.disk_consistent_lsn() + ) } UploadOp::Delete(delete) => { write!(f, "Delete({} layers)", delete.layers.len()) diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs new file mode 100644 index 0000000000..553edf6d8b --- /dev/null +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -0,0 +1,1062 @@ +//! +//! Utilities for vectored reading of variable-sized "blobs". +//! +//! The "blob" api is an abstraction on top of the "block" api, +//! with the main difference being that blobs do not have a fixed +//! size (each blob is prefixed with 1 or 4 byte length field) +//! +//! The vectored apis provided in this module allow for planning +//! and executing disk IO which covers multiple blobs. +//! +//! Reads are planned with [`VectoredReadPlanner`] which will coalesce +//! adjacent blocks into a single disk IO request and exectuted by +//! [`VectoredBlobReader`] which does all the required offset juggling +//! and returns a buffer housing all the blobs and a list of offsets. +//! +//! Note that the vectored blob api does *not* go through the page cache. + +use std::collections::BTreeMap; + +use bytes::BytesMut; +use pageserver_api::key::Key; +use tokio::io::AsyncWriteExt; +use tokio_epoll_uring::BoundedBuf; +use utils::lsn::Lsn; +use utils::vec_map::VecMap; + +use crate::context::RequestContext; +use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK}; +use crate::virtual_file::{self, VirtualFile}; + +/// Metadata bundled with the start and end offset of a blob. +#[derive(Copy, Clone, Debug)] +pub struct BlobMeta { + pub key: Key, + pub lsn: Lsn, +} + +/// Blob offsets into [`VectoredBlobsBuf::buf`] +pub struct VectoredBlob { + pub start: usize, + pub end: usize, + pub meta: BlobMeta, +} + +/// Return type of [`VectoredBlobReader::read_blobs`] +pub struct VectoredBlobsBuf { + /// Buffer for all blobs in this read + pub buf: BytesMut, + /// Offsets into the buffer and metadata for all blobs in this read + pub blobs: Vec, +} + +/// Description of one disk read for multiple blobs. +/// Used as the argument form [`VectoredBlobReader::read_blobs`] +#[derive(Debug)] +pub struct VectoredRead { + pub start: u64, + pub end: u64, + /// Start offset and metadata for each blob in this read + pub blobs_at: VecMap, +} + +impl VectoredRead { + pub(crate) fn size(&self) -> usize { + (self.end - self.start) as usize + } +} + +#[derive(Eq, PartialEq, Debug)] +pub(crate) enum VectoredReadExtended { + Yes, + No, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum VectoredReadCoalesceMode { + /// Only coalesce exactly adjacent reads. + AdjacentOnly, + /// In addition to adjacent reads, also consider reads whose corresponding + /// `end` and `start` offsets reside at the same chunk. + Chunked(usize), +} + +impl VectoredReadCoalesceMode { + /// [`AdjacentVectoredReadBuilder`] is used if alignment requirement is 0, + /// whereas [`ChunkedVectoredReadBuilder`] is used for alignment requirement 1 and higher. + pub(crate) fn get() -> Self { + let align = virtual_file::get_io_buffer_alignment_raw(); + if align == 0 { + VectoredReadCoalesceMode::AdjacentOnly + } else { + VectoredReadCoalesceMode::Chunked(align) + } + } +} + +pub(crate) enum VectoredReadBuilder { + Adjacent(AdjacentVectoredReadBuilder), + Chunked(ChunkedVectoredReadBuilder), +} + +impl VectoredReadBuilder { + fn new_impl( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + max_read_size: Option, + mode: VectoredReadCoalesceMode, + ) -> Self { + match mode { + VectoredReadCoalesceMode::AdjacentOnly => Self::Adjacent( + AdjacentVectoredReadBuilder::new(start_offset, end_offset, meta, max_read_size), + ), + VectoredReadCoalesceMode::Chunked(chunk_size) => { + Self::Chunked(ChunkedVectoredReadBuilder::new( + start_offset, + end_offset, + meta, + max_read_size, + chunk_size, + )) + } + } + } + + pub(crate) fn new( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + max_read_size: usize, + mode: VectoredReadCoalesceMode, + ) -> Self { + Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), mode) + } + + pub(crate) fn new_streaming( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + mode: VectoredReadCoalesceMode, + ) -> Self { + Self::new_impl(start_offset, end_offset, meta, None, mode) + } + + pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { + match self { + VectoredReadBuilder::Adjacent(builder) => builder.extend(start, end, meta), + VectoredReadBuilder::Chunked(builder) => builder.extend(start, end, meta), + } + } + + pub(crate) fn build(self) -> VectoredRead { + match self { + VectoredReadBuilder::Adjacent(builder) => builder.build(), + VectoredReadBuilder::Chunked(builder) => builder.build(), + } + } + + pub(crate) fn size(&self) -> usize { + match self { + VectoredReadBuilder::Adjacent(builder) => builder.size(), + VectoredReadBuilder::Chunked(builder) => builder.size(), + } + } +} + +pub(crate) struct AdjacentVectoredReadBuilder { + /// Start offset of the read. + start: u64, + // End offset of the read. + end: u64, + /// Start offset and metadata for each blob in this read + blobs_at: VecMap, + max_read_size: Option, +} + +impl AdjacentVectoredReadBuilder { + /// Start building a new vectored read. + /// + /// Note that by design, this does not check against reading more than `max_read_size` to + /// support reading larger blobs than the configuration value. The builder will be single use + /// however after that. + pub(crate) fn new( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + max_read_size: Option, + ) -> Self { + let mut blobs_at = VecMap::default(); + blobs_at + .append(start_offset, meta) + .expect("First insertion always succeeds"); + + Self { + start: start_offset, + end: end_offset, + blobs_at, + max_read_size, + } + } + /// Attempt to extend the current read with a new blob if the start + /// offset matches with the current end of the vectored read + /// and the resuting size is below the max read size + pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { + tracing::trace!(start, end, "trying to extend"); + let size = (end - start) as usize; + let not_limited_by_max_read_size = { + if let Some(max_read_size) = self.max_read_size { + self.size() + size <= max_read_size + } else { + true + } + }; + + if self.end == start && not_limited_by_max_read_size { + self.end = end; + self.blobs_at + .append(start, meta) + .expect("LSNs are ordered within vectored reads"); + + return VectoredReadExtended::Yes; + } + + VectoredReadExtended::No + } + + pub(crate) fn size(&self) -> usize { + (self.end - self.start) as usize + } + + pub(crate) fn build(self) -> VectoredRead { + VectoredRead { + start: self.start, + end: self.end, + blobs_at: self.blobs_at, + } + } +} + +pub(crate) struct ChunkedVectoredReadBuilder { + /// Start block number + start_blk_no: usize, + /// End block number (exclusive). + end_blk_no: usize, + /// Start offset and metadata for each blob in this read + blobs_at: VecMap, + max_read_size: Option, + /// Chunk size reads are coalesced into. + chunk_size: usize, +} + +/// Computes x / d rounded up. +fn div_round_up(x: usize, d: usize) -> usize { + (x + (d - 1)) / d +} + +impl ChunkedVectoredReadBuilder { + /// Start building a new vectored read. + /// + /// Note that by design, this does not check against reading more than `max_read_size` to + /// support reading larger blobs than the configuration value. The builder will be single use + /// however after that. + pub(crate) fn new( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + max_read_size: Option, + chunk_size: usize, + ) -> Self { + let mut blobs_at = VecMap::default(); + blobs_at + .append(start_offset, meta) + .expect("First insertion always succeeds"); + + let start_blk_no = start_offset as usize / chunk_size; + let end_blk_no = div_round_up(end_offset as usize, chunk_size); + Self { + start_blk_no, + end_blk_no, + blobs_at, + max_read_size, + chunk_size, + } + } + + /// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk. + /// + /// The resulting size also must be below the max read size. + pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { + tracing::trace!(start, end, "trying to extend"); + let start_blk_no = start as usize / self.chunk_size; + let end_blk_no = div_round_up(end as usize, self.chunk_size); + + let not_limited_by_max_read_size = { + if let Some(max_read_size) = self.max_read_size { + let coalesced_size = (end_blk_no - self.start_blk_no) * self.chunk_size; + coalesced_size <= max_read_size + } else { + true + } + }; + + // True if the second block starts in the same block or the immediate next block where the first block ended. + // + // Note: This automatically handles the case where two blocks are adjacent to each other, + // whether they starts on chunk size boundary or not. + let is_adjacent_chunk_read = { + // 1. first.end & second.start are in the same block + self.end_blk_no == start_blk_no + 1 || + // 2. first.end ends one block before second.start + self.end_blk_no == start_blk_no + }; + + if is_adjacent_chunk_read && not_limited_by_max_read_size { + self.end_blk_no = end_blk_no; + self.blobs_at + .append(start, meta) + .expect("LSNs are ordered within vectored reads"); + + return VectoredReadExtended::Yes; + } + + VectoredReadExtended::No + } + + pub(crate) fn size(&self) -> usize { + (self.end_blk_no - self.start_blk_no) * self.chunk_size + } + + pub(crate) fn build(self) -> VectoredRead { + let start = (self.start_blk_no * self.chunk_size) as u64; + let end = (self.end_blk_no * self.chunk_size) as u64; + VectoredRead { + start, + end, + blobs_at: self.blobs_at, + } + } +} + +#[derive(Copy, Clone, Debug)] +pub enum BlobFlag { + None, + Ignore, + ReplaceAll, +} + +/// Planner for vectored blob reads. +/// +/// Blob offsets are received via [`VectoredReadPlanner::handle`] +/// and coalesced into disk reads. +/// +/// The implementation is very simple: +/// * Collect all blob offsets in an ordered structure +/// * Iterate over the collected blobs and coalesce them into reads at the end +pub struct VectoredReadPlanner { + // Track all the blob offsets. Start offsets must be ordered. + blobs: BTreeMap>, + // Arguments for previous blob passed into [`VectoredReadPlanner::handle`] + prev: Option<(Key, Lsn, u64, BlobFlag)>, + + max_read_size: usize, + + mode: VectoredReadCoalesceMode, +} + +impl VectoredReadPlanner { + pub fn new(max_read_size: usize) -> Self { + let mode = VectoredReadCoalesceMode::get(); + Self { + blobs: BTreeMap::new(), + prev: None, + max_read_size, + mode, + } + } + + /// Include a new blob in the read plan. + /// + /// This function is called from a B-Tree index visitor (see `DeltaLayerInner::plan_reads` + /// and `ImageLayerInner::plan_reads`). Said visitor wants to collect blob offsets for all + /// keys in a given keyspace. This function must be called for each key in the desired + /// keyspace (monotonically continuous). [`Self::handle_range_end`] must + /// be called after every range in the offset. + /// + /// In the event that keys are skipped, the behaviour is undefined and can lead to an + /// incorrect read plan. We can end up asserting, erroring in wal redo or returning + /// incorrect data to the user. + /// + /// The `flag` argument has two interesting values: + /// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs. + /// This is used for WAL records that `will_init`. + /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens + /// if the blob is cached. + pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) { + // Implementation note: internally lag behind by one blob such that + // we have a start and end offset when initialising [`VectoredRead`] + let (prev_key, prev_lsn, prev_offset, prev_flag) = match self.prev { + None => { + self.prev = Some((key, lsn, offset, flag)); + return; + } + Some(prev) => prev, + }; + + self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag); + + self.prev = Some((key, lsn, offset, flag)); + } + + pub fn handle_range_end(&mut self, offset: u64) { + if let Some((prev_key, prev_lsn, prev_offset, prev_flag)) = self.prev { + self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag); + } + + self.prev = None; + } + + fn add_blob(&mut self, key: Key, lsn: Lsn, start_offset: u64, end_offset: u64, flag: BlobFlag) { + match flag { + BlobFlag::None => { + let blobs_for_key = self.blobs.entry(key).or_default(); + blobs_for_key.push((lsn, start_offset, end_offset)); + } + BlobFlag::ReplaceAll => { + let blobs_for_key = self.blobs.entry(key).or_default(); + blobs_for_key.clear(); + blobs_for_key.push((lsn, start_offset, end_offset)); + } + BlobFlag::Ignore => {} + } + } + + pub fn finish(self) -> Vec { + let mut current_read_builder: Option = None; + let mut reads = Vec::new(); + + for (key, blobs_for_key) in self.blobs { + for (lsn, start_offset, end_offset) in blobs_for_key { + let extended = match &mut current_read_builder { + Some(read_builder) => { + read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn }) + } + None => VectoredReadExtended::No, + }; + + if extended == VectoredReadExtended::No { + let next_read_builder = VectoredReadBuilder::new( + start_offset, + end_offset, + BlobMeta { key, lsn }, + self.max_read_size, + self.mode, + ); + + let prev_read_builder = current_read_builder.replace(next_read_builder); + + // `current_read_builder` is None in the first iteration of the outer loop + if let Some(read_builder) = prev_read_builder { + reads.push(read_builder.build()); + } + } + } + } + + if let Some(read_builder) = current_read_builder { + reads.push(read_builder.build()); + } + + reads + } +} + +/// Disk reader for vectored blob spans (does not go through the page cache) +pub struct VectoredBlobReader<'a> { + file: &'a VirtualFile, +} + +impl<'a> VectoredBlobReader<'a> { + pub fn new(file: &'a VirtualFile) -> Self { + Self { file } + } + + /// Read the requested blobs into the buffer. + /// + /// We have to deal with the fact that blobs are not fixed size. + /// Each blob is prefixed by a size header. + /// + /// The success return value is a struct which contains the buffer + /// filled from disk and a list of offsets at which each blob lies + /// in the buffer. + pub async fn read_blobs( + &self, + read: &VectoredRead, + buf: BytesMut, + ctx: &RequestContext, + ) -> Result { + assert!(read.size() > 0); + assert!( + read.size() <= buf.capacity(), + "{} > {}", + read.size(), + buf.capacity() + ); + + if cfg!(debug_assertions) { + let align = virtual_file::get_io_buffer_alignment() as u64; + debug_assert_eq!( + read.start % align, + 0, + "Read start at {} does not satisfy the required io buffer alignment ({} bytes)", + read.start, + align + ); + } + + let mut buf = self + .file + .read_exact_at(buf.slice(0..read.size()), read.start, ctx) + .await? + .into_inner(); + + let blobs_at = read.blobs_at.as_slice(); + + let start_offset = read.start; + + let mut metas = Vec::with_capacity(blobs_at.len()); + // Blobs in `read` only provide their starting offset. The end offset + // of a blob is implicit: the start of the next blob if one exists + // or the end of the read. + + // Some scratch space, put here for reusing the allocation + let mut decompressed_vec = Vec::new(); + + for (blob_start, meta) in blobs_at { + let blob_start_in_buf = blob_start - start_offset; + let first_len_byte = buf[blob_start_in_buf as usize]; + + // Each blob is prefixed by a header containing its size and compression information. + // Extract the size and skip that header to find the start of the data. + // The size can be 1 or 4 bytes. The most significant bit is 0 in the + // 1 byte case and 1 in the 4 byte case. + let (size_length, blob_size, compression_bits) = if first_len_byte < 0x80 { + (1, first_len_byte as u64, BYTE_UNCOMPRESSED) + } else { + let mut blob_size_buf = [0u8; 4]; + let offset_in_buf = blob_start_in_buf as usize; + + blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]); + blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK; + + let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK; + ( + 4, + u32::from_be_bytes(blob_size_buf) as u64, + compression_bits, + ) + }; + + let start_raw = blob_start_in_buf + size_length; + let end_raw = start_raw + blob_size; + let (start, end); + if compression_bits == BYTE_UNCOMPRESSED { + start = start_raw as usize; + end = end_raw as usize; + } else if compression_bits == BYTE_ZSTD { + let mut decoder = + async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec); + decoder + .write_all(&buf[start_raw as usize..end_raw as usize]) + .await?; + decoder.flush().await?; + start = buf.len(); + buf.extend_from_slice(&decompressed_vec); + end = buf.len(); + decompressed_vec.clear(); + } else { + let error = std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("invalid compression byte {compression_bits:x}"), + ); + return Err(error); + } + + metas.push(VectoredBlob { + start, + end, + meta: *meta, + }); + } + + Ok(VectoredBlobsBuf { buf, blobs: metas }) + } +} + +/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. +/// +/// It provides a streaming API for getting read blobs. It returns a batch when +/// `handle` gets called and when the current key would just exceed the read_size and +/// max_cnt constraints. +pub struct StreamingVectoredReadPlanner { + read_builder: Option, + // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`] + prev: Option<(Key, Lsn, u64)>, + /// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150, + /// we will produce a single batch instead of split them. + max_read_size: u64, + /// Max item count per batch + max_cnt: usize, + /// Size of the current batch + cnt: usize, + + mode: VectoredReadCoalesceMode, +} + +impl StreamingVectoredReadPlanner { + pub fn new(max_read_size: u64, max_cnt: usize) -> Self { + assert!(max_cnt > 0); + assert!(max_read_size > 0); + let mode = VectoredReadCoalesceMode::get(); + Self { + read_builder: None, + prev: None, + max_cnt, + max_read_size, + cnt: 0, + mode, + } + } + + pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64) -> Option { + // Implementation note: internally lag behind by one blob such that + // we have a start and end offset when initialising [`VectoredRead`] + let (prev_key, prev_lsn, prev_offset) = match self.prev { + None => { + self.prev = Some((key, lsn, offset)); + return None; + } + Some(prev) => prev, + }; + + let res = self.add_blob(prev_key, prev_lsn, prev_offset, offset, false); + + self.prev = Some((key, lsn, offset)); + + res + } + + pub fn handle_range_end(&mut self, offset: u64) -> Option { + let res = if let Some((prev_key, prev_lsn, prev_offset)) = self.prev { + self.add_blob(prev_key, prev_lsn, prev_offset, offset, true) + } else { + None + }; + + self.prev = None; + + res + } + + fn add_blob( + &mut self, + key: Key, + lsn: Lsn, + start_offset: u64, + end_offset: u64, + is_last_blob_in_read: bool, + ) -> Option { + match &mut self.read_builder { + Some(read_builder) => { + let extended = read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn }); + assert_eq!(extended, VectoredReadExtended::Yes); + } + None => { + self.read_builder = { + Some(VectoredReadBuilder::new_streaming( + start_offset, + end_offset, + BlobMeta { key, lsn }, + self.mode, + )) + }; + } + } + let read_builder = self.read_builder.as_mut().unwrap(); + self.cnt += 1; + if is_last_blob_in_read + || read_builder.size() >= self.max_read_size as usize + || self.cnt >= self.max_cnt + { + let prev_read_builder = self.read_builder.take(); + self.cnt = 0; + + // `current_read_builder` is None in the first iteration + if let Some(read_builder) = prev_read_builder { + return Some(read_builder.build()); + } + } + None + } +} + +#[cfg(test)] +mod tests { + use anyhow::Error; + + use crate::context::DownloadBehavior; + use crate::page_cache::PAGE_SZ; + use crate::task_mgr::TaskKind; + + use super::super::blob_io::tests::{random_array, write_maybe_compressed}; + use super::*; + + fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) { + let align = virtual_file::get_io_buffer_alignment() as u64; + assert_eq!(read.start % align, 0); + assert_eq!(read.start / align, offset_range.first().unwrap().2 / align); + + let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect(); + + let offsets_in_read: Vec<_> = read + .blobs_at + .as_slice() + .iter() + .map(|(offset, _)| *offset) + .collect(); + + assert_eq!(expected_offsets_in_read, offsets_in_read); + } + + #[test] + fn planner_chunked_coalesce_all_test() { + use crate::virtual_file; + + let chunk_size = virtual_file::get_io_buffer_alignment() as u64; + + // The test explicitly does not check chunk size < 512 + if chunk_size < 512 { + return; + } + + let max_read_size = chunk_size as usize * 8; + let key = Key::MIN; + let lsn = Lsn(0); + + let blob_descriptions = [ + (key, lsn, chunk_size / 8, BlobFlag::None), // Read 1 BEGIN + (key, lsn, chunk_size / 4, BlobFlag::Ignore), // Gap + (key, lsn, chunk_size / 2, BlobFlag::None), + (key, lsn, chunk_size - 2, BlobFlag::Ignore), // Gap + (key, lsn, chunk_size, BlobFlag::None), + (key, lsn, chunk_size * 2 - 1, BlobFlag::None), + (key, lsn, chunk_size * 2 + 1, BlobFlag::Ignore), // Gap + (key, lsn, chunk_size * 3 + 1, BlobFlag::None), + (key, lsn, chunk_size * 5 + 1, BlobFlag::None), + (key, lsn, chunk_size * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce. + (key, lsn, chunk_size * 7 + 1, BlobFlag::None), + (key, lsn, chunk_size * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size) + (key, lsn, chunk_size * 9, BlobFlag::Ignore), // ==== skipped a chunk + (key, lsn, chunk_size * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce) + ]; + + let ranges = [ + &[ + blob_descriptions[0], + blob_descriptions[2], + blob_descriptions[4], + blob_descriptions[5], + blob_descriptions[7], + blob_descriptions[8], + blob_descriptions[10], + ], + &blob_descriptions[11..12], + &blob_descriptions[13..], + ]; + + let mut planner = VectoredReadPlanner::new(max_read_size); + for (key, lsn, offset, flag) in blob_descriptions { + planner.handle(key, lsn, offset, flag); + } + + planner.handle_range_end(652 * 1024); + + let reads = planner.finish(); + + assert_eq!(reads.len(), ranges.len()); + + for (idx, read) in reads.iter().enumerate() { + validate_read(read, ranges[idx]); + } + } + + #[test] + fn planner_max_read_size_test() { + let max_read_size = 128 * 1024; + let key = Key::MIN; + let lsn = Lsn(0); + + let blob_descriptions = vec![ + (key, lsn, 0, BlobFlag::None), + (key, lsn, 32 * 1024, BlobFlag::None), + (key, lsn, 96 * 1024, BlobFlag::None), // Last in read 1 + (key, lsn, 128 * 1024, BlobFlag::None), // Last in read 2 + (key, lsn, 198 * 1024, BlobFlag::None), // Last in read 3 + (key, lsn, 268 * 1024, BlobFlag::None), // Last in read 4 + (key, lsn, 396 * 1024, BlobFlag::None), // Last in read 5 + (key, lsn, 652 * 1024, BlobFlag::None), // Last in read 6 + ]; + + let ranges = [ + &blob_descriptions[0..3], + &blob_descriptions[3..4], + &blob_descriptions[4..5], + &blob_descriptions[5..6], + &blob_descriptions[6..7], + &blob_descriptions[7..], + ]; + + let mut planner = VectoredReadPlanner::new(max_read_size); + for (key, lsn, offset, flag) in blob_descriptions.clone() { + planner.handle(key, lsn, offset, flag); + } + + planner.handle_range_end(652 * 1024); + + let reads = planner.finish(); + + assert_eq!(reads.len(), 6); + + // TODO: could remove zero reads to produce 5 reads here + + for (idx, read) in reads.iter().enumerate() { + validate_read(read, ranges[idx]); + } + } + + #[test] + fn planner_replacement_test() { + let chunk_size = virtual_file::get_io_buffer_alignment() as u64; + let max_read_size = 128 * chunk_size as usize; + let first_key = Key::MIN; + let second_key = first_key.next(); + let lsn = Lsn(0); + + let blob_descriptions = vec![ + (first_key, lsn, 0, BlobFlag::None), // First in read 1 + (first_key, lsn, chunk_size, BlobFlag::None), // Last in read 1 + (second_key, lsn, 2 * chunk_size, BlobFlag::ReplaceAll), + (second_key, lsn, 3 * chunk_size, BlobFlag::None), + (second_key, lsn, 4 * chunk_size, BlobFlag::ReplaceAll), // First in read 2 + (second_key, lsn, 5 * chunk_size, BlobFlag::None), // Last in read 2 + ]; + + let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]]; + + let mut planner = VectoredReadPlanner::new(max_read_size); + for (key, lsn, offset, flag) in blob_descriptions.clone() { + planner.handle(key, lsn, offset, flag); + } + + planner.handle_range_end(6 * chunk_size); + + let reads = planner.finish(); + assert_eq!(reads.len(), 2); + + for (idx, read) in reads.iter().enumerate() { + validate_read(read, ranges[idx]); + } + } + + #[test] + fn streaming_planner_max_read_size_test() { + let max_read_size = 128 * 1024; + let key = Key::MIN; + let lsn = Lsn(0); + + let blob_descriptions = vec![ + (key, lsn, 0, BlobFlag::None), + (key, lsn, 32 * 1024, BlobFlag::None), + (key, lsn, 96 * 1024, BlobFlag::None), + (key, lsn, 128 * 1024, BlobFlag::None), + (key, lsn, 198 * 1024, BlobFlag::None), + (key, lsn, 268 * 1024, BlobFlag::None), + (key, lsn, 396 * 1024, BlobFlag::None), + (key, lsn, 652 * 1024, BlobFlag::None), + ]; + + let ranges = [ + &blob_descriptions[0..3], + &blob_descriptions[3..5], + &blob_descriptions[5..6], + &blob_descriptions[6..7], + &blob_descriptions[7..], + ]; + + let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1000); + let mut reads = Vec::new(); + for (key, lsn, offset, _) in blob_descriptions.clone() { + reads.extend(planner.handle(key, lsn, offset)); + } + reads.extend(planner.handle_range_end(652 * 1024)); + + assert_eq!(reads.len(), ranges.len()); + + for (idx, read) in reads.iter().enumerate() { + validate_read(read, ranges[idx]); + } + } + + #[test] + fn streaming_planner_max_cnt_test() { + let max_read_size = 1024 * 1024; + let key = Key::MIN; + let lsn = Lsn(0); + + let blob_descriptions = vec![ + (key, lsn, 0, BlobFlag::None), + (key, lsn, 32 * 1024, BlobFlag::None), + (key, lsn, 96 * 1024, BlobFlag::None), + (key, lsn, 128 * 1024, BlobFlag::None), + (key, lsn, 198 * 1024, BlobFlag::None), + (key, lsn, 268 * 1024, BlobFlag::None), + (key, lsn, 396 * 1024, BlobFlag::None), + (key, lsn, 652 * 1024, BlobFlag::None), + ]; + + let ranges = [ + &blob_descriptions[0..2], + &blob_descriptions[2..4], + &blob_descriptions[4..6], + &blob_descriptions[6..], + ]; + + let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2); + let mut reads = Vec::new(); + for (key, lsn, offset, _) in blob_descriptions.clone() { + reads.extend(planner.handle(key, lsn, offset)); + } + reads.extend(planner.handle_range_end(652 * 1024)); + + assert_eq!(reads.len(), ranges.len()); + + for (idx, read) in reads.iter().enumerate() { + validate_read(read, ranges[idx]); + } + } + + #[test] + fn streaming_planner_edge_test() { + let max_read_size = 1024 * 1024; + let key = Key::MIN; + let lsn = Lsn(0); + { + let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1); + let mut reads = Vec::new(); + reads.extend(planner.handle_range_end(652 * 1024)); + assert!(reads.is_empty()); + } + { + let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1); + let mut reads = Vec::new(); + reads.extend(planner.handle(key, lsn, 0)); + reads.extend(planner.handle_range_end(652 * 1024)); + assert_eq!(reads.len(), 1); + validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]); + } + { + let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1); + let mut reads = Vec::new(); + reads.extend(planner.handle(key, lsn, 0)); + reads.extend(planner.handle(key, lsn, 128 * 1024)); + reads.extend(planner.handle_range_end(652 * 1024)); + assert_eq!(reads.len(), 2); + validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]); + validate_read(&reads[1], &[(key, lsn, 128 * 1024, BlobFlag::None)]); + } + { + let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2); + let mut reads = Vec::new(); + reads.extend(planner.handle(key, lsn, 0)); + reads.extend(planner.handle(key, lsn, 128 * 1024)); + reads.extend(planner.handle_range_end(652 * 1024)); + assert_eq!(reads.len(), 1); + validate_read( + &reads[0], + &[ + (key, lsn, 0, BlobFlag::None), + (key, lsn, 128 * 1024, BlobFlag::None), + ], + ); + } + } + + async fn round_trip_test_compressed(blobs: &[Vec], compression: bool) -> Result<(), Error> { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let (_temp_dir, pathbuf, offsets) = + write_maybe_compressed::(blobs, compression, &ctx).await?; + + let file = VirtualFile::open(&pathbuf, &ctx).await?; + let file_len = std::fs::metadata(&pathbuf)?.len(); + + // Multiply by two (compressed data might need more space), and add a few bytes for the header + let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16; + let mut buf = BytesMut::with_capacity(reserved_bytes); + + let mode = VectoredReadCoalesceMode::get(); + let vectored_blob_reader = VectoredBlobReader::new(&file); + let meta = BlobMeta { + key: Key::MIN, + lsn: Lsn(0), + }; + + for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() { + let end = offsets.get(idx + 1).unwrap_or(&file_len); + if idx + 1 == offsets.len() { + continue; + } + let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, mode); + let read = read_builder.build(); + let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?; + assert_eq!(result.blobs.len(), 1); + let read_blob = &result.blobs[0]; + let read_buf = &result.buf[read_blob.start..read_blob.end]; + assert_eq!(blob, read_buf, "mismatch for idx={idx} at offset={offset}"); + buf = result.buf; + } + Ok(()) + } + + #[tokio::test] + async fn test_really_big_array() -> Result<(), Error> { + let blobs = &[ + b"test".to_vec(), + random_array(10 * PAGE_SZ), + b"hello".to_vec(), + random_array(66 * PAGE_SZ), + vec![0xf3; 24 * PAGE_SZ], + b"foobar".to_vec(), + ]; + round_trip_test_compressed(blobs, false).await?; + round_trip_test_compressed(blobs, true).await?; + Ok(()) + } + + #[tokio::test] + async fn test_arrays_inc() -> Result<(), Error> { + let blobs = (0..PAGE_SZ / 8) + .map(|v| random_array(v * 16)) + .collect::>(); + round_trip_test_compressed(&blobs, false).await?; + round_trip_test_compressed(&blobs, true).await?; + Ok(()) + } + + #[test] + fn test_div_round_up() { + const CHUNK_SIZE: usize = 512; + assert_eq!(1, div_round_up(200, CHUNK_SIZE)); + assert_eq!(1, div_round_up(CHUNK_SIZE, CHUNK_SIZE)); + assert_eq!(2, div_round_up(CHUNK_SIZE + 1, CHUNK_SIZE)); + } +} diff --git a/pageserver/src/trace.rs b/pageserver/src/trace.rs deleted file mode 100644 index 18ec269198..0000000000 --- a/pageserver/src/trace.rs +++ /dev/null @@ -1,36 +0,0 @@ -use bytes::Bytes; -use camino::Utf8PathBuf; -use std::{ - fs::{create_dir_all, File}, - io::{BufWriter, Write}, -}; - -pub struct Tracer { - writer: BufWriter, -} - -impl Drop for Tracer { - fn drop(&mut self) { - self.flush() - } -} - -impl Tracer { - pub fn new(path: Utf8PathBuf) -> Self { - let parent = path.parent().expect("failed to parse parent path"); - create_dir_all(parent).expect("failed to create trace dir"); - - let file = File::create(path).expect("failed to create trace file"); - Tracer { - writer: BufWriter::new(file), - } - } - - pub fn trace(&mut self, msg: &Bytes) { - self.writer.write_all(msg).expect("failed to write trace"); - } - - pub fn flush(&mut self) { - self.writer.flush().expect("failed to flush trace file"); - } -} diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs new file mode 100644 index 0000000000..a0223f3bce --- /dev/null +++ b/pageserver/src/utilization.rs @@ -0,0 +1,70 @@ +//! An utilization metric which is used to decide on which pageserver to put next tenant. +//! +//! The metric is exposed via `GET /v1/utilization`. Refer and maintain it's openapi spec as the +//! truth. + +use anyhow::Context; +use std::path::Path; +use utils::serde_percent::Percent; + +use pageserver_api::models::PageserverUtilization; + +use crate::{config::PageServerConf, metrics::NODE_UTILIZATION_SCORE, tenant::mgr::TenantManager}; + +pub(crate) fn regenerate( + conf: &PageServerConf, + tenants_path: &Path, + tenant_manager: &TenantManager, +) -> anyhow::Result { + let statvfs = nix::sys::statvfs::statvfs(tenants_path) + .map_err(std::io::Error::from) + .context("statvfs tenants directory")?; + + // https://unix.stackexchange.com/a/703650 + let blocksz = if statvfs.fragment_size() > 0 { + statvfs.fragment_size() + } else { + statvfs.block_size() + }; + + #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))] + let free = statvfs.blocks_available() as u64 * blocksz; + + #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))] + let used = statvfs + .blocks() + // use blocks_free instead of available here to match df in case someone compares + .saturating_sub(statvfs.blocks_free()) as u64 + * blocksz; + + let captured_at = std::time::SystemTime::now(); + + // Calculate aggregate utilization from tenants on this pageserver + let (disk_wanted_bytes, shard_count) = tenant_manager.calculate_utilization()?; + + // Fetch the fraction of disk space which may be used + let disk_usable_pct = match conf.disk_usage_based_eviction.clone() { + Some(e) => e.max_usage_pct, + None => Percent::new(100).unwrap(), + }; + + // Express a static value for how many shards we may schedule on one node + const MAX_SHARDS: u32 = 20000; + + let mut doc = PageserverUtilization { + disk_usage_bytes: used, + free_space_bytes: free, + disk_wanted_bytes, + disk_usable_pct, + shard_count, + max_shard_count: MAX_SHARDS, + utilization_score: None, + captured_at: utils::serde_system_time::SystemTime(captured_at), + }; + + // Initialize `PageserverUtilization::utilization_score` + let score = doc.cached_score(); + NODE_UTILIZATION_SCORE.set(score); + + Ok(doc) +} diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 06f58b5c52..57856eea80 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -1,6 +1,7 @@ -//! //! VirtualFile is like a normal File, but it's not bound directly to -//! a file descriptor. Instead, the file is opened when it's read from, +//! a file descriptor. +//! +//! Instead, the file is opened when it's read from, //! and if too many files are open globally in the system, least-recently //! used ones are closed. //! @@ -10,18 +11,55 @@ //! This is similar to PostgreSQL's virtual file descriptor facility in //! src/backend/storage/file/fd.c //! +use crate::context::RequestContext; use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC}; + +use crate::page_cache::{PageWriteGuard, PAGE_SZ}; use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; +use owned_buffers_io::io_buf_ext::FullSlice; +use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; use pageserver_api::shard::TenantShardId; -use std::fs::{self, File, OpenOptions}; +use std::fs::File; use std::io::{Error, ErrorKind, Seek, SeekFrom}; -use std::os::unix::fs::FileExt; +use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; + +use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio::time::Instant; -use utils::fs_ext; + +pub use pageserver_api::models::virtual_file as api; +pub(crate) mod io_engine; +pub use io_engine::feature_test as io_engine_feature_test; +pub use io_engine::io_engine_for_bench; +pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult; +mod metadata; +mod open_options; +use self::owned_buffers_io::write::OwnedAsyncWriter; +pub(crate) use api::DirectIoMode; +pub(crate) use io_engine::IoEngineKind; +pub(crate) use metadata::Metadata; +pub(crate) use open_options::*; + +pub(crate) mod owned_buffers_io { + //! Abstractions for IO with owned buffers. + //! + //! Not actually tied to [`crate::virtual_file`] specifically, but, it's the primary + //! reason we need this abstraction. + //! + //! Over time, this could move into the `tokio-epoll-uring` crate, maybe `uring-common`, + //! but for the time being we're proving out the primitives in the neon.git repo + //! for faster iteration. + + pub(crate) mod io_buf_ext; + pub(crate) mod slice; + pub(crate) mod write; + pub(crate) mod util { + pub(crate) mod size_tracking_writer; + } +} /// /// A virtual file descriptor. You can use this just like std::fs::File, but internally @@ -106,7 +144,39 @@ struct SlotInner { tag: u64, /// the underlying file - file: Option, + file: Option, +} + +/// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`]. +struct PageWriteGuardBuf { + page: PageWriteGuard<'static>, +} +// Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot, +// and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved. +// Page cache pages are zero-initialized, so, wrt uninitialized memory we're good. +// (Page cache tracks separately whether the contents are valid, see `PageWriteGuard::mark_valid`.) +unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf { + fn stable_ptr(&self) -> *const u8 { + self.page.as_ptr() + } + fn bytes_init(&self) -> usize { + self.page.len() + } + fn bytes_total(&self) -> usize { + self.page.len() + } +} +// Safety: see above, plus: the ownership of [`PageWriteGuard`] means exclusive access, +// hence it's safe to hand out the `stable_mut_ptr()`. +unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf { + fn stable_mut_ptr(&mut self) -> *mut u8 { + self.page.as_mut_ptr() + } + + unsafe fn set_init(&mut self, pos: usize) { + // There shouldn't really be any reason to call this API since bytes_init() == bytes_total(). + assert!(pos <= self.page.len()); + } } impl OpenFiles { @@ -274,20 +344,31 @@ macro_rules! with_file { let $ident = $this.lock_file().await?; observe_duration!($op, $($body)*) }}; + ($this:expr, $op:expr, | mut $ident:ident | $($body:tt)*) => {{ + let mut $ident = $this.lock_file().await?; + observe_duration!($op, $($body)*) + }}; } impl VirtualFile { /// Open a file in read-only mode. Like File::open. - pub async fn open(path: &Utf8Path) -> Result { - Self::open_with_options(path, OpenOptions::new().read(true)).await + pub async fn open>( + path: P, + ctx: &RequestContext, + ) -> Result { + Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await } /// Create a new file for writing. If the file exists, it will be truncated. /// Like File::create. - pub async fn create(path: &Utf8Path) -> Result { + pub async fn create>( + path: P, + ctx: &RequestContext, + ) -> Result { Self::open_with_options( - path, + path.as_ref(), OpenOptions::new().write(true).create(true).truncate(true), + ctx, ) .await } @@ -297,11 +378,13 @@ impl VirtualFile { /// Note: If any custom flags were set in 'open_options' through OpenOptionsExt, /// they will be applied also when the file is subsequently re-opened, not only /// on the first time. Make sure that's sane! - pub async fn open_with_options( - path: &Utf8Path, + pub async fn open_with_options>( + path: P, open_options: &OpenOptions, + _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ ) -> Result { - let path_str = path.to_string(); + let path_ref = path.as_ref(); + let path_str = path_ref.to_string(); let parts = path_str.split('/').collect::>(); let (tenant_id, shard_id, timeline_id) = if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME { @@ -326,7 +409,9 @@ impl VirtualFile { // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case // where our caller doesn't get to use the returned VirtualFile before its // slot gets re-used by someone else. - let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?; + let file = observe_duration!(StorageIoOperation::Open, { + open_options.open(path_ref.as_std_path()).await? + }); // Strip all options other than read and write. // @@ -341,7 +426,7 @@ impl VirtualFile { let vfile = VirtualFile { handle: RwLock::new(handle), pos: 0, - path: path.to_path_buf(), + path: path_ref.to_path_buf(), open_options: reopen_options, tenant_id, shard_id, @@ -356,54 +441,57 @@ impl VirtualFile { Ok(vfile) } - /// Async & [`VirtualFile`]-enabled version of [`::utils::crashsafe::overwrite`]. - pub async fn crashsafe_overwrite( - final_path: &Utf8Path, - tmp_path: &Utf8Path, - content: &[u8], + /// Async version of [`::utils::crashsafe::overwrite`]. + /// + /// # NB: + /// + /// Doesn't actually use the [`VirtualFile`] file descriptor cache, but, + /// it did at an earlier time. + /// And it will use this module's [`io_engine`] in the near future, so, leaving it here. + pub async fn crashsafe_overwrite + Send, Buf: IoBuf + Send>( + final_path: Utf8PathBuf, + tmp_path: Utf8PathBuf, + content: B, ) -> std::io::Result<()> { - let Some(final_path_parent) = final_path.parent() else { - return Err(std::io::Error::from_raw_os_error( - nix::errno::Errno::EINVAL as i32, - )); - }; - std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?; - let mut file = Self::open_with_options( - tmp_path, - OpenOptions::new() - .write(true) - // Use `create_new` so that, if we race with ourselves or something else, - // we bail out instead of causing damage. - .create_new(true), - ) - .await?; - file.write_all(content).await?; - file.sync_all().await?; - drop(file); // before the rename, that's important! - // renames are atomic - std::fs::rename(tmp_path, final_path)?; - // Only open final path parent dirfd now, so that this operation only - // ever holds one VirtualFile fd at a time. That's important because - // the current `find_victim_slot` impl might pick the same slot for both - // VirtualFile., and it eventually does a blocking write lock instead of - // try_lock. - let final_parent_dirfd = - Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?; - final_parent_dirfd.sync_all().await?; - Ok(()) + // TODO: use tokio_epoll_uring if configured as `io_engine`. + // See https://github.com/neondatabase/neon/issues/6663 + + tokio::task::spawn_blocking(move || { + let slice_storage; + let content_len = content.bytes_init(); + let content = if content.bytes_init() > 0 { + slice_storage = Some(content.slice(0..content_len)); + slice_storage.as_deref().expect("just set it to Some()") + } else { + &[] + }; + utils::crashsafe::overwrite(&final_path, &tmp_path, content) + }) + .await + .expect("blocking task is never aborted") } /// Call File::sync_all() on the underlying File. pub async fn sync_all(&self) -> Result<(), Error> { - with_file!(self, StorageIoOperation::Fsync, |file| file - .as_ref() - .sync_all()) + with_file!(self, StorageIoOperation::Fsync, |file_guard| { + let (_file_guard, res) = io_engine::get().sync_all(file_guard).await; + res + }) } - pub async fn metadata(&self) -> Result { - with_file!(self, StorageIoOperation::Metadata, |file| file - .as_ref() - .metadata()) + /// Call File::sync_data() on the underlying File. + pub async fn sync_data(&self) -> Result<(), Error> { + with_file!(self, StorageIoOperation::Fsync, |file_guard| { + let (_file_guard, res) = io_engine::get().sync_data(file_guard).await; + res + }) + } + + pub async fn metadata(&self) -> Result { + with_file!(self, StorageIoOperation::Metadata, |file_guard| { + let (_file_guard, res) = io_engine::get().metadata(file_guard).await; + res + }) } /// Helper function internal to `VirtualFile` that looks up the underlying File, @@ -412,7 +500,7 @@ impl VirtualFile { /// /// We are doing it via a macro as Rust doesn't support async closures that /// take on parameters with lifetimes. - async fn lock_file(&self) -> Result, Error> { + async fn lock_file(&self) -> Result { let open_files = get_open_files(); let mut handle_guard = { @@ -458,10 +546,9 @@ impl VirtualFile { // NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this // case from StorageIoOperation::Open. This helps with identifying thrashing // of the virtual file descriptor cache. - let file = observe_duration!( - StorageIoOperation::OpenAfterReplace, - self.open_options.open(&self.path) - )?; + let file = observe_duration!(StorageIoOperation::OpenAfterReplace, { + self.open_options.open(self.path.as_std_path()).await? + }); // Store the File in the slot and update the handle in the VirtualFile // to point to it. @@ -486,9 +573,8 @@ impl VirtualFile { self.pos = offset; } SeekFrom::End(offset) => { - self.pos = with_file!(self, StorageIoOperation::Seek, |file| file - .as_ref() - .seek(SeekFrom::End(offset)))? + self.pos = with_file!(self, StorageIoOperation::Seek, |mut file_guard| file_guard + .with_std_file_mut(|std_file| std_file.seek(SeekFrom::End(offset))))? } SeekFrom::Current(offset) => { let pos = self.pos as i128 + offset as i128; @@ -507,136 +593,475 @@ impl VirtualFile { Ok(self.pos) } - // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135 - pub async fn read_exact_at(&self, mut buf: &mut [u8], mut offset: u64) -> Result<(), Error> { - while !buf.is_empty() { - match self.read_at(buf, offset).await { - Ok(0) => { - return Err(Error::new( - std::io::ErrorKind::UnexpectedEof, - "failed to fill whole buffer", - )) - } - Ok(n) => { - buf = &mut buf[n..]; - offset += n as u64; - } - Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} - Err(e) => return Err(e), + /// Read the file contents in range `offset..(offset + slice.bytes_total())` into `slice[0..slice.bytes_total()]`. + /// + /// The returned `Slice` is equivalent to the input `slice`, i.e., it's the same view into the same buffer. + pub async fn read_exact_at( + &self, + slice: Slice, + offset: u64, + ctx: &RequestContext, + ) -> Result, Error> + where + Buf: IoBufMut + Send, + { + let assert_we_return_original_bounds = if cfg!(debug_assertions) { + Some((slice.stable_ptr() as usize, slice.bytes_total())) + } else { + None + }; + + let original_bounds = slice.bounds(); + let (buf, res) = + read_exact_at_impl(slice, offset, |buf, offset| self.read_at(buf, offset, ctx)).await; + let res = res.map(|_| buf.slice(original_bounds)); + + if let Some(original_bounds) = assert_we_return_original_bounds { + if let Ok(slice) = &res { + let returned_bounds = (slice.stable_ptr() as usize, slice.bytes_total()); + assert_eq!(original_bounds, returned_bounds); } } - Ok(()) + + res + } + + /// Like [`Self::read_exact_at`] but for [`PageWriteGuard`]. + pub async fn read_exact_at_page( + &self, + page: PageWriteGuard<'static>, + offset: u64, + ctx: &RequestContext, + ) -> Result, Error> { + let buf = PageWriteGuardBuf { page }.slice_full(); + debug_assert_eq!(buf.bytes_total(), PAGE_SZ); + self.read_exact_at(buf, offset, ctx) + .await + .map(|slice| slice.into_inner().page) } // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235 - pub async fn write_all_at(&self, mut buf: &[u8], mut offset: u64) -> Result<(), Error> { + pub async fn write_all_at( + &self, + buf: FullSlice, + mut offset: u64, + ctx: &RequestContext, + ) -> (FullSlice, Result<(), Error>) { + let buf = buf.into_raw_slice(); + let bounds = buf.bounds(); + let restore = + |buf: Slice<_>| FullSlice::must_new(Slice::from_buf_bounds(buf.into_inner(), bounds)); + let mut buf = buf; while !buf.is_empty() { - match self.write_at(buf, offset).await { + let (tmp, res) = self.write_at(FullSlice::must_new(buf), offset, ctx).await; + buf = tmp.into_raw_slice(); + match res { Ok(0) => { - return Err(Error::new( - std::io::ErrorKind::WriteZero, - "failed to write whole buffer", - )); + return ( + restore(buf), + Err(Error::new( + std::io::ErrorKind::WriteZero, + "failed to write whole buffer", + )), + ); } Ok(n) => { - buf = &buf[n..]; + buf = buf.slice(n..); offset += n as u64; } - Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} - Err(e) => return Err(e), + Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {} + Err(e) => return (restore(buf), Err(e)), } } - Ok(()) + (restore(buf), Ok(())) } - pub async fn write_all(&mut self, mut buf: &[u8]) -> Result<(), Error> { + /// Writes `buf` to the file at the current offset. + /// + /// Panics if there is an uninitialized range in `buf`, as that is most likely a bug in the caller. + pub async fn write_all( + &mut self, + buf: FullSlice, + ctx: &RequestContext, + ) -> (FullSlice, Result) { + let buf = buf.into_raw_slice(); + let bounds = buf.bounds(); + let restore = + |buf: Slice<_>| FullSlice::must_new(Slice::from_buf_bounds(buf.into_inner(), bounds)); + let nbytes = buf.len(); + let mut buf = buf; while !buf.is_empty() { - match self.write(buf).await { + let (tmp, res) = self.write(FullSlice::must_new(buf), ctx).await; + buf = tmp.into_raw_slice(); + match res { Ok(0) => { - return Err(Error::new( - std::io::ErrorKind::WriteZero, - "failed to write whole buffer", - )); + return ( + restore(buf), + Err(Error::new( + std::io::ErrorKind::WriteZero, + "failed to write whole buffer", + )), + ); } Ok(n) => { - buf = &buf[n..]; + buf = buf.slice(n..); } Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} - Err(e) => return Err(e), + Err(e) => return (restore(buf), Err(e)), } } - Ok(()) + (restore(buf), Ok(nbytes)) } - async fn write(&mut self, buf: &[u8]) -> Result { + async fn write( + &mut self, + buf: FullSlice, + ctx: &RequestContext, + ) -> (FullSlice, Result) { let pos = self.pos; - let n = self.write_at(buf, pos).await?; + let (buf, res) = self.write_at(buf, pos, ctx).await; + let n = match res { + Ok(n) => n, + Err(e) => return (buf, Err(e)), + }; self.pos += n as u64; - Ok(n) + (buf, Ok(n)) } - pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result { - let result = with_file!(self, StorageIoOperation::Read, |file| file - .as_ref() - .read_at(buf, offset)); - if let Ok(size) = result { - STORAGE_IO_SIZE - .with_label_values(&["read", &self.tenant_id, &self.shard_id, &self.timeline_id]) - .add(size as i64); - } - result + pub(crate) async fn read_at( + &self, + buf: tokio_epoll_uring::Slice, + offset: u64, + _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ + ) -> (tokio_epoll_uring::Slice, Result) + where + Buf: tokio_epoll_uring::IoBufMut + Send, + { + let file_guard = match self.lock_file().await { + Ok(file_guard) => file_guard, + Err(e) => return (buf, Err(e)), + }; + + observe_duration!(StorageIoOperation::Read, { + let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await; + if let Ok(size) = res { + STORAGE_IO_SIZE + .with_label_values(&[ + "read", + &self.tenant_id, + &self.shard_id, + &self.timeline_id, + ]) + .add(size as i64); + } + (buf, res) + }) } - async fn write_at(&self, buf: &[u8], offset: u64) -> Result { - let result = with_file!(self, StorageIoOperation::Write, |file| file - .as_ref() - .write_at(buf, offset)); - if let Ok(size) = result { - STORAGE_IO_SIZE - .with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id]) - .add(size as i64); - } - result + /// The function aborts the process if the error is fatal. + async fn write_at( + &self, + buf: FullSlice, + offset: u64, + _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ + ) -> (FullSlice, Result) { + let (slice, result) = self.write_at_inner(buf, offset, _ctx).await; + let result = result.maybe_fatal_err("write_at"); + (slice, result) + } + + async fn write_at_inner( + &self, + buf: FullSlice, + offset: u64, + _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ + ) -> (FullSlice, Result) { + let file_guard = match self.lock_file().await { + Ok(file_guard) => file_guard, + Err(e) => return (buf, Err(e)), + }; + observe_duration!(StorageIoOperation::Write, { + let ((_file_guard, buf), result) = + io_engine::get().write_at(file_guard, offset, buf).await; + if let Ok(size) = result { + STORAGE_IO_SIZE + .with_label_values(&[ + "write", + &self.tenant_id, + &self.shard_id, + &self.timeline_id, + ]) + .add(size as i64); + } + (buf, result) + }) } } -struct FileGuard<'a> { - slot_guard: RwLockReadGuard<'a, SlotInner>, +// Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135 +pub async fn read_exact_at_impl( + mut buf: tokio_epoll_uring::Slice, + mut offset: u64, + mut read_at: F, +) -> (Buf, std::io::Result<()>) +where + Buf: IoBufMut + Send, + F: FnMut(tokio_epoll_uring::Slice, u64) -> Fut, + Fut: std::future::Future, std::io::Result)>, +{ + while buf.bytes_total() != 0 { + let res; + (buf, res) = read_at(buf, offset).await; + match res { + Ok(0) => break, + Ok(n) => { + buf = buf.slice(n..); + offset += n as u64; + } + Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} + Err(e) => return (buf.into_inner(), Err(e)), + } + } + // NB: don't use `buf.is_empty()` here; it is from the + // `impl Deref for Slice { Target = [u8] }`; the &[u8] + // returned by it only covers the initialized portion of `buf`. + // Whereas we're interested in ensuring that we filled the entire + // buffer that the user passed in. + if buf.bytes_total() != 0 { + ( + buf.into_inner(), + Err(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "failed to fill whole buffer", + )), + ) + } else { + assert_eq!(buf.len(), buf.bytes_total()); + (buf.into_inner(), Ok(())) + } } -impl<'a> AsRef for FileGuard<'a> { - fn as_ref(&self) -> &File { +#[cfg(test)] +mod test_read_exact_at_impl { + + use std::{collections::VecDeque, sync::Arc}; + + use tokio_epoll_uring::{BoundedBuf, BoundedBufMut}; + + use super::read_exact_at_impl; + + struct Expectation { + offset: u64, + bytes_total: usize, + result: std::io::Result>, + } + struct MockReadAt { + expectations: VecDeque, + } + + impl MockReadAt { + async fn read_at( + &mut self, + mut buf: tokio_epoll_uring::Slice>, + offset: u64, + ) -> (tokio_epoll_uring::Slice>, std::io::Result) { + let exp = self + .expectations + .pop_front() + .expect("read_at called but we have no expectations left"); + assert_eq!(exp.offset, offset); + assert_eq!(exp.bytes_total, buf.bytes_total()); + match exp.result { + Ok(bytes) => { + assert!(bytes.len() <= buf.bytes_total()); + buf.put_slice(&bytes); + (buf, Ok(bytes.len())) + } + Err(e) => (buf, Err(e)), + } + } + } + + impl Drop for MockReadAt { + fn drop(&mut self) { + assert_eq!(self.expectations.len(), 0); + } + } + + #[tokio::test] + async fn test_basic() { + let buf = Vec::with_capacity(5).slice_full(); + let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { + expectations: VecDeque::from(vec![Expectation { + offset: 0, + bytes_total: 5, + result: Ok(vec![b'a', b'b', b'c', b'd', b'e']), + }]), + })); + let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { + let mock_read_at = Arc::clone(&mock_read_at); + async move { mock_read_at.lock().await.read_at(buf, offset).await } + }) + .await; + assert!(res.is_ok()); + assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']); + } + + #[tokio::test] + async fn test_empty_buf_issues_no_syscall() { + let buf = Vec::new().slice_full(); + let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { + expectations: VecDeque::new(), + })); + let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { + let mock_read_at = Arc::clone(&mock_read_at); + async move { mock_read_at.lock().await.read_at(buf, offset).await } + }) + .await; + assert!(res.is_ok()); + } + + #[tokio::test] + async fn test_two_read_at_calls_needed_until_buf_filled() { + let buf = Vec::with_capacity(4).slice_full(); + let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { + expectations: VecDeque::from(vec![ + Expectation { + offset: 0, + bytes_total: 4, + result: Ok(vec![b'a', b'b']), + }, + Expectation { + offset: 2, + bytes_total: 2, + result: Ok(vec![b'c', b'd']), + }, + ]), + })); + let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { + let mock_read_at = Arc::clone(&mock_read_at); + async move { mock_read_at.lock().await.read_at(buf, offset).await } + }) + .await; + assert!(res.is_ok()); + assert_eq!(buf, vec![b'a', b'b', b'c', b'd']); + } + + #[tokio::test] + async fn test_eof_before_buffer_full() { + let buf = Vec::with_capacity(3).slice_full(); + let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { + expectations: VecDeque::from(vec![ + Expectation { + offset: 0, + bytes_total: 3, + result: Ok(vec![b'a']), + }, + Expectation { + offset: 1, + bytes_total: 2, + result: Ok(vec![b'b']), + }, + Expectation { + offset: 2, + bytes_total: 1, + result: Ok(vec![]), + }, + ]), + })); + let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { + let mock_read_at = Arc::clone(&mock_read_at); + async move { mock_read_at.lock().await.read_at(buf, offset).await } + }) + .await; + let Err(err) = res else { + panic!("should return an error"); + }; + assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof); + assert_eq!(format!("{err}"), "failed to fill whole buffer"); + // buffer contents on error are unspecified + } +} + +struct FileGuard { + slot_guard: RwLockReadGuard<'static, SlotInner>, +} + +impl AsRef for FileGuard { + fn as_ref(&self) -> &OwnedFd { // This unwrap is safe because we only create `FileGuard`s // if we know that the file is Some. self.slot_guard.file.as_ref().unwrap() } } +impl FileGuard { + /// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually. + fn with_std_file(&self, with: F) -> R + where + F: FnOnce(&File) -> R, + { + // SAFETY: + // - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`. + // - `&` usage below: `self` is `&`, hence Rust typesystem guarantees there are is no `&mut` + let file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) }; + let res = with(&file); + let _ = file.into_raw_fd(); + res + } + /// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually. + fn with_std_file_mut(&mut self, with: F) -> R + where + F: FnOnce(&mut File) -> R, + { + // SAFETY: + // - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`. + // - &mut usage below: `self` is `&mut`, hence this call is the only task/thread that has control over the underlying fd + let mut file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) }; + let res = with(&mut file); + let _ = file.into_raw_fd(); + res + } +} + +impl tokio_epoll_uring::IoFd for FileGuard { + unsafe fn as_fd(&self) -> RawFd { + let owned_fd: &OwnedFd = self.as_ref(); + owned_fd.as_raw_fd() + } +} + #[cfg(test)] impl VirtualFile { pub(crate) async fn read_blk( &self, blknum: u32, + ctx: &RequestContext, ) -> Result, std::io::Error> { use crate::page_cache::PAGE_SZ; - let mut buf = [0; PAGE_SZ]; - self.read_exact_at(&mut buf, blknum as u64 * (PAGE_SZ as u64)) + let slice = Vec::with_capacity(PAGE_SZ).slice_full(); + assert_eq!(slice.bytes_total(), PAGE_SZ); + let slice = self + .read_exact_at(slice, blknum as u64 * (PAGE_SZ as u64), ctx) .await?; - Ok(std::sync::Arc::new(buf).into()) + Ok(crate::tenant::block_io::BlockLease::Vec(slice.into_inner())) } - async fn read_to_end(&mut self, buf: &mut Vec) -> Result<(), Error> { + async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { + let mut tmp = vec![0; 128]; loop { - let mut tmp = [0; 128]; - match self.read_at(&mut tmp, self.pos).await { + let slice = tmp.slice(..128); + let (slice, res) = self.read_at(slice, self.pos, ctx).await; + match res { Ok(0) => return Ok(()), Ok(n) => { self.pos += n as u64; - buf.extend_from_slice(&tmp[..n]); + buf.extend_from_slice(&slice[..n]); } Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} Err(e) => return Err(e), } + tmp = slice.into_inner(); } } } @@ -682,6 +1107,18 @@ impl Drop for VirtualFile { } } +impl OwnedAsyncWriter for VirtualFile { + #[inline(always)] + async fn write_all( + &mut self, + buf: FullSlice, + ctx: &RequestContext, + ) -> std::io::Result<(usize, FullSlice)> { + let (buf, res) = VirtualFile::write_all(self, buf, ctx).await; + res.map(move |v| (v, buf)) + } +} + impl OpenFiles { fn new(num_slots: usize) -> OpenFiles { let mut slots = Box::new(Vec::with_capacity(num_slots)); @@ -704,10 +1141,15 @@ impl OpenFiles { /// Initialize the virtual file module. This must be called once at page /// server startup. /// -pub fn init(num_slots: usize) { +#[cfg(not(test))] +pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize) { if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() { panic!("virtual_file::init called twice"); } + if set_io_buffer_alignment(io_buffer_alignment).is_err() { + panic!("IO buffer alignment ({io_buffer_alignment}) is not a power of two"); + } + io_engine::init(engine); crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64); } @@ -730,14 +1172,66 @@ fn get_open_files() -> &'static OpenFiles { } } +static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT); + +/// Returns true if `x` is zero or a power of two. +fn is_zero_or_power_of_two(x: usize) -> bool { + (x == 0) || ((x & (x - 1)) == 0) +} + +#[allow(unused)] +pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> { + if is_zero_or_power_of_two(align) { + IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed); + Ok(()) + } else { + Err(align) + } +} + +/// Gets the io buffer alignment requirement. Returns 0 if there is no requirement specified. +/// +/// This function should be used to check the raw config value. +pub(crate) fn get_io_buffer_alignment_raw() -> usize { + let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed); + + if cfg!(test) { + let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT"; + if let Some(test_align) = utils::env::var(env_var_name) { + if is_zero_or_power_of_two(test_align) { + test_align + } else { + panic!("IO buffer alignment ({test_align}) is not a power of two"); + } + } else { + align + } + } else { + align + } +} + +/// Gets the io buffer alignment requirement. Returns 1 if the alignment config is set to zero. +/// +/// This function should be used for getting the actual alignment value to use. +pub(crate) fn get_io_buffer_alignment() -> usize { + let align = get_io_buffer_alignment_raw(); + align.max(1) +} + #[cfg(test)] mod tests { + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + use super::*; + use owned_buffers_io::io_buf_ext::IoBufExt; + use owned_buffers_io::slice::SliceMutExt; use rand::seq::SliceRandom; use rand::thread_rng; use rand::Rng; - use std::future::Future; use std::io::Write; + use std::os::unix::fs::FileExt; use std::sync::Arc; enum MaybeVirtualFile { @@ -752,16 +1246,32 @@ mod tests { } impl MaybeVirtualFile { - async fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> { + async fn read_exact_at( + &self, + mut slice: tokio_epoll_uring::Slice>, + offset: u64, + ctx: &RequestContext, + ) -> Result>, Error> { match self { - MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await, - MaybeVirtualFile::File(file) => file.read_exact_at(buf, offset), + MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await, + MaybeVirtualFile::File(file) => { + let rust_slice: &mut [u8] = slice.as_mut_rust_slice_full_zeroed(); + file.read_exact_at(rust_slice, offset).map(|()| slice) + } } } - async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> { + async fn write_all_at( + &self, + buf: FullSlice, + offset: u64, + ctx: &RequestContext, + ) -> Result<(), Error> { match self { - MaybeVirtualFile::VirtualFile(file) => file.write_all_at(buf, offset).await, - MaybeVirtualFile::File(file) => file.write_all_at(buf, offset), + MaybeVirtualFile::VirtualFile(file) => { + let (_buf, res) = file.write_all_at(buf, offset, ctx).await; + res + } + MaybeVirtualFile::File(file) => file.write_all_at(&buf[..], offset), } } async fn seek(&mut self, pos: SeekFrom) -> Result { @@ -770,22 +1280,29 @@ mod tests { MaybeVirtualFile::File(file) => file.seek(pos), } } - async fn write_all(&mut self, buf: &[u8]) -> Result<(), Error> { + async fn write_all( + &mut self, + buf: FullSlice, + ctx: &RequestContext, + ) -> Result<(), Error> { match self { - MaybeVirtualFile::VirtualFile(file) => file.write_all(buf).await, - MaybeVirtualFile::File(file) => file.write_all(buf), + MaybeVirtualFile::VirtualFile(file) => { + let (_buf, res) = file.write_all(buf, ctx).await; + res.map(|_| ()) + } + MaybeVirtualFile::File(file) => file.write_all(&buf[..]), } } // Helper function to slurp contents of a file, starting at the current position, // into a string - async fn read_string(&mut self) -> Result { + async fn read_string(&mut self, ctx: &RequestContext) -> Result { use std::io::Read; let mut buf = String::new(); match self { MaybeVirtualFile::VirtualFile(file) => { let mut buf = Vec::new(); - file.read_to_end(&mut buf).await?; + file.read_to_end(&mut buf, ctx).await?; return Ok(String::from_utf8(buf).unwrap()); } MaybeVirtualFile::File(file) => { @@ -796,15 +1313,23 @@ mod tests { } // Helper function to slurp a portion of a file into a string - async fn read_string_at(&mut self, pos: u64, len: usize) -> Result { - let mut buf = vec![0; len]; - self.read_exact_at(&mut buf, pos).await?; - Ok(String::from_utf8(buf).unwrap()) + async fn read_string_at( + &mut self, + pos: u64, + len: usize, + ctx: &RequestContext, + ) -> Result { + let slice = Vec::with_capacity(len).slice_full(); + assert_eq!(slice.bytes_total(), len); + let slice = self.read_exact_at(slice, pos, ctx).await?; + let vec = slice.into_inner(); + assert_eq!(vec.len(), len); + Ok(String::from_utf8(vec).unwrap()) } } #[tokio::test] - async fn test_virtual_files() -> Result<(), Error> { + async fn test_virtual_files() -> anyhow::Result<()> { // The real work is done in the test_files() helper function. This // allows us to run the same set of tests against a native File, and // VirtualFile. We trust the native Files and wouldn't need to test them, @@ -812,69 +1337,106 @@ mod tests { // results with VirtualFiles as with native Files. (Except that with // native files, you will run out of file descriptors if the ulimit // is low enough.) - test_files("virtual_files", |path, open_options| async move { - let vf = VirtualFile::open_with_options(&path, &open_options).await?; - Ok(MaybeVirtualFile::VirtualFile(vf)) - }) - .await + struct A; + + impl Adapter for A { + async fn open( + path: Utf8PathBuf, + opts: OpenOptions, + ctx: &RequestContext, + ) -> Result { + let vf = VirtualFile::open_with_options(&path, &opts, ctx).await?; + Ok(MaybeVirtualFile::VirtualFile(vf)) + } + } + test_files::("virtual_files").await } #[tokio::test] - async fn test_physical_files() -> Result<(), Error> { - test_files("physical_files", |path, open_options| async move { - Ok(MaybeVirtualFile::File(open_options.open(path)?)) - }) - .await + async fn test_physical_files() -> anyhow::Result<()> { + struct B; + + impl Adapter for B { + async fn open( + path: Utf8PathBuf, + opts: OpenOptions, + _ctx: &RequestContext, + ) -> Result { + Ok(MaybeVirtualFile::File({ + let owned_fd = opts.open(path.as_std_path()).await?; + File::from(owned_fd) + })) + } + } + + test_files::("physical_files").await } - async fn test_files(testname: &str, openfunc: OF) -> Result<(), Error> + /// This is essentially a closure which returns a MaybeVirtualFile, but because rust edition + /// 2024 is not yet out with new lifetime capture or outlives rules, this is a async function + /// in trait which benefits from the new lifetime capture rules already. + trait Adapter { + async fn open( + path: Utf8PathBuf, + opts: OpenOptions, + ctx: &RequestContext, + ) -> Result; + } + + async fn test_files(testname: &str) -> anyhow::Result<()> where - OF: Fn(Utf8PathBuf, OpenOptions) -> FT, - FT: Future>, + A: Adapter, { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir(testname); std::fs::create_dir_all(&testdir)?; let path_a = testdir.join("file_a"); - let mut file_a = openfunc( + let mut file_a = A::open( path_a.clone(), OpenOptions::new() .write(true) .create(true) .truncate(true) .to_owned(), + &ctx, ) .await?; - file_a.write_all(b"foobar").await?; + file_a + .write_all(b"foobar".to_vec().slice_len(), &ctx) + .await?; // cannot read from a file opened in write-only mode - let _ = file_a.read_string().await.unwrap_err(); + let _ = file_a.read_string(&ctx).await.unwrap_err(); // Close the file and re-open for reading - let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?; + let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?; // cannot write to a file opened in read-only mode - let _ = file_a.write_all(b"bar").await.unwrap_err(); + let _ = file_a + .write_all(b"bar".to_vec().slice_len(), &ctx) + .await + .unwrap_err(); // Try simple read - assert_eq!("foobar", file_a.read_string().await?); + assert_eq!("foobar", file_a.read_string(&ctx).await?); // It's positioned at the EOF now. - assert_eq!("", file_a.read_string().await?); + assert_eq!("", file_a.read_string(&ctx).await?); // Test seeks. assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1); - assert_eq!("oobar", file_a.read_string().await?); + assert_eq!("oobar", file_a.read_string(&ctx).await?); assert_eq!(file_a.seek(SeekFrom::End(-2)).await?, 4); - assert_eq!("ar", file_a.read_string().await?); + assert_eq!("ar", file_a.read_string(&ctx).await?); assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1); assert_eq!(file_a.seek(SeekFrom::Current(2)).await?, 3); - assert_eq!("bar", file_a.read_string().await?); + assert_eq!("bar", file_a.read_string(&ctx).await?); assert_eq!(file_a.seek(SeekFrom::Current(-5)).await?, 1); - assert_eq!("oobar", file_a.read_string().await?); + assert_eq!("oobar", file_a.read_string(&ctx).await?); // Test erroneous seeks to before byte 0 file_a.seek(SeekFrom::End(-7)).await.unwrap_err(); @@ -882,11 +1444,11 @@ mod tests { file_a.seek(SeekFrom::Current(-2)).await.unwrap_err(); // the erroneous seek should have left the position unchanged - assert_eq!("oobar", file_a.read_string().await?); + assert_eq!("oobar", file_a.read_string(&ctx).await?); // Create another test file, and try FileExt functions on it. let path_b = testdir.join("file_b"); - let mut file_b = openfunc( + let mut file_b = A::open( path_b.clone(), OpenOptions::new() .read(true) @@ -894,12 +1456,17 @@ mod tests { .create(true) .truncate(true) .to_owned(), + &ctx, ) .await?; - file_b.write_all_at(b"BAR", 3).await?; - file_b.write_all_at(b"FOO", 0).await?; + file_b + .write_all_at(b"BAR".to_vec().slice_len(), 3, &ctx) + .await?; + file_b + .write_all_at(b"FOO".to_vec().slice_len(), 0, &ctx) + .await?; - assert_eq!(file_b.read_string_at(2, 3).await?, "OBA"); + assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA"); // Open a lot of files, enough to cause some evictions. (Or to be precise, // open the same file many times. The effect is the same.) @@ -909,9 +1476,13 @@ mod tests { let mut vfiles = Vec::new(); for _ in 0..100 { - let mut vfile = - openfunc(path_b.clone(), OpenOptions::new().read(true).to_owned()).await?; - assert_eq!("FOOBAR", vfile.read_string().await?); + let mut vfile = A::open( + path_b.clone(), + OpenOptions::new().read(true).to_owned(), + &ctx, + ) + .await?; + assert_eq!("FOOBAR", vfile.read_string(&ctx).await?); vfiles.push(vfile); } @@ -920,13 +1491,13 @@ mod tests { // The underlying file descriptor for 'file_a' should be closed now. Try to read // from it again. We left the file positioned at offset 1 above. - assert_eq!("oobar", file_a.read_string().await?); + assert_eq!("oobar", file_a.read_string(&ctx).await?); // Check that all the other FDs still work too. Use them in random order for // good measure. vfiles.as_mut_slice().shuffle(&mut thread_rng()); for vfile in vfiles.iter_mut() { - assert_eq!("OOBAR", vfile.read_string_at(1, 5).await?); + assert_eq!("OOBAR", vfile.read_string_at(1, 5, &ctx).await?); } Ok(()) @@ -942,6 +1513,7 @@ mod tests { const THREADS: usize = 100; const SAMPLE: [u8; SIZE] = [0xADu8; SIZE]; + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency"); std::fs::create_dir_all(&testdir)?; @@ -955,8 +1527,12 @@ mod tests { // Open the file many times. let mut files = Vec::new(); for _ in 0..VIRTUAL_FILES { - let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true)) - .await?; + let f = VirtualFile::open_with_options( + &test_file_path, + OpenOptions::new().read(true), + &ctx, + ) + .await?; files.push(f); } let files = Arc::new(files); @@ -970,12 +1546,17 @@ mod tests { let mut hdls = Vec::new(); for _threadno in 0..THREADS { let files = files.clone(); + let ctx = ctx.detached_child(TaskKind::UnitTest, DownloadBehavior::Error); let hdl = rt.spawn(async move { - let mut buf = [0u8; SIZE]; + let mut buf = vec![0u8; SIZE]; let mut rng = rand::rngs::OsRng; for _ in 1..1000 { let f = &files[rng.gen_range(0..files.len())]; - f.read_exact_at(&mut buf, 0).await.unwrap(); + buf = f + .read_exact_at(buf.slice_full(), 0, &ctx) + .await + .unwrap() + .into_inner(); assert!(buf == SAMPLE); } }); @@ -991,26 +1572,27 @@ mod tests { #[tokio::test] async fn test_atomic_overwrite_basic() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic"); std::fs::create_dir_all(&testdir).unwrap(); let path = testdir.join("myfile"); let tmp_path = testdir.join("myfile.tmp"); - VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo") + VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); - let post = file.read_string().await.unwrap(); + let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); + let post = file.read_string(&ctx).await.unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); drop(file); - VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar") + VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec()) .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); - let post = file.read_string().await.unwrap(); + let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); + let post = file.read_string(&ctx).await.unwrap(); assert_eq!(post, "bar"); assert!(!tmp_path.exists()); drop(file); @@ -1018,6 +1600,7 @@ mod tests { #[tokio::test] async fn test_atomic_overwrite_preexisting_tmp() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp"); std::fs::create_dir_all(&testdir).unwrap(); @@ -1028,12 +1611,12 @@ mod tests { std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap(); assert!(tmp_path.exists()); - VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo") + VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); - let post = file.read_string().await.unwrap(); + let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); + let post = file.read_string(&ctx).await.unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); drop(file); diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs new file mode 100644 index 0000000000..ccde90ee1a --- /dev/null +++ b/pageserver/src/virtual_file/io_engine.rs @@ -0,0 +1,368 @@ +//! [`super::VirtualFile`] supports different IO engines. +//! +//! The [`IoEngineKind`] enum identifies them. +//! +//! The choice of IO engine is global. +//! Initialize using [`init`]. +//! +//! Then use [`get`] and [`super::OpenOptions`]. +//! +//! + +#[cfg(target_os = "linux")] +pub(super) mod tokio_epoll_uring_ext; + +use tokio_epoll_uring::IoBuf; +use tracing::Instrument; + +pub(crate) use super::api::IoEngineKind; +#[derive(Clone, Copy)] +#[repr(u8)] +pub(crate) enum IoEngine { + NotSet, + StdFs, + #[cfg(target_os = "linux")] + TokioEpollUring, +} + +impl From for IoEngine { + fn from(value: IoEngineKind) -> Self { + match value { + IoEngineKind::StdFs => IoEngine::StdFs, + #[cfg(target_os = "linux")] + IoEngineKind::TokioEpollUring => IoEngine::TokioEpollUring, + } + } +} + +impl TryFrom for IoEngine { + type Error = u8; + + fn try_from(value: u8) -> Result { + Ok(match value { + v if v == (IoEngine::NotSet as u8) => IoEngine::NotSet, + v if v == (IoEngine::StdFs as u8) => IoEngine::StdFs, + #[cfg(target_os = "linux")] + v if v == (IoEngine::TokioEpollUring as u8) => IoEngine::TokioEpollUring, + x => return Err(x), + }) + } +} + +static IO_ENGINE: AtomicU8 = AtomicU8::new(IoEngine::NotSet as u8); + +pub(crate) fn set(engine_kind: IoEngineKind) { + let engine: IoEngine = engine_kind.into(); + IO_ENGINE.store(engine as u8, std::sync::atomic::Ordering::Relaxed); + #[cfg(not(test))] + { + let metric = &crate::metrics::virtual_file_io_engine::KIND; + metric.reset(); + metric + .with_label_values(&[&format!("{engine_kind}")]) + .set(1); + } +} + +#[cfg(not(test))] +pub(super) fn init(engine_kind: IoEngineKind) { + set(engine_kind); +} + +/// Longer-term, this API should only be used by [`super::VirtualFile`]. +pub(crate) fn get() -> IoEngine { + let cur = IoEngine::try_from(IO_ENGINE.load(Ordering::Relaxed)).unwrap(); + if cfg!(test) { + let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE"; + match cur { + IoEngine::NotSet => { + let kind = match std::env::var(env_var_name) { + Ok(v) => match v.parse::() { + Ok(engine_kind) => engine_kind, + Err(e) => { + panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}") + } + }, + Err(std::env::VarError::NotPresent) => { + #[cfg(target_os = "linux")] + { + IoEngineKind::TokioEpollUring + } + #[cfg(not(target_os = "linux"))] + { + IoEngineKind::StdFs + } + } + Err(std::env::VarError::NotUnicode(_)) => { + panic!("env var {env_var_name} is not unicode"); + } + }; + self::set(kind); + self::get() + } + x => x, + } + } else { + cur + } +} + +use std::{ + os::unix::prelude::FileExt, + sync::atomic::{AtomicU8, Ordering}, +}; + +use super::{ + owned_buffers_io::{io_buf_ext::FullSlice, slice::SliceMutExt}, + FileGuard, Metadata, +}; + +#[cfg(target_os = "linux")] +fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std::io::Error { + match e { + tokio_epoll_uring::Error::Op(e) => e, + tokio_epoll_uring::Error::System(system) => { + std::io::Error::new(std::io::ErrorKind::Other, system) + } + } +} + +impl IoEngine { + pub(super) async fn read_at( + &self, + file_guard: FileGuard, + offset: u64, + mut slice: tokio_epoll_uring::Slice, + ) -> ( + (FileGuard, tokio_epoll_uring::Slice), + std::io::Result, + ) + where + Buf: tokio_epoll_uring::IoBufMut + Send, + { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let rust_slice = slice.as_mut_rust_slice_full_zeroed(); + let res = file_guard.with_std_file(|std_file| std_file.read_at(rust_slice, offset)); + ((file_guard, slice), res) + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + let system = tokio_epoll_uring_ext::thread_local_system().await; + let (resources, res) = system.read(file_guard, offset, slice).await; + (resources, res.map_err(epoll_uring_error_to_std)) + } + } + } + pub(super) async fn sync_all(&self, file_guard: FileGuard) -> (FileGuard, std::io::Result<()>) { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let res = file_guard.with_std_file(|std_file| std_file.sync_all()); + (file_guard, res) + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + let system = tokio_epoll_uring_ext::thread_local_system().await; + let (resources, res) = system.fsync(file_guard).await; + (resources, res.map_err(epoll_uring_error_to_std)) + } + } + } + pub(super) async fn sync_data( + &self, + file_guard: FileGuard, + ) -> (FileGuard, std::io::Result<()>) { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let res = file_guard.with_std_file(|std_file| std_file.sync_data()); + (file_guard, res) + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + let system = tokio_epoll_uring_ext::thread_local_system().await; + let (resources, res) = system.fdatasync(file_guard).await; + (resources, res.map_err(epoll_uring_error_to_std)) + } + } + } + pub(super) async fn metadata( + &self, + file_guard: FileGuard, + ) -> (FileGuard, std::io::Result) { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let res = + file_guard.with_std_file(|std_file| std_file.metadata().map(Metadata::from)); + (file_guard, res) + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + let system = tokio_epoll_uring_ext::thread_local_system().await; + let (resources, res) = system.statx(file_guard).await; + ( + resources, + res.map_err(epoll_uring_error_to_std).map(Metadata::from), + ) + } + } + } + pub(super) async fn write_at( + &self, + file_guard: FileGuard, + offset: u64, + buf: FullSlice, + ) -> ((FileGuard, FullSlice), std::io::Result) { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let result = file_guard.with_std_file(|std_file| std_file.write_at(&buf, offset)); + ((file_guard, buf), result) + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + let system = tokio_epoll_uring_ext::thread_local_system().await; + let ((file_guard, slice), res) = + system.write(file_guard, offset, buf.into_raw_slice()).await; + ( + (file_guard, FullSlice::must_new(slice)), + res.map_err(epoll_uring_error_to_std), + ) + } + } + } + + /// If we switch a user of [`tokio::fs`] to use [`super::io_engine`], + /// they'd start blocking the executor thread if [`IoEngine::StdFs`] is configured + /// whereas before the switch to [`super::io_engine`], that wasn't the case. + /// This method helps avoid such a regression. + /// + /// Panics if the `spawn_blocking` fails, see [`tokio::task::JoinError`] for reasons why that can happen. + pub(crate) async fn spawn_blocking_and_block_on_if_std(&self, work: Fut) -> R + where + Fut: 'static + Send + std::future::Future, + R: 'static + Send, + { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let span = tracing::info_span!("spawn_blocking_block_on_if_std"); + tokio::task::spawn_blocking({ + move || tokio::runtime::Handle::current().block_on(work.instrument(span)) + }) + .await + .expect("failed to join blocking code most likely it panicked, panicking as well") + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => work.await, + } + } +} + +pub enum FeatureTestResult { + PlatformPreferred(IoEngineKind), + Worse { + engine: IoEngineKind, + remark: String, + }, +} + +impl FeatureTestResult { + #[cfg(target_os = "linux")] + const PLATFORM_PREFERRED: IoEngineKind = IoEngineKind::TokioEpollUring; + #[cfg(not(target_os = "linux"))] + const PLATFORM_PREFERRED: IoEngineKind = IoEngineKind::StdFs; +} + +impl From for IoEngineKind { + fn from(val: FeatureTestResult) -> Self { + match val { + FeatureTestResult::PlatformPreferred(e) => e, + FeatureTestResult::Worse { engine, .. } => engine, + } + } +} + +/// Somewhat costly under the hood, do only once. +/// Panics if we can't set up the feature test. +pub fn feature_test() -> anyhow::Result { + std::thread::spawn(|| { + + #[cfg(not(target_os = "linux"))] + { + Ok(FeatureTestResult::PlatformPreferred( + FeatureTestResult::PLATFORM_PREFERRED, + )) + } + #[cfg(target_os = "linux")] + { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + Ok(match rt.block_on(tokio_epoll_uring::System::launch()) { + Ok(_) => FeatureTestResult::PlatformPreferred({ + assert!(matches!( + IoEngineKind::TokioEpollUring, + FeatureTestResult::PLATFORM_PREFERRED + )); + FeatureTestResult::PLATFORM_PREFERRED + }), + Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) => { + let remark = match e.raw_os_error() { + Some(nix::libc::EPERM) => { + // fall back + "creating tokio-epoll-uring fails with EPERM, assuming it's admin-disabled " + .to_string() + } + Some(nix::libc::EFAULT) => { + // fail feature test + anyhow::bail!( + "creating tokio-epoll-uring fails with EFAULT, might have corrupted memory" + ); + } + Some(_) | None => { + // fall back + format!("creating tokio-epoll-uring fails with error: {e:#}") + } + }; + FeatureTestResult::Worse { + engine: IoEngineKind::StdFs, + remark, + } + } + }) + } + }) + .join() + .unwrap() +} + +/// For use in benchmark binaries only. +/// +/// Benchmarks which initialize `virtual_file` need to know what engine to use, but we also +/// don't want to silently fall back to slower I/O engines in a benchmark: this could waste +/// developer time trying to figure out why it's slow. +/// +/// In practice, this method will either return IoEngineKind::TokioEpollUring, or panic. +pub fn io_engine_for_bench() -> IoEngineKind { + #[cfg(not(target_os = "linux"))] + { + panic!("This benchmark does I/O and can only give a representative result on Linux"); + } + #[cfg(target_os = "linux")] + { + match feature_test().unwrap() { + FeatureTestResult::PlatformPreferred(engine) => engine, + FeatureTestResult::Worse { + engine: _engine, + remark, + } => { + panic!("This benchmark does I/O can requires the preferred I/O engine: {remark}"); + } + } + } +} diff --git a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs new file mode 100644 index 0000000000..6ea19d6b2d --- /dev/null +++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs @@ -0,0 +1,194 @@ +//! Like [`::tokio_epoll_uring::thread_local_system()`], but with pageserver-specific +//! handling in case the instance can't launched. +//! +//! This is primarily necessary due to ENOMEM aka OutOfMemory errors during io_uring creation +//! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series. +//! See for more details. + +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; +use std::sync::Arc; + +use tokio_util::sync::CancellationToken; +use tracing::{error, info, info_span, warn, Instrument}; +use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; + +use tokio_epoll_uring::{System, SystemHandle}; + +use crate::virtual_file::on_fatal_io_error; + +use crate::metrics::tokio_epoll_uring as metrics; + +#[derive(Clone)] +struct ThreadLocalState(Arc); + +struct ThreadLocalStateInner { + cell: tokio::sync::OnceCell, + launch_attempts: AtomicU32, + /// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`] + thread_local_state_id: u64, +} + +impl ThreadLocalState { + pub fn new() -> Self { + Self(Arc::new(ThreadLocalStateInner { + cell: tokio::sync::OnceCell::default(), + launch_attempts: AtomicU32::new(0), + thread_local_state_id: THREAD_LOCAL_STATE_ID.fetch_add(1, Ordering::Relaxed), + })) + } + + pub fn make_id_string(&self) -> String { + format!("{}", self.0.thread_local_state_id) + } +} + +static THREAD_LOCAL_STATE_ID: AtomicU64 = AtomicU64::new(0); + +thread_local! { + static THREAD_LOCAL: ThreadLocalState = ThreadLocalState::new(); +} + +/// Panics if we cannot [`System::launch`]. +pub async fn thread_local_system() -> Handle { + let fake_cancel = CancellationToken::new(); + loop { + let thread_local_state = THREAD_LOCAL.with(|arc| arc.clone()); + let inner = &thread_local_state.0; + let get_or_init_res = inner + .cell + .get_or_try_init(|| async { + let attempt_no = inner + .launch_attempts + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + let span = info_span!("tokio_epoll_uring_ext::thread_local_system", thread_local=%thread_local_state.make_id_string(), %attempt_no); + async { + // Rate-limit retries per thread-local. + // NB: doesn't yield to executor at attempt_no=0. + utils::backoff::exponential_backoff( + attempt_no, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + &fake_cancel, + ) + .await; + let res = System::launch() + // this might move us to another executor thread => loop outside the get_or_try_init, not inside it + .await; + match res { + Ok(system) => { + info!("successfully launched system"); + metrics::THREAD_LOCAL_LAUNCH_SUCCESSES.inc(); + Ok(system) + } + Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => { + warn!("not enough locked memory to tokio-epoll-uring, will retry"); + info_span!("stats").in_scope(|| { + emit_launch_failure_process_stats(); + }); + metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc(); + Err(()) + } + // abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere. + // This is equivalent to a fatal IO error. + Err(ref e @ tokio_epoll_uring::LaunchResult::IoUringBuild(ref inner)) => { + error!(error=%e, "failed to launch thread-local tokio-epoll-uring, this should not happen, aborting process"); + info_span!("stats").in_scope(|| { + emit_launch_failure_process_stats(); + }); + on_fatal_io_error(inner, "launch thread-local tokio-epoll-uring"); + }, + } + } + .instrument(span) + .await + }) + .await; + if get_or_init_res.is_ok() { + return Handle(thread_local_state); + } + } +} + +fn emit_launch_failure_process_stats() { + // tokio-epoll-uring stats + // vmlck + rlimit + // number of threads + // rss / system memory usage generally + + let tokio_epoll_uring::metrics::Metrics { + systems_created, + systems_destroyed, + } = tokio_epoll_uring::metrics::global(); + info!(systems_created, systems_destroyed, "tokio-epoll-uring"); + + match procfs::process::Process::myself() { + Ok(myself) => { + match myself.limits() { + Ok(limits) => { + info!(?limits.max_locked_memory, "/proc/self/limits"); + } + Err(error) => { + info!(%error, "no limit stats due to error"); + } + } + + match myself.status() { + Ok(status) => { + let procfs::process::Status { + vmsize, + vmlck, + vmpin, + vmrss, + rssanon, + rssfile, + rssshmem, + vmdata, + vmstk, + vmexe, + vmlib, + vmpte, + threads, + .. + } = status; + info!( + vmsize, + vmlck, + vmpin, + vmrss, + rssanon, + rssfile, + rssshmem, + vmdata, + vmstk, + vmexe, + vmlib, + vmpte, + threads, + "/proc/self/status" + ); + } + Err(error) => { + info!(%error, "no status status due to error"); + } + } + } + Err(error) => { + info!(%error, "no process stats due to error"); + } + }; +} + +#[derive(Clone)] +pub struct Handle(ThreadLocalState); + +impl std::ops::Deref for Handle { + type Target = SystemHandle; + + fn deref(&self) -> &Self::Target { + self.0 + .0 + .cell + .get() + .expect("must be already initialized when using this") + } +} diff --git a/pageserver/src/virtual_file/metadata.rs b/pageserver/src/virtual_file/metadata.rs new file mode 100644 index 0000000000..f530c50988 --- /dev/null +++ b/pageserver/src/virtual_file/metadata.rs @@ -0,0 +1,30 @@ +use std::fs; + +pub enum Metadata { + StdFs(fs::Metadata), + #[cfg(target_os = "linux")] + TokioEpollUring(Box), +} + +#[cfg(target_os = "linux")] +impl From> for Metadata { + fn from(value: Box) -> Self { + Metadata::TokioEpollUring(value) + } +} + +impl From for Metadata { + fn from(value: std::fs::Metadata) -> Self { + Metadata::StdFs(value) + } +} + +impl Metadata { + pub fn len(&self) -> u64 { + match self { + Metadata::StdFs(metadata) => metadata.len(), + #[cfg(target_os = "linux")] + Metadata::TokioEpollUring(statx) => statx.stx_size, + } + } +} diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs new file mode 100644 index 0000000000..7f951270d1 --- /dev/null +++ b/pageserver/src/virtual_file/open_options.rs @@ -0,0 +1,139 @@ +//! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`]; + +use super::io_engine::IoEngine; +use std::{os::fd::OwnedFd, path::Path}; + +#[derive(Debug, Clone)] +pub enum OpenOptions { + StdFs(std::fs::OpenOptions), + #[cfg(target_os = "linux")] + TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions), +} + +impl Default for OpenOptions { + fn default() -> Self { + match super::io_engine::get() { + IoEngine::NotSet => panic!("io engine not set"), + IoEngine::StdFs => Self::StdFs(std::fs::OpenOptions::new()), + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + Self::TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions::new()) + } + } + } +} + +impl OpenOptions { + pub fn new() -> OpenOptions { + Self::default() + } + + pub fn read(&mut self, read: bool) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.read(read); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.read(read); + } + } + self + } + + pub fn write(&mut self, write: bool) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.write(write); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.write(write); + } + } + self + } + + pub fn create(&mut self, create: bool) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.create(create); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.create(create); + } + } + self + } + + pub fn create_new(&mut self, create_new: bool) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.create_new(create_new); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.create_new(create_new); + } + } + self + } + + pub fn truncate(&mut self, truncate: bool) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.truncate(truncate); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.truncate(truncate); + } + } + self + } + + pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result { + match self { + OpenOptions::StdFs(x) => x.open(path).map(|file| file.into()), + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await; + system.open(path, x).await.map_err(|e| match e { + tokio_epoll_uring::Error::Op(e) => e, + tokio_epoll_uring::Error::System(system) => { + std::io::Error::new(std::io::ErrorKind::Other, system) + } + }) + } + } + } +} + +impl std::os::unix::prelude::OpenOptionsExt for OpenOptions { + fn mode(&mut self, mode: u32) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.mode(mode); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.mode(mode); + } + } + self + } + + fn custom_flags(&mut self, flags: i32) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.custom_flags(flags); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.custom_flags(flags); + } + } + self + } +} diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs new file mode 100644 index 0000000000..7c773b6b21 --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs @@ -0,0 +1,78 @@ +//! See [`FullSlice`]. + +use bytes::{Bytes, BytesMut}; +use std::ops::{Deref, Range}; +use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; + +/// The true owned equivalent for Rust [`slice`]. Use this for the write path. +/// +/// Unlike [`tokio_epoll_uring::Slice`], which we unfortunately inherited from `tokio-uring`, +/// [`FullSlice`] is guaranteed to have all its bytes initialized. This means that +/// [`>::len`] is equal to [`Slice::bytes_init`] and [`Slice::bytes_total`]. +/// +pub struct FullSlice { + slice: Slice, +} + +impl FullSlice +where + B: IoBuf, +{ + pub(crate) fn must_new(slice: Slice) -> Self { + assert_eq!(slice.bytes_init(), slice.bytes_total()); + FullSlice { slice } + } + pub(crate) fn into_raw_slice(self) -> Slice { + let FullSlice { slice: s } = self; + s + } +} + +impl Deref for FullSlice +where + B: IoBuf, +{ + type Target = [u8]; + + fn deref(&self) -> &[u8] { + let rust_slice = &self.slice[..]; + assert_eq!(rust_slice.len(), self.slice.bytes_init()); + assert_eq!(rust_slice.len(), self.slice.bytes_total()); + rust_slice + } +} + +pub(crate) trait IoBufExt { + /// Get a [`FullSlice`] for the entire buffer, i.e., `self[..]` or `self[0..self.len()]`. + fn slice_len(self) -> FullSlice + where + Self: Sized; +} + +macro_rules! impl_io_buf_ext { + ($T:ty) => { + impl IoBufExt for $T { + #[inline(always)] + fn slice_len(self) -> FullSlice { + let len = self.len(); + let s = if len == 0 { + // `BoundedBuf::slice(0..len)` or `BoundedBuf::slice(..)` has an incorrect assertion, + // causing a panic if len == 0. + // The Slice::from_buf_bounds has the correct assertion (<= instead of <). + // => https://github.com/neondatabase/tokio-epoll-uring/issues/46 + let slice = self.slice_full(); + let mut bounds: Range<_> = slice.bounds(); + bounds.end = bounds.start; + Slice::from_buf_bounds(slice.into_inner(), bounds) + } else { + self.slice(0..len) + }; + FullSlice::must_new(s) + } + } + }; +} + +impl_io_buf_ext!(Bytes); +impl_io_buf_ext!(BytesMut); +impl_io_buf_ext!(Vec); diff --git a/pageserver/src/virtual_file/owned_buffers_io/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/slice.rs new file mode 100644 index 0000000000..6100593663 --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/slice.rs @@ -0,0 +1,121 @@ +use tokio_epoll_uring::BoundedBuf; +use tokio_epoll_uring::BoundedBufMut; +use tokio_epoll_uring::IoBufMut; +use tokio_epoll_uring::Slice; + +pub(crate) trait SliceMutExt { + /// Get a `&mut[0..self.bytes_total()`] slice, for when you need to do borrow-based IO. + /// + /// See the test case `test_slice_full_zeroed` for the difference to just doing `&slice[..]` + fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8]; +} + +impl SliceMutExt for Slice +where + B: IoBufMut, +{ + #[inline(always)] + fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8] { + // zero-initialize the uninitialized parts of the buffer so we can create a Rust slice + // + // SAFETY: we own `slice`, don't write outside the bounds + unsafe { + let to_init = self.bytes_total() - self.bytes_init(); + self.stable_mut_ptr() + .add(self.bytes_init()) + .write_bytes(0, to_init); + self.set_init(self.bytes_total()); + }; + let bytes_total = self.bytes_total(); + &mut self[0..bytes_total] + } +} + +#[cfg(test)] +mod tests { + use std::io::Read; + + use super::*; + use bytes::Buf; + use tokio_epoll_uring::Slice; + + #[test] + fn test_slice_full_zeroed() { + let make_fake_file = || bytes::BytesMut::from(&b"12345"[..]).reader(); + + // before we start the test, let's make sure we have a shared understanding of what slice_full does + { + let buf = Vec::with_capacity(3); + let slice: Slice<_> = buf.slice_full(); + assert_eq!(slice.bytes_init(), 0); + assert_eq!(slice.bytes_total(), 3); + let rust_slice = &slice[..]; + assert_eq!( + rust_slice.len(), + 0, + "Slice only derefs to a &[u8] of the initialized part" + ); + } + + // and also let's establish a shared understanding of .slice() + { + let buf = Vec::with_capacity(3); + let slice: Slice<_> = buf.slice(0..2); + assert_eq!(slice.bytes_init(), 0); + assert_eq!(slice.bytes_total(), 2); + let rust_slice = &slice[..]; + assert_eq!( + rust_slice.len(), + 0, + "Slice only derefs to a &[u8] of the initialized part" + ); + } + + // the above leads to the easy mistake of using slice[..] for borrow-based IO like so: + { + let buf = Vec::with_capacity(3); + let mut slice: Slice<_> = buf.slice_full(); + assert_eq!(slice[..].len(), 0); + let mut file = make_fake_file(); + file.read_exact(&mut slice[..]).unwrap(); // one might think this reads 3 bytes but it reads 0 + assert_eq!(&slice[..] as &[u8], &[][..] as &[u8]); + } + + // With owned buffers IO like with VirtualFilem, you could totally + // pass in a `Slice` with bytes_init()=0 but bytes_total()=5 + // and it will read 5 bytes into the slice, and return a slice that has bytes_init()=5. + { + // TODO: demo + } + + // + // Ok, now that we have a shared understanding let's demo how to use the extension trait. + // + + // slice_full() + { + let buf = Vec::with_capacity(3); + let mut slice: Slice<_> = buf.slice_full(); + let rust_slice = slice.as_mut_rust_slice_full_zeroed(); + assert_eq!(rust_slice.len(), 3); + assert_eq!(rust_slice, &[0, 0, 0]); + let mut file = make_fake_file(); + file.read_exact(rust_slice).unwrap(); + assert_eq!(rust_slice, b"123"); + assert_eq!(&slice[..], b"123"); + } + + // .slice(..) + { + let buf = Vec::with_capacity(3); + let mut slice: Slice<_> = buf.slice(0..2); + let rust_slice = slice.as_mut_rust_slice_full_zeroed(); + assert_eq!(rust_slice.len(), 2); + assert_eq!(rust_slice, &[0, 0]); + let mut file = make_fake_file(); + file.read_exact(rust_slice).unwrap(); + assert_eq!(rust_slice, b"12"); + assert_eq!(&slice[..], b"12"); + } + } +} diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs new file mode 100644 index 0000000000..efcb61ba65 --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs @@ -0,0 +1,50 @@ +use crate::{ + context::RequestContext, + virtual_file::owned_buffers_io::{io_buf_ext::FullSlice, write::OwnedAsyncWriter}, +}; +use tokio_epoll_uring::IoBuf; + +pub struct Writer { + dst: W, + bytes_amount: u64, +} + +impl Writer { + pub fn new(dst: W) -> Self { + Self { + dst, + bytes_amount: 0, + } + } + + pub fn bytes_written(&self) -> u64 { + self.bytes_amount + } + + pub fn as_inner(&self) -> &W { + &self.dst + } + + /// Returns the wrapped `VirtualFile` object as well as the number + /// of bytes that were written to it through this object. + #[cfg_attr(target_os = "macos", allow(dead_code))] + pub fn into_inner(self) -> (u64, W) { + (self.bytes_amount, self.dst) + } +} + +impl OwnedAsyncWriter for Writer +where + W: OwnedAsyncWriter, +{ + #[inline(always)] + async fn write_all( + &mut self, + buf: FullSlice, + ctx: &RequestContext, + ) -> std::io::Result<(usize, FullSlice)> { + let (nwritten, buf) = self.dst.write_all(buf, ctx).await?; + self.bytes_amount += u64::try_from(nwritten).unwrap(); + Ok((nwritten, buf)) + } +} diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs new file mode 100644 index 0000000000..568cf62e56 --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -0,0 +1,345 @@ +use bytes::BytesMut; +use tokio_epoll_uring::IoBuf; + +use crate::context::RequestContext; + +use super::io_buf_ext::{FullSlice, IoBufExt}; + +/// A trait for doing owned-buffer write IO. +/// Think [`tokio::io::AsyncWrite`] but with owned buffers. +pub trait OwnedAsyncWriter { + async fn write_all( + &mut self, + buf: FullSlice, + ctx: &RequestContext, + ) -> std::io::Result<(usize, FullSlice)>; +} + +/// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch +/// small writes into larger writes of size [`Buffer::cap`]. +/// +/// # Passthrough Of Large Writers +/// +/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`] +/// cause the internal buffer to be flushed prematurely so that the large +/// buffered write is passed through to the underlying [`OwnedAsyncWriter`]. +/// +/// This pass-through is generally beneficial for throughput, but if +/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource, +/// unlimited large writes may cause latency or fairness issues. +/// +/// In such cases, a different implementation that always buffers in memory +/// may be preferable. +pub struct BufferedWriter { + writer: W, + /// invariant: always remains Some(buf) except + /// - while IO is ongoing => goes back to Some() once the IO completed successfully + /// - after an IO error => stays `None` forever + /// + /// In these exceptional cases, it's `None`. + buf: Option, +} + +impl BufferedWriter +where + B: Buffer + Send, + Buf: IoBuf + Send, + W: OwnedAsyncWriter, +{ + pub fn new(writer: W, buf: B) -> Self { + Self { + writer, + buf: Some(buf), + } + } + + pub fn as_inner(&self) -> &W { + &self.writer + } + + /// Panics if used after any of the write paths returned an error + pub fn inspect_buffer(&self) -> &B { + self.buf() + } + + #[cfg_attr(target_os = "macos", allow(dead_code))] + pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result { + self.flush(ctx).await?; + + let Self { buf, writer } = self; + assert!(buf.is_some()); + Ok(writer) + } + + #[inline(always)] + fn buf(&self) -> &B { + self.buf + .as_ref() + .expect("must not use after we returned an error") + } + + /// Guarantees that if Ok() is returned, all bytes in `chunk` have been accepted. + #[cfg_attr(target_os = "macos", allow(dead_code))] + pub async fn write_buffered( + &mut self, + chunk: FullSlice, + ctx: &RequestContext, + ) -> std::io::Result<(usize, FullSlice)> { + let chunk = chunk.into_raw_slice(); + + let chunk_len = chunk.len(); + // avoid memcpy for the middle of the chunk + if chunk.len() >= self.buf().cap() { + self.flush(ctx).await?; + // do a big write, bypassing `buf` + assert_eq!( + self.buf + .as_ref() + .expect("must not use after an error") + .pending(), + 0 + ); + let (nwritten, chunk) = self + .writer + .write_all(FullSlice::must_new(chunk), ctx) + .await?; + assert_eq!(nwritten, chunk_len); + return Ok((nwritten, chunk)); + } + // in-memory copy the < BUFFER_SIZED tail of the chunk + assert!(chunk.len() < self.buf().cap()); + let mut slice = &chunk[..]; + while !slice.is_empty() { + let buf = self.buf.as_mut().expect("must not use after an error"); + let need = buf.cap() - buf.pending(); + let have = slice.len(); + let n = std::cmp::min(need, have); + buf.extend_from_slice(&slice[..n]); + slice = &slice[n..]; + if buf.pending() >= buf.cap() { + assert_eq!(buf.pending(), buf.cap()); + self.flush(ctx).await?; + } + } + assert!(slice.is_empty(), "by now we should have drained the chunk"); + Ok((chunk_len, FullSlice::must_new(chunk))) + } + + /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data. + /// + /// It is less performant because we always have to copy the borrowed data into the internal buffer + /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant + /// for large writes. + pub async fn write_buffered_borrowed( + &mut self, + mut chunk: &[u8], + ctx: &RequestContext, + ) -> std::io::Result { + let chunk_len = chunk.len(); + while !chunk.is_empty() { + let buf = self.buf.as_mut().expect("must not use after an error"); + let need = buf.cap() - buf.pending(); + let have = chunk.len(); + let n = std::cmp::min(need, have); + buf.extend_from_slice(&chunk[..n]); + chunk = &chunk[n..]; + if buf.pending() >= buf.cap() { + assert_eq!(buf.pending(), buf.cap()); + self.flush(ctx).await?; + } + } + Ok(chunk_len) + } + + async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> { + let buf = self.buf.take().expect("must not use after an error"); + let buf_len = buf.pending(); + if buf_len == 0 { + self.buf = Some(buf); + return Ok(()); + } + let slice = buf.flush(); + let (nwritten, slice) = self.writer.write_all(slice, ctx).await?; + assert_eq!(nwritten, buf_len); + self.buf = Some(Buffer::reuse_after_flush( + slice.into_raw_slice().into_inner(), + )); + Ok(()) + } +} + +/// A [`Buffer`] is used by [`BufferedWriter`] to batch smaller writes into larger ones. +pub trait Buffer { + type IoBuf: IoBuf; + + /// Capacity of the buffer. Must not change over the lifetime `self`.` + fn cap(&self) -> usize; + + /// Add data to the buffer. + /// Panics if there is not enough room to accomodate `other`'s content, i.e., + /// panics if `other.len() > self.cap() - self.pending()`. + fn extend_from_slice(&mut self, other: &[u8]); + + /// Number of bytes in the buffer. + fn pending(&self) -> usize; + + /// Turns `self` into a [`FullSlice`] of the pending data + /// so we can use [`tokio_epoll_uring`] to write it to disk. + fn flush(self) -> FullSlice; + + /// After the write to disk is done and we have gotten back the slice, + /// [`BufferedWriter`] uses this method to re-use the io buffer. + fn reuse_after_flush(iobuf: Self::IoBuf) -> Self; +} + +impl Buffer for BytesMut { + type IoBuf = BytesMut; + + #[inline(always)] + fn cap(&self) -> usize { + self.capacity() + } + + fn extend_from_slice(&mut self, other: &[u8]) { + BytesMut::extend_from_slice(self, other) + } + + #[inline(always)] + fn pending(&self) -> usize { + self.len() + } + + fn flush(self) -> FullSlice { + self.slice_len() + } + + fn reuse_after_flush(mut iobuf: BytesMut) -> Self { + iobuf.clear(); + iobuf + } +} + +impl OwnedAsyncWriter for Vec { + async fn write_all( + &mut self, + buf: FullSlice, + _: &RequestContext, + ) -> std::io::Result<(usize, FullSlice)> { + self.extend_from_slice(&buf[..]); + Ok((buf.len(), buf)) + } +} + +#[cfg(test)] +mod tests { + use bytes::BytesMut; + + use super::*; + use crate::context::{DownloadBehavior, RequestContext}; + use crate::task_mgr::TaskKind; + + #[derive(Default)] + struct RecorderWriter { + writes: Vec>, + } + impl OwnedAsyncWriter for RecorderWriter { + async fn write_all( + &mut self, + buf: FullSlice, + _: &RequestContext, + ) -> std::io::Result<(usize, FullSlice)> { + self.writes.push(Vec::from(&buf[..])); + Ok((buf.len(), buf)) + } + } + + fn test_ctx() -> RequestContext { + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) + } + + macro_rules! write { + ($writer:ident, $data:literal) => {{ + $writer + .write_buffered(::bytes::Bytes::from_static($data).slice_len(), &test_ctx()) + .await?; + }}; + } + + #[tokio::test] + async fn test_buffered_writes_only() -> std::io::Result<()> { + let recorder = RecorderWriter::default(); + let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); + write!(writer, b"a"); + write!(writer, b"b"); + write!(writer, b"c"); + write!(writer, b"d"); + write!(writer, b"e"); + let recorder = writer.flush_and_into_inner(&test_ctx()).await?; + assert_eq!( + recorder.writes, + vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")] + ); + Ok(()) + } + + #[tokio::test] + async fn test_passthrough_writes_only() -> std::io::Result<()> { + let recorder = RecorderWriter::default(); + let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); + write!(writer, b"abc"); + write!(writer, b"de"); + write!(writer, b""); + write!(writer, b"fghijk"); + let recorder = writer.flush_and_into_inner(&test_ctx()).await?; + assert_eq!( + recorder.writes, + vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")] + ); + Ok(()) + } + + #[tokio::test] + async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> { + let recorder = RecorderWriter::default(); + let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); + write!(writer, b"a"); + write!(writer, b"bc"); + write!(writer, b"d"); + write!(writer, b"e"); + let recorder = writer.flush_and_into_inner(&test_ctx()).await?; + assert_eq!( + recorder.writes, + vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")] + ); + Ok(()) + } + + #[tokio::test] + async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> { + let ctx = test_ctx(); + let ctx = &ctx; + let recorder = RecorderWriter::default(); + let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); + + writer.write_buffered_borrowed(b"abc", ctx).await?; + writer.write_buffered_borrowed(b"d", ctx).await?; + writer.write_buffered_borrowed(b"e", ctx).await?; + writer.write_buffered_borrowed(b"fg", ctx).await?; + writer.write_buffered_borrowed(b"hi", ctx).await?; + writer.write_buffered_borrowed(b"j", ctx).await?; + writer.write_buffered_borrowed(b"klmno", ctx).await?; + + let recorder = writer.flush_and_into_inner(ctx).await?; + assert_eq!( + recorder.writes, + { + let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"]; + expect + } + .iter() + .map(|v| v[..].to_vec()) + .collect::>() + ); + Ok(()) + } +} diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 3183608862..2d3841881b 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -21,19 +21,25 @@ //! redo Postgres process, but some records it can handle directly with //! bespoken Rust code. +use std::time::Duration; +use std::time::SystemTime; + use pageserver_api::shard::ShardIdentity; use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes; use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment; +use postgres_ffi::TimestampTz; use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn}; use anyhow::{bail, Context, Result}; use bytes::{Buf, Bytes, BytesMut}; use tracing::*; use utils::failpoint_support; +use utils::rate_limit::RateLimit; use crate::context::RequestContext; use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::{DatadirModification, Version}; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::PageReconstructError; use crate::tenant::Timeline; use crate::walrecord::*; @@ -51,8 +57,16 @@ use utils::lsn::Lsn; pub struct WalIngest { shard: ShardIdentity, + pg_version: u32, checkpoint: CheckPoint, checkpoint_modified: bool, + warn_ingest_lag: WarnIngestLag, +} + +struct WarnIngestLag { + lag_msg_ratelimit: RateLimit, + future_lsn_msg_ratelimit: RateLimit, + timestamp_invalid_msg_ratelimit: RateLimit, } impl WalIngest { @@ -69,8 +83,14 @@ impl WalIngest { Ok(WalIngest { shard: *timeline.get_shard_identity(), + pg_version: timeline.pg_version, checkpoint, checkpoint_modified: false, + warn_ingest_lag: WarnIngestLag { + lag_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)), + future_lsn_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)), + timestamp_invalid_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)), + }, }) } @@ -86,10 +106,9 @@ impl WalIngest { /// pub async fn ingest_record( &mut self, - recdata: Bytes, + decoded: DecodedWALRecord, lsn: Lsn, modification: &mut DatadirModification<'_>, - decoded: &mut DecodedWALRecord, ctx: &RequestContext, ) -> anyhow::Result { WAL_INGEST.records_received.inc(); @@ -97,7 +116,12 @@ impl WalIngest { let prev_len = modification.len(); modification.set_lsn(lsn)?; - decode_wal_record(recdata, decoded, pg_version)?; + + if decoded.is_dbase_create_copy(self.pg_version) { + // Records of this type should always be preceded by a commit(), as they + // rely on reading data pages back from the Timeline. + assert!(!modification.has_dirty_data_pages()); + } let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -109,15 +133,17 @@ impl WalIngest { self.checkpoint_modified = true; } + failpoint_support::sleep_millis_async!("wal-ingest-record-sleep"); + match decoded.xl_rmid { pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => { // Heap AM records need some special handling, because they modify VM pages // without registering them with the standard mechanism. - self.ingest_heapam_record(&mut buf, modification, decoded, ctx) + self.ingest_heapam_record(&mut buf, modification, &decoded, ctx) .await?; } pg_constants::RM_NEON_ID => { - self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx) + self.ingest_neonrmgr_record(&mut buf, modification, &decoded, ctx) .await?; } // Handle other special record types @@ -232,6 +258,7 @@ impl WalIngest { modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT, + decoded.origin_id, ctx, ) .await?; @@ -244,6 +271,7 @@ impl WalIngest { modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT_PREPARED, + decoded.origin_id, ctx, ) .await?; @@ -303,7 +331,7 @@ impl WalIngest { } pg_constants::RM_RELMAP_ID => { let xlrec = XlRelmapUpdate::decode(&mut buf); - self.ingest_relmap_page(modification, &xlrec, decoded, ctx) + self.ingest_relmap_page(modification, &xlrec, &decoded, ctx) .await?; } pg_constants::RM_XLOG_ID => { @@ -334,6 +362,38 @@ impl WalIngest { { self.checkpoint.oldestXid = xlog_checkpoint.oldestXid; } + trace!( + "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}", + xlog_checkpoint.oldestActiveXid, + self.checkpoint.oldestActiveXid + ); + + // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`, + // because at shutdown, all in-progress transactions will implicitly + // end. Postgres startup code knows that, and allows hot standby to start + // immediately from a shutdown checkpoint. + // + // In Neon, Postgres hot standby startup always behaves as if starting from + // an online checkpoint. It needs a valid `oldestActiveXid` value, so + // instead of overwriting self.checkpoint.oldestActiveXid with + // InvalidTransactionid from the checkpoint WAL record, update it to a + // proper value, knowing that there are no in-progress transactions at this + // point, except for prepared transactions. + // + // See also the neon code changes in the InitWalRecovery() function. + if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID + && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN + { + let mut oldest_active_xid = self.checkpoint.nextXid.value as u32; + for xid in modification.tline.list_twophase_files(lsn, ctx).await? { + if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 { + oldest_active_xid = xid; + } + } + self.checkpoint.oldestActiveXid = oldest_active_xid; + } else { + self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid; + } // Write a new checkpoint key-value pair on every checkpoint record, even // if nothing really changed. Not strictly required, but it seems nice to @@ -346,7 +406,7 @@ impl WalIngest { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_LOGICAL_MESSAGE { - let xlrec = XlLogicalMessage::decode(&mut buf); + let xlrec = crate::walrecord::XlLogicalMessage::decode(&mut buf); let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?; let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size]; if prefix == "neon-test" { @@ -360,6 +420,26 @@ impl WalIngest { } } } + pg_constants::RM_STANDBY_ID => { + let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + if info == pg_constants::XLOG_RUNNING_XACTS { + let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf); + self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid; + self.checkpoint_modified = true; + } + } + pg_constants::RM_REPLORIGIN_ID => { + let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + if info == pg_constants::XLOG_REPLORIGIN_SET { + let xlrec = crate::walrecord::XlReploriginSet::decode(&mut buf); + modification + .set_replorigin(xlrec.node_id, xlrec.remote_lsn) + .await? + } else if info == pg_constants::XLOG_REPLORIGIN_DROP { + let xlrec = crate::walrecord::XlReploriginDrop::decode(&mut buf); + modification.drop_replorigin(xlrec.node_id).await? + } + } _x => { // TODO: should probably log & fail here instead of blindly // doing something without understanding the protocol @@ -388,7 +468,7 @@ impl WalIngest { ); if !key_is_local { - if self.shard.is_zero() { + if self.shard.is_shard_zero() { // Shard 0 tracks relation sizes. Although we will not store this block, we will observe // its blkno in case it implicitly extends a relation. self.observe_decoded_block(modification, blk, ctx).await?; @@ -396,7 +476,7 @@ impl WalIngest { continue; } - self.ingest_decoded_block(modification, lsn, decoded, blk, ctx) + self.ingest_decoded_block(modification, lsn, &decoded, blk, ctx) .await?; } @@ -412,6 +492,8 @@ impl WalIngest { // until commit() is called to flush the data into the repository and update // the latest LSN. + modification.on_record_end(); + Ok(modification.len() > prev_len) } @@ -459,7 +541,7 @@ impl WalIngest { && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record - && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)? + && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version) // do not materialize null pages because them most likely be soon replaced with real data && blk.bimg_len != 0 { @@ -483,6 +565,7 @@ impl WalIngest { page_set_lsn(&mut image, lsn) } assert_eq!(image.len(), BLCKSZ as usize); + self.put_rel_page_image(modification, rel, blk.blkno, image.freeze(), ctx) .await?; } else { @@ -562,7 +645,7 @@ impl WalIngest { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { - std::mem::size_of::() * xlrec.ntuples as usize + size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); @@ -629,7 +712,7 @@ impl WalIngest { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { - std::mem::size_of::() * xlrec.ntuples as usize + size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); @@ -696,7 +779,7 @@ impl WalIngest { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { - std::mem::size_of::() * xlrec.ntuples as usize + size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); @@ -864,7 +947,7 @@ impl WalIngest { // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set 0 } else { - std::mem::size_of::() * xlrec.ntuples as usize + size_of::() * xlrec.ntuples as usize }; assert_eq!(offset_array_len, buf.remaining()); @@ -1019,7 +1102,7 @@ impl WalIngest { let nblocks = modification .tline - .get_rel_size(src_rel, Version::Modified(modification), true, ctx) + .get_rel_size(src_rel, Version::Modified(modification), ctx) .await?; let dst_rel = RelTag { spcnode: tablespace_id, @@ -1033,17 +1116,27 @@ impl WalIngest { // Copy content debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks); for blknum in 0..nblocks { - debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel); + // Sharding: + // - src and dst are always on the same shard, because they differ only by dbNode, and + // dbNode is not included in the hash inputs for sharding. + // - This WAL command is replayed on all shards, but each shard only copies the blocks + // that belong to it. + let src_key = rel_block_to_key(src_rel, blknum); + if !self.shard.is_key_local(&src_key) { + debug!( + "Skipping non-local key {} during XLOG_DBASE_CREATE", + src_key + ); + continue; + } + debug!( + "copying block {} from {} ({}) to {}", + blknum, src_rel, src_key, dst_rel + ); let content = modification .tline - .get_rel_page_at_lsn( - src_rel, - blknum, - Version::Modified(modification), - true, - ctx, - ) + .get_rel_page_at_lsn(src_rel, blknum, Version::Modified(modification), ctx) .await?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; @@ -1111,7 +1204,7 @@ impl WalIngest { if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 { // Tail of last remaining FSM page has to be zeroed. // We are not precise here and instead of digging in FSM bitmap format just clear the whole page. - modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?; + modification.put_rel_page_image_zero(rel, fsm_physical_page_no); fsm_physical_page_no += 1; } let nblocks = get_relsize(modification, rel, ctx).await?; @@ -1133,7 +1226,7 @@ impl WalIngest { if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 { // Tail of last remaining vm page has to be zeroed. // We are not precise here and instead of digging in VM bitmap format just clear the whole page. - modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?; + modification.put_rel_page_image_zero(rel, vm_page_no); vm_page_no += 1; } let nblocks = get_relsize(modification, rel, ctx).await?; @@ -1146,6 +1239,48 @@ impl WalIngest { Ok(()) } + fn warn_on_ingest_lag( + &mut self, + conf: &crate::config::PageServerConf, + wal_timestmap: TimestampTz, + ) { + debug_assert_current_span_has_tenant_and_timeline_id(); + let now = SystemTime::now(); + let rate_limits = &mut self.warn_ingest_lag; + match try_from_pg_timestamp(wal_timestmap) { + Ok(ts) => { + match now.duration_since(ts) { + Ok(lag) => { + if lag > conf.wait_lsn_timeout { + rate_limits.lag_msg_ratelimit.call2(|rate_limit_stats| { + let lag = humantime::format_duration(lag); + warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout"); + }) + } + }, + Err(e) => { + let delta_t = e.duration(); + // determined by prod victoriametrics query: 1000 * (timestamp(node_time_seconds{neon_service="pageserver"}) - node_time_seconds) + // => https://www.robustperception.io/time-metric-from-the-node-exporter/ + const IGNORED_DRIFT: Duration = Duration::from_millis(100); + if delta_t > IGNORED_DRIFT { + let delta_t = humantime::format_duration(delta_t); + rate_limits.future_lsn_msg_ratelimit.call2(|rate_limit_stats| { + warn!(%rate_limit_stats, %delta_t, "ingesting record with timestamp from future"); + }) + } + } + }; + + } + Err(error) => { + rate_limits.timestamp_invalid_msg_ratelimit.call2(|rate_limit_stats| { + warn!(%rate_limit_stats, %error, "ingesting record with invalid timestamp, cannot calculate lag and will fail find-lsn-for-timestamp type queries"); + }) + } + } + } + /// Subroutine of ingest_record(), to handle an XLOG_XACT_* records. /// async fn ingest_xact_record( @@ -1153,6 +1288,7 @@ impl WalIngest { modification: &mut DatadirModification<'_>, parsed: &XlXactParsedRecord, is_commit: bool, + origin_id: u16, ctx: &RequestContext, ) -> anyhow::Result<()> { // Record update of CLOG pages @@ -1161,6 +1297,8 @@ impl WalIngest { let mut rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; let mut page_xids: Vec = vec![parsed.xid]; + self.warn_on_ingest_lag(modification.tline.conf, parsed.xact_time); + for subxact in &parsed.subxacts { let subxact_pageno = subxact / pg_constants::CLOG_XACTS_PER_PAGE; if subxact_pageno != pageno { @@ -1211,13 +1349,18 @@ impl WalIngest { }; if modification .tline - .get_rel_exists(rel, Version::Modified(modification), true, ctx) + .get_rel_exists(rel, Version::Modified(modification), ctx) .await? { self.put_rel_drop(modification, rel, ctx).await?; } } } + if origin_id != 0 { + modification + .set_replorigin(origin_id, parsed.origin_lsn) + .await?; + } Ok(()) } @@ -1232,13 +1375,10 @@ impl WalIngest { xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db ); - // Here we treat oldestXid and oldestXidDB - // differently from postgres redo routines. - // In postgres checkpoint.oldestXid lags behind xlrec.oldest_xid - // until checkpoint happens and updates the value. - // Here we can use the most recent value. - // It's just an optimization, though and can be deleted. - // TODO Figure out if there will be any issues with replica. + // In Postgres, oldestXid and oldestXidDB are updated in memory when the CLOG is + // truncated, but a checkpoint record with the updated values isn't written until + // later. In Neon, a server can start at any LSN, not just on a checkpoint record, + // so we keep the oldestXid and oldestXidDB up-to-date. self.checkpoint.oldestXid = xlrec.oldest_xid; self.checkpoint.oldestXidDB = xlrec.oldest_xid_db; self.checkpoint_modified = true; @@ -1339,24 +1479,47 @@ impl WalIngest { // Note: The multixact members can wrap around, even within one WAL record. offset = offset.wrapping_add(n_this_page as u32); } - if xlrec.mid >= self.checkpoint.nextMulti { - self.checkpoint.nextMulti = xlrec.mid + 1; + let next_offset = offset; + assert!(xlrec.moff.wrapping_add(xlrec.nmembers) == next_offset); + + // Update next-multi-xid and next-offset + // + // NB: In PostgreSQL, the next-multi-xid stored in the control file is allowed to + // go to 0, and it's fixed up by skipping to FirstMultiXactId in functions that + // read it, like GetNewMultiXactId(). This is different from how nextXid is + // incremented! nextXid skips over < FirstNormalTransactionId when the the value + // is stored, so it's never 0 in a checkpoint. + // + // I don't know why it's done that way, it seems less error-prone to skip over 0 + // when the value is stored rather than when it's read. But let's do it the same + // way here. + let next_multi_xid = xlrec.mid.wrapping_add(1); + + if self + .checkpoint + .update_next_multixid(next_multi_xid, next_offset) + { self.checkpoint_modified = true; } - if xlrec.moff + xlrec.nmembers > self.checkpoint.nextMultiOffset { - self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers; - self.checkpoint_modified = true; - } - let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| { - if mbr.xid.wrapping_sub(acc) as i32 > 0 { - mbr.xid + + // Also update the next-xid with the highest member. According to the comments in + // multixact_redo(), this shouldn't be necessary, but let's do the same here. + let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| { + if let Some(max_xid) = acc { + if mbr.xid.wrapping_sub(max_xid) as i32 > 0 { + Some(mbr.xid) + } else { + acc + } } else { - acc + Some(mbr.xid) } }); - if self.checkpoint.update_next_xid(max_mbr_xid) { - self.checkpoint_modified = true; + if let Some(max_xid) = max_mbr_xid { + if self.checkpoint.update_next_xid(max_xid) { + self.checkpoint_modified = true; + } } Ok(()) } @@ -1504,7 +1667,7 @@ impl WalIngest { nblocks } else if !modification .tline - .get_rel_exists(rel, Version::Modified(modification), true, ctx) + .get_rel_exists(rel, Version::Modified(modification), ctx) .await? { // create it with 0 size initially, the logic below will extend it @@ -1516,7 +1679,7 @@ impl WalIngest { } else { modification .tline - .get_rel_size(rel, Version::Modified(modification), true, ctx) + .get_rel_size(rel, Version::Modified(modification), ctx) .await? }; @@ -1533,7 +1696,7 @@ impl WalIngest { continue; } - modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?; + modification.put_rel_page_image_zero(rel, gap_blknum); } } Ok(()) @@ -1599,7 +1762,7 @@ impl WalIngest { // fill the gap with zeros for gap_blknum in old_nblocks..blknum { - modification.put_slru_page_image(kind, segno, gap_blknum, ZERO_PAGE.clone())?; + modification.put_slru_page_image_zero(kind, segno, gap_blknum); } } Ok(()) @@ -1610,17 +1773,17 @@ async fn get_relsize( modification: &DatadirModification<'_>, rel: RelTag, ctx: &RequestContext, -) -> anyhow::Result { +) -> Result { let nblocks = if !modification .tline - .get_rel_exists(rel, Version::Modified(modification), true, ctx) + .get_rel_exists(rel, Version::Modified(modification), ctx) .await? { 0 } else { modification .tline - .get_rel_size(rel, Version::Modified(modification), true, ctx) + .get_rel_size(rel, Version::Modified(modification), ctx) .await? }; Ok(nblocks) @@ -1632,8 +1795,6 @@ mod tests { use super::*; use crate::tenant::harness::*; use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH}; - use crate::tenant::Timeline; - use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; use postgres_ffi::RELSEG_SIZE; use crate::DEFAULT_PG_VERSION; @@ -1664,7 +1825,7 @@ mod tests { #[tokio::test] async fn test_relsize() -> Result<()> { - let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_relsize").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -1673,23 +1834,27 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x20)); walingest.put_rel_creation(&mut m, TESTREL_A, &ctx).await?; walingest - .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x30)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x40)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x50)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; assert_current_logical_size(&tline, Lsn(0x50)); @@ -1697,29 +1862,29 @@ mod tests { // The relation was created at LSN 2, not visible at LSN 1 yet. assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) .await?, false ); assert!(tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) .await .is_err()); assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, 1 ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx) .await?, 3 ); @@ -1727,48 +1892,48 @@ mod tests { // Check page contents at each LSN assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), &ctx) .await?, - TEST_IMG("foo blk 0 at 2") + test_img("foo blk 0 at 2") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), &ctx) .await?, - TEST_IMG("foo blk 0 at 3") + test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), &ctx) .await?, - TEST_IMG("foo blk 0 at 3") + test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), &ctx) .await?, - TEST_IMG("foo blk 1 at 4") + test_img("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), &ctx) .await?, - TEST_IMG("foo blk 0 at 3") + test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), &ctx) .await?, - TEST_IMG("foo blk 1 at 4") + test_img("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx) .await?, - TEST_IMG("foo blk 2 at 5") + test_img("foo blk 2 at 5") ); // Truncate last block @@ -1782,35 +1947,35 @@ mod tests { // Check reported size and contents after truncation assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx) .await?, 2 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), &ctx) .await?, - TEST_IMG("foo blk 0 at 3") + test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), &ctx) .await?, - TEST_IMG("foo blk 1 at 4") + test_img("foo blk 1 at 4") ); // should still see the truncated block with older LSN assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx) .await?, 3 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx) .await?, - TEST_IMG("foo blk 2 at 5") + test_img("foo blk 2 at 5") ); // Truncate to zero length @@ -1821,7 +1986,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), &ctx) .await?, 0 ); @@ -1829,53 +1994,55 @@ mod tests { // Extend from 0 to 2 blocks, leaving a gap let mut m = tline.begin_modification(Lsn(0x70)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), &ctx) .await?, 2 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), &ctx) .await?, ZERO_PAGE ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), &ctx) .await?, - TEST_IMG("foo blk 1") + test_img("foo blk 1") ); // Extend a lot more, leaving a big gap that spans across segments let mut m = tline.begin_modification(Lsn(0x80)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx) .await?, 1501 ); for blk in 2..1500 { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), &ctx) .await?, ZERO_PAGE ); } assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), &ctx) .await?, - TEST_IMG("foo blk 1500") + test_img("foo blk 1500") ); Ok(()) @@ -1885,7 +2052,10 @@ mod tests { // and then created it again within the same layer. #[tokio::test] async fn test_drop_extend() -> Result<()> { - let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_drop_extend") + .await? + .load() + .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -1893,20 +2063,20 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x20)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx) .await?; m.commit(&ctx).await?; // Check that rel exists and size is correct assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, 1 ); @@ -1919,7 +2089,7 @@ mod tests { // Check that rel is not visible anymore assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), &ctx) .await?, false ); @@ -1930,20 +2100,20 @@ mod tests { // Re-create it let mut m = tline.begin_modification(Lsn(0x40)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 4"), &ctx) .await?; m.commit(&ctx).await?; // Check that rel exists and size is correct assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx) .await?, 1 ); @@ -1956,7 +2126,10 @@ mod tests { // and then extended it again within the same layer. #[tokio::test] async fn test_truncate_extend() -> Result<()> { - let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_truncate_extend") + .await? + .load() + .await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -1968,7 +2141,7 @@ mod tests { for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, Lsn(0x20)); walingest - .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx) .await?; } m.commit(&ctx).await?; @@ -1976,24 +2149,24 @@ mod tests { // The relation was created at LSN 20, not visible at LSN 1 yet. assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) .await?, false ); assert!(tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) .await .is_err()); assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, relsize ); @@ -2004,9 +2177,9 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), &ctx) .await?, - TEST_IMG(&data) + test_img(&data) ); } @@ -2021,7 +2194,7 @@ mod tests { // Check reported size and contents after truncation assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx) .await?, 1 ); @@ -2031,16 +2204,16 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), &ctx) .await?, - TEST_IMG(&data) + test_img(&data) ); } // should still see all blocks with older LSN assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx) .await?, relsize ); @@ -2049,9 +2222,9 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), &ctx) .await?, - TEST_IMG(&data) + test_img(&data) ); } @@ -2062,20 +2235,20 @@ mod tests { for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, lsn); walingest - .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx) .await?; } m.commit(&ctx).await?; assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx) .await?, relsize ); @@ -2085,9 +2258,9 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), &ctx) .await?, - TEST_IMG(&data) + test_img(&data) ); } @@ -2098,7 +2271,7 @@ mod tests { /// split into multiple 1 GB segments in Postgres. #[tokio::test] async fn test_large_rel() -> Result<()> { - let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await; + let (tenant, ctx) = TenantHarness::create("test_large_rel").await?.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -2108,7 +2281,7 @@ mod tests { for blknum in 0..RELSEG_SIZE + 1 { lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); - let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); + let img = test_img(&format!("foo blk {} at {}", blknum, Lsn(lsn))); walingest .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx) .await?; @@ -2119,7 +2292,7 @@ mod tests { assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE + 1 ); @@ -2133,7 +2306,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE ); @@ -2148,7 +2321,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE - 1 ); @@ -2166,7 +2339,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) .await?, size as BlockNumber ); @@ -2206,7 +2379,10 @@ mod tests { let startpoint = Lsn::from_hex("14AEC08").unwrap(); let _endpoint = Lsn::from_hex("1FFFF98").unwrap(); - let harness = TenantHarness::create("test_ingest_real_wal").unwrap(); + let harness = TenantHarness::create("test_ingest_real_wal").await.unwrap(); + let span = harness + .span() + .in_scope(|| info_span!("timeline_span", timeline_id=%TIMELINE_ID)); let (tenant, ctx) = harness.load().await; let remote_initdb_path = @@ -2248,7 +2424,6 @@ mod tests { .await .unwrap(); let mut modification = tline.begin_modification(startpoint); - let mut decoded = DecodedWALRecord::default(); println!("decoding {} bytes", bytes.len() - xlogoff); // Decode and ingest wal. We process the wal in chunks because @@ -2256,8 +2431,11 @@ mod tests { for chunk in bytes[xlogoff..].chunks(50) { decoder.feed_bytes(chunk); while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() { + let mut decoded = DecodedWALRecord::default(); + decode_wal_record(recdata, &mut decoded, modification.tline.pg_version).unwrap(); walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx) + .ingest_record(decoded, lsn, &mut modification, &ctx) + .instrument(span.clone()) .await .unwrap(); } diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index ff6bc9194b..0c4d575de8 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -9,10 +9,10 @@ use postgres_ffi::pg_constants; use postgres_ffi::BLCKSZ; use postgres_ffi::{BlockNumber, TimestampTz}; use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId}; -use postgres_ffi::{XLogRecord, XLOG_SIZE_OF_XLOG_RECORD}; +use postgres_ffi::{RepOriginId, XLogRecord, XLOG_SIZE_OF_XLOG_RECORD}; use serde::{Deserialize, Serialize}; use tracing::*; -use utils::bin_ser::DeserializeError; +use utils::{bin_ser::DeserializeError, lsn::Lsn}; /// Each update to a page is represented by a NeonWalRecord. It can be a wrapper /// around a PostgreSQL WAL record, or a custom neon-specific "record". @@ -44,19 +44,66 @@ pub enum NeonWalRecord { moff: MultiXactOffset, members: Vec, }, + /// Update the map of AUX files, either writing or dropping an entry + AuxFile { + file_path: String, + content: Option, + }, + + /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it. + #[cfg(test)] + Test { + /// Append a string to the image. + append: String, + /// Clear the image before appending. + clear: bool, + /// Treat this record as an init record. `clear` should be set to true if this field is set + /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and + /// its references in `timeline.rs`. + will_init: bool, + }, } impl NeonWalRecord { /// Does replaying this WAL record initialize the page from scratch, or does /// it need to be applied over the previous image of the page? pub fn will_init(&self) -> bool { + // If you change this function, you'll also need to change ValueBytes::will_init match self { NeonWalRecord::Postgres { will_init, rec: _ } => *will_init, - + #[cfg(test)] + NeonWalRecord::Test { will_init, .. } => *will_init, // None of the special neon record types currently initialize the page _ => false, } } + + #[cfg(test)] + pub(crate) fn wal_append(s: impl AsRef) -> Self { + Self::Test { + append: s.as_ref().to_string(), + clear: false, + will_init: false, + } + } + + #[cfg(test)] + pub(crate) fn wal_clear() -> Self { + Self::Test { + append: "".to_string(), + clear: true, + will_init: false, + } + } + + #[cfg(test)] + pub(crate) fn wal_init() -> Self { + Self::Test { + append: "".to_string(), + clear: true, + will_init: true, + } + } } /// DecodedBkpBlock represents per-page data contained in a WAL record. @@ -110,6 +157,31 @@ pub struct DecodedWALRecord { pub blocks: Vec, pub main_data_offset: usize, + pub origin_id: u16, +} + +impl DecodedWALRecord { + /// Check if this WAL record represents a legacy "copy" database creation, which populates new relations + /// by reading other existing relations' data blocks. This is more complex to apply than new-style database + /// creations which simply include all the desired blocks in the WAL, so we need a helper function to detect this case. + pub(crate) fn is_dbase_create_copy(&self, pg_version: u32) -> bool { + if self.xl_rmid == pg_constants::RM_DBASE_ID { + let info = self.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + match pg_version { + 14 => { + // Postgres 14 database creations are always the legacy kind + info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE + } + 15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY, + 16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY, + _ => { + panic!("Unsupported postgres version {pg_version}") + } + } + } else { + false + } + } } #[repr(C)] @@ -567,6 +639,7 @@ pub struct XlXactParsedRecord { pub subxacts: Vec, pub xnodes: Vec, + pub origin_lsn: Lsn, } impl XlXactParsedRecord { @@ -645,6 +718,11 @@ impl XlXactParsedRecord { debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid); } + let origin_lsn = if xinfo & pg_constants::XACT_XINFO_HAS_ORIGIN != 0 { + Lsn(buf.get_u64_le()) + } else { + Lsn::INVALID + }; XlXactParsedRecord { xid, info, @@ -654,6 +732,7 @@ impl XlXactParsedRecord { ts_id, subxacts, xnodes, + origin_lsn, } } } @@ -768,6 +847,72 @@ impl XlLogicalMessage { } } +#[repr(C)] +#[derive(Debug)] +pub struct XlRunningXacts { + pub xcnt: u32, + pub subxcnt: u32, + pub subxid_overflow: bool, + pub next_xid: TransactionId, + pub oldest_running_xid: TransactionId, + pub latest_completed_xid: TransactionId, + pub xids: Vec, +} + +impl XlRunningXacts { + pub fn decode(buf: &mut Bytes) -> XlRunningXacts { + let xcnt = buf.get_u32_le(); + let subxcnt = buf.get_u32_le(); + let subxid_overflow = buf.get_u32_le() != 0; + let next_xid = buf.get_u32_le(); + let oldest_running_xid = buf.get_u32_le(); + let latest_completed_xid = buf.get_u32_le(); + let mut xids = Vec::new(); + for _ in 0..(xcnt + subxcnt) { + xids.push(buf.get_u32_le()); + } + XlRunningXacts { + xcnt, + subxcnt, + subxid_overflow, + next_xid, + oldest_running_xid, + latest_completed_xid, + xids, + } + } +} + +#[repr(C)] +#[derive(Debug)] +pub struct XlReploriginDrop { + pub node_id: RepOriginId, +} + +impl XlReploriginDrop { + pub fn decode(buf: &mut Bytes) -> XlReploriginDrop { + XlReploriginDrop { + node_id: buf.get_u16_le(), + } + } +} + +#[repr(C)] +#[derive(Debug)] +pub struct XlReploriginSet { + pub remote_lsn: Lsn, + pub node_id: RepOriginId, +} + +impl XlReploriginSet { + pub fn decode(buf: &mut Bytes) -> XlReploriginSet { + XlReploriginSet { + remote_lsn: Lsn(buf.get_u64_le()), + node_id: buf.get_u16_le(), + } + } +} + /// Main routine to decode a WAL record and figure out which blocks are modified // // See xlogrecord.h for details @@ -802,6 +947,7 @@ pub fn decode_wal_record( let mut rnode_dbnode: u32 = 0; let mut rnode_relnode: u32 = 0; let mut got_rnode = false; + let mut origin_id: u16 = 0; let mut buf = record.clone(); @@ -849,7 +995,7 @@ pub fn decode_wal_record( pg_constants::XLR_BLOCK_ID_ORIGIN => { // RepOriginId is uint16 - buf.advance(2); + origin_id = buf.get_u16_le(); } pg_constants::XLR_BLOCK_ID_TOPLEVEL_XID => { @@ -896,7 +1042,7 @@ pub fn decode_wal_record( ); let blk_img_is_compressed = - postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)?; + postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version); if blk_img_is_compressed { debug!("compressed block image , pg_version = {}", pg_version); @@ -1046,6 +1192,7 @@ pub fn decode_wal_record( decoded.xl_info = xlogrec.xl_info; decoded.xl_rmid = xlogrec.xl_rmid; decoded.record = record; + decoded.origin_id = origin_id; decoded.main_data_offset = main_data_offset; Ok(()) diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index cfb8052cf1..a36955fa21 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -17,98 +17,120 @@ //! records. It achieves it by dropping privileges before replaying //! any WAL records, so that even if an attacker hijacks the Postgres //! process, he cannot escape out of it. -//! -use anyhow::Context; -use byteorder::{ByteOrder, LittleEndian}; -use bytes::{BufMut, Bytes, BytesMut}; -use nix::poll::*; -use pageserver_api::shard::TenantShardId; -use serde::Serialize; -use std::collections::VecDeque; -use std::io; -use std::io::prelude::*; -use std::ops::{Deref, DerefMut}; -use std::os::unix::io::AsRawFd; -use std::os::unix::prelude::CommandExt; -use std::process::Stdio; -use std::process::{Child, ChildStdin, ChildStdout, Command}; -use std::sync::{Arc, Mutex, MutexGuard, RwLock}; -use std::time::Duration; -use std::time::Instant; -use tracing::*; -use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock}; -#[cfg(feature = "testing")] -use std::sync::atomic::{AtomicUsize, Ordering}; +/// Process lifecycle and abstracction for the IPC protocol. +mod process; + +/// Code to apply [`NeonWalRecord`]s. +pub(crate) mod apply_neon; use crate::config::PageServerConf; use crate::metrics::{ - WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS, - WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, - WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, + WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, + WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME, }; use crate::repository::Key; use crate::walrecord::NeonWalRecord; +use anyhow::Context; +use bytes::{Bytes, BytesMut}; +use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus}; +use pageserver_api::shard::TenantShardId; +use std::sync::Arc; +use std::time::Duration; +use std::time::Instant; +use tracing::*; +use utils::lsn::Lsn; +use utils::sync::gate::GateError; +use utils::sync::heavier_once_cell; -use pageserver_api::key::{key_to_rel_block, key_to_slru_block}; -use pageserver_api::reltag::{RelTag, SlruKind}; -use postgres_ffi::pg_constants; -use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; -use postgres_ffi::v14::nonrelfile_utils::{ - mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, - transaction_id_set_status, -}; -use postgres_ffi::BLCKSZ; - -/// -/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster. -/// -/// In Postgres `BufferTag` structure is used for exactly the same purpose. -/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91). -/// -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)] -pub(crate) struct BufferTag { - pub rel: RelTag, - pub blknum: u32, -} - -struct ProcessInput { - stdin: ChildStdin, - n_requests: usize, -} - -struct ProcessOutput { - stdout: ChildStdout, - pending_responses: VecDeque>, - n_processed_responses: usize, -} - -/// -/// This is the real implementation that uses a Postgres process to -/// perform WAL replay. Only one thread can use the process at a time, -/// that is controlled by the Mutex. In the future, we might want to -/// launch a pool of processes to allow concurrent replay of multiple -/// records. +/// The real implementation that uses a Postgres process to +/// perform WAL replay. /// +/// Only one thread can use the process at a time, that is controlled by the +/// Mutex. In the future, we might want to launch a pool of processes to allow +/// concurrent replay of multiple records. pub struct PostgresRedoManager { tenant_shard_id: TenantShardId, conf: &'static PageServerConf, last_redo_at: std::sync::Mutex>, - redo_process: RwLock>>, + /// We use [`heavier_once_cell`] for + /// + /// 1. coalescing the lazy spawning of walredo processes ([`ProcessOnceCell::Spawned`]) + /// 2. prevent new processes from being spawned on [`Self::shutdown`] (=> [`ProcessOnceCell::ManagerShutDown`]). + /// + /// # Spawning + /// + /// Redo requests use the once cell to coalesce onto one call to [`process::WalRedoProcess::launch`]. + /// + /// Notably, requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the + /// their process object; we use [`Arc::clone`] for that. + /// + /// This is primarily because earlier implementations that didn't use [`heavier_once_cell`] + /// had that behavior; it's probably unnecessary. + /// The only merit of it is that if one walredo process encounters an error, + /// it can take it out of rotation (= using [`heavier_once_cell::Guard::take_and_deinit`]. + /// and retry redo, thereby starting the new process, while other redo tasks might + /// still be using the old redo process. But, those other tasks will most likely + /// encounter an error as well, and errors are an unexpected condition anyway. + /// So, probably we could get rid of the `Arc` in the future. + /// + /// # Shutdown + /// + /// See [`Self::launched_processes`]. + redo_process: heavier_once_cell::OnceCell, + + /// Gate that is entered when launching a walredo process and held open + /// until the process has been `kill()`ed and `wait()`ed upon. + /// + /// Manager shutdown waits for this gate to close after setting the + /// [`ProcessOnceCell::ManagerShutDown`] state in [`Self::redo_process`]. + /// + /// This type of usage is a bit unusual because gates usually keep track of + /// concurrent operations, e.g., every [`Self::request_redo`] that is inflight. + /// But we use it here to keep track of the _processes_ that we have launched, + /// which may outlive any individual redo request because + /// - we keep walredo process around until its quiesced to amortize spawn cost and + /// - the Arc may be held by multiple concurrent redo requests, so, just because + /// you replace the [`Self::redo_process`] cell's content doesn't mean the + /// process gets killed immediately. + /// + /// We could simplify this by getting rid of the [`Arc`]. + /// See the comment on [`Self::redo_process`] for more details. + launched_processes: utils::sync::gate::Gate, } -/// Can this request be served by neon redo functions -/// or we need to pass it to wal-redo postgres process? -fn can_apply_in_neon(rec: &NeonWalRecord) -> bool { - // Currently, we don't have bespoken Rust code to replay any - // Postgres WAL records. But everything else is handled in neon. - #[allow(clippy::match_like_matches_macro)] - match rec { - NeonWalRecord::Postgres { - will_init: _, - rec: _, - } => false, - _ => true, +/// See [`PostgresRedoManager::redo_process`]. +enum ProcessOnceCell { + Spawned(Arc), + ManagerShutDown, +} + +struct Process { + process: process::WalRedoProcess, + /// This field is last in this struct so the guard gets dropped _after_ [`Self::process`]. + /// (Reminder: dropping [`Self::process`] synchronously sends SIGKILL and then `wait()`s for it to exit). + _launched_processes_guard: utils::sync::gate::GateGuard, +} + +impl std::ops::Deref for Process { + type Target = process::WalRedoProcess; + + fn deref(&self) -> &Self::Target { + &self.process + } +} + +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("cancelled")] + Cancelled, + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +macro_rules! bail { + ($($arg:tt)*) => { + return Err($crate::walredo::Error::Other(::anyhow::anyhow!($($arg)*))); } } @@ -132,17 +154,17 @@ impl PostgresRedoManager { base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, pg_version: u32, - ) -> anyhow::Result { + ) -> Result { if records.is_empty() { - anyhow::bail!("invalid WAL redo request with no records"); + bail!("invalid WAL redo request with no records"); } let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID); let mut img = base_img.map(|p| p.1); - let mut batch_neon = can_apply_in_neon(&records[0].1); + let mut batch_neon = apply_neon::can_apply_in_neon(&records[0].1); let mut batch_start = 0; for (i, record) in records.iter().enumerate().skip(1) { - let rec_neon = can_apply_in_neon(&record.1); + let rec_neon = apply_neon::can_apply_in_neon(&record.1); if rec_neon != batch_neon { let result = if batch_neon { @@ -157,6 +179,7 @@ impl PostgresRedoManager { self.conf.wal_redo_timeout, pg_version, ) + .await }; img = Some(result?); @@ -177,6 +200,24 @@ impl PostgresRedoManager { self.conf.wal_redo_timeout, pg_version, ) + .await + } + } + + pub fn status(&self) -> WalRedoManagerStatus { + WalRedoManagerStatus { + last_redo_at: { + let at = *self.last_redo_at.lock().unwrap(); + at.and_then(|at| { + let age = at.elapsed(); + // map any chrono errors silently to None here + chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?) + }) + }, + process: self.redo_process.get().and_then(|p| match &*p { + ProcessOnceCell::Spawned(p) => Some(WalRedoManagerProcessStatus { pid: p.id() }), + ProcessOnceCell::ManagerShutDown => None, + }), } } } @@ -194,10 +235,53 @@ impl PostgresRedoManager { tenant_shard_id, conf, last_redo_at: std::sync::Mutex::default(), - redo_process: RwLock::new(None), + redo_process: heavier_once_cell::OnceCell::default(), + launched_processes: utils::sync::gate::Gate::default(), } } + /// Shut down the WAL redo manager. + /// + /// Returns `true` if this call was the one that initiated shutdown. + /// `true` may be observed by no caller if the first caller stops polling. + /// + /// After this future completes + /// - no redo process is running + /// - no new redo process will be spawned + /// - redo requests that need walredo process will fail with [`Error::Cancelled`] + /// - [`apply_neon`]-only redo requests may still work, but this may change in the future + /// + /// # Cancel-Safety + /// + /// This method is cancellation-safe. + pub async fn shutdown(&self) -> bool { + // prevent new processes from being spawned + let maybe_permit = match self.redo_process.get_or_init_detached().await { + Ok(guard) => { + if matches!(&*guard, ProcessOnceCell::ManagerShutDown) { + None + } else { + let (proc, permit) = guard.take_and_deinit(); + drop(proc); // this just drops the Arc, its refcount may not be zero yet + Some(permit) + } + } + Err(permit) => Some(permit), + }; + let it_was_us = if let Some(permit) = maybe_permit { + self.redo_process + .set(ProcessOnceCell::ManagerShutDown, permit); + true + } else { + false + }; + // wait for ongoing requests to drain and the refcounts of all Arc that + // we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s + // for the underlying process. + self.launched_processes.close().await; + it_was_us + } + /// This type doesn't have its own background task to check for idleness: we /// rely on our owner calling this function periodically in its own housekeeping /// loops. @@ -206,8 +290,7 @@ impl PostgresRedoManager { if let Some(last_redo_at) = *g { if last_redo_at.elapsed() >= idle_timeout { drop(g); - let mut guard = self.redo_process.write().unwrap(); - *guard = None; + drop(self.redo_process.get().map(|guard| guard.take_and_deinit())); } } } @@ -216,8 +299,11 @@ impl PostgresRedoManager { /// /// Process one request for WAL redo using wal-redo postgres /// + /// # Cancel-Safety + /// + /// Cancellation safe. #[allow(clippy::too_many_arguments)] - fn apply_batch_postgres( + async fn apply_batch_postgres( &self, key: Key, lsn: Lsn, @@ -226,50 +312,58 @@ impl PostgresRedoManager { records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, pg_version: u32, - ) -> anyhow::Result { + ) -> Result { *(self.last_redo_at.lock().unwrap()) = Some(Instant::now()); - let (rel, blknum) = key_to_rel_block(key).context("invalid record")?; + let (rel, blknum) = key.to_rel_block().context("invalid record")?; const MAX_RETRY_ATTEMPTS: u32 = 1; let mut n_attempts = 0u32; loop { - // launch the WAL redo process on first use - let proc: Arc = { - let proc_guard = self.redo_process.read().unwrap(); - match &*proc_guard { - None => { - // "upgrade" to write lock to launch the process - drop(proc_guard); - let mut proc_guard = self.redo_process.write().unwrap(); - match &*proc_guard { - None => { - let timer = - WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer(); - let proc = Arc::new( - WalRedoProcess::launch( - self.conf, - self.tenant_shard_id, - pg_version, - ) - .context("launch walredo process")?, - ); - timer.observe_duration(); - *proc_guard = Some(Arc::clone(&proc)); - proc - } - Some(proc) => Arc::clone(proc), - } + let proc: Arc = match self.redo_process.get_or_init_detached().await { + Ok(guard) => match &*guard { + ProcessOnceCell::Spawned(proc) => Arc::clone(proc), + ProcessOnceCell::ManagerShutDown => { + return Err(Error::Cancelled); } - Some(proc) => Arc::clone(proc), + }, + Err(permit) => { + let start = Instant::now(); + // acquire guard before spawning process, so that we don't spawn new processes + // if the gate is already closed. + let _launched_processes_guard = match self.launched_processes.enter() { + Ok(guard) => guard, + Err(GateError::GateClosed) => unreachable!( + "shutdown sets the once cell to `ManagerShutDown` state before closing the gate" + ), + }; + let proc = Arc::new(Process { + process: process::WalRedoProcess::launch( + self.conf, + self.tenant_shard_id, + pg_version, + ) + .context("launch walredo process")?, + _launched_processes_guard, + }); + let duration = start.elapsed(); + WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); + info!( + duration_ms = duration.as_millis(), + pid = proc.id(), + "launched walredo process" + ); + self.redo_process + .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit); + proc } }; let started_at = std::time::Instant::now(); // Relational WAL records are applied using wal-redo-postgres - let buf_tag = BufferTag { rel, blknum }; let result = proc - .apply_wal_records(buf_tag, &base_img, records, wal_redo_timeout) + .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout) + .await .context("apply_wal_records"); let duration = started_at.elapsed(); @@ -299,7 +393,7 @@ impl PostgresRedoManager { // next request will launch a new one. if let Err(e) = result.as_ref() { error!( - "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", + "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", records.len(), records.first().map(|p| p.0).unwrap_or(Lsn(0)), records.last().map(|p| p.0).unwrap_or(Lsn(0)), @@ -309,41 +403,46 @@ impl PostgresRedoManager { n_attempts, e, ); - // Avoid concurrent callers hitting the same issue. - // We can't prevent it from happening because we want to enable parallelism. - { - let mut guard = self.redo_process.write().unwrap(); - match &*guard { - Some(current_field_value) => { - if Arc::ptr_eq(current_field_value, &proc) { - // We're the first to observe an error from `proc`, it's our job to take it out of rotation. - *guard = None; - } - } - None => { - // Another thread was faster to observe the error, and already took the process out of rotation. - } - } - } + // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation. + // Note that there may be other tasks concurrent with us that also hold `proc`. + // We have to deal with that here. + // Also read the doc comment on field `self.redo_process`. + // // NB: there may still be other concurrent threads using `proc`. // The last one will send SIGKILL when the underlying Arc reaches refcount 0. - // NB: it's important to drop(proc) after drop(guard). Otherwise we'd keep - // holding the lock while waiting for the process to exit. - // NB: the drop impl blocks the current threads with a wait() system call for - // the child process. We dropped the `guard` above so that other threads aren't - // affected. But, it's good that the current thread _does_ block to wait. - // If we instead deferred the waiting into the background / to tokio, it could - // happen that if walredo always fails immediately, we spawn processes faster + // + // NB: the drop impl blocks the dropping thread with a wait() system call for + // the child process. In some ways the blocking is actually good: if we + // deferred the waiting into the background / to tokio if we used `tokio::process`, + // it could happen that if walredo always fails immediately, we spawn processes faster // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here, // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads. // This probably needs revisiting at some later point. + match self.redo_process.get() { + None => (), + Some(guard) => { + match &*guard { + ProcessOnceCell::ManagerShutDown => {} + ProcessOnceCell::Spawned(guard_proc) => { + if Arc::ptr_eq(&proc, guard_proc) { + // We're the first to observe an error from `proc`, it's our job to take it out of rotation. + guard.take_and_deinit(); + } else { + // Another task already spawned another redo process (further up in this method) + // and put it into `redo_process`. Do nothing, our view of the world is behind. + } + } + } + } + } + // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall. drop(proc); } else if n_attempts != 0 { info!(n_attempts, "retried walredo succeeded"); } n_attempts += 1; if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() { - return result; + return result.map_err(Error::Other); } } } @@ -357,7 +456,7 @@ impl PostgresRedoManager { lsn: Lsn, base_img: Option, records: &[(Lsn, NeonWalRecord)], - ) -> anyhow::Result { + ) -> Result { let start_time = Instant::now(); let mut page = BytesMut::new(); @@ -366,7 +465,7 @@ impl PostgresRedoManager { page.extend_from_slice(&fpi[..]); } else { // All the current WAL record types that we can handle require a base image. - anyhow::bail!("invalid neon WAL redo request with no base image"); + bail!("invalid neon WAL redo request with no base image"); } // Apply all the WAL records in the batch @@ -393,767 +492,15 @@ impl PostgresRedoManager { &self, key: Key, page: &mut BytesMut, - _record_lsn: Lsn, + record_lsn: Lsn, record: &NeonWalRecord, ) -> anyhow::Result<()> { - match record { - NeonWalRecord::Postgres { - will_init: _, - rec: _, - } => { - anyhow::bail!("tried to pass postgres wal record to neon WAL redo"); - } - NeonWalRecord::ClearVisibilityMapFlags { - new_heap_blkno, - old_heap_blkno, - flags, - } => { - // sanity check that this is modifying the correct relation - let (rel, blknum) = key_to_rel_block(key).context("invalid record")?; - assert!( - rel.forknum == VISIBILITYMAP_FORKNUM, - "ClearVisibilityMapFlags record on unexpected rel {}", - rel - ); - if let Some(heap_blkno) = *new_heap_blkno { - // Calculate the VM block and offset that corresponds to the heap block. - let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno); - let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno); - let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno); - - // Check that we're modifying the correct VM block. - assert!(map_block == blknum); - - // equivalent to PageGetContents(page) - let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; - - map[map_byte as usize] &= !(flags << map_offset); - } - - // Repeat for 'old_heap_blkno', if any - if let Some(heap_blkno) = *old_heap_blkno { - let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno); - let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno); - let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno); - - assert!(map_block == blknum); - - let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; - - map[map_byte as usize] &= !(flags << map_offset); - } - } - // Non-relational WAL records are handled here, with custom code that has the - // same effects as the corresponding Postgres WAL redo function. - NeonWalRecord::ClogSetCommitted { xids, timestamp } => { - let (slru_kind, segno, blknum) = - key_to_slru_block(key).context("invalid record")?; - assert_eq!( - slru_kind, - SlruKind::Clog, - "ClogSetCommitted record with unexpected key {}", - key - ); - for &xid in xids { - let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; - let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; - let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - - // Check that we're modifying the correct CLOG block. - assert!( - segno == expected_segno, - "ClogSetCommitted record for XID {} with unexpected key {}", - xid, - key - ); - assert!( - blknum == expected_blknum, - "ClogSetCommitted record for XID {} with unexpected key {}", - xid, - key - ); - - transaction_id_set_status( - xid, - pg_constants::TRANSACTION_STATUS_COMMITTED, - page, - ); - } - - // Append the timestamp - if page.len() == BLCKSZ as usize + 8 { - page.truncate(BLCKSZ as usize); - } - if page.len() == BLCKSZ as usize { - page.extend_from_slice(×tamp.to_be_bytes()); - } else { - warn!( - "CLOG blk {} in seg {} has invalid size {}", - blknum, - segno, - page.len() - ); - } - } - NeonWalRecord::ClogSetAborted { xids } => { - let (slru_kind, segno, blknum) = - key_to_slru_block(key).context("invalid record")?; - assert_eq!( - slru_kind, - SlruKind::Clog, - "ClogSetAborted record with unexpected key {}", - key - ); - for &xid in xids { - let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; - let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; - let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - - // Check that we're modifying the correct CLOG block. - assert!( - segno == expected_segno, - "ClogSetAborted record for XID {} with unexpected key {}", - xid, - key - ); - assert!( - blknum == expected_blknum, - "ClogSetAborted record for XID {} with unexpected key {}", - xid, - key - ); - - transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page); - } - } - NeonWalRecord::MultixactOffsetCreate { mid, moff } => { - let (slru_kind, segno, blknum) = - key_to_slru_block(key).context("invalid record")?; - assert_eq!( - slru_kind, - SlruKind::MultiXactOffsets, - "MultixactOffsetCreate record with unexpected key {}", - key - ); - // Compute the block and offset to modify. - // See RecordNewMultiXact in PostgreSQL sources. - let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; - let entryno = mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; - let offset = (entryno * 4) as usize; - - // Check that we're modifying the correct multixact-offsets block. - let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; - let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - assert!( - segno == expected_segno, - "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", - mid, - key - ); - assert!( - blknum == expected_blknum, - "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", - mid, - key - ); - - LittleEndian::write_u32(&mut page[offset..offset + 4], *moff); - } - NeonWalRecord::MultixactMembersCreate { moff, members } => { - let (slru_kind, segno, blknum) = - key_to_slru_block(key).context("invalid record")?; - assert_eq!( - slru_kind, - SlruKind::MultiXactMembers, - "MultixactMembersCreate record with unexpected key {}", - key - ); - for (i, member) in members.iter().enumerate() { - let offset = moff + i as u32; - - // Compute the block and offset to modify. - // See RecordNewMultiXact in PostgreSQL sources. - let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32; - let memberoff = mx_offset_to_member_offset(offset); - let flagsoff = mx_offset_to_flags_offset(offset); - let bshift = mx_offset_to_flags_bitshift(offset); - - // Check that we're modifying the correct multixact-members block. - let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; - let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - assert!( - segno == expected_segno, - "MultiXactMembersCreate record for offset {} with unexpected key {}", - moff, - key - ); - assert!( - blknum == expected_blknum, - "MultiXactMembersCreate record for offset {} with unexpected key {}", - moff, - key - ); - - let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]); - flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); - flagsval |= member.status << bshift; - LittleEndian::write_u32(&mut page[flagsoff..flagsoff + 4], flagsval); - LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid); - } - } - } + apply_neon::apply_in_neon(record, record_lsn, key, page)?; Ok(()) } } -/// -/// Command with ability not to give all file descriptors to child process -/// -trait CloseFileDescriptors: CommandExt { - /// - /// Close file descriptors (other than stdin, stdout, stderr) in child process - /// - fn close_fds(&mut self) -> &mut Command; -} - -impl CloseFileDescriptors for C { - fn close_fds(&mut self) -> &mut Command { - // SAFETY: Code executed inside pre_exec should have async-signal-safety, - // which means it should be safe to execute inside a signal handler. - // The precise meaning depends on platform. See `man signal-safety` - // for the linux definition. - // - // The set_fds_cloexec_threadsafe function is documented to be - // async-signal-safe. - // - // Aside from this function, the rest of the code is re-entrant and - // doesn't make any syscalls. We're just passing constants. - // - // NOTE: It's easy to indirectly cause a malloc or lock a mutex, - // which is not async-signal-safe. Be careful. - unsafe { - self.pre_exec(move || { - close_fds::set_fds_cloexec_threadsafe(3, &[]); - Ok(()) - }) - } - } -} - -struct WalRedoProcess { - #[allow(dead_code)] - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - // Some() on construction, only becomes None on Drop. - child: Option, - stdout: Mutex, - stdin: Mutex, - /// Counter to separate same sized walredo inputs failing at the same millisecond. - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize, -} - -impl WalRedoProcess { - // - // Start postgres binary in special WAL redo mode. - // - #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))] - fn launch( - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - pg_version: u32, - ) -> anyhow::Result { - let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. - let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; - - // Start postgres itself - let child = Command::new(pg_bin_dir_path.join("postgres")) - .arg("--wal-redo") - .stdin(Stdio::piped()) - .stderr(Stdio::piped()) - .stdout(Stdio::piped()) - .env_clear() - .env("LD_LIBRARY_PATH", &pg_lib_dir_path) - .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) - // The redo process is not trusted, and runs in seccomp mode that - // doesn't allow it to open any files. We have to also make sure it - // doesn't inherit any file descriptors from the pageserver, that - // would allow an attacker to read any files that happen to be open - // in the pageserver. - // - // The Rust standard library makes sure to mark any file descriptors with - // as close-on-exec by default, but that's not enough, since we use - // libraries that directly call libc open without setting that flag. - .close_fds() - .spawn_no_leak_child(tenant_shard_id) - .context("spawn process")?; - WAL_REDO_PROCESS_COUNTERS.started.inc(); - let mut child = scopeguard::guard(child, |child| { - error!("killing wal-redo-postgres process due to a problem during launch"); - child.kill_and_wait(WalRedoKillCause::Startup); - }); - - let stdin = child.stdin.take().unwrap(); - let stdout = child.stdout.take().unwrap(); - let stderr = child.stderr.take().unwrap(); - let stderr = tokio::process::ChildStderr::from_std(stderr) - .context("convert to tokio::ChildStderr")?; - macro_rules! set_nonblock_or_log_err { - ($file:ident) => {{ - let res = set_nonblock($file.as_raw_fd()); - if let Err(e) = &res { - error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed"); - } - res - }}; - } - set_nonblock_or_log_err!(stdin)?; - set_nonblock_or_log_err!(stdout)?; - - // all fallible operations post-spawn are complete, so get rid of the guard - let child = scopeguard::ScopeGuard::into_inner(child); - - tokio::spawn( - async move { - scopeguard::defer! { - debug!("wal-redo-postgres stderr_logger_task finished"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); - } - debug!("wal-redo-postgres stderr_logger_task started"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); - - use tokio::io::AsyncBufReadExt; - let mut stderr_lines = tokio::io::BufReader::new(stderr); - let mut buf = Vec::new(); - let res = loop { - buf.clear(); - // TODO we don't trust the process to cap its stderr length. - // Currently it can do unbounded Vec allocation. - match stderr_lines.read_until(b'\n', &mut buf).await { - Ok(0) => break Ok(()), // eof - Ok(num_bytes) => { - let output = String::from_utf8_lossy(&buf[..num_bytes]); - error!(%output, "received output"); - } - Err(e) => { - break Err(e); - } - } - }; - match res { - Ok(()) => (), - Err(e) => { - error!(error=?e, "failed to read from walredo stderr"); - } - } - }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) - ); - - Ok(Self { - conf, - tenant_shard_id, - child: Some(child), - stdin: Mutex::new(ProcessInput { - stdin, - n_requests: 0, - }), - stdout: Mutex::new(ProcessOutput { - stdout, - pending_responses: VecDeque::new(), - n_processed_responses: 0, - }), - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize::default(), - }) - } - - fn id(&self) -> u32 { - self.child - .as_ref() - .expect("must not call this during Drop") - .id() - } - - // Apply given WAL records ('records') over an old page image. Returns - // new page image. - // - #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] - fn apply_wal_records( - &self, - tag: BufferTag, - base_img: &Option, - records: &[(Lsn, NeonWalRecord)], - wal_redo_timeout: Duration, - ) -> anyhow::Result { - let input = self.stdin.lock().unwrap(); - - // Serialize all the messages to send the WAL redo process first. - // - // This could be problematic if there are millions of records to replay, - // but in practice the number of records is usually so small that it doesn't - // matter, and it's better to keep this code simple. - // - // Most requests start with a before-image with BLCKSZ bytes, followed by - // by some other WAL records. Start with a buffer that can hold that - // comfortably. - let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); - build_begin_redo_for_block_msg(tag, &mut writebuf); - if let Some(img) = base_img { - build_push_page_msg(tag, img, &mut writebuf); - } - for (lsn, rec) in records.iter() { - if let NeonWalRecord::Postgres { - will_init: _, - rec: postgres_rec, - } = rec - { - build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); - } else { - anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); - } - } - build_get_page_msg(tag, &mut writebuf); - WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); - - let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout); - - if res.is_err() { - // not all of these can be caused by this particular input, however these are so rare - // in tests so capture all. - self.record_and_log(&writebuf); - } - - res - } - - fn apply_wal_records0( - &self, - writebuf: &[u8], - input: MutexGuard, - wal_redo_timeout: Duration, - ) -> anyhow::Result { - let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small. - let mut nwrite = 0usize; - - while nwrite < writebuf.len() { - let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)]; - let n = loop { - match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) { - Err(nix::errno::Errno::EINTR) => continue, - res => break res, - } - }?; - - if n == 0 { - anyhow::bail!("WAL redo timed out"); - } - - // If 'stdin' is writeable, do write. - let in_revents = stdin_pollfds[0].revents().unwrap(); - if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() { - nwrite += proc.stdin.write(&writebuf[nwrite..])?; - } - if in_revents.contains(PollFlags::POLLHUP) { - // We still have more data to write, but the process closed the pipe. - anyhow::bail!("WAL redo process closed its stdin unexpectedly"); - } - } - let request_no = proc.n_requests; - proc.n_requests += 1; - drop(proc); - - // To improve walredo performance we separate sending requests and receiving - // responses. Them are protected by different mutexes (output and input). - // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process - // then there is not warranty that T1 will first granted output mutex lock. - // To address this issue we maintain number of sent requests, number of processed - // responses and ring buffer with pending responses. After sending response - // (under input mutex), threads remembers request number. Then it releases - // input mutex, locks output mutex and fetch in ring buffer all responses until - // its stored request number. The it takes correspondent element from - // pending responses ring buffer and truncate all empty elements from the front, - // advancing processed responses number. - - let mut output = self.stdout.lock().unwrap(); - let n_processed_responses = output.n_processed_responses; - while n_processed_responses + output.pending_responses.len() <= request_no { - // We expect the WAL redo process to respond with an 8k page image. We read it - // into this buffer. - let mut resultbuf = vec![0; BLCKSZ.into()]; - let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far - while nresult < BLCKSZ.into() { - let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)]; - // We do two things simultaneously: reading response from stdout - // and forward any logging information that the child writes to its stderr to the page server's log. - let n = loop { - match nix::poll::poll( - &mut stdout_pollfds[..], - wal_redo_timeout.as_millis() as i32, - ) { - Err(nix::errno::Errno::EINTR) => continue, - res => break res, - } - }?; - - if n == 0 { - anyhow::bail!("WAL redo timed out"); - } - - // If we have some data in stdout, read it to the result buffer. - let out_revents = stdout_pollfds[0].revents().unwrap(); - if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() { - nresult += output.stdout.read(&mut resultbuf[nresult..])?; - } - if out_revents.contains(PollFlags::POLLHUP) { - anyhow::bail!("WAL redo process closed its stdout unexpectedly"); - } - } - output - .pending_responses - .push_back(Some(Bytes::from(resultbuf))); - } - // Replace our request's response with None in `pending_responses`. - // Then make space in the ring buffer by clearing out any seqence of contiguous - // `None`'s from the front of `pending_responses`. - // NB: We can't pop_front() because other requests' responses because another - // requester might have grabbed the output mutex before us: - // T1: grab input mutex - // T1: send request_no 23 - // T1: release input mutex - // T2: grab input mutex - // T2: send request_no 24 - // T2: release input mutex - // T2: grab output mutex - // T2: n_processed_responses + output.pending_responses.len() <= request_no - // 23 0 24 - // T2: enters poll loop that reads stdout - // T2: put response for 23 into pending_responses - // T2: put response for 24 into pending_resposnes - // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back - // T2: takes its response_24 - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: releases output mutex - // T1: grabs output mutex - // T1: n_processed_responses + output.pending_responses.len() > request_no - // 23 2 23 - // T1: skips poll loop that reads stdout - // T1: takes its response_23 - // pending_responses now looks like this: Front None None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Back - // n_processed_responses now has value 25 - let res = output.pending_responses[request_no - n_processed_responses] - .take() - .expect("we own this request_no, nobody else is supposed to take it"); - while let Some(front) = output.pending_responses.front() { - if front.is_none() { - output.pending_responses.pop_front(); - output.n_processed_responses += 1; - } else { - break; - } - } - Ok(res) - } - - #[cfg(feature = "testing")] - fn record_and_log(&self, writebuf: &[u8]) { - let millis = std::time::SystemTime::now() - .duration_since(std::time::SystemTime::UNIX_EPOCH) - .unwrap() - .as_millis(); - - let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); - - // these files will be collected to an allure report - let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); - - let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); - - let res = std::fs::OpenOptions::new() - .write(true) - .create_new(true) - .read(true) - .open(path) - .and_then(|mut f| f.write_all(writebuf)); - - // trip up allowed_errors - if let Err(e) = res { - tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); - } else { - tracing::error!(filename, "erroring walredo input saved"); - } - } - - #[cfg(not(feature = "testing"))] - fn record_and_log(&self, _: &[u8]) {} -} - -impl Drop for WalRedoProcess { - fn drop(&mut self) { - self.child - .take() - .expect("we only do this once") - .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); - // no way to wait for stderr_logger_task from Drop because that is async only - } -} - -/// Wrapper type around `std::process::Child` which guarantees that the child -/// will be killed and waited-for by this process before being dropped. -struct NoLeakChild { - tenant_id: TenantShardId, - child: Option, -} - -impl Deref for NoLeakChild { - type Target = Child; - - fn deref(&self) -> &Self::Target { - self.child.as_ref().expect("must not use from drop") - } -} - -impl DerefMut for NoLeakChild { - fn deref_mut(&mut self) -> &mut Self::Target { - self.child.as_mut().expect("must not use from drop") - } -} - -impl NoLeakChild { - fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result { - let child = command.spawn()?; - Ok(NoLeakChild { - tenant_id, - child: Some(child), - }) - } - - fn kill_and_wait(mut self, cause: WalRedoKillCause) { - let child = match self.child.take() { - Some(child) => child, - None => return, - }; - Self::kill_and_wait_impl(child, cause); - } - - #[instrument(skip_all, fields(pid=child.id(), ?cause))] - fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) { - scopeguard::defer! { - WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc(); - } - let res = child.kill(); - if let Err(e) = res { - // This branch is very unlikely because: - // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it. - // - This is the only place that calls .kill() - // - We consume `self`, so, .kill() can't be called twice. - // - If the process exited by itself or was killed by someone else, - // .kill() will still succeed because we haven't wait()'ed yet. - // - // So, if we arrive here, we have really no idea what happened, - // whether the PID stored in self.child is still valid, etc. - // If this function were fallible, we'd return an error, but - // since it isn't, all we can do is log an error and proceed - // with the wait(). - error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process"); - } - - match child.wait() { - Ok(exit_status) => { - info!(exit_status = %exit_status, "wait successful"); - } - Err(e) => { - error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)"); - } - } - } -} - -impl Drop for NoLeakChild { - fn drop(&mut self) { - let child = match self.child.take() { - Some(child) => child, - None => return, - }; - let tenant_shard_id = self.tenant_id; - // Offload the kill+wait of the child process into the background. - // If someone stops the runtime, we'll leak the child process. - // We can ignore that case because we only stop the runtime on pageserver exit. - tokio::runtime::Handle::current().spawn(async move { - tokio::task::spawn_blocking(move || { - // Intentionally don't inherit the tracing context from whoever is dropping us. - // This thread here is going to outlive of our dropper. - let span = tracing::info_span!( - "walredo", - tenant_id = %tenant_shard_id.tenant_id, - shard_id = %tenant_shard_id.shard_slug() - ); - let _entered = span.enter(); - Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop); - }) - .await - }); - } -} - -trait NoLeakChildCommandExt { - fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result; -} - -impl NoLeakChildCommandExt for Command { - fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result { - NoLeakChild::spawn(tenant_id, self) - } -} - -// Functions for constructing messages to send to the postgres WAL redo -// process. See pgxn/neon_walredo/walredoproc.c for -// explanation of the protocol. - -fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec) { - let len = 4 + 1 + 4 * 4; - - buf.put_u8(b'B'); - buf.put_u32(len as u32); - - tag.ser_into(buf) - .expect("serialize BufferTag should always succeed"); -} - -fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec) { - assert!(base_img.len() == 8192); - - let len = 4 + 1 + 4 * 4 + base_img.len(); - - buf.put_u8(b'P'); - buf.put_u32(len as u32); - tag.ser_into(buf) - .expect("serialize BufferTag should always succeed"); - buf.put(base_img); -} - -fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec) { - let len = 4 + 8 + rec.len(); - - buf.put_u8(b'A'); - buf.put_u32(len as u32); - buf.put_u64(endlsn.0); - buf.put(rec); -} - -fn build_get_page_msg(tag: BufferTag, buf: &mut Vec) { - let len = 4 + 1 + 4 * 4; - - buf.put_u8(b'G'); - buf.put_u32(len as u32); - tag.ser_into(buf) - .expect("serialize BufferTag should always succeed"); -} - #[cfg(test)] mod tests { use super::PostgresRedoManager; @@ -1162,6 +509,7 @@ mod tests { use bytes::Bytes; use pageserver_api::shard::TenantShardId; use std::str::FromStr; + use tracing::Instrument; use utils::{id::TenantId, lsn::Lsn}; #[tokio::test] @@ -1186,6 +534,7 @@ mod tests { short_records(), 14, ) + .instrument(h.span()) .await .unwrap(); @@ -1213,6 +562,7 @@ mod tests { short_records(), 14, ) + .instrument(h.span()) .await .unwrap(); @@ -1233,6 +583,7 @@ mod tests { short_records(), 16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */ ) + .instrument(h.span()) .await .unwrap_err(); } @@ -1261,6 +612,7 @@ mod tests { // underscored because unused, except for removal at drop _repo_dir: camino_tempfile::Utf8TempDir, manager: PostgresRedoManager, + tenant_shard_id: TenantShardId, } impl RedoHarness { @@ -1277,7 +629,11 @@ mod tests { Ok(RedoHarness { _repo_dir: repo_dir, manager, + tenant_shard_id, }) } + fn span(&self) -> tracing::Span { + tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()) + } } } diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs new file mode 100644 index 0000000000..facf01004c --- /dev/null +++ b/pageserver/src/walredo/apply_neon.rs @@ -0,0 +1,319 @@ +use crate::pgdatadir_mapping::AuxFilesDirectory; +use crate::walrecord::NeonWalRecord; +use anyhow::Context; +use byteorder::{ByteOrder, LittleEndian}; +use bytes::{BufMut, BytesMut}; +use pageserver_api::key::Key; +use pageserver_api::reltag::SlruKind; +use postgres_ffi::pg_constants; +use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; +use postgres_ffi::v14::nonrelfile_utils::{ + mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, + transaction_id_set_status, +}; +use postgres_ffi::BLCKSZ; +use tracing::*; +use utils::bin_ser::BeSer; +use utils::lsn::Lsn; + +/// Can this request be served by neon redo functions +/// or we need to pass it to wal-redo postgres process? +pub(crate) fn can_apply_in_neon(rec: &NeonWalRecord) -> bool { + // Currently, we don't have bespoken Rust code to replay any + // Postgres WAL records. But everything else is handled in neon. + #[allow(clippy::match_like_matches_macro)] + match rec { + NeonWalRecord::Postgres { + will_init: _, + rec: _, + } => false, + _ => true, + } +} + +pub(crate) fn apply_in_neon( + record: &NeonWalRecord, + lsn: Lsn, + key: Key, + page: &mut BytesMut, +) -> Result<(), anyhow::Error> { + match record { + NeonWalRecord::Postgres { + will_init: _, + rec: _, + } => { + anyhow::bail!("tried to pass postgres wal record to neon WAL redo"); + } + NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno, + old_heap_blkno, + flags, + } => { + // sanity check that this is modifying the correct relation + let (rel, blknum) = key.to_rel_block().context("invalid record")?; + assert!( + rel.forknum == VISIBILITYMAP_FORKNUM, + "ClearVisibilityMapFlags record on unexpected rel {}", + rel + ); + if let Some(heap_blkno) = *new_heap_blkno { + // Calculate the VM block and offset that corresponds to the heap block. + let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno); + let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno); + let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno); + + // Check that we're modifying the correct VM block. + assert!(map_block == blknum); + + // equivalent to PageGetContents(page) + let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; + + map[map_byte as usize] &= !(flags << map_offset); + postgres_ffi::page_set_lsn(page, lsn); + } + + // Repeat for 'old_heap_blkno', if any + if let Some(heap_blkno) = *old_heap_blkno { + let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno); + let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno); + let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno); + + assert!(map_block == blknum); + + let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; + + map[map_byte as usize] &= !(flags << map_offset); + postgres_ffi::page_set_lsn(page, lsn); + } + } + // Non-relational WAL records are handled here, with custom code that has the + // same effects as the corresponding Postgres WAL redo function. + NeonWalRecord::ClogSetCommitted { xids, timestamp } => { + let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?; + assert_eq!( + slru_kind, + SlruKind::Clog, + "ClogSetCommitted record with unexpected key {}", + key + ); + for &xid in xids { + let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; + let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; + let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; + + // Check that we're modifying the correct CLOG block. + assert!( + segno == expected_segno, + "ClogSetCommitted record for XID {} with unexpected key {}", + xid, + key + ); + assert!( + blknum == expected_blknum, + "ClogSetCommitted record for XID {} with unexpected key {}", + xid, + key + ); + + transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_COMMITTED, page); + } + + // Append the timestamp + if page.len() == BLCKSZ as usize + 8 { + page.truncate(BLCKSZ as usize); + } + if page.len() == BLCKSZ as usize { + page.extend_from_slice(×tamp.to_be_bytes()); + } else { + warn!( + "CLOG blk {} in seg {} has invalid size {}", + blknum, + segno, + page.len() + ); + } + } + NeonWalRecord::ClogSetAborted { xids } => { + let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?; + assert_eq!( + slru_kind, + SlruKind::Clog, + "ClogSetAborted record with unexpected key {}", + key + ); + for &xid in xids { + let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; + let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; + let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; + + // Check that we're modifying the correct CLOG block. + assert!( + segno == expected_segno, + "ClogSetAborted record for XID {} with unexpected key {}", + xid, + key + ); + assert!( + blknum == expected_blknum, + "ClogSetAborted record for XID {} with unexpected key {}", + xid, + key + ); + + transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page); + } + } + NeonWalRecord::MultixactOffsetCreate { mid, moff } => { + let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?; + assert_eq!( + slru_kind, + SlruKind::MultiXactOffsets, + "MultixactOffsetCreate record with unexpected key {}", + key + ); + // Compute the block and offset to modify. + // See RecordNewMultiXact in PostgreSQL sources. + let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; + let entryno = mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; + let offset = (entryno * 4) as usize; + + // Check that we're modifying the correct multixact-offsets block. + let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; + let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; + assert!( + segno == expected_segno, + "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", + mid, + key + ); + assert!( + blknum == expected_blknum, + "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", + mid, + key + ); + + LittleEndian::write_u32(&mut page[offset..offset + 4], *moff); + } + NeonWalRecord::MultixactMembersCreate { moff, members } => { + let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?; + assert_eq!( + slru_kind, + SlruKind::MultiXactMembers, + "MultixactMembersCreate record with unexpected key {}", + key + ); + for (i, member) in members.iter().enumerate() { + let offset = moff + i as u32; + + // Compute the block and offset to modify. + // See RecordNewMultiXact in PostgreSQL sources. + let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32; + let memberoff = mx_offset_to_member_offset(offset); + let flagsoff = mx_offset_to_flags_offset(offset); + let bshift = mx_offset_to_flags_bitshift(offset); + + // Check that we're modifying the correct multixact-members block. + let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; + let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; + assert!( + segno == expected_segno, + "MultiXactMembersCreate record for offset {} with unexpected key {}", + moff, + key + ); + assert!( + blknum == expected_blknum, + "MultiXactMembersCreate record for offset {} with unexpected key {}", + moff, + key + ); + + let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]); + flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); + flagsval |= member.status << bshift; + LittleEndian::write_u32(&mut page[flagsoff..flagsoff + 4], flagsval); + LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid); + } + } + NeonWalRecord::AuxFile { file_path, content } => { + let mut dir = AuxFilesDirectory::des(page)?; + dir.upsert(file_path.clone(), content.clone()); + + page.clear(); + let mut writer = page.writer(); + dir.ser_into(&mut writer)?; + } + #[cfg(test)] + NeonWalRecord::Test { + append, + clear, + will_init, + } => { + if *will_init { + assert!(*clear, "init record must be clear to ensure correctness"); + } + if *clear { + page.clear(); + } + page.put_slice(append.as_bytes()); + } + } + Ok(()) +} + +#[cfg(test)] +mod test { + use bytes::Bytes; + use pageserver_api::key::AUX_FILES_KEY; + + use super::*; + use std::collections::HashMap; + + /// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile + #[test] + fn apply_aux_file_deltas() -> anyhow::Result<()> { + let base_dir = AuxFilesDirectory { + files: HashMap::from([ + ("two".to_string(), Bytes::from_static(b"content0")), + ("three".to_string(), Bytes::from_static(b"contentX")), + ]), + }; + let base_image = AuxFilesDirectory::ser(&base_dir)?; + + let deltas = vec![ + // Insert + NeonWalRecord::AuxFile { + file_path: "one".to_string(), + content: Some(Bytes::from_static(b"content1")), + }, + // Update + NeonWalRecord::AuxFile { + file_path: "two".to_string(), + content: Some(Bytes::from_static(b"content99")), + }, + // Delete + NeonWalRecord::AuxFile { + file_path: "three".to_string(), + content: None, + }, + ]; + + let file_path = AUX_FILES_KEY; + let mut page = BytesMut::from_iter(base_image); + + for record in deltas { + apply_in_neon(&record, Lsn(8), file_path, &mut page)?; + } + + let reconstructed = AuxFilesDirectory::des(&page)?; + let expect = HashMap::from([ + ("one".to_string(), Bytes::from_static(b"content1")), + ("two".to_string(), Bytes::from_static(b"content99")), + ]); + + assert_eq!(reconstructed.files, expect); + + Ok(()) + } +} diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs new file mode 100644 index 0000000000..9140d4f6aa --- /dev/null +++ b/pageserver/src/walredo/process.rs @@ -0,0 +1,382 @@ +mod no_leak_child; +/// The IPC protocol that pageserver and walredo process speak over their shared pipe. +mod protocol; + +use self::no_leak_child::NoLeakChild; +use crate::{ + config::PageServerConf, + metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, + span::debug_assert_current_span_has_tenant_id, + walrecord::NeonWalRecord, +}; +use anyhow::Context; +use bytes::Bytes; +use pageserver_api::{reltag::RelTag, shard::TenantShardId}; +use postgres_ffi::BLCKSZ; +#[cfg(feature = "testing")] +use std::sync::atomic::AtomicUsize; +use std::{ + collections::VecDeque, + process::{Command, Stdio}, + time::Duration, +}; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tracing::{debug, error, instrument, Instrument}; +use utils::{lsn::Lsn, poison::Poison}; + +pub struct WalRedoProcess { + #[allow(dead_code)] + conf: &'static PageServerConf, + #[cfg(feature = "testing")] + tenant_shard_id: TenantShardId, + // Some() on construction, only becomes None on Drop. + child: Option, + stdout: tokio::sync::Mutex>, + stdin: tokio::sync::Mutex>, + /// Counter to separate same sized walredo inputs failing at the same millisecond. + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize, +} + +struct ProcessInput { + stdin: tokio::process::ChildStdin, + n_requests: usize, +} + +struct ProcessOutput { + stdout: tokio::process::ChildStdout, + pending_responses: VecDeque>, + n_processed_responses: usize, +} + +impl WalRedoProcess { + // + // Start postgres binary in special WAL redo mode. + // + #[instrument(skip_all,fields(pg_version=pg_version))] + pub(crate) fn launch( + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + pg_version: u32, + ) -> anyhow::Result { + crate::span::debug_assert_current_span_has_tenant_id(); + + let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. + let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; + + use no_leak_child::NoLeakChildCommandExt; + // Start postgres itself + let child = Command::new(pg_bin_dir_path.join("postgres")) + // the first arg must be --wal-redo so the child process enters into walredo mode + .arg("--wal-redo") + // the child doesn't process this arg, but, having it in the argv helps indentify the + // walredo process for a particular tenant when debugging a pagserver + .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) + .stdin(Stdio::piped()) + .stderr(Stdio::piped()) + .stdout(Stdio::piped()) + .env_clear() + .env("LD_LIBRARY_PATH", &pg_lib_dir_path) + .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) + // NB: The redo process is not trusted after we sent it the first + // walredo work. Before that, it is trusted. Specifically, we trust + // it to + // 1. close all file descriptors except stdin, stdout, stderr because + // pageserver might not be 100% diligent in setting FD_CLOEXEC on all + // the files it opens, and + // 2. to use seccomp to sandbox itself before processing the first + // walredo request. + .spawn_no_leak_child(tenant_shard_id) + .context("spawn process")?; + WAL_REDO_PROCESS_COUNTERS.started.inc(); + let mut child = scopeguard::guard(child, |child| { + error!("killing wal-redo-postgres process due to a problem during launch"); + child.kill_and_wait(WalRedoKillCause::Startup); + }); + + let stdin = child.stdin.take().unwrap(); + let stdout = child.stdout.take().unwrap(); + let stderr = child.stderr.take().unwrap(); + let stderr = tokio::process::ChildStderr::from_std(stderr) + .context("convert to tokio::ChildStderr")?; + let stdin = + tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?; + let stdout = tokio::process::ChildStdout::from_std(stdout) + .context("convert to tokio::ChildStdout")?; + + // all fallible operations post-spawn are complete, so get rid of the guard + let child = scopeguard::ScopeGuard::into_inner(child); + + tokio::spawn( + async move { + scopeguard::defer! { + debug!("wal-redo-postgres stderr_logger_task finished"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); + } + debug!("wal-redo-postgres stderr_logger_task started"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); + + use tokio::io::AsyncBufReadExt; + let mut stderr_lines = tokio::io::BufReader::new(stderr); + let mut buf = Vec::new(); + let res = loop { + buf.clear(); + // TODO we don't trust the process to cap its stderr length. + // Currently it can do unbounded Vec allocation. + match stderr_lines.read_until(b'\n', &mut buf).await { + Ok(0) => break Ok(()), // eof + Ok(num_bytes) => { + let output = String::from_utf8_lossy(&buf[..num_bytes]); + error!(%output, "received output"); + } + Err(e) => { + break Err(e); + } + } + }; + match res { + Ok(()) => (), + Err(e) => { + error!(error=?e, "failed to read from walredo stderr"); + } + } + }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) + ); + + Ok(Self { + conf, + #[cfg(feature = "testing")] + tenant_shard_id, + child: Some(child), + stdin: tokio::sync::Mutex::new(Poison::new( + "stdin", + ProcessInput { + stdin, + n_requests: 0, + }, + )), + stdout: tokio::sync::Mutex::new(Poison::new( + "stdout", + ProcessOutput { + stdout, + pending_responses: VecDeque::new(), + n_processed_responses: 0, + }, + )), + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize::default(), + }) + } + + pub(crate) fn id(&self) -> u32 { + self.child + .as_ref() + .expect("must not call this during Drop") + .id() + } + + /// Apply given WAL records ('records') over an old page image. Returns + /// new page image. + /// + /// # Cancel-Safety + /// + /// Cancellation safe. + #[instrument(skip_all, fields(pid=%self.id()))] + pub(crate) async fn apply_wal_records( + &self, + rel: RelTag, + blknum: u32, + base_img: &Option, + records: &[(Lsn, NeonWalRecord)], + wal_redo_timeout: Duration, + ) -> anyhow::Result { + debug_assert_current_span_has_tenant_id(); + + let tag = protocol::BufferTag { rel, blknum }; + + // Serialize all the messages to send the WAL redo process first. + // + // This could be problematic if there are millions of records to replay, + // but in practice the number of records is usually so small that it doesn't + // matter, and it's better to keep this code simple. + // + // Most requests start with a before-image with BLCKSZ bytes, followed by + // by some other WAL records. Start with a buffer that can hold that + // comfortably. + let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); + protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); + if let Some(img) = base_img { + protocol::build_push_page_msg(tag, img, &mut writebuf); + } + for (lsn, rec) in records.iter() { + if let NeonWalRecord::Postgres { + will_init: _, + rec: postgres_rec, + } = rec + { + protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); + } else { + anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); + } + } + protocol::build_get_page_msg(tag, &mut writebuf); + WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); + + let Ok(res) = + tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await + else { + anyhow::bail!("WAL redo timed out"); + }; + + if res.is_err() { + // not all of these can be caused by this particular input, however these are so rare + // in tests so capture all. + self.record_and_log(&writebuf); + } + + res + } + + /// # Cancel-Safety + /// + /// When not polled to completion (e.g. because in `tokio::select!` another + /// branch becomes ready before this future), concurrent and subsequent + /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls. + /// Dispose of this process instance and create a new one. + async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result { + let request_no = { + let mut lock_guard = self.stdin.lock().await; + let mut poison_guard = lock_guard.check_and_arm()?; + let input = poison_guard.data_mut(); + input + .stdin + .write_all(writebuf) + .await + .context("write to walredo stdin")?; + let request_no = input.n_requests; + input.n_requests += 1; + poison_guard.disarm(); + request_no + }; + + // To improve walredo performance we separate sending requests and receiving + // responses. Them are protected by different mutexes (output and input). + // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process + // then there is not warranty that T1 will first granted output mutex lock. + // To address this issue we maintain number of sent requests, number of processed + // responses and ring buffer with pending responses. After sending response + // (under input mutex), threads remembers request number. Then it releases + // input mutex, locks output mutex and fetch in ring buffer all responses until + // its stored request number. The it takes correspondent element from + // pending responses ring buffer and truncate all empty elements from the front, + // advancing processed responses number. + + let mut lock_guard = self.stdout.lock().await; + let mut poison_guard = lock_guard.check_and_arm()?; + let output = poison_guard.data_mut(); + let n_processed_responses = output.n_processed_responses; + while n_processed_responses + output.pending_responses.len() <= request_no { + // We expect the WAL redo process to respond with an 8k page image. We read it + // into this buffer. + let mut resultbuf = vec![0; BLCKSZ.into()]; + output + .stdout + .read_exact(&mut resultbuf) + .await + .context("read walredo stdout")?; + output + .pending_responses + .push_back(Some(Bytes::from(resultbuf))); + } + // Replace our request's response with None in `pending_responses`. + // Then make space in the ring buffer by clearing out any seqence of contiguous + // `None`'s from the front of `pending_responses`. + // NB: We can't pop_front() because other requests' responses because another + // requester might have grabbed the output mutex before us: + // T1: grab input mutex + // T1: send request_no 23 + // T1: release input mutex + // T2: grab input mutex + // T2: send request_no 24 + // T2: release input mutex + // T2: grab output mutex + // T2: n_processed_responses + output.pending_responses.len() <= request_no + // 23 0 24 + // T2: enters poll loop that reads stdout + // T2: put response for 23 into pending_responses + // T2: put response for 24 into pending_resposnes + // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back + // T2: takes its response_24 + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: releases output mutex + // T1: grabs output mutex + // T1: n_processed_responses + output.pending_responses.len() > request_no + // 23 2 23 + // T1: skips poll loop that reads stdout + // T1: takes its response_23 + // pending_responses now looks like this: Front None None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Back + // n_processed_responses now has value 25 + let res = output.pending_responses[request_no - n_processed_responses] + .take() + .expect("we own this request_no, nobody else is supposed to take it"); + while let Some(front) = output.pending_responses.front() { + if front.is_none() { + output.pending_responses.pop_front(); + output.n_processed_responses += 1; + } else { + break; + } + } + poison_guard.disarm(); + Ok(res) + } + + #[cfg(feature = "testing")] + fn record_and_log(&self, writebuf: &[u8]) { + use std::sync::atomic::Ordering; + + let millis = std::time::SystemTime::now() + .duration_since(std::time::SystemTime::UNIX_EPOCH) + .unwrap() + .as_millis(); + + let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); + + // these files will be collected to an allure report + let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); + + let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); + + use std::io::Write; + let res = std::fs::OpenOptions::new() + .write(true) + .create_new(true) + .read(true) + .open(path) + .and_then(|mut f| f.write_all(writebuf)); + + // trip up allowed_errors + if let Err(e) = res { + tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); + } else { + tracing::error!(filename, "erroring walredo input saved"); + } + } + + #[cfg(not(feature = "testing"))] + fn record_and_log(&self, _: &[u8]) {} +} + +impl Drop for WalRedoProcess { + fn drop(&mut self) { + self.child + .take() + .expect("we only do this once") + .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); + // no way to wait for stderr_logger_task from Drop because that is async only + } +} diff --git a/pageserver/src/walredo/process/no_leak_child.rs b/pageserver/src/walredo/process/no_leak_child.rs new file mode 100644 index 0000000000..1a0d7039df --- /dev/null +++ b/pageserver/src/walredo/process/no_leak_child.rs @@ -0,0 +1,124 @@ +use tracing::instrument; +use tracing::{error, info}; + +use crate::metrics::WalRedoKillCause; +use crate::metrics::WAL_REDO_PROCESS_COUNTERS; + +use std::io; +use std::process::Command; + +use std::ops::DerefMut; + +use std::ops::Deref; + +use std::process::Child; + +use pageserver_api::shard::TenantShardId; + +/// Wrapper type around `std::process::Child` which guarantees that the child +/// will be killed and waited-for by this process before being dropped. +pub(crate) struct NoLeakChild { + pub(crate) tenant_id: TenantShardId, + pub(crate) child: Option, +} + +impl Deref for NoLeakChild { + type Target = Child; + + fn deref(&self) -> &Self::Target { + self.child.as_ref().expect("must not use from drop") + } +} + +impl DerefMut for NoLeakChild { + fn deref_mut(&mut self) -> &mut Self::Target { + self.child.as_mut().expect("must not use from drop") + } +} + +impl NoLeakChild { + pub(crate) fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result { + let child = command.spawn()?; + Ok(NoLeakChild { + tenant_id, + child: Some(child), + }) + } + + pub(crate) fn kill_and_wait(mut self, cause: WalRedoKillCause) { + let child = match self.child.take() { + Some(child) => child, + None => return, + }; + Self::kill_and_wait_impl(child, cause); + } + + #[instrument(skip_all, fields(pid=child.id(), ?cause))] + pub(crate) fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) { + scopeguard::defer! { + WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc(); + } + let res = child.kill(); + if let Err(e) = res { + // This branch is very unlikely because: + // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it. + // - This is the only place that calls .kill() + // - We consume `self`, so, .kill() can't be called twice. + // - If the process exited by itself or was killed by someone else, + // .kill() will still succeed because we haven't wait()'ed yet. + // + // So, if we arrive here, we have really no idea what happened, + // whether the PID stored in self.child is still valid, etc. + // If this function were fallible, we'd return an error, but + // since it isn't, all we can do is log an error and proceed + // with the wait(). + error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process"); + } + + match child.wait() { + Ok(exit_status) => { + info!(exit_status = %exit_status, "wait successful"); + } + Err(e) => { + error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)"); + } + } + } +} + +impl Drop for NoLeakChild { + fn drop(&mut self) { + let child = match self.child.take() { + Some(child) => child, + None => return, + }; + let tenant_shard_id = self.tenant_id; + // Offload the kill+wait of the child process into the background. + // If someone stops the runtime, we'll leak the child process. + // We can ignore that case because we only stop the runtime on pageserver exit. + tokio::runtime::Handle::current().spawn(async move { + tokio::task::spawn_blocking(move || { + // Intentionally don't inherit the tracing context from whoever is dropping us. + // This thread here is going to outlive of our dropper. + let span = tracing::info_span!( + "walredo", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug() + ); + let _entered = span.enter(); + Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop); + }) + .await + }); + } +} + +pub(crate) trait NoLeakChildCommandExt { + fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result; +} + +impl NoLeakChildCommandExt for Command { + fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result { + NoLeakChild::spawn(tenant_id, self) + } +} diff --git a/pageserver/src/walredo/process/protocol.rs b/pageserver/src/walredo/process/protocol.rs new file mode 100644 index 0000000000..b703344cc8 --- /dev/null +++ b/pageserver/src/walredo/process/protocol.rs @@ -0,0 +1,57 @@ +use bytes::BufMut; +use pageserver_api::reltag::RelTag; +use serde::Serialize; +use utils::bin_ser::BeSer; +use utils::lsn::Lsn; + +/// +/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster. +/// +/// In Postgres `BufferTag` structure is used for exactly the same purpose. +/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91). +/// +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)] +pub(crate) struct BufferTag { + pub rel: RelTag, + pub blknum: u32, +} + +pub(crate) fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec) { + let len = 4 + 1 + 4 * 4; + + buf.put_u8(b'B'); + buf.put_u32(len as u32); + + tag.ser_into(buf) + .expect("serialize BufferTag should always succeed"); +} + +pub(crate) fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec) { + assert!(base_img.len() == 8192); + + let len = 4 + 1 + 4 * 4 + base_img.len(); + + buf.put_u8(b'P'); + buf.put_u32(len as u32); + tag.ser_into(buf) + .expect("serialize BufferTag should always succeed"); + buf.put(base_img); +} + +pub(crate) fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec) { + let len = 4 + 8 + rec.len(); + + buf.put_u8(b'A'); + buf.put_u32(len as u32); + buf.put_u64(endlsn.0); + buf.put(rec); +} + +pub(crate) fn build_get_page_msg(tag: BufferTag, buf: &mut Vec) { + let len = 4 + 1 + 4 * 4; + + buf.put_u8(b'G'); + buf.put_u32(len as u32); + tag.ser_into(buf) + .expect("serialize BufferTag should always succeed"); +} diff --git a/pageserver/test_data/indices/mixed_workload/README.md b/pageserver/test_data/indices/mixed_workload/README.md new file mode 100644 index 0000000000..724274fcd9 --- /dev/null +++ b/pageserver/test_data/indices/mixed_workload/README.md @@ -0,0 +1,7 @@ + +# This was captured from one shard of a large tenant in staging. + +# It has a mixture of deltas and image layers, >1000 layers in total. + +# This is suitable for general smoke tests that want an index which is not +# trivially small, but doesn't contain weird/pathological cases. diff --git a/pageserver/test_data/indices/mixed_workload/index_part.json b/pageserver/test_data/indices/mixed_workload/index_part.json new file mode 100644 index 0000000000..cb4bfc4726 --- /dev/null +++ b/pageserver/test_data/indices/mixed_workload/index_part.json @@ -0,0 +1 @@ +{"version":7,"layer_metadata":{"000000067F00004005000060F300069883DB-000000067F00004005000060F300069D13FA__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300039A4000-000000067F00004005000060F300039C0000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300039FC000-000000067F00004005000060F30003A0F066__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000082C0F1-000000067F000040050081DB43000086E169__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000478000-000000067F00004005000060F3000047C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000012C000-000000067F00004005000060F300001F0000__0000018624969468":{"file_size":134422528,"generation":7,"shard":"0008"},"000000067F00004005000060F700019E8000-000000067F00004005000060F700019EC000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300018E0FE6-000000067F00004005000060F3000193A10B__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016E85370000004000-030000000000000000000000000000000002__0000018613F0A050":{"file_size":14172160,"generation":3,"shard":"0008"},"000000067F00004005000060F300034847BD-000000067F00004005000060F300034BD86C__000000EBC9213D59-000000EFA7EAA9E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C80000-000000067F000040050081DB430000C84000__000000BDAFECFC00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100000CCBA0-000000067F00004005000060F20100000000__0000000D80565628":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CA4000-000000067F00004005016EA00C0000CE0000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB00013BC000-000000067F00004005000060FB0001400000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001240000-000000067F00004005016EA00C0001244000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004EC52E9-000000067F00004005000060F30004F1638A__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E10000-000000067F000040050081DB430000E14000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000007F0F-000000067F0000400500EB4A480000037E20__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004FE8000-000000067F00004005000060F3000502905D__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000072C000-000000067F000040050081DB430000768000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005E3B48F-000000067F00004005000060F30005EF454F__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A100000B7E04-030000000000000000000000000000000002__000000E7C2F1B249-000000EBC9213D59":{"file_size":30146560,"generation":2,"shard":"0008"},"000000067F0000400501025D90000009029B-000000067F0000400501025D950100000000__0000011B688FEDC8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A10000-000000067F000040050081DB430000A14000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002F5105E-000000067F00004005000060F30002F9A0EB__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000187FE22-000000067F000040050081D80C0100000000__00000075E5D2A930":{"file_size":59138048,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001E8000-000000067F000040050081DB4300001EC000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000184C000-000000067F00004005000060FB000187FE22__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005A16504-000000067F00004005000060F30005A57691__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005C0000-000000067F00004005000060F100005C821A__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-000000067F00004005000060F00300000000__000001BCB572A4E0":{"file_size":2310144,"generation":17,"shard":"0008"},"000000067F00004005000060F30002214000-000000067F00004005000060F30002264247__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000110000-000000067F0000400500E3A2A10000114000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006864000-000000067F00004005000060F30006868000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000D0000-000000067F0000400500DBCED500000D4000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000274C000-000000067F00004005000060F30002790000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00009274AB-030000000000000000000000000000000002__000001935283F9B9-00000196C9018F59":{"file_size":60104704,"generation":11,"shard":"0008"},"000000067F0000400500C782E4000023D359-000000067F0000400500C782E400002A5E4B__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001780DB7-000000067F00004005000060F700017E1391__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004E4000-000000067F000040050081DB4300004F8000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018C0000-000000067F00004005016EA00C00018C4000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F300056DC000-000000067F00004005000060F300056E0000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001F14230-000000067F000040050081D80C0100000000__0000018613F0A050":{"file_size":59138048,"generation":3,"shard":"0008"},"000000067F00004005010F9F120000004000-030000000000000000000000000000000002__0000012E77D3BF00":{"file_size":105775104,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D80000-000000067F00004005000060F30002D84000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000122BBF-000000067F00004005000060F7000013B18E__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002B10000-000000067F00004005000060F30002B88FF2__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006320C60-000000067F00004005000060F30006349DA2__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000079E393-000000067F00004005016EA00C00009BF728__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500F67839000005C000-000000067F0000400500F67839000006AEF4__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D7F71A-030000000000000000000000000000000002__000001BA93C39481-000001BCB572A4E1":{"file_size":50880512,"generation":17,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572C481-000001BCB572C5D9":{"file_size":24576,"generation":20,"shard":"0008"},"000000067F00004005000060F70001570000-000000067F00004005000060F70001574000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000042C000-000000067F00004005000060F30000478000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572C5D9-000001BCB572DFF9":{"file_size":24576,"generation":22,"shard":"0008"},"000000067F00004005000060FB00015FCD31-030000000000000000000000000000000002__000000698F2C3A38":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30005C841ED-000000067F00004005000060F30005C95225__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001B4A119-000000067F00004005000060F30100000000__0000008196C976A1-0000008625CF2891":{"file_size":200990720,"generation":2,"shard":"0008"},"000000067F00004005000060F300019790A2-000000067F00004005000060F300019C2056__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001838000-000000067F00004005000060FB000183C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001C00FE1-000000067F00004005000060F30001C0A0A3__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300056E0000-000000067F00004005000060F300056E4000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000BBD532-000000067F00004005000060F80100000000__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":96477184,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F9B026-000000067F00004005000060F30100000000__00000047E31D98D1-0000004C49155071":{"file_size":173834240,"generation":2,"shard":"0008"},"000000067F000040050081DB430000500000-000000067F000040050081DB430000504000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004971675-000000067F00004005000060F300049B26A8__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003102107-000000067F00004005000060F300031130BC__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300048A4000-000000067F00004005000060F30004900000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004B8000-000000067F00004005016EA00C00004BC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001A71688-000000067F00004005000060FB0001A8A1CD__0000007E3A9BFD29-0000008196C976A1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E60000-000000067F00004005000060F30000E64000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300023B0FF7-000000067F00004005000060F300024020ED__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003F8000-000000067F00004005016EA00C00003FC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004B2B250-000000067F00004005000060F30004B5431C__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000050000-000000067F00004005000060F700000885C5__000000044854EBD1-00000008B6B51879":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB000097168A-030000000000000000000000000000000002__00000028C365FBE1-0000002D2A8E0B81":{"file_size":120299520,"generation":2,"shard":"0008"},"000000067F00004005000060F3000625C000-000000067F00004005000060F30006270000__0000017171761D90":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001BA8000-000000067F00004005000060FB0001BC0B44__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003344134-000000067F00004005000060F3000336D193__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B10FFF-000000067F00004005000060F30006B22072__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E34000-000000067F00004005000060F30006E70000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000008238C-000000067F00004005000060F60100000000__00000139CF156B58":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000A30000-000000067F00004005000060F70100000000__0000009DF02C1241-000000A173C00489":{"file_size":269688832,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001CE16ED-000000067F000040050081D80C0100000000__0000008DDCD70B68":{"file_size":59138048,"generation":2,"shard":"0008"},"000000067F000040050081DB4300011B0000-000000067F000040050081DB4300011B4000__000000DBD29DC248":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000010C0D1-000000067F0000400500F3A25C000011E137__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000004000-000000067F00004005000060F70000029ED0__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F60000058F73-000000067F00004005000060F60100000000__000000E4D847F4E0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001C3F636-000000067F00004005016EA00C0001CC74D7__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000101089-000000067F0000400500EB4A48000012798C__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300007A8000-000000067F000040050081DB4300007AC000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F1000010043F-000000067F00004005000060F20100000000__0000000D55A212C9-000000114A805939":{"file_size":182878208,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001EAC000-000000067F00004005000060FB0001F14230__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000616F6B2-000000067F00004005000060F300061B8705__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005C9E3C4-000000067F00004005000060F30005CCF3C5__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AA0000-000000067F00004005000060F70001AB05CB__0000015304A396B9-0000015670D6AFD9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F3000073C000-000000067F00004005000060F30000775A02__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003AE21D-000000067F000040050081DB43000045029C__0000008DBE2855F9-000000923719A971":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B04000-000000067F00004005000060F70001B18000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E74000-000000067F00004005000060F30000E78000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000182C000-000000067F00004005000060F700018871D6__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000DE8B45-000000067F00004005000060FB0000DF968A__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E78000-000000067F00004005000060F30000E7C000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000140C000-030000000000000000000000000000000002__000000603CA8F2F0":{"file_size":89522176,"generation":2,"shard":"0008"},"000000067F00004005000060FB00011CA1CD-000000067F00004005000060FB00011F2D11__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000144FB4E-000000067F00004005016EA00C00014B79E7__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F700015A195C-000000067F00004005000060F80100000000__0000012E77D3BF00":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FC0000-000000067F00004005000060F70000FC4000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000012798C-000000067F0000400500EB4A48000013F89B__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CE4000-000000067F00004005016EA00C0001D18000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30005FC519A-000000067F00004005000060F30005FE621A__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000370000-000000067F00004005016EA00C0000374000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0001760000-000000067F00004005016EA00C0001764000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F100003A0000-000000067F00004005000060F100003B8214__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300006B0000-000000067F00004005000060F300006B4000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00004E1FF6-030000000000000000000000000000000002__000000174479FC18":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F3000502905D-000000067F00004005000060F300050321C0__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AB05CB-000000067F00004005000060F70001AB8B97__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000151F7C5-000000067F00004005016EA00C000158F667__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70000B9C000-000000067F00004005000060F80100000000__000000AFE87558B0":{"file_size":83533824,"generation":2,"shard":"0008"},"000000067F00004005000060F7000141882A-000000067F00004005000060F80100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000018F5CD-000000067F0000400500EB4A48000019F4DD__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000196C000-000000067F00004005000060F70001990000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300029C623C-000000067F00004005000060F30100000000__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":81313792,"generation":2,"shard":"0008"},"000000067F00004005000060F300027C0000-000000067F00004005000060F300027C4000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300000001487-000000067F0000400500FB3D300100000000__0000010FB1BE19B9-00000113456156F1":{"file_size":24428544,"generation":2,"shard":"0008"},"000000067F00004005000060F300056D8000-000000067F00004005000060F300056DC000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700003C0000-000000067F00004005000060F700003C4000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000664E3CA-000000067F00004005000060F30100000000__000001715E483C79-000001751A7D7589":{"file_size":288645120,"generation":2,"shard":"0008"},"000000067F000040050100D04D000004B5AD-000000067F000040050100D04D00000634BB__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500DBCED5000002C000-000000067F0000400500DBCED50000078000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000C20000-000000067F00004005016EA00C0000C24000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70001B30000-000000067F00004005000060F70001B34000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700009C035C-000000067F00004005000060F80100000000__0000009A1ABDE921-0000009DF02C1241":{"file_size":264159232,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B33945-000000067F00004005000060F30100000000__0000010FB1BE19B9-00000113456156F1":{"file_size":155344896,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000079FCFA-000000067F00004005016EA00C00007C7B9C__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000218000-000000067F0000400500EB4A48000021C000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D1D0DC-000000067F00004005000060F30005D76250__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000149B774-000000067F00004005000060FB00014A42B8__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D0B155-000000067F00004005000060F30003D14206__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300020FC052-000000067F00004005000060F300021050B0__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002268000-000000067F00004005000060F300022B9050__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004FC000-000000067F000040050081DB430000500000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060A93B5-000000067F00004005000060F300060C2210__0000016834A3FC91-0000016B49A934C1":{"file_size":263479296,"generation":2,"shard":"0008"},"000000067F00004005000060F3000674C000-000000067F00004005000060F30006798000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300007F913A-030000000000000000000000000000000002__000000A5A3F27398":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000F4000-030000000000000000000000000000000002__000000E4D847F4E0":{"file_size":103907328,"generation":2,"shard":"0008"},"000000067F00004005000060F70001348000-000000067F00004005000060F70100000000__0000011B632CC319-0000011F1A40FA69":{"file_size":270753792,"generation":2,"shard":"0008"},"000000067F00004005000060F10000030000-000000067F00004005000060F20100000000__000000021DC73119-000000044854EBD1":{"file_size":267771904,"generation":2,"shard":"0008"},"000000067F000040050107B54701FFFFFFFF-000000067F000040050107B5470300000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006674000-000000067F00004005000060F30006690000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050107B54701FFFFFFFF-000000067F000040050107B5470300000000__0000011B632CC319-0000011F1A40FA69":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30000298000-000000067F00004005000060F3000029C000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F185D4-000000067F00004005000060F80100000000__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":249135104,"generation":2,"shard":"0008"},"000000067F00004005000060F300049CB712-000000067F00004005000060F30004A048A8__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700004B1E77-000000067F00004005000060F80100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B00000-000000067F00004005000060F30004B1111A__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D14000-000000067F00004005000060F30006D30000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00002D77AE-030000000000000000000000000000000002__000001880F984A29-0000018C496B6DB1":{"file_size":81018880,"generation":11,"shard":"0008"},"000000067F00004005000060F300002D0000-000000067F00004005000060F30000370FD1__0000000D55A212C9-000000114A805939":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D790000028000-000000067F0000400500D69D79000002C000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002170000-000000067F00004005000060F30002174000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F59017-000000067F00004005000060F30000F91FFF__00000047E31D98D1-0000004C49155071":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000006A37A-000000067F00004005000060F60100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F6000002F012-000000067F00004005000060F60100000000__00000081AA3C40F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30005614000-000000067F00004005000060F30005688000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300036C8000-000000067F00004005000060F300036F91FE__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001ADF63C-030000000000000000000000000000000002__000001B3E1B95181-000001B6FFE46BC9":{"file_size":64421888,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000057D31-000000067F0000400500EB4A48000008FC41__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F58000-000000067F00004005016EA00C0000F5C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000908000-000000067F000040050081DB43000094A076__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000471200E-000000067F00004005000060F3000474302B__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300000403DA-030000000000000000000000000000000002__00000075E5D2A930":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F60000079C4E-000000067F00004005000060F60100000000__0000012E77D3BF00":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500F67839000003C000-000000067F0000400500F678390000058000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001C80000-000000067F00004005000060FB0001C84000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300047F5138-000000067F00004005000060F3000480620C__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B5C09E-000000067F00004005000060F30006BAD108__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001410F57-000000067F00004005000060F70001429534__00000122A7BB7B29-0000012694E36301":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006B4000-000000067F00004005016EA00C00006E0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700009605D8-000000067F00004005000060F80100000000__000000923719A971-00000096262826C9":{"file_size":251338752,"generation":2,"shard":"0008"},"000000067F00004005000060F70000C8CD0C-000000067F00004005000060F80100000000__000000BAC0041E18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700012B8000-000000067F00004005000060F80100000000__00000113456156F1-00000117EDA82C11":{"file_size":265781248,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000049C000-000000067F00004005016EA00C00004A8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000C78000-000000067F00004005000060F70000C7C000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B4B0BB-000000067F00004005000060F30006B5C09E__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001844000-000000067F00004005000060FB0001848000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300067F0000-000000067F00004005000060F300067F4000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C80000-000000067F00004005000060F30004C84000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A4C000-000000067F00004005000060F30002A98000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002480000-000000067F00004005000060F30002484000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000306A02D-000000067F00004005000060F30100000000__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":191299584,"generation":2,"shard":"0008"},"000000067F00004005000060F70001510000-000000067F00004005000060F70001514000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005BDB15B-000000067F00004005000060F30005C841ED__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E98000-000000067F00004005000060FB0001E9C000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300057942F4-000000067F00004005000060F300057DD292__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005698000-000000067F00004005000060F3000569C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002983166-000000067F00004005000060F3000299C28F__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000C24000-000000067F00004005016EA00C0000CA0000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F300033D7D7C-000000067F00004005000060F30003458D42__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A1C000-000000067F000040050081DB430000A30379__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D93639-000000067F00004005000060F50100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029C195-000000067F00004005016EA00C000029C196__000001BA93C39481-000001BCB572A4E1":{"file_size":32768,"generation":17,"shard":"0008"},"000000067F00004005000060F30000A5F9BB-000000067F00004005000060F60100000000__000000321AA80270":{"file_size":81657856,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D84000-000000067F00004005000060F30002D93639__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D1C000-000000067F00004005000060F30005D70000__000001684518AF20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010C8000-000000067F000040050081DB4300010E2072__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000058AF5E-000000067F000040050081DB4300005BCFD7__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000034611E-000000067F00004005000060F80100000000__000000321AA80270":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300000C1095-000000067F00004005000060F60100000000__000000021DC73119-000000044854EBD1":{"file_size":220635136,"generation":2,"shard":"0008"},"000000067F00004005000060FB000183C000-000000067F00004005000060FB0001840000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C8729E-000000067F00004005000060F30006C98340__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005138000-000000067F00004005000060F3000513C000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300053E30C3-000000067F00004005000060F300053F40CC__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000002C000-000000067F000040050081DB4300000403DA__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004970000-000000067F00004005000060F30004974000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C08000-000000067F00004005000060F30003C0C000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000103AD12-000000067F00004005000060FB000104B856__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004AC000-000000067F00004005016EA00C00004B8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000DB7D33-000000067F00004005016EA00C0000E47BD2__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30001F30000-000000067F00004005000060F30001F34000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050109FFA2000000C000-030000000000000000000000000000000002__000001180B3FF408":{"file_size":70516736,"generation":2,"shard":"0008"},"000000067F00004005000060F700017405D4-000000067F00004005000060F70001758B92__000001398B56A519-0000013C9C0E3339":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300030B0000-000000067F00004005000060F300030C0FE5__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010660F501FFFFFFFF-000000067F00004005010660F50300000000__00000122A7BB7B29-0000012694E36301":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002168000-000000067F00004005000060F3000216C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F60000046A83-000000067F00004005000060F60100000000__000000BAC0041E18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001368000-000000067F00004005000060FB000136C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000184000-000000067F00004005000060F80100000000__000000174479FC18":{"file_size":93143040,"generation":2,"shard":"0008"},"000000067F00004005000060FB00012A8000-000000067F00004005000060FB0100000000__00000057593D8169-0000005C01565329":{"file_size":273711104,"generation":2,"shard":"0008"},"000000067F00004005000060F700007B0000-000000067F00004005000060F700007D05C8__00000075CC373F31-00000079F2A2F311":{"file_size":268468224,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001680B45-000000067F00004005000060FB000169968A__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300050CC000-000000067F00004005000060F300050E8000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-000000067F00004005000060F00300000000__0000018613F0A050":{"file_size":2310144,"generation":3,"shard":"0008"},"000000067F00004005000060F70001B1C000-000000067F00004005000060F70001B30000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F50000-000000067F00004005000060F70000F705D6__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050109CD330100000000-000000067F000040050109FFA2000000C000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800001FC000-000000067F0000400500EB4A480000200000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000240B12A-000000067F00004005000060F300024440AE__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000008228D-000000067F00004005000060F60100000000__000000027AF9D7D0":{"file_size":24576,"generation":1,"shard":"0008"},"000000067F00004005016EA00C000042C000-000000067F00004005016EA00C0000478000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0000FF8000-000000067F00004005000060FB0001000B44__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000169968A-000000067F00004005000060FB00016D21CF__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005F821C-000000067F00004005000060F20100000000__000000636DE92159-000000663565F8C9":{"file_size":149954560,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D7C000-000000067F00004005016EA00C0001E03DD8__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F0000400500F678390000058000-000000067F0000400500F67839000005C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800003A7E20-000000067F0000400500EB4A4800003BFD31__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001228000-000000067F00004005016EA00C000122C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000F0C0E9-000000067F000040050081DB430000F4E15B__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000758000-000000067F00004005000060F80100000000__0000006DDB29D589-000000722F474369":{"file_size":264781824,"generation":2,"shard":"0008"},"000000067F00004005000060F300068640AF-000000067F00004005000060F3000686D0DE__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000047C000-000000067F00004005016EA00C0000498000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30006166575-000000067F00004005000060F3000616F6B2__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B18000-000000067F00004005000060F70001B1C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700016EC000-000000067F00004005000060F70001708000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005CCF3C5-000000067F00004005000060F30005D184F6__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002848000-000000067F00004005000060F3000285901B__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300039C0000-000000067F00004005000060F300039C4000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002464000-000000067F00004005000060F30002480000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00011D0000-000000067F00004005016EA00C00011D4000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003D44283-000000067F00004005000060F30003D952B0__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480100000000-000000067F0000400500EE16BC0000044000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000533205E-000000067F00004005000060F300053E30C3__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000009A255-000000067F00004005000060F60300000000__0000017CC2FD7288":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B00000-000000067F00004005000060F70001B04000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004958000-000000067F00004005000060F3000495C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000518000-000000067F00004005000060F80100000000__0000004C49155071-0000004F31878919":{"file_size":262373376,"generation":2,"shard":"0008"},"000000067F00004005000060F300064D8000-000000067F00004005000060F3000658113F__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FDA1F80000014000-000000067F0000400500FDA1F80000020D42__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000284000-000000067F00004005000060FB00002D4B6A__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000CDBB9C-000000067F00004005000060F80100000000__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":148865024,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001298000-000000067F00004005016EA00C000129C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001DD8000-000000067F00004005000060FB0001DF0B43__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001220000-000000067F00004005000060F70001224000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002908000-000000067F00004005000060F30002920FA0__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F5C000-000000067F00004005016EA00C0000F90000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0001E03DD8-030000000000000000000000000000000002__000001BCB572A4E0":{"file_size":139264,"generation":17,"shard":"0008"},"000000067F00004005000060F30003998000-000000067F00004005000060F3000399C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00014E75C6-030000000000000000000000000000000002__000001A931C135B1-000001AC25760149":{"file_size":51486720,"generation":11,"shard":"0008"},"000000067F00004005010660F500000F44CB-000000067F00004005010660F70100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003FC000-000000067F00004005016EA00C0000400000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003810000-000000067F00004005000060F30003849093__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B00000-000000067F00004005000060F30006B10FFF__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001541688-000000067F00004005000060FB000154A1CD__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001098000-000000067F00004005000060FB000109C000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700011912D4-000000067F00004005000060F80100000000__00000104BD37F348":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A40000-000000067F00004005000060F30002A44000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001448000-000000067F00004005000060F300014B0F7B__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001009688-000000067F00004005000060FB000102A1CE__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001A4000-000000067F0000400500EE16BC00001E0000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B58B45-000000067F00004005000060FB0000B6168A__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000AC000-000000067F0000400500D69D7900000BDAF5__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000193A10B-000000067F00004005000060F30100000000__00000075CC373F31-00000079F2A2F311":{"file_size":198148096,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00005A0000-000000067F00004005016EA00C00005A4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700000E0000-000000067F00004005000060F80100000000__0000000D80565628":{"file_size":112009216,"generation":2,"shard":"0008"},"000000067F00004005000060F3000690F2FD-000000067F00004005000060F300069883DB__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300004C6B83-000000067F00004005000060F60100000000__000000174479FC18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001E18000-000000067F00004005000060F30001E50FF3__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300043B4000-000000067F00004005000060F300043B8000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100006C0000-000000067F00004005000060F20100000000__000000722F474369-00000075CC373F31":{"file_size":267665408,"generation":2,"shard":"0008"},"000000067F00004005000060F70000A78000-000000067F00004005000060F70000A7C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00011C1688-000000067F00004005000060FB00011CA1CD__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004E8000-000000067F00004005016EA00C00004EC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000257A6F-000000067F00004005016EA00C000029F90B__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001590000-000000067F00004005000060FB0001594000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000193189A-030000000000000000000000000000000002__000001B3F17FE4E0":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005000060F300027C4000-000000067F00004005000060F30002828000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000B40000-000000067F00004005016EA00C0000B44000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30006694000-000000067F00004005000060F300066F0000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015C8000-000000067F00004005000060FB00015CC000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B84000-000000067F00004005000060F30003B90000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006704000-000000067F00004005000060F30006748000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000793506-030000000000000000000000000000000002__0000002427BD8BD0":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30004F1638A-000000067F00004005000060F30100000000__000001440D3D0C69-0000014784964B91":{"file_size":93708288,"generation":2,"shard":"0008"},"000000067F00004005000060F80100000000-000000067F00004005000060FB0000014000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F70000180000-000000067F00004005000060F70000184000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A2693B-000000067F00004005000060F30004A7F98F__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C71F27-000000067F00004005000060F30002C9AFB8__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300038075AF-000000067F00004005000060F30100000000__000000FF8B261599-000001048B25A8E9":{"file_size":49823744,"generation":2,"shard":"0008"},"000000067F0000400500DBCED50000028000-000000067F0000400500DBCED5000002C000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004188000-000000067F00004005000060F300041D9101__0000012694E36301-0000012A3F140591":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30006868000-000000067F00004005000060F50100000000__00000178C5D5D3A8":{"file_size":116645888,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A789A0-000000067F00004005000060F30003AB9907__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000368000-000000067F0000400500EB4A48000036FF11__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300047EC0CA-000000067F00004005000060F300047F5138__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AB8B97-000000067F00004005000060F70001AC115C__0000015304A396B9-0000015670D6AFD9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D61283-000000067F00004005000060F70000D8985C__000000C462B3C2A9-000000C824C09619":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300011D1111-000000067F00004005000060F3000122A1D5__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001967D34-000000067F00004005016EA00C000197FBD0__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500FA2AD3000004D85C-000000067F0000400500FB3D300100000000__0000010D77B487A0":{"file_size":31309824,"generation":2,"shard":"0008"},"000000067F000040050081DB4300005BCFD7-000000067F000040050081DB4300005D704F__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000004000-000000067F00004005000060F100000260F2__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F0000400500EE16BC00000F8000-000000067F0000400500EE16BC000014158C__000000F901689359-000000FCCD5238B1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000921E8A-000000067F00004005000060F60100000000__00000028C365FBE1-0000002D2A8E0B81":{"file_size":228564992,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001190000-000000067F00004005000060FB0001198B44__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300067A0000-000000067F00004005000060F300067A4000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000200000-000000067F00004005000060F10000204000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FF0FBB-000000067F00004005000060F3000407201D__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000001C000-000000067F00004005000060F3000008228D__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0001CD7376-030000000000000000000000000000000002__000001B6FFE46BC9-000001BA93C39481":{"file_size":70238208,"generation":11,"shard":"0008"},"000000067F00004005000060FB0000EBC000-000000067F00004005000060FB0000EC8000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000293210E-000000067F00004005000060F30002983166__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000151F271-000000067F00004005000060F30100000000__000000636DE92159-000000663565F8C9":{"file_size":41271296,"generation":2,"shard":"0008"},"000000067F00004005000060F30004880000-000000067F00004005000060F30004884000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000518222-000000067F00004005000060F20100000000__0000005413AB3641-00000057593D8169":{"file_size":169492480,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003E0000-000000067F00004005016EA00C00003E4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30000775A02-000000067F00004005000060F60100000000__0000002427BD8BD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000197FBD0-000000067F00004005016EA00C00019C7A6A__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000067114B-000000067F00004005000060F60100000000__0000001B59EEB909-0000001FFBC01501":{"file_size":232669184,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001408000-000000067F00004005000060FB000140C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800001F8000-000000067F0000400500EB4A4800001FC000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000290000-000000067F0000400500EB4A480000294000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003061089-000000067F00004005000060F3000306A02D__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CE4000-000000067F00004005000060F30001CF0197__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000E20000-000000067F00004005000060F70000E24000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001D0000-000000067F000040050081DB4300001D4000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D184F6-000000067F00004005000060F30100000000__0000016143292911-00000164DEE06671":{"file_size":200163328,"generation":2,"shard":"0008"},"000000067F00004005000060F300066F4000-000000067F00004005000060F30006700000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A38000-000000067F000040050081DB430000A4A074__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F38000-000000067F00004005000060F30000F59017__00000047E31D98D1-0000004C49155071":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C0C000-000000067F00004005000060FB0000C18000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D34000-000000067F00004005000060F30006D60000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005010660F501FFFFFFFF-000000067F00004005010660F50300000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700013E85D1-000000067F00004005000060F70001410BBC__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000538B44-000000067F00004005000060FB0000551689__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001410000-000000067F00004005000060F70001414000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300032F1113-000000067F00004005000060F3000330A1C8__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004974000-000000067F00004005000060F3000498DC49__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000625EB45-000000067F00004005000060F30006277C61__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700019E8E81-000000067F00004005000060F80100000000__0000014EC58A4A79-0000015304A396B9":{"file_size":246792192,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB5730259-000001BCB5732691":{"file_size":24576,"generation":187,"shard":"0008"},"000000067F000040050081DB4300001CC000-000000067F000040050081DB4300001D0000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C00000-000000067F00004005000060F30002C18FAE__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FC4000-000000067F00004005000060F70000FCD85E__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000107C39B-030000000000000000000000000000000002__0000004C49155071-0000004F31878919":{"file_size":133349376,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F90000-000000067F00004005016EA00C0000F94000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000F98000-000000067F00004005016EA00C0000F9C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700019EC000-000000067F00004005000060F80100000000__0000014EDD256548":{"file_size":7421952,"generation":2,"shard":"0008"},"000000067F00004005000060F300069FA3F6-000000067F00004005000060F30006A0B44C__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003AC000-000000067F000040050081DB4300003B27DA__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005A57691-000000067F00004005000060F30005B00697__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300060CB2C8-000000067F00004005000060F300060D4415__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000495C000-000000067F00004005000060F30004970000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000D1C5F-000000067F0000400500D69D7900000F1B5B__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001358000-030000000000000000000000000000000002__000001A95031E5B8":{"file_size":21110784,"generation":11,"shard":"0008"},"000000067F00004005000060F3000430C000-000000067F00004005000060F30004370000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004904000-000000067F00004005000060F30004958000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000008000-000000067F00004005000060F30000378000__00000186146441F1-0000018624969469":{"file_size":33357824,"generation":6,"shard":"0008"},"000000067F00004005000060F700005C0000-000000067F00004005000060F700005C85CE__00000057593D8169-0000005C01565329":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000B04000-000000067F00004005016EA00C0000B40000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002920FA0-000000067F00004005000060F3000293210E__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002058000-000000067F00004005000060F30002070F71__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000686D0DE-000000067F00004005000060F3000689E295__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FA2AD30000004000-000000067F0000400500FA2AD30000030000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00009BF728-000000067F00004005016EA00C0000A575C7__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30004374000-000000067F00004005000060F300043B0000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300051F0000-000000067F00004005000060F300051F4000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006B22072-000000067F00004005000060F30006B4B0BB__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000328FA4E-000000067F00004005000060F50100000000__000000E4D847F4E0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000000FEA0-000000067F00004005016EA00C000001FD3E__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A48000019F4DD-030000000000000000000000000000000002__000000F6661C9241-000000F901689359":{"file_size":59498496,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003EC000-000000067F00004005016EA00C00003F8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000073C000-000000067F00004005016EA00C000074F43B__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003542BFF-000000067F00004005000060F50100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001771169-000000067F00004005000060F80100000000__000001398B56A519-0000013C9C0E3339":{"file_size":263454720,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003B27DA-030000000000000000000000000000000002__0000008DDCD70B68":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F3000542AFB0-000000067F00004005000060F30005474062__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000057C94F-000000067F00004005000060F80100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300055861F2-000000067F00004005000060F30100000000__0000015304A396B9-0000015670D6AFD9":{"file_size":127393792,"generation":2,"shard":"0008"},"000000067F00004005000060F30001D79136-000000067F00004005000060F30100000000__0000008DBE2855F9-000000923719A971":{"file_size":227958784,"generation":2,"shard":"0008"},"000000067F00004005000060F10000218000-000000067F00004005000060F1000021C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CD4000-000000067F00004005016EA00C0001CE0000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F300017EC000-000000067F00004005000060F30001886B2A__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001188000-000000067F00004005000060F300011D1111__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000ECC000-000000067F00004005000060FB0000F050F2__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300018C0000-000000067F00004005000060F300018E0FE6__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006E4000-000000067F00004005016EA00C0000738000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002790000-000000067F00004005000060F30002794000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00001B850B-000000067F0000400500F56D510100000000__0000011B688FEDC8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F100001F8000-000000067F00004005000060F100001FC000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000810000-000000067F00004005000060F80100000000__00000079F2A2F311-0000007E3A9BFD29":{"file_size":263454720,"generation":2,"shard":"0008"},"000000067F00004005000060F100006CBF87-000000067F00004005000060F20100000000__000000A5A3F27398":{"file_size":15851520,"generation":2,"shard":"0008"},"000000067F0000400500F7D2DD0100000000-000000067F0000400500F8E3A50000014000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700010AABC7-000000067F00004005000060F80100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B80000-000000067F00004005000060F30003B84000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000078000-000000067F000040050081DB4300000AA080__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002618000-000000067F00004005000060F30002680F9D__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A48000-000000067F00004005000060F30002A4C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001994000-000000067F00004005000060F700019E8000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B6168A-000000067F00004005000060FB0000B6A1D0__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000147A0EC-000000067F00004005000060FB000148AC30__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000060000-000000067F0000400500EE16BC0000064000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003458D42-000000067F00004005000060F30003481DDB__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E30000-000000067F00004005000060F30006E34000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700017F8000-000000067F00004005000060F700017FC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C50000-000000067F00004005000060F30004C54000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001720000-000000067F00004005000060F80100000000__00000139CF156B58":{"file_size":63463424,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A8E15E-000000067F000040050081DB430000A98000__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":265404416,"generation":2,"shard":"0008"},"000000067F00004005000060F30004BAE526-000000067F00004005000060F30004BE7584__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001ADF97B-000000067F00004005016EA00C0001B0FD2A__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F60000014000-000000067F00004005000060F60100000000__0000003D2AB09B68":{"file_size":83329024,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C1C000-000000067F00004005000060FB0000C70000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005240000-000000067F00004005000060F30005244000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000077C000-000000067F000040050081DB430000790000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D60000-000000067F00004005000060F30006D64000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C54000-000000067F00004005000060F30004C60000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000000000000000001-000000067F0000400500000A690000000002__00000186146441F1-0000018624969469":{"file_size":57344,"generation":6,"shard":"0008"},"000000067F00004005000060F30005688000-000000067F00004005000060F3000568C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004370000-000000067F00004005000060F30004374000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300051F4000-000000067F00004005000060F30005210000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DD8000-000000067F00004005000060F30004DDC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E400001AFD31-000000067F0000400500C782E400001B7C41__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000BB103B-000000067F00004005000060F60000014C3A__0000003579F03331-0000003959DA2DE9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500D19D030100000000-000000067F0000400500D69D790000024000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000028B253-030000000000000000000000000000000002__0000008196C976A1-0000008625CF2891":{"file_size":151224320,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DD8000-000000067F00004005000060F30004E40FFC__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010F44EB0100000000-000000067F00004005010F57CB000000C000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003BCC000-000000067F00004005000060F30003C08000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005B80000-000000067F00004005000060F30005B89170__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000135FCAD-000000067F00004005016EA00C000144FB4E__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005010660F500000B0000-000000067F00004005010660F500000B4000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000D31030-000000067F00004005000060F30100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":233791488,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C18FAE-000000067F00004005000060F30002C71F27__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000041FB53-000000067F0000400500EB4A480000447A64__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000048000-000000067F0000400500EE16BC000004C000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00009D0000-000000067F00004005000060FB00009D4000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100004365FE-000000067F00004005000060F20100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006BAD108-000000067F00004005000060F30006C0E146__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300006B4000-000000067F00004005000060F300006E0000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000327C000-000000067F00004005000060F3000328FA4E__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B94000-000000067F00004005000060F30003BC8000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003CB8FCF-000000067F00004005000060F30003CCA0B9__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003EA902F-000000067F00004005000060F30003F72201__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C64000-000000067F00004005000060F30004C80000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000194000-000000067F000040050081DB4300001C8000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB01FFFFFFFF-000000067F00004005000060FB0300000000__0000018613A0DEA9-00000186146441F1":{"file_size":73728,"generation":5,"shard":"0008"},"000000067F00004005000060F300038B5F5B-000000067F00004005000060F300038FF04F__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001C8000-000000067F000040050081DB4300001CC000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000137F10-000000067F0000400500C782E40000177E20__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000139C000-000000067F00004005000060FB00013B8000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000447A64-000000067F0000400500EB4A480100000000__000000FCCD5238B1-000000FF8B261599":{"file_size":40550400,"generation":2,"shard":"0008"},"000000067F00004005000060F70000418000-000000067F00004005000060F700004405CF__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB430000728000-000000067F000040050081DB43000072C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014B0F7B-000000067F00004005000060F30100000000__000000601F43CF09-000000636DE92159":{"file_size":83951616,"generation":2,"shard":"0008"},"000000067F00004005000060F30005F3303F-000000067F00004005000060F30005FA40AD__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300012442A9-000000067F00004005000060F3000129D29A__00000057593D8169-0000005C01565329":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010B14AB-000000067F000040050081DB430100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00014CF88D-000000067F00004005016EA00C00014D7727__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006A0B44C-000000067F00004005000060F30006A7C566__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000062EE46-000000067F00004005000060F20100000000__000000698F2C3A38":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CE0000-000000067F00004005016EA00C0001CE4000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30000250000-000000067F00004005000060F30000254000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300050E8000-000000067F00004005000060F300050EC000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000259F4A3-000000067F00004005000060F30100000000__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":44433408,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A640EA-000000067F000040050081DB430000A8E15E__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003050000-000000067F00004005000060F30003061089__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C0000158000-000000067F0000400500F3A25C000016A065__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010A4000-000000067F000040050081DB4300010B14AB__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001E0000-000000067F0000400500EE16BC00001E4000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300055B8000-000000067F00004005000060F300055BC000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CE4000-000000067F00004005016EA00C0000D30000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003640000-000000067F00004005000060F30003644000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000014F7AC-000000067F0000400500EB4A4800001876BD__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CD338E-000000067F00004005016EA00C0001CE79E0__000001BA93C39481-000001BCB572A4E1":{"file_size":268451840,"generation":17,"shard":"0008"},"000000067F00004005000060FB0001530B44-000000067F00004005000060FB0001541688__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300031D516C-000000067F00004005000060F30100000000__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":137863168,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00019C7A6A-000000067F00004005016EA00C00019F7907__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000E7F7A7-000000067F00004005016EA00C0000F3F647__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300032C0000-000000067F00004005000060F300032F1113__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006E0000-000000067F00004005016EA00C00006E4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F7000019EA78-000000067F00004005000060F80100000000__0000001737D88379-0000001B59EEB909":{"file_size":50946048,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001B4FBC9-000000067F00004005016EA00C0001BBFA66__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001660000-000000067F00004005000060FB0001680B45__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002BAA1DD-000000067F00004005000060F30100000000__000000C462B3C2A9-000000C824C09619":{"file_size":203554816,"generation":2,"shard":"0008"},"000000067F00004005000060F300049B26A8-000000067F00004005000060F300049CB712__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000CCB5CD-000000067F00004005000060F70000CDBB9C__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000EEA075-000000067F000040050081DB430000F0C0E9__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300003E0000-000000067F00004005000060F300003E8FBC__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C9C000-000000067F00004005000060F30006CA0000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000C7C000-000000067F00004005000060F70000C8CD0C__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001148000-000000067F00004005000060FB000114C000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001232ACF-000000067F00004005000060F80100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FE8000-000000067F00004005000060F700010105DB__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000355928-000000067F0000400500EB4A480100000000__000000FCD84FE628":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700003FE341-000000067F00004005000060F80100000000__0000003D2AB09B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000244D189-000000067F00004005000060F30100000000__000000A9EB8C4489-000000ACA44C8E99":{"file_size":212566016,"generation":2,"shard":"0008"},"000000067F00004005000060F700003B85C7-000000067F00004005000060F80100000000__0000003579F03331-0000003959DA2DE9":{"file_size":208945152,"generation":2,"shard":"0008"},"000000067F00004005000060F100005A2B80-000000067F00004005000060F20100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB000070C000-000000067F00004005000060FB0000718000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB01FFFFFFFF-000000067F00004005000060FB0300000000__00000186146441F1-0000018624969469":{"file_size":24576,"generation":6,"shard":"0008"},"000000067F00004005000060FB000180C000-000000067F00004005000060FB0001838000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000044000-000000067F0000400500EE16BC0000048000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10100000000-000000067F00004005000060F10300000000__000000A583FBFB91-000000A9EB8C4489":{"file_size":483328,"generation":2,"shard":"0008"},"000000067F00004005000060F30004EA41A5-000000067F00004005000060F30004EC52E9__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003AB9907-000000067F00004005000060F30003AF28CB__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000974000-000000067F00004005000060FB00009D0000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038720A2-000000067F00004005000060F300038A3082__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000452BA1-000000067F000040050081DB4300004C4C1E__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300017AA0CE-000000067F00004005000060F30100000000__0000006DDB29D589-000000722F474369":{"file_size":202719232,"generation":2,"shard":"0008"},"000000067F000040050081DB430000504000-000000067F000040050081DB430000560000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B5431C-000000067F00004005000060F30004B654F6__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000C20000-000000067F00004005000060F30000C24000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300028920E4-000000067F00004005000060F30100000000__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":200351744,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004C4C1E-030000000000000000000000000000000002__000000923719A971-00000096262826C9":{"file_size":192356352,"generation":2,"shard":"0008"},"000000067F000040050081DB430000190000-000000067F000040050081DB430000194000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E88000-000000067F000040050081DB430000E8C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000738000-000000067F00004005016EA00C000073C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000578EE6-000000067F000040050081DB43000058AF5E__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001C38000-000000067F00004005000060F30001C3C000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000B7C0EA-030000000000000000000000000000000002__000000B2B5C4E8F9-000000B768469051":{"file_size":133464064,"generation":2,"shard":"0008"},"000000067F00004005000060F3000625B8F0-000000067F00004005000060F30100000000__0000016B49A934C1-0000016E1FBB7B99":{"file_size":139640832,"generation":2,"shard":"0008"},"000000067F00004005000060FB000109C000-000000067F00004005000060FB0001110000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572DFF9-000001BCB5730259":{"file_size":24576,"generation":41,"shard":"0008"},"000000067F00004005000060FB0000AA8000-000000067F00004005000060FB0000AD0B45__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300043F8000-000000067F00004005000060F300043FC000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800003C7C42-000000067F0000400500EB4A48000041FB53__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005BA213F-000000067F00004005000060F30005BDB15B__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300063FE10E-000000067F00004005000060F30100000000__0000016E1FBB7B99-000001715E483C79":{"file_size":111067136,"generation":2,"shard":"0008"},"000000067F00004005000060F30000F91FFF-000000067F00004005000060F30000F9B026__00000047E31D98D1-0000004C49155071":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003650000-000000067F00004005000060F30003654000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300050A412B-000000067F00004005000060F300050B5199__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D78000-000000067F00004005016EA00C0001D7C000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005016EA00C0001244000-000000067F00004005016EA00C0001298000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F100001FC000-000000067F00004005000060F10000200000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CA0000-000000067F00004005016EA00C0000CA4000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F3000498DC49-000000067F00004005000060F50100000000__00000139CF156B58":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F60000036EA0-000000067F00004005000060F60100000000__0000009A24DF6768":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000928B45-000000067F00004005000060FB000097168A__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006854000-000000067F00004005000060F30006858000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050109FFA2000000C3F5-030000000000000000000000000000000002__00000117EDA82C11-0000011B632CC319":{"file_size":226066432,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A6D1B3-000000067F00004005000060F30100000000__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":117620736,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D2C000-000000067F00004005000060F30002D80000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A31FB6-000000067F00004005000060F30003A3B020__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000160723E-000000067F00004005016EA00C00016570D9__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500FB3D310000018000-000000067F0000400500FB3D31000001C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001708000-000000067F00004005000060F7000170C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000283C3E7-000000067F00004005000060F50100000000__000000BAC0041E18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00018F0000-000000067F00004005000060FB0100000000__00000075CC373F31-00000079F2A2F311":{"file_size":268959744,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EC8000-000000067F00004005000060FB0000ECC000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F9C000-000000067F00004005016EA00C0000FF0000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002680F9D-000000067F00004005000060F3000274A080__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000679C000-000000067F00004005000060F300067A0000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000428313F-000000067F00004005000060F300042CC1BD__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00000FFFFFFFF-030000000000000000000000000000000002__00000186146441F1-0000018624969469":{"file_size":24576,"generation":6,"shard":"0008"},"000000067F00004005000060FB00017D8000-000000067F00004005000060FB00017DC000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700017FC000-000000067F00004005000060F70001828000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002FD317C-000000067F00004005000060F30002FF427D__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001701588-000000067F00004005000060FB00017120CE__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500000A3000FFFFFFFF-000000067F0000400500000A690000000002__000001BA93C39481-000001BCB572A4E1":{"file_size":40960,"generation":17,"shard":"0008"},"000000067F00004005000060FB0000638B45-030000000000000000000000000000000002__0000001B59EEB909-0000001FFBC01501":{"file_size":252010496,"generation":2,"shard":"0008"},"000000067F000040050081DB430000394000-000000067F000040050081DB4300003A8000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CF0197-000000067F00004005000060F50100000000__0000008DDCD70B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800000DFB51-000000067F0000400500EB4A4800000E7A62__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000014C000-000000067F00004005000060F70000180000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005948000-000000067F00004005000060F300059790CD__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000853115-000000067F00004005000060F60100000000__00000023FEF9F321-00000028C365FBE1":{"file_size":176136192,"generation":2,"shard":"0008"},"000000067F00004005000060F30004884000-000000067F00004005000060F30004888000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000513C000-000000067F00004005000060F30005160000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000017C000-000000067F0000400500F3A25C00001B850B__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006382F14-000000067F00004005000060F3000638C06D__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000017F02-000000067F0000400500E3A2A100000B7E04__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001000B44-000000067F00004005000060FB0001009688__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D790100000000-000000067F0000400500DBCED50000024000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010A0000-000000067F000040050081DB4300010A4000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000310000-000000067F00004005000060FB0000348B45__0000000D55A212C9-000000114A805939":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F60000060038-000000067F00004005000060F60100000000__000000F91FE84F08":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CE0000-000000067F00004005000060F30001CE4000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300000AA080-000000067F000040050081DB4300000D40FF__00000075CC373F31-00000079F2A2F311":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000551689-030000000000000000000000000000000002__0000001737D88379-0000001B59EEB909":{"file_size":227418112,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000D90000-000000067F00004005000060FB0100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":272769024,"generation":2,"shard":"0008"},"000000067F00004005000060F300059CC403-000000067F00004005000060F300059F53C6__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F2C000-000000067F00004005000060F30001F30000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000014000-000000067F00004005000060FB0000084772__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F30004B654F6-000000067F00004005000060F30004BAE526__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002450000-000000067F00004005000060F30002454000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A0F066-000000067F00004005000060F50100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F60000032EBE-000000067F00004005000060F60100000000__0000008DDCD70B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00001D8000-000000067F00004005000060FB00001DC000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000670000-000000067F00004005016EA00C0000674000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0001344000-000000067F00004005016EA00C0001358000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000D30000-000000067F00004005016EA00C0000D34000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000012FE9A-000000067F00004005016EA00C00001F7D38__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70000BF0000-000000067F00004005000060F70100000000__000000B2B5C4E8F9-000000B768469051":{"file_size":273809408,"generation":2,"shard":"0008"},"000000067F00004005000060F300005A0000-000000067F00004005000060F3000067114B__0000001B59EEB909-0000001FFBC01501":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000021C000-000000067F0000400500EB4A480000290000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F3C000-000000067F00004005016EA00C0000F58000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000074F43B-030000000000000000000000000000000002__000001936E73D028":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005010F57CB000000C000-000000067F00004005010F99A50100000000__00000126C3C69FC0":{"file_size":22978560,"generation":2,"shard":"0008"},"000000067F00004005000060F700017E1391-000000067F00004005000060F80100000000__0000013C9C0E3339-0000013FEFA7D709":{"file_size":232677376,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CC74D7-000000067F00004005016EA00C0001CD7376__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F700005C85CE-000000067F00004005000060F700005E8B9D__00000057593D8169-0000005C01565329":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FCD352-000000067F00004005000060F30100000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":124788736,"generation":2,"shard":"0008"},"000000067F0000400500C782E400002A5E4B-000000067F0000400500C782E400002CDD5C__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700018871D6-000000067F00004005000060F80100000000__000001444EB7FC10":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D252C8-000000067F00004005000060F30100000000__00000117EDA82C11-0000011B632CC319":{"file_size":205963264,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001408A62-000000067F00004005000060FB00014195A7__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E400001B7C41-000000067F0000400500C782E400001C7B51__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000110000-000000067F00004005000060FB0100000000__000000044854EBD1-00000008B6B51879":{"file_size":272613376,"generation":2,"shard":"0008"},"000000067F00004005000060F300004E8000-000000067F00004005000060F60100000000__0000001737D88379-0000001B59EEB909":{"file_size":260579328,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DF4000-000000067F00004005000060F30006E30000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C84000-030000000000000000000000000000000002__000000BAC0041E18":{"file_size":59998208,"generation":2,"shard":"0008"},"000000067F00004005000060F30002B88FF2-000000067F00004005000060F30002BAA1DD__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000434000-000000067F00004005000060FB00004A0000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DA8000-000000067F00004005000060F30004DAC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004E0000-000000067F000040050081DB4300004E4000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001E4000-000000067F0000400500EE16BC0000201716__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C440EA-000000067F000040050081DB430000C5E15B__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000BDAF5-000000067F0000400500D69D790100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A9C000-000000067F00004005000060F30002AEED02__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DAC000-000000067F00004005000060F30004DD8000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B94000-000000067F00004005000060F70000B98000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002454000-000000067F00004005000060F30002460000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100001059CB-000000067F00004005000060F10000125BF2__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000D362CA-000000067F00004005016EA00C0000DB7D33__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30001C0A0A3-000000067F00004005000060F30100000000__0000008625CF2891-00000089F4693119":{"file_size":203063296,"generation":2,"shard":"0008"},"000000067F00004005000060F300066F0000-000000067F00004005000060F300066F4000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001414000-000000067F00004005000060F70001428000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014CC16D-000000067F00004005000060F300014D5280__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000172AC12-030000000000000000000000000000000002__0000006DDB29D589-000000722F474369":{"file_size":186875904,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E4C000-000000067F000040050081DB430000E88000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300063A50CD-000000067F00004005000060F300063FE10E__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005419E9C-000000067F00004005000060F3000542AFB0__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC000014158C-030000000000000000000000000000000002__000000F901689359-000000FCCD5238B1":{"file_size":67854336,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00015FF3A0-000000067F00004005016EA00C000160723E__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00008E760F-000000067F00004005016EA00C00009274AB__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70000B98000-000000067F00004005000060F70000B9C000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00004A4000-000000067F00004005000060FB00004E1FF6__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006670000-000000067F00004005000060F30006674000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000185EE9-000000067F00004005000060F7000018E4B6__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D030000067CA9-030000000000000000000000000000000002__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":29319168,"generation":2,"shard":"0008"},"000000067F0000400500FF2A51000000BFFB-030000000000000000000000000000000002__0000010D77B487A0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A048A8-000000067F00004005000060F30004A1D870__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300004BC000-000000067F00004005000060F300004C6B83__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005290FC9-000000067F00004005000060F3000533205E__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300031130BC-000000067F00004005000060F300031C40D1__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D030000047EE2-000000067F0000400500D19D03000004FDC6__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A44000-000000067F00004005000060F30002A48000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003DAE2DC-000000067F00004005000060F30003DD734C__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F8E3A50000014000-000000067F0000400500F8E3A5000004A25C__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100002F03E9-000000067F00004005000060F20100000000__000000321AA80270":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001138000-000000067F00004005000060F80100000000__000000FCCD5238B1-000000FF8B261599":{"file_size":72695808,"generation":2,"shard":"0008"},"000000067F00004005000060F300056E4000-000000067F00004005000060F50100000000__00000159B010F6C0":{"file_size":13393920,"generation":2,"shard":"0008"},"000000067F00004005000060F70000A7C000-000000067F00004005000060F70000ABD9C4__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000CC6E51-030000000000000000000000000000000002__0000003D2AB09B68":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F60000091EFF-000000067F00004005000060F60100000000__0000014EDD256548":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000008FC41-000000067F0000400500EB4A4800000DFB51__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F363B4-000000067F00004005000060F30001F574A6__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001CD0000-000000067F00004005016EA00C0001CD4000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F300059B324D-000000067F00004005000060F300059CC403__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002530000-000000067F00004005000060F30002534000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000004B633-000000067F00004005000060F60100000000__000000C483D0D6B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700011E0000-000000067F00004005000060F80100000000__0000010779A7F551-0000010A5E65DF39":{"file_size":262922240,"generation":2,"shard":"0008"},"000000067F00004005000060F30006690000-000000067F00004005000060F30006694000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000100E18-000000067F00004005000060F700001213F2__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500FF2A510000004000-000000067F0000400500FF2A51000000BFFB__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EB8000-000000067F00004005000060FB0000EBC000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000674000-000000067F00004005016EA00C00006B0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000EF85D6-000000067F00004005000060F80100000000__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":262897664,"generation":2,"shard":"0008"},"000000067F00004005000060F700005E8B9D-000000067F00004005000060F700005F9158__00000057593D8169-0000005C01565329":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30004E40FFC-000000067F00004005000060F30004E7A062__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000037E20-000000067F0000400500EB4A480000057D31__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400501101C0901FFFFFFFF-030000000000000000000000000000000002__0000012E71CF31F9-000001334140FC21":{"file_size":65060864,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B10000-000000067F00004005000060F70100000000__000000A583FBFB91-000000A9EB8C4489":{"file_size":272646144,"generation":2,"shard":"0008"},"000000067F00004005000060F300056E104B-000000067F00004005000060F3000570A19E__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300059790CD-000000067F00004005000060F300059AA115__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B54000-000000067F00004005000060F70000B90000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300041D9101-000000067F00004005000060F3000424A099__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700000E085E-000000067F00004005000060F70000100E18__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300051B0000-000000067F00004005000060F300051B4000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572A4E1-000001BCB572C329":{"file_size":24576,"generation":17,"shard":"0008"},"000000067F00004005000060F30006D30000-000000067F00004005000060F30006D34000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FDA1F80000020D42-000000067F0000400500FDA1F80100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081D80C0100000000-000000067F000040050081DB430000024000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F600000235B4-000000067F00004005000060F60100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500C782E400000A0000-000000067F0000400500C782E400000A4000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002264247-000000067F00004005000060F50100000000__000000A5A3F27398":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000302C2D6-000000067F00004005000060F50100000000__000000DBD29DC248":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000129C000-000000067F00004005016EA00C0001340000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700016E8000-000000067F00004005000060F700016EC000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300023A0000-000000067F00004005000060F300023B0FF7__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F20100000000-000000067F00004005000060F3000000C000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0000374000-000000067F00004005016EA00C00003E0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000368000-000000067F00004005000060F80100000000__0000003203FB5749-0000003579F03331":{"file_size":263249920,"generation":2,"shard":"0008"},"000000067F000040050081DB4300006310C9-030000000000000000000000000000000002__0000009A1ABDE921-0000009DF02C1241":{"file_size":208953344,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000DC8000-000000067F00004005000060FB0000DE8B45__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000530000-000000067F00004005000060FB0000538B44__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000024000-000000067F000040050081DB430000028000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000488C000-000000067F00004005000060F30004898000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300044D3639-000000067F00004005000060F50100000000__0000012E77D3BF00":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005010450640000000570-000000067F0000400501046F39000000BDD2__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300021050B0-000000067F00004005000060F3000212E160__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700010DD440-000000067F00004005000060F80100000000__000000F309FCDD19-000000F6661C9241":{"file_size":91758592,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000AD0B45-000000067F00004005000060FB0000AE168A__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000013B18E-000000067F00004005000060F7000014B73D__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001938000-000000067F00004005016EA00C000193FE9D__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500C782E400000A4000-000000067F0000400500C782E4000012A71E__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001A40000-000000067F00004005000060F30001A44000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00008578D4-000000067F00004005016EA00C00008CF772__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30001CC0000-000000067F00004005000060F30001CC4000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004D20000-000000067F00004005000060F30004D24000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00003E8000-000000067F00004005016EA00C00003EC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F300039C4000-000000067F00004005000060F300039F8000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005164000-000000067F00004005000060F300051B0000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300039F8000-000000067F00004005000060F300039FC000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010F46BD-000000067F000040050081DB430100000000__000000D31E48D7C9-000000D74E29AAD1":{"file_size":113999872,"generation":2,"shard":"0008"},"000000067F00004005000060F30002E630CF-000000067F00004005000060F30100000000__000000D31E48D7C9-000000D74E29AAD1":{"file_size":171999232,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000ACF305-000000067F00004005016EA00C0000ADF1AB__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006748000-000000067F00004005000060F3000674C000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003810000-000000067F00004005000060F50100000000__00000104BD37F348":{"file_size":11739136,"generation":2,"shard":"0008"},"000000067F00004005000060F1000021C000-000000067F00004005000060F20100000000__0000002427BD8BD0":{"file_size":132448256,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017EC000-000000067F00004005016EA00C00018C0000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F7000025DA3C-000000067F00004005000060F80100000000__0000002427BD8BD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00007F0000-000000067F00004005000060FB0000860B45__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FF0000-000000067F00004005000060F30003FF4000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000E0AD15-000000067F00004005000060FB0000E1B859__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010ADFA80000004000-000000067F00004005010F2BD40100000000__00000126C3C69FC0":{"file_size":13369344,"generation":2,"shard":"0008"},"000000067F00004005000060F30004898000-000000067F00004005000060F3000489C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D2B1B0-000000067F00004005000060F30003D44283__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000FF4000-000000067F00004005016EA00C0001188000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005010F99A50100000000-000000067F00004005010F9F120000004000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F34000-000000067F00004005000060F30001F38F48__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700018A0000-000000067F00004005000060F700018D85CA__000001440D3D0C69-0000014784964B91":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300029A526C-000000067F00004005000060F300029C623C__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00017DC000-000000067F00004005000060FB0001808000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED50000024000-000000067F0000400500DBCED50000028000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000201716-000000067F0000400500EE16C40100000000__0000012A77C1B0B0":{"file_size":32768,"generation":2,"shard":"0008"},"000000067F00004005000060F30006D10000-000000067F00004005000060F30006D14000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430001064000-000000067F000040050081DB4300010A0000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C01FFFFFFFF-000000067F0000400500F3A25C0300000000__0000011B632CC319-0000011F1A40FA69":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001340000-000000067F00004005000060F30001344000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003E98000-000000067F00004005000060F30003EA902F__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C0E146-000000067F00004005000060F30006C8729E__0000017C9F5597E1-0000018022640391":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F600000166C4-000000067F00004005000060F60100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":54165504,"generation":2,"shard":"0008"},"000000067F00004005000060F10000180000-000000067F00004005000060F1000018821A__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000193FE9D-000000067F00004005016EA00C0001967D34__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F000040050081DB43000076C000-000000067F000040050081DB430000778000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300050321C0-000000067F00004005000060F30005063187__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000D4000-000000067F0000400500DBCED500000F0000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300004B8000-000000067F00004005000060F300004BC000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000022C000-000000067F00004005000060FB0000280000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000DF968A-000000067F00004005000060FB0000E021D0__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000228000-000000067F00004005000060FB000022C000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015D8000-000000067F00004005000060FB00015DC000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005B89170-000000067F00004005000060F30005BA213F__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300043B0000-000000067F00004005000060F300043B4000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300004F8000-000000067F000040050081DB4300004FC000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006860000-000000067F00004005000060F30006864000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000ADA0D0-000000067F00004005000060F30000B0300C__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FF2A510000000000-000000067F000040050100D04D000004369C__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00000BB439-030000000000000000000000000000000002__00000104BD37F348":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001C078FA-000000067F00004005016EA00C0001C0F79A__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F000040050081DB430000B4A075-000000067F000040050081DB430000B7C0EA__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000117C10C-000000067F00004005000060F50100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000E47BD2-000000067F00004005016EA00C0000E67A6E__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30005D23BB5-000000067F00004005000060F50100000000__00000164EA9EC9A8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000336D193-000000067F00004005000060F3000337DCF3__000000E4C63CFA21-000000E7C2F1B249":{"file_size":259473408,"generation":2,"shard":"0008"},"000000067F00004005000060F300001F0000-000000067F00004005000060F300001F4000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000084772-030000000000000000000000000000000002__000000027AF9D7D0":{"file_size":147456,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0001CE79E0-000000067F00004005016EA00C0001D1F87B__000001BA93C39481-000001BCB572A4E1":{"file_size":268451840,"generation":17,"shard":"0008"},"000000067F0000400500EB4A4800FFFFFFFF-000000067F0000400500EB4A480100000000__000000FF8B261599-000001048B25A8E9":{"file_size":1318912,"generation":2,"shard":"0008"},"000000067F00004005000060F70000488000-000000067F00004005000060F7000048C000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000ADF1AB-000000067F00004005016EA00C0100000000__00000196C9018F59-0000019A2EAFE7A9":{"file_size":282132480,"generation":11,"shard":"0008"},"000000067F00004005000060FB000071C000-000000067F00004005000060FB0000793506__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006850000-000000067F00004005000060F30006854000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000390000-000000067F000040050081DB430000394000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000020C000-000000067F00004005000060F30000250000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001398000-000000067F00004005000060FB000139C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003648000-000000067F00004005000060F3000364C000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E400001C7B51-000000067F0000400500C782E4000023FA62__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001788000-000000067F00004005016EA00C000178C000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000C3A075-000000067F000040050081DB430000C440EA__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300036FE561-000000067F00004005000060F300038075AF__000000FF8B261599-000001048B25A8E9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D03000004FDC6-000000067F0000400500D19D030000067CA9__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C00000-000000067F00004005000060FB0000C04000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000282C000-000000067F00004005000060F3000283C3E7__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00006B0000-000000067F00004005016EA00C00006B4000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30001789027-000000067F00004005000060F300017AA0CE__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004558000-000000067F00004005000060F300045C1062__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C08000-000000067F00004005000060FB0000C0C000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DCC000-000000067F00004005000060F30006DF0000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B221FE-000000067F00004005000060F30004B2B250__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018C4000-000000067F00004005016EA00C00018E0000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000564000-000000067F000040050081DB430000578000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000274A080-000000067F00004005000060F30100000000__000000B2B5C4E8F9-000000B768469051":{"file_size":199057408,"generation":2,"shard":"0008"},"000000067F00004005000060F300046D0EA8-000000067F00004005000060F3000471200E__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001114000-000000067F00004005000060FB0001120000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FEC000-000000067F00004005000060F30003FF0000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000368000-000000067F00004005000060F10100000000__0000003959DA2DE9-0000003D03FCCDB9":{"file_size":269967360,"generation":2,"shard":"0008"},"000000067F0000400500C782E4000012A71E-030000000000000000000000000000000002__000000D037B2DBD0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F30006C98000-000000067F00004005000060F30006C9C000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300055BC000-000000067F00004005000060F30005610000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000F050F2-030000000000000000000000000000000002__00000047F1F2B800":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30002484000-000000067F00004005000060F300024D8000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FE8000-000000067F00004005000060F30003FEC000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000A8000-000000067F0000400500DBCED500000AC000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700006C3D76-000000067F00004005000060F80100000000__000000663565F8C9-000000698AF6E809":{"file_size":139821056,"generation":2,"shard":"0008"},"000000067F00004005000060F30002534000-000000067F00004005000060F3000253B7A3__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000412D27C-000000067F00004005000060F30004156457__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000910000-000000067F00004005000060F700009385D4__0000008DBE2855F9-000000923719A971":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30002510000-000000067F00004005000060F30002514000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002210000-000000067F00004005000060F30002214000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FF4000-000000067F00004005000060F30004070000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001BBFA66-000000067F00004005016EA00C0001C078FA__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000424A099-000000067F00004005000060F3000428313F__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300036F91FE-000000067F00004005000060F30100000000__000000FCCD5238B1-000000FF8B261599":{"file_size":164118528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000718000-000000067F00004005000060FB000071C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005010F44EB000000C000-000000067F00004005010F44EB0100000000__00000126C3C69FC0":{"file_size":70696960,"generation":2,"shard":"0008"},"000000067F00004005000060F30005214000-000000067F00004005000060F30005240000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000A7AF6E-030000000000000000000000000000000002__000000321AA80270":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30005063187-000000067F00004005000060F300050A412B__0000014784964B91-0000014B000D1821":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005E8000-000000067F00004005000060F100005F821C__000000636DE92159-000000663565F8C9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300020830BE-000000067F00004005000060F300020FC052__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300065BB235-000000067F00004005000060F300065F42B4__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FA2AD30000034000-000000067F0000400500FA2AD3000004D85C__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017A8000-000000067F00004005016EA00C00017AC000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB00008D8000-000000067F00004005000060FB0000928B45__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000798000-000000067F00004005000060F300007C1007__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D19D030000040000-000000067F0000400500D19D030000047EE2__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30001AB1583-000000067F00004005000060F50100000000__00000081AA3C40F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001AD8000-000000067F00004005000060F30001B09104__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000E1B859-030000000000000000000000000000000002__000000417D21ACF9-00000044B4679349":{"file_size":156844032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E9C000-000000067F00004005000060FB0001EA8000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001374000-000000067F00004005000060FB0001398000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000155C000-000000067F00004005000060FB0001590000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00000EA069-000000067F0000400500F3A25C000010C0D1__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000568C000-000000067F00004005000060F30005698000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C74000-000000067F00004005000060FB0000C98000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700004F0000-000000067F00004005000060F80100000000__00000047E31D98D1-0000004C49155071":{"file_size":264921088,"generation":2,"shard":"0008"},"000000067F00004005000060F30005598000-000000067F00004005000060F3000559C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001429534-000000067F00004005000060F80100000000__00000122A7BB7B29-0000012694E36301":{"file_size":231964672,"generation":2,"shard":"0008"},"000000067F00004005000060F70000780000-000000067F00004005000060F80100000000__000000722F474369-00000075CC373F31":{"file_size":263340032,"generation":2,"shard":"0008"},"000000067F00004005000060F300019F31AA-000000067F00004005000060F30100000000__00000079F2A2F311-0000007E3A9BFD29":{"file_size":168484864,"generation":2,"shard":"0008"},"000000067F000040050081DB430000822079-000000067F000040050081DB43000082C0F1__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300007AC000-000000067F000040050081DB4300007F913A__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005847319-000000067F00004005000060F300058C8000__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":261505024,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E21687-000000067F00004005000060FB0100000000__000000923719A971-00000096262826C9":{"file_size":224403456,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C98000-000000067F00004005000060F30003CB8FCF__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB43000045029C-030000000000000000000000000000000002__0000008DBE2855F9-000000923719A971":{"file_size":89505792,"generation":2,"shard":"0008"},"000000067F00004005000060F3000559C000-000000067F00004005000060F300055B8000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000285901B-000000067F00004005000060F300028920E4__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E64000-000000067F00004005000060F30000E70000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015FB022-000000067F00004005000060F3000160410C__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006FDA081-000000067F00004005000060F30100000000__00000184624E5741-000001860C80A151":{"file_size":202276864,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000107973-000000067F0000400500EE16BC0100000000__000000F309FCDD19-000000F6661C9241":{"file_size":275456000,"generation":2,"shard":"0008"},"000000067F00004005000060F300031C40D1-000000067F00004005000060F300031D516C__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00001F7D38-000000067F00004005016EA00C000020FBCF__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500FDA1F80100000000-000000067F0000400500FF2A510000004000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001182EC9-000000067F00004005000060F80100000000__000000FF8B261599-000001048B25A8E9":{"file_size":174284800,"generation":2,"shard":"0008"},"000000067F00004005000060F700011528FB-000000067F00004005000060F70001182EC9__000000FF8B261599-000001048B25A8E9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300024DC000-000000067F00004005000060F30002510000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00000B0000-030000000000000000000000000000000002__000000021DC73119-000000044854EBD1":{"file_size":259375104,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001DF0B43-000000067F00004005000060FB0001E21687__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000088000-000000067F00004005000060F10000090000__00000008B6B51879-0000000D55A212C9":{"file_size":264142848,"generation":2,"shard":"0008"},"000000067F00004005000060F30003968000-000000067F00004005000060F3000396C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017AC000-000000067F00004005016EA00C00017E8000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F1000019C73D-000000067F00004005000060F20100000000__0000001B59EEB909-0000001FFBC01501":{"file_size":124698624,"generation":2,"shard":"0008"},"000000067F00004005000060F700001F8000-000000067F00004005000060F700002005D2__0000001B59EEB909-0000001FFBC01501":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001110000-000000067F00004005000060FB0001114000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F1000019842A-000000067F00004005000060F20100000000__0000001737D88379-0000001B59EEB909":{"file_size":145137664,"generation":2,"shard":"0008"},"000000067F00004005000060F700003BC000-000000067F00004005000060F700003C0000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000280000-000000067F00004005000060FB0000284000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED5000007C000-000000067F0000400500DBCED500000A8000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB5732691-000001BCB5734CD9":{"file_size":24576,"generation":239,"shard":"0008"},"000000067F00004005010660F70100000000-000000067F000040050107B547000006C000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000C24000-000000067F00004005000060F30000CA0000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000569C000-000000067F00004005000060F300056D8000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00000C7A73-030000000000000000000000000000000002__0000018624969469-000001880F984A29":{"file_size":40566784,"generation":11,"shard":"0008"},"000000067F00004005000060F30001344000-000000067F00004005000060F30001358000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F38F48-000000067F00004005000060F50100000000__0000009A24DF6768":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001760000-000000067F00004005000060F30001789027__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000018821A-000000067F00004005000060F1000019842A__0000001737D88379-0000001B59EEB909":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300059AA115-000000067F00004005000060F300059B324D__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001400000-000000067F00004005000060FB0001404000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800000E7A62-000000067F0000400500EB4A480000107973__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000498000-000000067F00004005000060F3000049C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D24000-000000067F00004005000060F70000D38000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000120E409-000000067F000040050081DB430300000000__0000018613F0A050":{"file_size":24576,"generation":3,"shard":"0008"},"000000067F00004005000060FB0001A8A1CD-000000067F00004005000060FB0100000000__0000007E3A9BFD29-0000008196C976A1":{"file_size":199622656,"generation":2,"shard":"0008"},"000000067F00004005000060F30006270000-000000067F00004005000060F50100000000__0000016E41E03CA0":{"file_size":71114752,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000BAAD15-030000000000000000000000000000000002__0000003579F03331-0000003959DA2DE9":{"file_size":182321152,"generation":2,"shard":"0008"},"000000067F00004005000060F700016205B5-000000067F00004005000060F80100000000__0000012E71CF31F9-000001334140FC21":{"file_size":266862592,"generation":2,"shard":"0008"},"000000067F00004005000060F300030C0FE5-000000067F00004005000060F30003102107__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004BC000-000000067F00004005016EA00C00004E8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F10000440000-000000067F00004005000060F1000046821B__00000047E31D98D1-0000004C49155071":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB4300009C8000-000000067F000040050081DB4300009CC000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000106C000-000000067F00004005000060F700010AABC7__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000367733F-000000067F00004005000060F50100000000__000000F91FE84F08":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000478000-000000067F00004005016EA00C000047C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002E4104A-000000067F00004005000060F30002E4A157__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001370000-000000067F00004005000060FB0001374000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004B1111A-000000067F00004005000060F30004B221FE__0000013C9C0E3339-0000013FEFA7D709":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029C000-000000067F00004005016EA00C00002D0000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30001C3C000-000000067F00004005000060F30001CC0000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000136C000-000000067F00004005000060FB0001370000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000488000-000000067F00004005000060F10100000000__0000004C49155071-0000004F31878919":{"file_size":268754944,"generation":2,"shard":"0008"},"000000067F00004005000060F30000B0300C-000000067F00004005000060F60100000000__0000003203FB5749-0000003579F03331":{"file_size":212885504,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001C0F79A-000000067F00004005016EA00C0001C3F636__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000399C000-000000067F00004005000060F300039A0000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001574000-000000067F00004005000060F700015A195C__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005B00697-000000067F00004005000060F30100000000__0000015DD1D3C809-0000016143292911":{"file_size":282025984,"generation":2,"shard":"0008"},"000000067F00004005000060F300050C8000-000000067F00004005000060F300050CC000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700000885C5-000000067F00004005000060F80100000000__000000044854EBD1-00000008B6B51879":{"file_size":253878272,"generation":2,"shard":"0008"},"000000067F00004005000060F30001407F7A-000000067F00004005000060F50100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B90000-000000067F00004005000060F70000B94000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000560000-000000067F000040050081DB430000564000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001720000-000000067F00004005000060F700017405D4__000001398B56A519-0000013C9C0E3339":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300043CC000-000000067F00004005000060F300043F8000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000129D29A-000000067F00004005000060F30100000000__00000057593D8169-0000005C01565329":{"file_size":110788608,"generation":2,"shard":"0008"},"000000067F00004005000060F300003F9F83-000000067F00004005000060F30000402F4A__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001940000-000000067F00004005000060F700019685CE__0000014784964B91-0000014B000D1821":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300043B8000-000000067F00004005000060F300043BC000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000370FD1-000000067F00004005000060F60100000000__0000000D55A212C9-000000114A805939":{"file_size":232144896,"generation":2,"shard":"0008"},"000000067F00004005000060F30003849093-000000067F00004005000060F300038720A2__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100003C0432-000000067F00004005000060F20100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":262701056,"generation":2,"shard":"0008"},"000000067F00004005000060F700014F85DF-000000067F00004005000060F70001510BBE__0000012694E36301-0000012A3F140591":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F3000253B7A3-000000067F00004005000060F50100000000__000000AFE87558B0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001404000-000000067F00004005000060FB0001408000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003F942CF-000000067F00004005000060F30003FCD352__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B38000-000000067F00004005000060FB0000B58B45__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B505C8-000000067F00004005000060F80100000000__000000A9EB8C4489-000000ACA44C8E99":{"file_size":226459648,"generation":2,"shard":"0008"},"000000067F00004005000060F3000612D506-000000067F00004005000060F30006166575__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700000DC000-000000067F00004005000060F700000E0000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D31000000C000-000000067F0000400500FB3D310000018000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000001BCB572C329-000001BCB572C481":{"file_size":24576,"generation":19,"shard":"0008"},"000000067F00004005000060F30002828000-000000067F00004005000060F3000282C000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015B0000-000000067F00004005000060F300015B4000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED50000078000-000000067F0000400500DBCED5000007C000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000086E169-030000000000000000000000000000000002__000000A583FBFB91-000000A9EB8C4489":{"file_size":77471744,"generation":2,"shard":"0008"},"000000067F0000400501046F39000000BDD2-000000067F00004005010660F500000161F7__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D3101FFFFFFFF-000000067F0000400500FB3D310300000000__00000122A7BB7B29-0000012694E36301":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00000F28ED-030000000000000000000000000000000002__000000F91FE84F08":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30004E9307A-000000067F00004005000060F30004EA41A5__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00016D21CF-030000000000000000000000000000000002__000000698AF6E809-0000006DDB29D589":{"file_size":226353152,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800001876BD-000000067F0000400500EB4A48000018F5CD__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E400002E5B84-030000000000000000000000000000000002__000000DBD29DC248":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D8985C-000000067F00004005000060F70000DA1E38__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C28000-000000067F000040050081DB430000C3A075__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000407201D-000000067F00004005000060F300040E319D__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000002B3CE-000000067F00004005000060F60100000000__00000075E5D2A930":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D60000-000000067F00004005000060F80100000000__000000C483D0D6B8":{"file_size":133947392,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F705D6-000000067F00004005000060F80100000000__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":259842048,"generation":2,"shard":"0008"},"000000067F00004005000060F30004E7A062-000000067F00004005000060F30004E9307A__000001440D3D0C69-0000014784964B91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006810000-000000067F00004005000060F30006814000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700007D05C8-000000067F00004005000060F80100000000__00000075CC373F31-00000079F2A2F311":{"file_size":251740160,"generation":2,"shard":"0008"},"000000067F00004005000000000000000001-000000067F0000400500000A690000000002__0000018624969469-000001880F984A29":{"file_size":40960,"generation":11,"shard":"0008"},"000000067F00004005000060FB00014D8000-000000067F00004005000060FB0001530B44__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001EA8000-000000067F00004005000060FB0001EAC000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000230A0C7-000000067F00004005000060F30100000000__000000A583FBFB91-000000A9EB8C4489":{"file_size":213680128,"generation":2,"shard":"0008"},"000000067F00004005000060F30000A98000-000000067F00004005000060F30000AC9024__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003F72201-000000067F00004005000060F30003F7B254__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000498000-000000067F00004005016EA00C000049C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004CB8000-000000067F00004005000060F30004CBC000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300042CC1BD-000000067F00004005000060F300042D51D6__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D310000028681-000000067F0000400500FB3D320100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000474302B-000000067F00004005000060F300047EC0CA__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003204000-000000067F00004005000060F30003278000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300024020ED-000000067F00004005000060F3000240B12A__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000216C000-000000067F00004005000060F30002170000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000005DD43-000000067F00004005000060F60100000000__000000EFDE07FFD8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000348B45-000000067F00004005000060FB000037968A__0000000D55A212C9-000000114A805939":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000778000-000000067F000040050081DB43000077C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300011B4000-000000067F000040050081DB43000120E409__000000DBD29DC248":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003CCA0B9-000000067F00004005000060F30003D0B155__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00009D4000-000000067F00004005000060FB0000A7AF6E__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700008F0000-000000067F00004005000060F80100000000__00000089F4693119-0000008DBE2855F9":{"file_size":262905856,"generation":2,"shard":"0008"},"000000067F00004005000060F30006CA0000-000000067F00004005000060F30006CA4000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000E021D0-000000067F00004005000060FB0000E0AD15__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003654000-000000067F00004005000060F3000367733F__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000DC0000-000000067F00004005000060F70000DE05C8__000000C824C09619-000000CC13D2E549":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F700018D85CA-000000067F00004005000060F80100000000__000001440D3D0C69-0000014784964B91":{"file_size":260775936,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EAC000-000000067F00004005000060FB0000EB8000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E70000-000000067F00004005000060F30000E74000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005FE621A-000000067F00004005000060F30005FFF23F__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D20000-000000067F00004005000060F70000D24000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005244000-000000067F00004005000060F3000525C065__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400501025D9001FFFFFFFF-000000067F0000400501025D900300000000__0000011B632CC319-0000011F1A40FA69":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CD4000-000000067F00004005000060F30001CE0000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000E77906-000000067F00004005016EA00C0000E7F7A7__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300046B41AA-000000067F00004005000060F30100000000__0000012E71CF31F9-000001334140FC21":{"file_size":199688192,"generation":2,"shard":"0008"},"000000067F000040050100D04D00000634BB-030000000000000000000000000000000002__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":173744128,"generation":2,"shard":"0008"},"000000067F00004005000060F30000CA4000-000000067F00004005000060F30000CB16B6__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004DDC000-000000067F00004005000060F30004DF086C__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D7F2DE-000000067F00004005000060F30005DA03A8__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300048A0000-000000067F00004005000060F300048A4000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100003954D3-000000067F00004005000060F20100000000__0000003D2AB09B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300043BC000-000000067F00004005000060F300043C8000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D1C000-000000067F00004005016EA00C0001D78000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F100000D8000-000000067F00004005000060F100000E021B__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F300060A0282-000000067F00004005000060F300060A93B5__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000021D8F8-000000067F00004005000060F20100000000__00000023FEF9F321-00000028C365FBE1":{"file_size":88227840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000018000-000000067F00004005000060F3000001C000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F000040050081DB430000E48000-000000067F000040050081DB430000E4C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300003E8FBC-000000067F00004005000060F300003F9F83__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004868000-000000067F00004005000060F3000486C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700013D0000-000000067F00004005000060F700013E85D1__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001203856-030000000000000000000000000000000002__0000005413AB3641-00000057593D8169":{"file_size":157130752,"generation":2,"shard":"0008"},"000000067F00004005000060F3000029C000-000000067F00004005000060F300002C4887__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005160000-000000067F00004005000060F30005164000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D31000001C000-000000067F0000400500FB3D310000028681__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029F90B-000000067F00004005016EA00C00002D77AE__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30003620000-000000067F00004005000060F30100000000__000000F309FCDD19-000000F6661C9241":{"file_size":249372672,"generation":2,"shard":"0008"},"000000067F00004005000060F30003B90000-000000067F00004005000060F30003B94000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300001F4000-000000067F00004005000060F30000208000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001BB8000-000000067F00004005000060F30001C00FE1__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005210000-000000067F00004005000060F30005214000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002070F71-000000067F00004005000060F30002079FDE__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000B40000-000000067F00004005000060F30000BB103B__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000290000-000000067F00004005000060F10000298000__00000028C365FBE1-0000002D2A8E0B81":{"file_size":264134656,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00007C7B9C-000000067F00004005016EA00C0000807A34__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001548000-000000067F00004005000060FB000154C000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100005FC000-000000067F00004005000060F1000062EE46__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001A0000-000000067F0000400500EE16BC00001A4000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000F94000-000000067F00004005016EA00C0000F98000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000290000-000000067F00004005000060F80100000000__00000023FEF9F321-00000028C365FBE1":{"file_size":265764864,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001BC0B44-000000067F00004005000060FB0001BD1689__0000008625CF2891-00000089F4693119":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000337DCF2-000000067F00004005000060F30003386D10__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300045C1062-000000067F00004005000060F3000460202F__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006814000-000000067F00004005000060F30006850000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000073DFA8-000000067F00004005016EA00C000079FCFA__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000178C000-000000067F00004005016EA00C00017A8000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F1000051D1AE-000000067F00004005000060F20100000000__00000057593D8169-0000005C01565329":{"file_size":103145472,"generation":2,"shard":"0008"},"000000067F00004005000060F300034BD86C-000000067F00004005000060F30100000000__000000EBC9213D59-000000EFA7EAA9E1":{"file_size":95617024,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000008000-000000067F00004005016EA00C000000FEA0__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F1000014C000-000000067F00004005000060F1000015F545__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300000000EAB-000000067F0000400500FB3D300100000000__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":12976128,"generation":2,"shard":"0008"},"000000067F000040050081DB430000028000-000000067F000040050081DB43000002C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001BD1689-000000067F00004005000060FB0100000000__0000008625CF2891-00000089F4693119":{"file_size":223690752,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000000000-000000067F0000400500EB4A480000000001__000000FF8B261599-000001048B25A8E9":{"file_size":32768,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D952B0-000000067F00004005000060F30003DAE2DC__0000011B632CC319-0000011F1A40FA69":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B30000-000000067F00004005000060F70000B505C8__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F3000549D0A6-000000067F00004005000060F300055861F2__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F1000046821B-000000067F00004005000060F20100000000__00000047E31D98D1-0000004C49155071":{"file_size":266969088,"generation":2,"shard":"0008"},"000000067F00004005000060F300043C8000-000000067F00004005000060F300043CC000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001E720A2-000000067F00004005000060F30100000000__000000923719A971-00000096262826C9":{"file_size":141344768,"generation":2,"shard":"0008"},"000000067F000040050081DB4300003A8000-000000067F000040050081DB4300003AC000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700006AB7A6-000000067F00004005000060F700006C3D76__000000663565F8C9-000000698AF6E809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000570A19E-000000067F00004005000060F3000573B206__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003AF28CB-000000067F00004005000060F30003B33945__0000010FB1BE19B9-00000113456156F1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015CC000-000000067F00004005000060FB00015D8000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000A9CFB-000000067F0000400500D69D7900000D1C5F__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A30000-000000067F00004005000060F30002A34000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000047C000-000000067F00004005000060F30000498000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005FFF23F-000000067F00004005000060F300060A0282__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000029C194-000000067F00004005016EA00C00004EF809__0000018EC67807C9-000001935283F9B9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006D64000-000000067F00004005000060F30006DC8000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001340000-000000067F00004005016EA00C0001344000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000BB0000-000000067F00004005016EA00C0000BB4000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F0000400500EB4A480000000000-000000067F0000400500EB4A480000007F0F__000000F309FCDD19-000000F6661C9241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000114000-000000067F0000400500E3A2A1000016321A__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000578000-030000000000000000000000000000000002__0000009A24DF6768":{"file_size":107642880,"generation":2,"shard":"0008"},"000000067F00004005000060F30006798000-000000067F00004005000060F3000679C000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100000E021B-000000067F00004005000060F1000010043F__0000000D55A212C9-000000114A805939":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F000040050081DB430000DA8000-030000000000000000000000000000000002__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":233201664,"generation":2,"shard":"0008"},"000000067F00004005000060F100004EC079-000000067F00004005000060F20100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F7000170C000-000000067F00004005000060F70001720000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000FCD85E-000000067F00004005000060F80100000000__000000E4D847F4E0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00015B74FF-000000067F00004005016EA00C00015FF3A0__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30000AC9024-000000067F00004005000060F30000ADA0D0__0000003203FB5749-0000003579F03331":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16C40100000000-000000067F0000400500F3A25C000006C000__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000F1B5B-000000067F0000400500D69D790100000000__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":233275392,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C0C000-000000067F00004005000060F30003C257AD__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E44000-000000067F00004005000060F30000E60000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000018E4B6-000000067F00004005000060F7000019EA78__0000001737D88379-0000001B59EEB909":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00017E8000-000000067F00004005016EA00C00017EC000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003A4C09C-000000067F00004005000060F30003A6D1B3__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100000260F2-000000067F00004005000060F20100000000__000000027AF9D7D0":{"file_size":24576,"generation":1,"shard":"0008"},"000000067F00004005016EA00C0000097BDA-000000067F00004005016EA00C00000C7A73__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500C782E400002CDD5C-030000000000000000000000000000000002__000000D31E48D7C9-000000D74E29AAD1":{"file_size":90923008,"generation":2,"shard":"0008"},"000000067F00004005000060F3000685C000-000000067F00004005000060F30006860000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001C84000-000000067F00004005000060FB0001CE16ED__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000CC4BC2-000000067F000040050081DB430000CD6C36__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006349DA2-000000067F00004005000060F30006382F14__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000212E160-000000067F00004005000060F30100000000__0000009DF02C1241-000000A173C00489":{"file_size":224731136,"generation":2,"shard":"0008"},"000000067F00004005000060F30001FF8691-000000067F00004005000060F30100000000__0000009A1ABDE921-0000009DF02C1241":{"file_size":256114688,"generation":2,"shard":"0008"},"000000067F00004005000060F300067F4000-000000067F00004005000060F30006810000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700015A8000-000000067F00004005000060F700016205B5__0000012E71CF31F9-000001334140FC21":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500D69D790000024000-000000067F0000400500D69D790000028000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700007AE010-000000067F00004005000060F80100000000__00000075E5D2A930":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000428000-000000067F00004005016EA00C000042C000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30001E74000-000000067F00004005000060F30001F28000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038FF04F-000000067F00004005000060F30100000000__0000010779A7F551-0000010A5E65DF39":{"file_size":45359104,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001B0FD2A-000000067F00004005016EA00C0001B4FBC9__000001B6FFE46BC9-000001BA93C39481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30006858000-000000067F00004005000060F3000685C000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002F9A0EB-000000067F00004005000060F30002FD317C__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000808000-000000067F000040050081DB430000822079__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015DC000-000000067F00004005000060FB00015F0000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000021C000-000000067F00004005000060F7000025DA3C__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D79000007C000-000000067F0000400500D69D7900000A8000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000001EE3D-000000067F00004005000060F60100000000__00000054161C34B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB430000F4E15B-030000000000000000000000000000000002__000000C462B3C2A9-000000C824C09619":{"file_size":73662464,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F28000-000000067F00004005000060F30001F2C000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001F1DA6-030000000000000000000000000000000002__00000081AA3C40F0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F70001758B92-000000067F00004005000060F70001771169__000001398B56A519-0000013C9C0E3339":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A10000010000-000000067F0000400500E3A2A10000017F02__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002A98000-000000067F00004005000060F30002A9C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000573B206-000000067F00004005000060F300057942F4__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000860B45-030000000000000000000000000000000002__00000023FEF9F321-00000028C365FBE1":{"file_size":252788736,"generation":2,"shard":"0008"},"000000067F00004005000060F7000090B929-000000067F00004005000060F80100000000__0000008DDCD70B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F7000014B73D-000000067F00004005000060F80100000000__000000114A805939-00000013FB921C81":{"file_size":146432000,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D3C000-000000067F00004005000060F70000D60000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001514000-000000067F00004005000060F70001528000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001764000-000000067F00004005016EA00C0001788000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30001358000-000000067F00004005000060F3000135C000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001594000-000000067F00004005000060FB00015C8000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300004AC000-000000067F00004005000060F300004B8000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005610000-000000067F00004005000060F30005614000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002794000-000000067F00004005000060F300027C0000__000000BAC0041E18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C60000-000000067F00004005000060F30004C64000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700003A0000-000000067F00004005000060F700003B85C7__0000003579F03331-0000003959DA2DE9":{"file_size":268468224,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000F1034-030000000000000000000000000000000002__000000E4C63CFA21-000000E7C2F1B249":{"file_size":247480320,"generation":2,"shard":"0008"},"000000067F00004005000060F300051B4000-000000067F00004005000060F300051F0000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000003C77D-000000067F00004005000060F60100000000__000000A5A3F27398":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005010660F500000161F7-030000000000000000000000000000000002__0000010FB1BE19B9-00000113456156F1":{"file_size":64757760,"generation":2,"shard":"0008"},"000000067F00004005000060F30003F7B254-000000067F00004005000060F30003F942CF__0000011F1A40FA69-00000122A7BB7B29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004900000-000000067F00004005000060F30004904000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006F18000-000000067F00004005000060F30006F1C000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A21037-000000067F00004005000060F30003A31FB6__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000DB0000-000000067F00004005000060F30000E40F86__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001A60B43-000000067F00004005000060FB0001A71688__0000007E3A9BFD29-0000008196C976A1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DC8000-000000067F00004005000060F30006DCC000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700006E38F6-000000067F00004005000060F80100000000__000000698F2C3A38":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000122B1C9-000000067F00004005000060F300012442A9__00000057593D8169-0000005C01565329":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000EA8000-000000067F00004005000060FB0000EAC000__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B5A072-000000067F00004005000060F80100000000__00000159B010F6C0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000144DCA3-000000067F00004005016EA00C000151F7C5__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F600000711FF-000000067F00004005000060F60100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300050EC000-000000067F00004005000060F30005138000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005260000-000000067F00004005000060F30005290FC9__0000014EC58A4A79-0000015304A396B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700012DE407-000000067F00004005000060F80100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F10000-000000067F00004005000060F70000F185D4__000000DBBFA87AE1-000000DE2A8E4FC9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000D38000-000000067F00004005000060F70000D3C000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000006671F-000000067F00004005000060F60100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300059F53C6-000000067F00004005000060F30005A16504__0000015DD1D3C809-0000016143292911":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000B08000-000000067F000040050081DB430000B4A075__000000B2B5C4E8F9-000000B768469051":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000152C000-000000067F00004005000060F70001570000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000128000-000000067F00004005000060F3000012C000__0000018624969468":{"file_size":134422528,"generation":7,"shard":"0008"},"000000067F00004005000060F70000E24000-000000067F00004005000060F70000E387D6__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB4300002791D8-000000067F000040050081DB43000028B253__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F600000500F7-000000067F00004005000060F60100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000ABD9C4-000000067F00004005000060F80100000000__000000A5A3F27398":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB4300009CC000-000000067F000040050081DB430000A10000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700002005D2-000000067F00004005000060F80100000000__0000001B59EEB909-0000001FFBC01501":{"file_size":261169152,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001AA656E-000000067F000040050081D80C0100000000__00000081AA3C40F0":{"file_size":59138048,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E14000-000000067F000040050081DB430000E48000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003DD734C-000000067F00004005000060F30003E40000__0000011B632CC319-0000011F1A40FA69":{"file_size":261046272,"generation":2,"shard":"0008"},"000000067F0000400500D19D0300FFFFFFFF-030000000000000000000000000000000002__000000DE2A8E4FC9-000000E1CD2FBBE9":{"file_size":5373952,"generation":2,"shard":"0008"},"000000067F00004005000060F30001588000-000000067F00004005000060F3000158C000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000AC000-000000067F0000400500DBCED500000D0000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EB4A48000013F89B-000000067F0000400500EB4A48000014F7AC__000000F6661C9241-000000F901689359":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300005D704F-000000067F000040050081DB4300006310C9__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A14000-000000067F000040050081DB430000A18000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001F574A6-000000067F00004005000060F30001FF8691__0000009A1ABDE921-0000009DF02C1241":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D320100000000-000000067F0000400500FDA1F80000014000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001B09104-000000067F00004005000060F30001B4A119__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005011035750100000000-030000000000000000000000000000000002__00000159B010F6C0":{"file_size":78626816,"generation":2,"shard":"0008"},"000000067F00004005000060F1000015F545-000000067F00004005000060F20100000000__000000174479FC18":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000638C06D-000000067F00004005000060F300063A50CD__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000299C28F-000000067F00004005000060F300029A526C__000000BD9A7C56D9-000000C0C9EB88E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000364C000-000000067F00004005000060F30003650000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000CE0000-000000067F00004005016EA00C0000CE4000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB430000794000-000000067F000040050081DB4300007A8000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A18000-000000067F000040050081DB430000A1C000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000000C000-000000067F00004005000060F30000018000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F000040050081DB4300000D40FF-030000000000000000000000000000000002__00000075CC373F31-00000079F2A2F311":{"file_size":78061568,"generation":2,"shard":"0008"},"000000067F00004005000060F60000099FD8-000000067F00004005000060F60100000000__00000159B010F6C0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000330A1C8-000000067F00004005000060F3000332B1B6__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006FA900D-000000067F00004005000060F30006FDA081__00000184624E5741-000001860C80A151":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000148AC30-000000067F00004005000060FB000149B774__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C01FFFFFFFF-000000067F0000400500F3A25C0300000000__0000011F1A40FA69-00000122A7BB7B29":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30000EF1FC3-000000067F00004005000060F50100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006A7C566-000000067F00004005000060F30100000000__00000178B8B10551-0000017C9F5597E1":{"file_size":173072384,"generation":2,"shard":"0008"},"000000067F00004005000060FB000104B856-000000067F00004005000060FB000107C39B__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000030000-000000067F00004005000060F80100000000__000000021DC73119-000000044854EBD1":{"file_size":261341184,"generation":2,"shard":"0008"},"000000067F00004005000060F30003580FD3-000000067F00004005000060F30100000000__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":228188160,"generation":2,"shard":"0008"},"000000067F00004005000060F70001224000-000000067F00004005000060F70001232ACF__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300022B9050-000000067F00004005000060F3000230A0C7__000000A583FBFB91-000000A9EB8C4489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006654000-000000067F00004005000060F30006670000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700010D0000-000000067F00004005000060F700010D85CF__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000FD8000-030000000000000000000000000000000002__000000C824C09619-000000CC13D2E549":{"file_size":237559808,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015F0000-000000067F00004005000060FB00015F4000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F60100000000-000000067F00004005000060F70000004000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F00004005000060F70000DA1E38-000000067F00004005000060F80100000000__000000C462B3C2A9-000000C824C09619":{"file_size":209821696,"generation":2,"shard":"0008"},"000000067F00004005000060F30005D76250-000000067F00004005000060F30005D7F2DE__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000418000-000000067F00004005000060F10100000000__00000044B4679349-00000047E31D98D1":{"file_size":269148160,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B61000-000000067F00004005000060F80100000000__0000018613F0A050":{"file_size":65150976,"generation":3,"shard":"0008"},"000000067F00004005000060F300008C8000-000000067F00004005000060F300008E0F49__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300002D8000-030000000000000000000000000000000002__0000008625CF2891-00000089F4693119":{"file_size":231907328,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C04000-000000067F00004005000060FB0000C08000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001808000-000000067F00004005000060FB000180C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A30379-030000000000000000000000000000000002__000000AFE87558B0":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F700010D85CF-000000067F00004005000060F80100000000__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":164970496,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C70000-000000067F00004005000060FB0000C74000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001188000-000000067F00004005016EA00C000118C000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70000CB85B3-000000067F00004005000060F70000CC8B74__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A1D870-000000067F00004005000060F30004A2693B__000001398B56A519-0000013C9C0E3339":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00008CF772-000000067F00004005016EA00C00008E760F__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000D34000-000000067F00004005016EA00C0000D5D1E9__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00014B79E7-000000067F00004005016EA00C00014CF88D__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300040E319D-000000067F00004005000060F300040F41F4__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002FF427D-000000067F00004005000060F30100000000__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":156073984,"generation":2,"shard":"0008"},"000000067F00004005000060F30005E0A466-000000067F00004005000060F30005E3B48F__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700005F9158-000000067F00004005000060F80100000000__00000057593D8169-0000005C01565329":{"file_size":230768640,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018E4000-000000067F00004005016EA00C000193189A__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30005F0202C-000000067F00004005000060F30005F3303F__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000148000-000000067F00004005000060F1000014C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060C0000-000000067F00004005000060F300060C4000__0000016E41E03CA0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C9C000-000000067F00004005000060FB0000CC6E51__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050107B54700000A0EB1-000000067F000040050109CD330100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004EC000-000000067F00004005016EA00C00005A0000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C0000A9F465-000000067F00004005016EA00C0000ACF305__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F30000208000-000000067F00004005000060F3000020C000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000011E137-000000067F0000400500F67839000003E09B__000001048B25A8E9-0000010779A7F551":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30000402F4A-000000067F00004005000060F60100000000__000000114A805939-00000013FB921C81":{"file_size":166469632,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00004A8000-000000067F00004005016EA00C00004AC000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F70001968000-000000067F00004005000060F7000196C000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006EF8000-000000067F00004005000060F30006EFC000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000BB4000-000000067F00004005016EA00C0000C20000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700009C0000-000000067F00004005000060F80100000000__0000009A24DF6768":{"file_size":37371904,"generation":2,"shard":"0008"},"000000067F00004005000060F30004C84000-000000067F00004005000060F30004CB8000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002514000-000000067F00004005000060F30002530000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000DE05C8-000000067F00004005000060F80100000000__000000C824C09619-000000CC13D2E549":{"file_size":259473408,"generation":2,"shard":"0008"},"000000067F00004005000060F301FFFFFFFF-000000067F00004005000060F30300000000__00000186146441F1-0000018624969469":{"file_size":57344,"generation":6,"shard":"0008"},"000000067F00004005000060F30001886B2A-000000067F00004005000060F50100000000__00000075E5D2A930":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700006A8000-000000067F00004005000060F80100000000__000000636DE92159-000000663565F8C9":{"file_size":117022720,"generation":2,"shard":"0008"},"000000067F00004005000060FB000154C000-000000067F00004005000060FB0001558000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300053F40CC-000000067F00004005000060F30100000000__0000014EC58A4A79-0000015304A396B9":{"file_size":223453184,"generation":2,"shard":"0008"},"000000067F00004005000060F30005C95225-000000067F00004005000060F30005C9E3C4__0000016143292911-00000164DEE06671":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000558C000-000000067F00004005000060F30005598000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003FFA699-000000067F00004005000060F50100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006F1C000-000000067F00004005000060F50100000000__000001848D082B20":{"file_size":24117248,"generation":2,"shard":"0008"},"000000067F00004005000060F3000486C000-000000067F00004005000060F30004878000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300019C2056-000000067F00004005000060F300019F31AA__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC000004C000-000000067F0000400500EE16BC0000060000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000046EAB9-000000067F00004005000060F80100000000__000000417D21ACF9-00000044B4679349":{"file_size":48717824,"generation":2,"shard":"0008"},"000000067F000040050081DB430000790000-000000067F000040050081DB430000794000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D79000002C000-000000067F0000400500D69D790000078000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F60000026C90-000000067F00004005000060F60100000000__000000698F2C3A38":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30000738000-000000067F00004005000060F3000073C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000204000-000000067F00004005000060F10000218000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000177E20-000000067F0000400500C782E400001AFD31__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000048C000-000000067F00004005000060F700004B1E77__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015F8000-000000067F00004005000060F50100000000__000000698F2C3A38":{"file_size":131276800,"generation":2,"shard":"0008"},"000000067F00004005000060F30000428000-000000067F00004005000060F3000042C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB43000038C000-000000067F000040050081DB430000390000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB000102A1CE-000000067F00004005000060FB000103AD12__0000004C49155071-0000004F31878919":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001848000-000000067F00004005000060FB000184C000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00001DC000-000000067F00004005000060FB0000228000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00011D4000-000000067F00004005016EA00C0001228000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005016EA00C000011775B-030000000000000000000000000000000002__0000018820A34650":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005000060F700011B8000-000000067F00004005000060F80100000000__000001048B25A8E9-0000010779A7F551":{"file_size":263897088,"generation":2,"shard":"0008"},"000000067F00004005000060F3000660D31F-000000067F00004005000060F3000664E3CA__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC0000064000-000000067F0000400500EE16BC00000F28ED__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000525C065-000000067F00004005000060F50100000000__0000014EDD256548":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30004A7F98F-000000067F00004005000060F30100000000__000001398B56A519-0000013C9C0E3339":{"file_size":47595520,"generation":2,"shard":"0008"},"000000067F000040050100D04D000004369C-000000067F000040050100D04D000004B5AD__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F6000001A6E2-000000067F00004005000060F60100000000__00000047F1F2B800":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F700004405CF-000000067F00004005000060F80100000000__0000003D03FCCDB9-000000417D21ACF9":{"file_size":198836224,"generation":2,"shard":"0008"},"000000067F00004005000060F30002D28000-000000067F00004005000060F30002D2C000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F56D510100000000-000000067F0000400500F67839000003C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000E387D6-000000067F00004005000060F80100000000__000000D037B2DBD0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F3000213C000-000000067F00004005000060F30002168000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060D4415-000000067F00004005000060F3000612D506__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D3100000546CB-000000067F0000400500FB3D320100000000__00000122E1129DA0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F000040050081DB430000D18CA9-030000000000000000000000000000000002__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":210288640,"generation":2,"shard":"0008"},"000000067F00004005000060F60000062E4F-000000067F00004005000060F60100000000__00000104BD37F348":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000016A065-000000067F0000400500F3A25C000017C0CB__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001AD0000-000000067F00004005000060FB0001B28B44__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000254000-000000067F00004005000060F30000298000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000E8C000-000000067F000040050081DB430000EA0000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300040F41F4-000000067F00004005000060F3000412D27C__00000122A7BB7B29-0000012694E36301":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00013B8000-000000067F00004005000060FB00013BC000__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700000D8000-000000067F00004005000060F700000DC000__0000000D80565628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000958000-000000067F00004005000060F700009605D8__000000923719A971-00000096262826C9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB00004A0000-000000067F00004005000060FB00004A4000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700001213F2-000000067F00004005000060F80100000000__0000000D55A212C9-000000114A805939":{"file_size":55320576,"generation":2,"shard":"0008"},"000000067F00004005000060F30004156457-000000067F00004005000060F30100000000__00000122A7BB7B29-0000012694E36301":{"file_size":96927744,"generation":2,"shard":"0008"},"000000067F00004005000060F30003278000-000000067F00004005000060F3000327C000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000158F667-000000067F00004005016EA00C00015B74FF__000001AC25760149-000001AFC313C819":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001D50000-000000067F00004005000060FB0001D88B43__0000008DBE2855F9-000000923719A971":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F60000054AE8-000000067F00004005000060F60100000000__000000DBD29DC248":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300002C4887-000000067F00004005000060F60100000000__0000000D80565628":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001B34000-000000067F00004005000060F70001B5A072__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F600000416A8-000000067F00004005000060F60100000000__000000AFE87558B0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F10000050000-000000067F00004005000060F10000058000__000000044854EBD1-00000008B6B51879":{"file_size":264011776,"generation":2,"shard":"0008"},"000000067F00004005000060F300043FC000-000000067F00004005000060F300044D3639__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30004878000-000000067F00004005000060F3000487C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000396C000-000000067F00004005000060F30003998000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00019F7907-000000067F00004005016EA00C0001A477A4__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268443648,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00014D7727-000000067F00004005016EA00C00014E75C6__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005016EA00C00016570D9-030000000000000000000000000000000002__000001AC25760149-000001AFC313C819":{"file_size":86335488,"generation":11,"shard":"0008"},"000000067F00004005000060F70001270000-000000067F00004005000060F80100000000__0000010FB1BE19B9-00000113456156F1":{"file_size":265363456,"generation":2,"shard":"0008"},"000000067F0000400500EB4A4800003BFD31-000000067F0000400500EB4A4800003C7C42__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300014B31F8-000000067F00004005000060F300014CC16D__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000D5D1E9-030000000000000000000000000000000002__0000019E7001E460":{"file_size":139264,"generation":11,"shard":"0008"},"000000067F00004005000060F100003B8214-000000067F00004005000060F100003C0432__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001346854-000000067F00004005016EA00C000135FCAD__000001A931C135B1-000001AC25760149":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000160410C-000000067F00004005000060F3000165515A__000000698AF6E809-0000006DDB29D589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000118B12B-030000000000000000000000000000000002__00000054161C34B8":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30006DF0000-000000067F00004005000060F30006DF4000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700003C4000-000000067F00004005000060F700003FE341__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000FF0000-000000067F00004005000060F30100000000__0000004C49155071-0000004F31878919":{"file_size":256286720,"generation":2,"shard":"0008"},"000000067F00004005000060FB00015F4000-000000067F00004005000060FB00015FCD31__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005816253-000000067F00004005000060F30005847319__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002460000-000000067F00004005000060F30002464000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000113A337-000000067F00004005000060F700011528FB__000000FF8B261599-000001048B25A8E9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB000037968A-030000000000000000000000000000000002__0000000D55A212C9-000000114A805939":{"file_size":226426880,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000128000-000000067F00004005016EA00C000012FE9A__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F0000400500EB4A48000036FF11-000000067F0000400500EB4A4800003A7E20__000000FCCD5238B1-000000FF8B261599":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000658113F-000000067F00004005000060F3000659A203__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D18000-000000067F00004005016EA00C0001D1C000__000001BCB572A4E0":{"file_size":134422528,"generation":17,"shard":"0008"},"000000067F00004005000060F30001A44000-000000067F00004005000060F30001AB1583__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F10000138000-000000067F00004005000060F1000013C000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300009BC000-000000067F00004005000060F30000A50000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F7000110E30C-000000067F00004005000060F80100000000__000000F91FE84F08":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F50100000000-000000067F00004005000060F60000014000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006F18000-000000067F00004005000060F30006FA900D__00000184624E5741-000001860C80A151":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001D88B43-000000067F00004005000060FB0100000000__0000008DBE2855F9-000000923719A971":{"file_size":249028608,"generation":2,"shard":"0008"},"000000067F00004005000060F3000122A1D5-000000067F00004005000060F30100000000__0000005413AB3641-00000057593D8169":{"file_size":48783360,"generation":2,"shard":"0008"},"000000067F00004005000060F30006277C61-000000067F00004005000060F30006320C60__0000016E1FBB7B99-000001715E483C79":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000388000-000000067F000040050081DB43000038C000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000E67A6E-000000067F00004005016EA00C0000E77906__0000019E2C5DCEE1-000001A1DD8B4481":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F300009B8000-000000067F00004005000060F300009BC000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400501025D900000068000-000000067F00004005010450640000000570__0000010FB1BE19B9-00000113456156F1":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB00002D4B6A-030000000000000000000000000000000002__0000000D80565628":{"file_size":147456,"generation":2,"shard":"0008"},"000000067F00004005000060F30001E50FF3-000000067F00004005000060F30001E720A2__000000923719A971-00000096262826C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00005A4000-000000067F00004005016EA00C0000670000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060FB0000C18000-000000067F00004005000060FB0000C1C000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000BA4F5B-000000067F00004005000060F70000BBD532__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70001AC115C-000000067F00004005000060F80100000000__0000015304A396B9-0000015670D6AFD9":{"file_size":237248512,"generation":2,"shard":"0008"},"000000067F00004005000060F30004D24000-000000067F00004005000060F30004DA8000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30006CA4000-000000067F00004005000060F30006D10000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500EE16BC00001433D0-030000000000000000000000000000000002__000000FCCD5238B1-000000FF8B261599":{"file_size":146407424,"generation":2,"shard":"0008"},"000000067F00004005000060F3000165515A-000000067F00004005000060F30100000000__000000698AF6E809-0000006DDB29D589":{"file_size":112680960,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000118C000-000000067F00004005016EA00C00011D0000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F000040050081DB43000094A076-030000000000000000000000000000000002__000000A9EB8C4489-000000ACA44C8E99":{"file_size":176054272,"generation":2,"shard":"0008"},"000000067F00004005000060F70001528000-000000067F00004005000060F7000152C000__0000012E77D3BF00":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C82B50-000000067F000040050081DB430000CC4BC2__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001EF15A-000000067F000040050081DB4300002791D8__0000008196C976A1-0000008625CF2891":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F10000125BF2-000000067F00004005000060F20100000000__000000114A805939-00000013FB921C81":{"file_size":78782464,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E40F86-000000067F00004005000060F30100000000__000000417D21ACF9-00000044B4679349":{"file_size":111108096,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000FF0000-000000067F00004005016EA00C0000FF4000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30000CB16B6-000000067F00004005000060F50100000000__0000003D2AB09B68":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70001990000-000000067F00004005000060F70001994000__0000014EDD256548":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000A54000-000000067F00004005000060F30000A5F9BB__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300061B8705-000000067F00004005000060F300061D9774__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F7000084C000-000000067F00004005000060F70000858000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000848000-000000067F00004005000060F7000084C000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001D18000-000000067F00004005000060F30001D79136__0000008DBE2855F9-000000923719A971":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001558000-000000067F00004005000060FB000155C000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300024440AE-000000067F00004005000060F3000244D189__000000A9EB8C4489-000000ACA44C8E99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002CFC020-000000067F00004005000060F30100000000__000000C824C09619-000000CC13D2E549":{"file_size":150708224,"generation":2,"shard":"0008"},"000000067F000040050081DB430000A4A074-000000067F000040050081DB430000A640EA__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000C98000-000000067F00004005000060FB0000C9C000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001840000-000000067F00004005000060FB0001844000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30000802123-000000067F00004005000060F30000853115__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000029ED0-000000067F00004005000060F80100000000__000000027AF9D7D0":{"file_size":24576,"generation":1,"shard":"0008"},"000000067F00004005016EA00C00003E4000-000000067F00004005016EA00C00003E8000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004CBC000-000000067F00004005000060F30004D20000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000122C000-000000067F00004005016EA00C0001240000__000001A95031E5B8":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30004DF086C-000000067F00004005000060F50100000000__000001444EB7FC10":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F300050B5199-000000067F00004005000060F30100000000__0000014784964B91-0000014B000D1821":{"file_size":126124032,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001A477A4-000000067F00004005016EA00C0001ADF63C__000001B3E1B95181-000001B6FFE46BC9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F70001828000-000000067F00004005000060F7000182C000__000001444EB7FC10":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F100004F0000-000000067F00004005000060F10000518222__0000005413AB3641-00000057593D8169":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060F30005EFD576-000000067F00004005000060F30100000000__00000164DEE06671-0000016834A3FC91":{"file_size":193077248,"generation":2,"shard":"0008"},"000000067F0000400500F8E3A50100000000-000000067F0000400500FA2AD30000004000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000258E3A9-000000067F00004005000060F3000259F4A3__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000C90000-000000067F00004005000060F70000CB85B3__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005000060FB000114C000-000000067F00004005000060FB000118B12B__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003644000-000000067F00004005000060F30003648000__000000F91FE84F08":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001A50000-000000067F00004005000060FB0001A60B43__0000007E3A9BFD29-0000008196C976A1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003C257AD-000000067F00004005000060F50100000000__000001180B3FF408":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002DE8000-000000067F00004005000060F30002E4104A__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C00000C8000-000000067F0000400500F3A25C00000EA069__000001048B25A8E9-0000010779A7F551":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002174000-000000067F00004005000060F30002210000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014D5280-000000067F00004005000060F300014E6333__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000332B1B6-000000067F00004005000060F30003344134__000000E4C63CFA21-000000E7C2F1B249":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300065F42B4-000000067F00004005000060F3000660D31F__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010E264A-000000067F000040050081DB4300010F46BD__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300069D13FA-000000067F00004005000060F300069FA3F6__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300061D9774-000000067F00004005000060F30006222843__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100005C821A-000000067F00004005000060F20100000000__000000601F43CF09-000000636DE92159":{"file_size":265183232,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000200000-000000067F0000400500EB4A480000204000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70001690000-000000067F00004005000060F70100000000__000001334140FC21-00000137115BE4D9":{"file_size":273965056,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000A575C7-000000067F00004005016EA00C0000A9F465__00000196C9018F59-0000019A2EAFE7A9":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001E6C000-000000067F00004005000060FB0001E98000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00014195A7-000000067F00004005000060FB000147A0EC__000000601F43CF09-000000636DE92159":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000AE168A-030000000000000000000000000000000002__0000003203FB5749-0000003579F03331":{"file_size":223379456,"generation":2,"shard":"0008"},"000000067F00004005000060F30000CA0000-000000067F00004005000060F30000CA4000__0000003D2AB09B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300006E4000-000000067F00004005000060F30000738000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300006E0000-000000067F00004005000060F300006E4000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001124000-000000067F00004005000060FB0001148000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500D69D7900000A8000-000000067F0000400500D69D7900000AC000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000130000-000000067F0000400500C782E40000137F10__000000D01F399709-000000D31E48D7C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000020FBCF-000000067F00004005016EA00C0000257A6F__000001880F984A29-0000018C496B6DB1":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060FB0001B28B44-000000067F00004005000060FB0100000000__0000008196C976A1-0000008625CF2891":{"file_size":249454592,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001120000-000000067F00004005000060FB0001124000__00000054161C34B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005474062-000000067F00004005000060F3000549D0A6__0000015304A396B9-0000015670D6AFD9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E4000023FA62-030000000000000000000000000000000002__000000D01F399709-000000D31E48D7C9":{"file_size":245366784,"generation":2,"shard":"0008"},"000000067F000040050081DB430000160484-030000000000000000000000000000000002__00000079F2A2F311-0000007E3A9BFD29":{"file_size":226582528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038A4FB4-000000067F00004005000060F300038B5F5B__0000010779A7F551-0000010A5E65DF39":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300017E8000-000000067F00004005000060F300017EC000__00000075E5D2A930":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300100000000-000000067F0000400500FB3D31000000C000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700010105DB-000000067F00004005000060F80100000000__000000E4C63CFA21-000000E7C2F1B249":{"file_size":254935040,"generation":2,"shard":"0008"},"000000067F00004005000060F70000858570-000000067F00004005000060F80100000000__0000008196C976A1-0000008625CF2891":{"file_size":252985344,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001D4000-000000067F000040050081DB4300001E8000__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00005E0000-000000067F00004005000060FB0000638B45__0000001B59EEB909-0000001FFBC01501":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050107B547000006C000-000000067F000040050107B54700000A0EB1__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000430000-000000067F00004005000060FB0000434000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300014E6333-000000067F00004005000060F3000151F271__000000636DE92159-000000663565F8C9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500FB3D300100000000-000000067F0000400500FB3D300300000000__00000117EDA82C11-0000011B632CC319":{"file_size":65536,"generation":2,"shard":"0008"},"000000067F00004005000060F30004BE7584-000000067F00004005000060F30100000000__0000013C9C0E3339-0000013FEFA7D709":{"file_size":58204160,"generation":2,"shard":"0008"},"000000067F00004005000060F70001068000-000000067F00004005000060F80100000000__000000E7C2F1B249-000000EBC9213D59":{"file_size":168730624,"generation":2,"shard":"0008"},"000000067F00004005000060F1000013C000-000000067F00004005000060F10000148000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000659A203-000000067F00004005000060F300065BB235__000001715E483C79-000001751A7D7589":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F70000EC0000-000000067F00004005000060F70000EF85D6__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268460032,"generation":2,"shard":"0008"},"000000067F00004005010660F500000B4000-000000067F00004005010660F500000F44CB__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300067A4000-000000067F00004005000060F300067F0000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500DBCED500000F0000-000000067F0000400500DBCED500000F4000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000768000-000000067F000040050081DB43000076C000__000000A5A3F27398":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C00018E0000-000000067F00004005016EA00C00018E4000__000001B3F17FE4E0":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30000A50000-000000067F00004005000060F30000A54000__000000321AA80270":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001E68000-000000067F00004005000060FB0001E6C000__0000009A24DF6768":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001960000-000000067F00004005000060F300019790A2__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000B6A1D0-000000067F00004005000060FB0000BAAD15__0000003579F03331-0000003959DA2DE9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002E4A157-000000067F00004005000060F30002E630CF__000000D31E48D7C9-000000D74E29AAD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E70000-000000067F00004005000060F30006E74000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F700004464DD-000000067F00004005000060F7000046EAB9__000000417D21ACF9-00000044B4679349":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000204000-000000067F0000400500EB4A480000218000__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300042D51D6-000000067F00004005000060F3000430E1E9__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000F30000-000000067F00004005000060FB0100000000__00000047E31D98D1-0000004C49155071":{"file_size":272302080,"generation":2,"shard":"0008"},"000000067F000040050081DB4300006F8000-030000000000000000000000000000000002__0000009DF02C1241-000000A173C00489":{"file_size":235110400,"generation":2,"shard":"0008"},"000000067F000040050081DB4300001EC000-000000067F000040050081DB4300001F1DA6__00000081AA3C40F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300038A3082-000000067F00004005000060F30100000000__000001048B25A8E9-0000010779A7F551":{"file_size":76644352,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000400000-000000067F00004005016EA00C0000404000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30003481DDB-000000067F00004005000060F30100000000__000000E7C2F1B249-000000EBC9213D59":{"file_size":107814912,"generation":2,"shard":"0008"},"000000067F00004005000060F3000489C000-000000067F00004005000060F300048A0000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000CD6C36-000000067F000040050081DB430000D18CA9__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30004888000-000000067F00004005000060F3000488C000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300008E0F49-000000067F00004005000060F30000921E8A__00000028C365FBE1-0000002D2A8E0B81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500C782E40000074000-000000067F0000400500C782E400000A0000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00011F2D11-000000067F00004005000060FB0001203856__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300046330B1-000000067F00004005000060F300046B41AA__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003548000-000000067F00004005000060F30003580FD3__000000EFA7EAA9E1-000000F309FCDD19":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0001198B44-000000067F00004005000060FB00011C1688__0000005413AB3641-00000057593D8169":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000049C000-000000067F00004005000060F300004A8000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000B44000-000000067F00004005016EA00C0000BB0000__0000019E7001E460":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F700014F0000-000000067F00004005000060F700014F85DF__0000012694E36301-0000012A3F140591":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000C5E15B-000000067F000040050081DB430000C801D1__000000B768469051-000000BAB1E56C91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A10000-000000067F00004005000060F30003A21037__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006EFC000-000000067F00004005000060F30006F18000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0001D1F87B-000000067F00004005016EA00C0001D7F71A__000001BA93C39481-000001BCB572A4E1":{"file_size":268451840,"generation":17,"shard":"0008"},"000000067F00004005000060F30002A34000-000000067F00004005000060F30002A40000__000000C483D0D6B8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000F0AA88-000000067F00004005000060F80100000000__000000DBD29DC248":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30006700000-000000067F00004005000060F30006704000__00000178C5D5D3A8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CC4000-000000067F00004005000060F30001CD0000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000858000-000000067F00004005000060F80100000000__00000081AA3C40F0":{"file_size":48439296,"generation":2,"shard":"0008"},"000000067F000040050081DB4300000D6407-000000067F000040050081DB430000160484__00000079F2A2F311-0000007E3A9BFD29":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300057DD292-000000067F00004005000060F30005816253__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006222843-000000067F00004005000060F3000625B8F0__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000410000-000000067F00004005000060FB0000430B46__000000114A805939-00000013FB921C81":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F100006A8000-000000067F00004005000060F100006B0000__0000006DDB29D589-000000722F474369":{"file_size":264110080,"generation":2,"shard":"0008"},"000000067F00004005000060F3000460202F-000000067F00004005000060F300046330B1__0000012E71CF31F9-000001334140FC21":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30006E74000-000000067F00004005000060F30006EF8000__000001848D082B20":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003A3B020-000000067F00004005000060F30003A4C09C__0000010D5DC42EF9-0000010FB1BE19B9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30002535462-000000067F00004005000060F3000258E3A9__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500EB4A480000294000-000000067F0000400500EB4A480000355928__000000FCD84FE628":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016E85370000000000-030000000000000000000000000000000002__00000159A7EC8CB9-0000015DD1D3C809":{"file_size":152190976,"generation":2,"shard":"0008"},"000000067F00004005000060F3000158C000-000000067F00004005000060F300015B0000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003386D10-000000067F00004005000060F300033D7D7C__000000E7C2F1B249-000000EBC9213D59":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000E7C000-000000067F00004005000060F30000EF1FC3__00000047F1F2B800":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500FA2AD30000030000-000000067F0000400500FA2AD30000034000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005588000-000000067F00004005000060F3000558C000__00000159B010F6C0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300039A0000-000000067F00004005000060F300039A4000__0000010D77B487A0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F6000008A13D-000000067F00004005000060F60100000000__000001444EB7FC10":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060FB00017120CE-000000067F00004005000060FB000172AC12__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30003200000-000000067F00004005000060F30003204000__000000E4D847F4E0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300007C1007-000000067F00004005000060F30000802123__00000023FEF9F321-00000028C365FBE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F3A25C000006C000-000000067F0000400500F3A25C00000BB439__00000104BD37F348":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300015B4000-000000067F00004005000060F300015F8000__000000698F2C3A38":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F300060C220F-000000067F00004005000060F300060CB2C8__0000016B49A934C1-0000016E1FBB7B99":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500F8E3A5000004A25C-000000067F0000400500F8E3A50100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002C9AFB8-000000067F00004005000060F30002CFC020__000000C824C09619-000000CC13D2E549":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005010F2BD40100000000-000000067F00004005010F44EB000000C000__00000126C3C69FC0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30002AEED02-000000067F00004005000060F50100000000__000000C483D0D6B8":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30002EB8000-000000067F00004005000060F30002F5105E__000000D74E29AAD1-000000DBBFA87AE1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500E3A2A1000016321A-030000000000000000000000000000000002__000000EFDE07FFD8":{"file_size":139264,"generation":2,"shard":"0008"},"000000067F00004005000060F3000135C000-000000067F00004005000060F30001407F7A__000000603CA8F2F0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F0000400500F67839000006AEF4-000000067F0000400500F7D2DD0100000000__0000010D77B487A0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F30005DA03A8-000000067F00004005000060F30005DC93F1__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB4300010E2072-000000067F000040050081DB430100000000__000000D01F399709-000000D31E48D7C9":{"file_size":15392768,"generation":2,"shard":"0008"},"000000067F00004005000060F300004A8000-000000067F00004005000060F300004AC000__000000174479FC18":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060FB00016E0A44-000000067F00004005000060FB0001701588__0000006DDB29D589-000000722F474369":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F300024D8000-000000067F00004005000060F300024DC000__000000AFE87558B0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003BC8000-000000067F00004005000060F30003BCC000__000001180B3FF408":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F00100000000-000000067F00004005000060F10000004000__000000027AF9D7D0":{"file_size":134422528,"generation":1,"shard":"0008"},"000000067F000040050081DB430100000000-000000067F0000400500C782E40000074000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30003D14206-000000067F00004005000060F30003D252C8__00000117EDA82C11-0000011B632CC319":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700006479E7-000000067F00004005000060F80100000000__000000603CA8F2F0":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000B9C988-000000067F00004005000060F70000BA4F5B__000000AFD23C27B9-000000B2B5C4E8F9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F0000400500D69D790000078000-000000067F0000400500D69D79000007C000__000000EFDE07FFD8":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F70000CC8B74-000000067F00004005000060F80100000000__000000BAB1E56C91-000000BD9A7C56D9":{"file_size":95657984,"generation":2,"shard":"0008"},"000000067F00004005000060FB0000708000-000000067F00004005000060FB000070C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F000040050081DB430000EA0000-000000067F000040050081DB430000EEA075__000000C462B3C2A9-000000C824C09619":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005016EA00C000001FD3E-000000067F00004005016EA00C0000097BDA__0000018624969469-000001880F984A29":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F00004005000060F3000689E295-000000067F00004005000060F3000690F2FD__00000178B8B10551-0000017C9F5597E1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30000CE0000-000000067F00004005000060F30000D31030__0000003D03FCCDB9-000000417D21ACF9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F000040050081DB430000EA0000-030000000000000000000000000000000002__000000C483D0D6B8":{"file_size":20307968,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000807A34-000000067F00004005016EA00C00008578D4__000001935283F9B9-00000196C9018F59":{"file_size":268451840,"generation":11,"shard":"0008"},"000000067F000040050081DB430001060000-000000067F000040050081DB430001064000__000000D037B2DBD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F3000480F32C-000000067F00004005000060F3000486837F__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F700009385D4-000000067F00004005000060F80100000000__0000008DBE2855F9-000000923719A971":{"file_size":252207104,"generation":2,"shard":"0008"},"000000067F00004005000060F30000090000-000000067F00004005000060F300000C1095__000000021DC73119-000000044854EBD1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000480620C-000000067F00004005000060F3000480F32C__000001334140FC21-00000137115BE4D9":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005FA40AD-000000067F00004005000060F30005FC519A__0000016834A3FC91-0000016B49A934C1":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060FB00014A42B8-030000000000000000000000000000000002__000000601F43CF09-000000636DE92159":{"file_size":137322496,"generation":2,"shard":"0008"},"000000067F00004005000060F30001CD0000-000000067F00004005000060F30001CD4000__0000008DDCD70B68":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005016EA00C0000404000-000000067F00004005016EA00C0000428000__000001936E73D028":{"file_size":134422528,"generation":11,"shard":"0008"},"000000067F00004005000060F30002079FDE-000000067F00004005000060F300020830BE__0000009DF02C1241-000000A173C00489":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F3000487C000-000000067F00004005000060F30004880000__00000139CF156B58":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005010A188401FFFFFFFF-000000067F00004005010A18840300000000__00000137115BE4D9-000001398B56A519":{"file_size":24576,"generation":2,"shard":"0008"},"000000067F00004005000060F70000218000-000000067F00004005000060F7000021C000__0000002427BD8BD0":{"file_size":134422528,"generation":2,"shard":"0008"},"000000067F00004005000060F30005EF454F-000000067F00004005000060F30005EFD576__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"},"000000067F00004005000060F30005DC93F1-000000067F00004005000060F30005E0A466__00000164DEE06671-0000016834A3FC91":{"file_size":268451840,"generation":2,"shard":"0008"}},"disk_consistent_lsn":"1BC/B5734CD8","metadata_bytes":{"disk_consistent_lsn":"1BC/B5734CD8","prev_record_lsn":"1BC/B5734CB0","ancestor_timeline":null,"ancestor_lsn":"0/0","latest_gc_cutoff_lsn":"1BC/B5732690","initdb_lsn":"0/14EE150","pg_version":16},"lineage":{}} diff --git a/patches/pg_anon.patch b/patches/pg_anon.patch new file mode 100644 index 0000000000..15dfd3c5a0 --- /dev/null +++ b/patches/pg_anon.patch @@ -0,0 +1,223 @@ +commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f +Author: Alexey Masterov +Date: Fri May 31 06:34:26 2024 +0000 + + These alternative expected files were added to consider the neon features + +diff --git a/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out +new file mode 100644 +index 0000000..2539cfd +--- /dev/null ++++ b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out +@@ -0,0 +1,101 @@ ++BEGIN; ++CREATE EXTENSION anon CASCADE; ++NOTICE: installing required extension "pgcrypto" ++SELECT anon.init(); ++ init ++------ ++ t ++(1 row) ++ ++CREATE ROLE mallory_the_masked_user; ++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED'; ++CREATE TABLE t1(i INT); ++ALTER TABLE t1 ADD COLUMN t TEXT; ++SECURITY LABEL FOR anon ON COLUMN t1.t ++IS 'MASKED WITH VALUE NULL'; ++INSERT INTO t1 VALUES (1,'test'); ++-- ++-- We're checking the owner's permissions ++-- ++-- see ++-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions ++-- ++SET ROLE mallory_the_masked_user; ++SELECT anon.pseudo_first_name(0) IS NOT NULL; ++ ?column? ++---------- ++ t ++(1 row) ++ ++-- SHOULD FAIL ++DO $$ ++BEGIN ++ PERFORM anon.init(); ++ EXCEPTION WHEN insufficient_privilege ++ THEN RAISE NOTICE 'insufficient_privilege'; ++END$$; ++NOTICE: insufficient_privilege ++-- SHOULD FAIL ++DO $$ ++BEGIN ++ PERFORM anon.anonymize_table('t1'); ++ EXCEPTION WHEN insufficient_privilege ++ THEN RAISE NOTICE 'insufficient_privilege'; ++END$$; ++NOTICE: insufficient_privilege ++-- SHOULD FAIL ++SAVEPOINT fail_start_engine; ++SELECT anon.start_dynamic_masking(); ++ERROR: Only supersusers can start the dynamic masking engine. ++CONTEXT: PL/pgSQL function anon.start_dynamic_masking(boolean) line 18 at RAISE ++ROLLBACK TO fail_start_engine; ++RESET ROLE; ++SELECT anon.start_dynamic_masking(); ++ start_dynamic_masking ++----------------------- ++ t ++(1 row) ++ ++SET ROLE mallory_the_masked_user; ++SELECT * FROM mask.t1; ++ i | t ++---+--- ++ 1 | ++(1 row) ++ ++-- SHOULD FAIL ++DO $$ ++BEGIN ++ SELECT * FROM public.t1; ++ EXCEPTION WHEN insufficient_privilege ++ THEN RAISE NOTICE 'insufficient_privilege'; ++END$$; ++NOTICE: insufficient_privilege ++-- SHOULD FAIL ++SAVEPOINT fail_stop_engine; ++SELECT anon.stop_dynamic_masking(); ++ERROR: Only supersusers can stop the dynamic masking engine. ++CONTEXT: PL/pgSQL function anon.stop_dynamic_masking() line 18 at RAISE ++ROLLBACK TO fail_stop_engine; ++RESET ROLE; ++SELECT anon.stop_dynamic_masking(); ++NOTICE: The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually. ++ stop_dynamic_masking ++---------------------- ++ t ++(1 row) ++ ++SET ROLE mallory_the_masked_user; ++SELECT COUNT(*)=1 FROM anon.pg_masking_rules; ++ ?column? ++---------- ++ t ++(1 row) ++ ++-- SHOULD FAIL ++SAVEPOINT fail_seclabel_on_role; ++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL; ++ERROR: permission denied ++DETAIL: The current user must have the CREATEROLE attribute. ++ROLLBACK TO fail_seclabel_on_role; ++ROLLBACK; +diff --git a/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out +new file mode 100644 +index 0000000..8b090fe +--- /dev/null ++++ b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out +@@ -0,0 +1,104 @@ ++BEGIN; ++CREATE EXTENSION anon CASCADE; ++NOTICE: installing required extension "pgcrypto" ++SELECT anon.init(); ++ init ++------ ++ t ++(1 row) ++ ++CREATE ROLE oscar_the_owner; ++ALTER DATABASE :DBNAME OWNER TO oscar_the_owner; ++CREATE ROLE mallory_the_masked_user; ++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED'; ++-- ++-- We're checking the owner's permissions ++-- ++-- see ++-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions ++-- ++SET ROLE oscar_the_owner; ++SELECT anon.pseudo_first_name(0) IS NOT NULL; ++ ?column? ++---------- ++ t ++(1 row) ++ ++-- SHOULD FAIL ++DO $$ ++BEGIN ++ PERFORM anon.init(); ++ EXCEPTION WHEN insufficient_privilege ++ THEN RAISE NOTICE 'insufficient_privilege'; ++END$$; ++NOTICE: insufficient_privilege ++CREATE TABLE t1(i INT); ++ALTER TABLE t1 ADD COLUMN t TEXT; ++SECURITY LABEL FOR anon ON COLUMN t1.t ++IS 'MASKED WITH VALUE NULL'; ++INSERT INTO t1 VALUES (1,'test'); ++SELECT anon.anonymize_table('t1'); ++ anonymize_table ++----------------- ++ t ++(1 row) ++ ++SELECT * FROM t1; ++ i | t ++---+--- ++ 1 | ++(1 row) ++ ++UPDATE t1 SET t='test' WHERE i=1; ++-- SHOULD FAIL ++SAVEPOINT fail_start_engine; ++SELECT anon.start_dynamic_masking(); ++ start_dynamic_masking ++----------------------- ++ t ++(1 row) ++ ++ROLLBACK TO fail_start_engine; ++RESET ROLE; ++SELECT anon.start_dynamic_masking(); ++ start_dynamic_masking ++----------------------- ++ t ++(1 row) ++ ++SET ROLE oscar_the_owner; ++SELECT * FROM t1; ++ i | t ++---+------ ++ 1 | test ++(1 row) ++ ++--SELECT * FROM mask.t1; ++-- SHOULD FAIL ++SAVEPOINT fail_stop_engine; ++SELECT anon.stop_dynamic_masking(); ++ERROR: permission denied for schema mask ++CONTEXT: SQL statement "DROP VIEW mask.t1;" ++PL/pgSQL function anon.mask_drop_view(oid) line 3 at EXECUTE ++SQL statement "SELECT anon.mask_drop_view(oid) ++ FROM pg_catalog.pg_class ++ WHERE relnamespace=quote_ident(pg_catalog.current_setting('anon.sourceschema'))::REGNAMESPACE ++ AND relkind IN ('r','p','f')" ++PL/pgSQL function anon.stop_dynamic_masking() line 22 at PERFORM ++ROLLBACK TO fail_stop_engine; ++RESET ROLE; ++SELECT anon.stop_dynamic_masking(); ++NOTICE: The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually. ++ stop_dynamic_masking ++---------------------- ++ t ++(1 row) ++ ++SET ROLE oscar_the_owner; ++-- SHOULD FAIL ++SAVEPOINT fail_seclabel_on_role; ++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL; ++ERROR: permission denied ++DETAIL: The current user must have the CREATEROLE attribute. ++ROLLBACK TO fail_seclabel_on_role; ++ROLLBACK; diff --git a/patches/pg_cron.patch b/patches/pg_cron.patch new file mode 100644 index 0000000000..c2b648c20c --- /dev/null +++ b/patches/pg_cron.patch @@ -0,0 +1,19 @@ +commit b3ea51ee158f113f2f82d0b97c12c54343c9a695 (HEAD -> master) +Author: Alexey Masterov +Date: Fri Jun 7 19:23:42 2024 +0000 + + Disable REGRESS_OPTIONS causing initdb + +diff --git a/ext-src/pg_cron-src/Makefile b/ext-src/pg_cron-src/Makefile +index 053314c..fbd5fb5 100644 +--- a/ext-src/pg_cron-src/Makefile ++++ b/ext-src/pg_cron-src/Makefile +@@ -5,7 +5,7 @@ EXTENSION = pg_cron + DATA_built = $(EXTENSION)--1.0.sql + DATA = $(wildcard $(EXTENSION)--*--*.sql) + +-REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check ++#REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check + REGRESS = pg_cron-test + + # compilation configuration diff --git a/patches/pg_hint_plan.patch b/patches/pg_hint_plan.patch new file mode 100644 index 0000000000..4039a036df --- /dev/null +++ b/patches/pg_hint_plan.patch @@ -0,0 +1,53 @@ +diff --git a/expected/ut-A.out b/expected/ut-A.out +index da723b8..5328114 100644 +--- a/expected/ut-A.out ++++ b/expected/ut-A.out +@@ -9,13 +9,16 @@ SET search_path TO public; + ---- + -- No.A-1-1-3 + CREATE EXTENSION pg_hint_plan; ++LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan + -- No.A-1-2-3 + DROP EXTENSION pg_hint_plan; + -- No.A-1-1-4 + CREATE SCHEMA other_schema; + CREATE EXTENSION pg_hint_plan SCHEMA other_schema; ++LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan + ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan" + CREATE EXTENSION pg_hint_plan; ++LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan + DROP SCHEMA other_schema; + ---- + ---- No. A-5-1 comment pattern +@@ -3175,6 +3178,7 @@ SELECT s.query, s.calls + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d + ON (s.dbid = d.oid) ++ WHERE s.query LIKE 'SELECT * FROM s1.t1%' OR s.query LIKE '%pg_stat_statements_reset%' + ORDER BY 1; + query | calls + --------------------------------------+------- +diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out +index d372459..6282afe 100644 +--- a/expected/ut-fdw.out ++++ b/expected/ut-fdw.out +@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on; + SET client_min_messages TO LOG; + SET pg_hint_plan.enable_hint TO on; + CREATE EXTENSION file_fdw; ++LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw + CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw; + CREATE USER MAPPING FOR PUBLIC SERVER file_server; + CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename'); +diff --git a/sql/ut-A.sql b/sql/ut-A.sql +index 7c7d58a..4fd1a07 100644 +--- a/sql/ut-A.sql ++++ b/sql/ut-A.sql +@@ -963,6 +963,7 @@ SELECT s.query, s.calls + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d + ON (s.dbid = d.oid) ++ WHERE s.query LIKE 'SELECT * FROM s1.t1%' OR s.query LIKE '%pg_stat_statements_reset%' + ORDER BY 1; + + ---- diff --git a/patches/pgvector.patch b/patches/pgvector.patch new file mode 100644 index 0000000000..3e1ffcaaaf --- /dev/null +++ b/patches/pgvector.patch @@ -0,0 +1,62 @@ +diff --git a/src/hnswbuild.c b/src/hnswbuild.c +index dcfb2bd..d5189ee 100644 +--- a/src/hnswbuild.c ++++ b/src/hnswbuild.c +@@ -860,9 +860,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc) + + hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false); + ++#ifdef NEON_SMGR ++ smgr_start_unlogged_build(RelationGetSmgr(indexRel)); ++#endif ++ + /* Perform inserts */ + HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false); + ++#ifdef NEON_SMGR ++ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel)); ++#endif ++ + /* Close relations within worker */ + index_close(indexRel, indexLockmode); + table_close(heapRel, heapLockmode); +@@ -1117,12 +1125,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, + SeedRandom(42); + #endif + ++#ifdef NEON_SMGR ++ smgr_start_unlogged_build(RelationGetSmgr(index)); ++#endif ++ + InitBuildState(buildstate, heap, index, indexInfo, forkNum); + + BuildGraph(buildstate, forkNum); + +- if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM) ++#ifdef NEON_SMGR ++ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index)); ++#endif ++ ++ if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM) { + log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocksInFork(index, forkNum), true); ++#ifdef NEON_SMGR ++ { ++#if PG_VERSION_NUM >= 160000 ++ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator; ++#else ++ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node; ++#endif ++ ++ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, ++ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); ++ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM); ++ } ++#endif ++ } ++ ++#ifdef NEON_SMGR ++ smgr_end_unlogged_build(RelationGetSmgr(index)); ++#endif + + FreeBuildState(buildstate); + } diff --git a/patches/rum.patch b/patches/rum.patch new file mode 100644 index 0000000000..3041f8df81 --- /dev/null +++ b/patches/rum.patch @@ -0,0 +1,54 @@ +commit 68f3b3b0d594f08aacc4a082ee210749ed5677eb +Author: Anastasia Lubennikova +Date: Mon Jul 15 12:31:56 2024 +0100 + + Neon: fix unlogged index build patch + +diff --git a/src/ruminsert.c b/src/ruminsert.c +index e8b209d..e89bf2a 100644 +--- a/src/ruminsert.c ++++ b/src/ruminsert.c +@@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + ++#ifdef NEON_SMGR ++ smgr_start_unlogged_build(index->rd_smgr); ++#endif ++ + initRumState(&buildstate.rumstate, index); + buildstate.rumstate.isBuild = true; + buildstate.indtuples = 0; +@@ -693,6 +697,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) + buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); + rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild); + ++#ifdef NEON_SMGR ++ smgr_finish_unlogged_build_phase_1(index->rd_smgr); ++#endif ++ + /* + * Write index to xlog + */ +@@ -713,6 +721,21 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) + UnlockReleaseBuffer(buffer); + } + ++#ifdef NEON_SMGR ++ { ++#if PG_VERSION_NUM >= 160000 ++ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator; ++#else ++ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node; ++#endif ++ ++ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); ++ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM); ++ ++ smgr_end_unlogged_build(index->rd_smgr); ++ } ++#endif ++ + /* + * Return statistics + */ diff --git a/pgxn/.dir-locals.el b/pgxn/.dir-locals.el new file mode 100644 index 0000000000..ab6208b698 --- /dev/null +++ b/pgxn/.dir-locals.el @@ -0,0 +1,19 @@ +;; see also src/tools/editors/emacs.samples for more complete settings + +((c-mode . ((c-basic-offset . 4) + (c-file-style . "bsd") + (fill-column . 78) + (indent-tabs-mode . t) + (tab-width . 4))) + (nxml-mode . ((fill-column . 78) + (indent-tabs-mode . nil))) + (perl-mode . ((perl-indent-level . 4) + (perl-continued-statement-offset . 2) + (perl-continued-brace-offset . -2) + (perl-brace-offset . 0) + (perl-brace-imaginary-offset . 0) + (perl-label-offset . -2) + (indent-tabs-mode . t) + (tab-width . 4))) + (sgml-mode . ((fill-column . 78) + (indent-tabs-mode . nil)))) diff --git a/pgxn/.editorconfig b/pgxn/.editorconfig new file mode 100644 index 0000000000..d69a3d1dc4 --- /dev/null +++ b/pgxn/.editorconfig @@ -0,0 +1,14 @@ +root = true + +[*.{c,h,l,y,pl,pm}] +indent_style = tab +indent_size = tab +tab_width = 4 + +[*.{sgml,xml}] +indent_style = space +indent_size = 1 + +[*.xsl] +indent_style = space +indent_size = 2 diff --git a/pgxn/hnsw/hnsw.c b/pgxn/hnsw/hnsw.c index 45bf78ed3b..e624cb831f 100644 --- a/pgxn/hnsw/hnsw.c +++ b/pgxn/hnsw/hnsw.c @@ -149,7 +149,7 @@ hnsw_check_available_memory(Size requested) struct sysinfo si; Size total; if (sysinfo(&si) < 0) - elog(ERROR, "Failed to get amount of RAM: %n"); + elog(ERROR, "Failed to get amount of RAM: %m"); total = si.totalram*si.mem_unit; if ((Size)NBuffers*BLCKSZ + requested >= total) diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index c6b224a14d..3b755bb042 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -6,6 +6,7 @@ OBJS = \ $(WIN32RES) \ extension_server.o \ file_cache.o \ + hll.o \ libpagestore.o \ neon.o \ neon_utils.o \ @@ -14,14 +15,15 @@ OBJS = \ relsize_cache.o \ walproposer.o \ walproposer_pg.o \ - control_plane_connector.o + control_plane_connector.o \ + walsender_hooks.o PG_CPPFLAGS = -I$(libpq_srcdir) SHLIB_LINK_INTERNAL = $(libpq) SHLIB_LINK = -lcurl EXTENSION = neon -DATA = neon--1.0.sql neon--1.0--1.1.sql +DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql neon--1.4--1.3.sql PGFILEDESC = "neon - cloud storage for PostgreSQL" EXTRA_CLEAN = \ diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index f6f006cba4..de023da5c4 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -35,15 +35,17 @@ #include "utils/memutils.h" #include "utils/jsonb.h" +#include "control_plane_connector.h" +#include "neon_utils.h" + static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL; +static const char *jwt_token = NULL; + /* GUCs */ static char *ConsoleURL = NULL; static bool ForwardDDL = true; - -/* Curl structures for sending the HTTP requests */ -static CURL *CurlHandle; -static struct curl_slist *ContentHeader = NULL; +static bool RegressTestMode = false; /* * CURL docs say that this buffer must exist until we call curl_easy_cleanup @@ -113,6 +115,8 @@ ConstructDeltaMessage() if (RootTable.db_table) { JsonbValue dbs; + HASH_SEQ_STATUS status; + DbEntry *entry; dbs.type = jbvString; dbs.val.string.val = "dbs"; @@ -120,9 +124,6 @@ ConstructDeltaMessage() pushJsonbValue(&state, WJB_KEY, &dbs); pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL); - HASH_SEQ_STATUS status; - DbEntry *entry; - hash_seq_init(&status, RootTable.db_table); while ((entry = hash_seq_search(&status)) != NULL) { @@ -168,8 +169,9 @@ ConstructDeltaMessage() #else const char *logdetail; #endif + char *encrypted_password; PushKeyValue(&state, "password", (char *) entry->password); - char *encrypted_password = get_role_password(entry->name, &logdetail); + encrypted_password = get_role_password(entry->name, &logdetail); if (encrypted_password) { @@ -226,6 +228,8 @@ ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata) static void SendDeltasToControlPlane() { + static CURL *handle = NULL; + if (!RootTable.db_table && !RootTable.role_table) return; if (!ConsoleURL) @@ -236,29 +240,57 @@ SendDeltasToControlPlane() if (!ForwardDDL) return; - char *message = ConstructDeltaMessage(); - ErrorString str = {}; + if (handle == NULL) + { + struct curl_slist *headers = NULL; - curl_easy_setopt(CurlHandle, CURLOPT_CUSTOMREQUEST, "PATCH"); - curl_easy_setopt(CurlHandle, CURLOPT_HTTPHEADER, ContentHeader); - curl_easy_setopt(CurlHandle, CURLOPT_POSTFIELDS, message); - curl_easy_setopt(CurlHandle, CURLOPT_URL, ConsoleURL); - curl_easy_setopt(CurlHandle, CURLOPT_ERRORBUFFER, CurlErrorBuf); - curl_easy_setopt(CurlHandle, CURLOPT_TIMEOUT, 3L /* seconds */ ); - curl_easy_setopt(CurlHandle, CURLOPT_WRITEDATA, &str); - curl_easy_setopt(CurlHandle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback); + headers = curl_slist_append(headers, "Content-Type: application/json"); + if (headers == NULL) + { + elog(ERROR, "Failed to set Content-Type header"); + } + + if (jwt_token) + { + char auth_header[8192]; + + snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token); + headers = curl_slist_append(headers, auth_header); + if (headers == NULL) + { + elog(ERROR, "Failed to set Authorization header"); + } + } + + handle = alloc_curl_handle(); + + curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "PATCH"); + curl_easy_setopt(handle, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(handle, CURLOPT_URL, ConsoleURL); + curl_easy_setopt(handle, CURLOPT_ERRORBUFFER, CurlErrorBuf); + curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ ); + curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback); + } + + char *message = ConstructDeltaMessage(); + ErrorString str; + + str.size = 0; + + curl_easy_setopt(handle, CURLOPT_POSTFIELDS, message); + curl_easy_setopt(handle, CURLOPT_WRITEDATA, &str); const int num_retries = 5; - int curl_status; + CURLcode curl_status; for (int i = 0; i < num_retries; i++) { - if ((curl_status = curl_easy_perform(CurlHandle)) == 0) + if ((curl_status = curl_easy_perform(handle)) == 0) break; elog(LOG, "Curl request failed on attempt %d: %s", i, CurlErrorBuf); pg_usleep(1000 * 1000); } - if (curl_status != 0) + if (curl_status != CURLE_OK) { elog(ERROR, "Failed to perform curl request: %s", CurlErrorBuf); } @@ -266,13 +298,11 @@ SendDeltasToControlPlane() { long response_code; - if (curl_easy_getinfo(CurlHandle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION) + if (curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION) { - bool error_exists = str.size != 0; - if (response_code != 200) { - if (error_exists) + if (str.size != 0) { elog(ERROR, "Received HTTP code %ld from control plane: %s", @@ -773,6 +803,14 @@ NeonProcessUtility( case T_DropRoleStmt: HandleDropRole(castNode(DropRoleStmt, parseTree)); break; + case T_CreateTableSpaceStmt: + if (!RegressTestMode) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("CREATE TABLESPACE is not supported on Neon"))); + } + break; default: break; } @@ -803,7 +841,7 @@ NeonProcessUtility( } } -extern void +void InitControlPlaneConnector() { PreviousProcessUtilityHook = ProcessUtility_hook; @@ -835,34 +873,22 @@ InitControlPlaneConnector() NULL, NULL); - const char *jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN"); + DefineCustomBoolVariable( + "neon.regress_test_mode", + "Controls whether we are running in the regression test mode", + NULL, + &RegressTestMode, + false, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN"); if (!jwt_token) { elog(LOG, "Missing NEON_CONTROL_PLANE_TOKEN environment variable, forwarding will not be authenticated"); } - if (curl_global_init(CURL_GLOBAL_DEFAULT)) - { - elog(ERROR, "Failed to initialize curl"); - } - if ((CurlHandle = curl_easy_init()) == NULL) - { - elog(ERROR, "Failed to initialize curl handle"); - } - if ((ContentHeader = curl_slist_append(ContentHeader, "Content-Type: application/json")) == NULL) - { - elog(ERROR, "Failed to initialize content header"); - } - - if (jwt_token) - { - char auth_header[8192]; - - snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token); - if ((ContentHeader = curl_slist_append(ContentHeader, auth_header)) == NULL) - { - elog(ERROR, "Failed to initialize authorization header"); - } - } } diff --git a/pgxn/neon/control_plane_connector.h b/pgxn/neon/control_plane_connector.h index 12d6a97562..7eed449200 100644 --- a/pgxn/neon/control_plane_connector.h +++ b/pgxn/neon/control_plane_connector.h @@ -1,6 +1,6 @@ #ifndef CONTROL_PLANE_CONNECTOR_H #define CONTROL_PLANE_CONNECTOR_H -void InitControlPlaneConnector(); +void InitControlPlaneConnector(void); #endif diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c index d9a75142f1..e38af08f89 100644 --- a/pgxn/neon/extension_server.c +++ b/pgxn/neon/extension_server.c @@ -14,6 +14,9 @@ #include "utils/guc.h" +#include "extension_server.h" +#include "neon_utils.h" + static int extension_server_port = 0; static download_extension_file_hook_type prev_download_extension_file_hook = NULL; @@ -31,15 +34,18 @@ static download_extension_file_hook_type prev_download_extension_file_hook = NUL static bool neon_download_extension_file_http(const char *filename, bool is_library) { - CURL *curl; + static CURL *handle = NULL; + CURLcode res; char *compute_ctl_url; - char *postdata; bool ret = false; - if ((curl = curl_easy_init()) == NULL) + if (handle == NULL) { - elog(ERROR, "Failed to initialize curl handle"); + handle = alloc_curl_handle(); + + curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST"); + curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ ); } compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s", @@ -47,28 +53,22 @@ neon_download_extension_file_http(const char *filename, bool is_library) elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url); - curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST"); - curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url); - curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */ ); + curl_easy_setopt(handle, CURLOPT_URL, compute_ctl_url); - if (curl) + /* Perform the request, res will get the return code */ + res = curl_easy_perform(handle); + /* Check for errors */ + if (res == CURLE_OK) { - /* Perform the request, res will get the return code */ - res = curl_easy_perform(curl); - /* Check for errors */ - if (res == CURLE_OK) - { - ret = true; - } - else - { - /* Don't error here because postgres will try to find the file */ - /* and will fail with some proper error message if it's not found. */ - elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res)); - } - - /* always cleanup */ - curl_easy_cleanup(curl); + ret = true; + } + else + { + /* + * Don't error here because postgres will try to find the file and will + * fail with some proper error message if it's not found. + */ + elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res)); } return ret; diff --git a/pgxn/neon/extension_server.h b/pgxn/neon/extension_server.h new file mode 100644 index 0000000000..3e67708b85 --- /dev/null +++ b/pgxn/neon/extension_server.h @@ -0,0 +1,17 @@ +/*------------------------------------------------------------------------- + * + * extension_server.h + * Request compute_ctl to download extension files. + * + * IDENTIFICATION + * contrib/neon/extension_server.h + * + *------------------------------------------------------------------------- + */ + +#ifndef EXTENSION_SERVER_H +#define EXTENSION_SERVER_H + +void pg_init_extension_server(void); + +#endif /* EXTENSION_SERVER_H */ diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 21db666caa..479209a537 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -25,6 +25,7 @@ #include "funcapi.h" #include "miscadmin.h" #include "pagestore_client.h" +#include "common/hashfn.h" #include "pgstat.h" #include "postmaster/bgworker.h" #include RELFILEINFO_HDR @@ -38,6 +39,10 @@ #include "utils/dynahash.h" #include "utils/guc.h" +#include "hll.h" + +#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) + /* * Local file cache is used to temporary store relations pages in local file system. * All blocks of all relations are stored inside one file and addressed using shared hash map. @@ -48,19 +53,43 @@ * * Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about * its consistency. + + * + * ## Holes + * + * The LFC can be resized on the fly, up to a maximum size that's determined + * at server startup (neon.max_file_cache_size). After server startup, we + * expand the underlying file when needed, until it reaches the soft limit + * (neon.file_cache_size_limit). If the soft limit is later reduced, we shrink + * the LFC by punching holes in the underlying file with a + * fallocate(FALLOC_FL_PUNCH_HOLE) call. The nominal size of the file doesn't + * shrink, but the disk space it uses does. + * + * Each hole is tracked by a dummy FileCacheEntry, which are kept in the + * 'holes' linked list. They are entered into the chunk hash table, with a + * special key where the blockNumber is used to store the 'offset' of the + * hole, and all other fields are zero. Holes are never looked up in the hash + * table, we only enter them there to have a FileCacheEntry that we can keep + * in the linked list. If the soft limit is raised again, we reuse the holes + * before extending the nominal size of the file. */ /* Local file storage allocation chunk. - * Should be power of two and not less than 32. Using larger than page chunks can + * Should be power of two. Using larger than page chunks can * 1. Reduce hash-map memory footprint: 8TB database contains billion pages * and size of hash entry is 40 bytes, so we need 40Gb just for hash map. * 1Mb chunks can reduce hash map size to 320Mb. * 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed */ #define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */ +/* + * Smaller chunk seems to be better for OLTP workload + */ +// #define BLOCKS_PER_CHUNK 8 /* 64kb chunk */ #define MB ((uint64)1024*1024) #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK)) +#define CHUNK_BITMAP_SIZE ((BLOCKS_PER_CHUNK + 31) / 32) typedef struct FileCacheEntry { @@ -68,8 +97,8 @@ typedef struct FileCacheEntry uint32 hash; uint32 offset; uint32 access_count; - uint32 bitmap[BLOCKS_PER_CHUNK / 32]; - dlist_node lru_node; /* LRU list node */ + uint32 bitmap[CHUNK_BITMAP_SIZE]; + dlist_node list_node; /* LRU/holes list node */ } FileCacheEntry; typedef struct FileCacheControl @@ -84,6 +113,8 @@ typedef struct FileCacheControl uint64 writes; dlist_head lru; /* double linked list for LRU replacement * algorithm */ + dlist_head holes; /* double linked list of punched holes */ + HyperLogLogState wss_estimation; /* estimation of working set size */ } FileCacheControl; static HTAB *lfc_hash; @@ -131,6 +162,7 @@ lfc_disable(char const *op) lfc_ctl->used = 0; lfc_ctl->limit = 0; dlist_init(&lfc_ctl->lru); + dlist_init(&lfc_ctl->holes); if (lfc_desc > 0) { @@ -210,18 +242,18 @@ lfc_shmem_startup(void) if (!found) { int fd; - uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size); + uint32 n_chunks = SIZE_MB_TO_CHUNKS(lfc_max_size); lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock"); info.keysize = sizeof(BufferTag); info.entrysize = sizeof(FileCacheEntry); /* - * lfc_size+1 because we add new element to hash table before eviction + * n_chunks+1 because we add new element to hash table before eviction * of victim */ lfc_hash = ShmemInitHash("lfc_hash", - lfc_size + 1, lfc_size + 1, + n_chunks + 1, n_chunks + 1, &info, HASH_ELEM | HASH_BLOBS); lfc_ctl->generation = 0; @@ -231,6 +263,10 @@ lfc_shmem_startup(void) lfc_ctl->misses = 0; lfc_ctl->writes = 0; dlist_init(&lfc_ctl->lru); + dlist_init(&lfc_ctl->holes); + + /* Initialize hyper-log-log structure for estimating working set size */ + initSHLL(&lfc_ctl->wss_estimation); /* Recreate file cache on restart */ fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); @@ -303,17 +339,37 @@ lfc_change_limit_hook(int newval, void *extra) * Shrink cache by throwing away least recently accessed chunks and * returning their space to file system */ - FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru)); + FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); + FileCacheEntry *hole; + uint32 offset = victim->offset; + uint32 hash; + bool found; + BufferTag holetag; - Assert(victim->access_count == 0); + CriticalAssert(victim->access_count == 0); #ifdef FALLOC_FL_PUNCH_HOLE if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0) neon_log(LOG, "Failed to punch hole in file: %m"); #endif + /* We remove the old entry, and re-enter a hole to the hash table */ hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); + + memset(&holetag, 0, sizeof(holetag)); + holetag.blockNum = offset; + hash = get_hash_value(lfc_hash, &holetag); + hole = hash_search_with_hash_value(lfc_hash, &holetag, hash, HASH_ENTER, &found); + hole->hash = hash; + hole->offset = offset; + hole->access_count = 0; + CriticalAssert(!found); + dlist_push_tail(&lfc_ctl->holes, &hole->list_node); + lfc_ctl->used -= 1; } lfc_ctl->limit = new_size; + if (new_size == 0) { + lfc_ctl->generation += 1; + } neon_log(DEBUG1, "set local file cache limit to %d", new_size); LWLockRelease(lfc_lock); @@ -399,6 +455,8 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); hash = get_hash_value(lfc_hash, &tag); LWLockAcquire(lfc_lock, LW_SHARED); @@ -430,6 +488,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) tag.forkNum = forkNum; tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1)); + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); hash = get_hash_value(lfc_hash, &tag); LWLockAcquire(lfc_lock, LW_EXCLUSIVE); @@ -460,7 +519,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) { bool has_remaining_pages; - for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) + for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) { if (entry->bitmap[i] != 0) { @@ -475,8 +534,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) */ if (!has_remaining_pages) { - dlist_delete(&entry->lru_node); - dlist_push_head(&lfc_ctl->lru, &entry->lru_node); + dlist_delete(&entry->list_node); + dlist_push_head(&lfc_ctl->lru, &entry->list_node); } } @@ -515,6 +574,8 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); hash = get_hash_value(lfc_hash, &tag); LWLockAcquire(lfc_lock, LW_EXCLUSIVE); @@ -526,16 +587,22 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, } entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + + /* Approximate working set */ + tag.blockNum = blkno; + addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0) { /* Page is not cached */ lfc_ctl->misses += 1; + pgBufferUsage.file_cache.misses += 1; LWLockRelease(lfc_lock); return false; } /* Unlink entry from LRU list to pin it for the duration of IO operation */ if (entry->access_count++ == 0) - dlist_delete(&entry->lru_node); + dlist_delete(&entry->list_node); generation = lfc_ctl->generation; entry_offset = entry->offset; @@ -553,11 +620,12 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (lfc_ctl->generation == generation) { - Assert(LFC_ENABLED()); + CriticalAssert(LFC_ENABLED()); lfc_ctl->hits += 1; - Assert(entry->access_count > 0); + pgBufferUsage.file_cache.hits += 1; + CriticalAssert(entry->access_count > 0); if (--entry->access_count == 0) - dlist_push_tail(&lfc_ctl->lru, &entry->lru_node); + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); } else result = false; @@ -596,6 +664,8 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void tag.forkNum = forkNum; tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); CopyNRelFileInfoToBufTag(tag, rinfo); + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); hash = get_hash_value(lfc_hash, &tag); LWLockAcquire(lfc_lock, LW_EXCLUSIVE); @@ -615,7 +685,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void * operation */ if (entry->access_count++ == 0) - dlist_delete(&entry->lru_node); + dlist_delete(&entry->list_node); } else { @@ -638,13 +708,26 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru)) { /* Cache overflow: evict least recently used chunk */ - FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru)); + FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); - Assert(victim->access_count == 0); + CriticalAssert(victim->access_count == 0); entry->offset = victim->offset; /* grab victim's chunk */ hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); neon_log(DEBUG2, "Swap file cache page"); } + else if (!dlist_is_empty(&lfc_ctl->holes)) + { + /* We can reuse a hole that was left behind when the LFC was shrunk previously */ + FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes)); + uint32 offset = hole->offset; + bool found; + + hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found); + CriticalAssert(found); + + lfc_ctl->used += 1; + entry->offset = offset; /* reuse the hole */ + } else { lfc_ctl->used += 1; @@ -672,11 +755,11 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void if (lfc_ctl->generation == generation) { - Assert(LFC_ENABLED()); + CriticalAssert(LFC_ENABLED()); /* Place entry to the head of LRU list */ - Assert(entry->access_count > 0); + CriticalAssert(entry->access_count > 0); if (--entry->access_count == 0) - dlist_push_tail(&lfc_ctl->lru, &entry->lru_node); + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31)); } @@ -691,7 +774,6 @@ typedef struct } NeonGetStatsCtx; #define NUM_NEON_GET_STATS_COLS 2 -#define NUM_NEON_GET_STATS_ROWS 3 PG_FUNCTION_INFO_V1(neon_get_lfc_stats); Datum @@ -727,7 +809,6 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS) INT8OID, -1, 0); fctx->tupdesc = BlessTupleDesc(tupledesc); - funcctx->max_calls = NUM_NEON_GET_STATS_ROWS; funcctx->user_fctx = fctx; /* Return to original context when allocating transient memory */ @@ -761,6 +842,11 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS) if (lfc_ctl) value = lfc_ctl->writes; break; + case 4: + key = "file_cache_size"; + if (lfc_ctl) + value = lfc_ctl->size; + break; default: SRF_RETURN_DONE(funcctx); } @@ -884,7 +970,7 @@ local_cache_pages(PG_FUNCTION_ARGS) hash_seq_init(&status, lfc_hash); while ((entry = hash_seq_search(&status)) != NULL) { - for (int i = 0; i < BLOCKS_PER_CHUNK / 32; i++) + for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) n_pages += pg_popcount32(entry->bitmap[i]); } } @@ -962,3 +1048,39 @@ local_cache_pages(PG_FUNCTION_ARGS) else SRF_RETURN_DONE(funcctx); } + +PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds); + +Datum +approximate_working_set_size_seconds(PG_FUNCTION_ARGS) +{ + if (lfc_size_limit != 0) + { + int32 dc; + time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0); + LWLockAcquire(lfc_lock, LW_SHARED); + dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration); + LWLockRelease(lfc_lock); + PG_RETURN_INT32(dc); + } + PG_RETURN_NULL(); +} + +PG_FUNCTION_INFO_V1(approximate_working_set_size); + +Datum +approximate_working_set_size(PG_FUNCTION_ARGS) +{ + if (lfc_size_limit != 0) + { + int32 dc; + bool reset = PG_GETARG_BOOL(0); + LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED); + dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1); + if (reset) + memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs); + LWLockRelease(lfc_lock); + PG_RETURN_INT32(dc); + } + PG_RETURN_NULL(); +} diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c new file mode 100644 index 0000000000..f8496b3125 --- /dev/null +++ b/pgxn/neon/hll.c @@ -0,0 +1,193 @@ +/*------------------------------------------------------------------------- + * + * hll.c + * Sliding HyperLogLog cardinality estimator + * + * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group + * + * Implements https://hal.science/hal-00465313/document + * + * Based on Hideaki Ohno's C++ implementation. This is probably not ideally + * suited to estimating the cardinality of very large sets; in particular, we + * have not attempted to further optimize the implementation as described in + * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic + * Engineering of a State of The Art Cardinality Estimation Algorithm". + * + * A sparse representation of HyperLogLog state is used, with fixed space + * overhead. + * + * The copyright terms of Ohno's original version (the MIT license) follow. + * + * IDENTIFICATION + * src/backend/lib/hyperloglog.c + * + *------------------------------------------------------------------------- + */ + +/* + * Copyright (c) 2013 Hideaki Ohno + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#include "postgres.h" +#include "funcapi.h" +#include "port/pg_bitutils.h" +#include "utils/timestamp.h" +#include "hll.h" + + +#define POW_2_32 (4294967296.0) +#define NEG_POW_2_32 (-4294967296.0) + +#define ALPHA_MM ((0.7213 / (1.0 + 1.079 / HLL_N_REGISTERS)) * HLL_N_REGISTERS * HLL_N_REGISTERS) + +/* + * Worker for addHyperLogLog(). + * + * Calculates the position of the first set bit in first b bits of x argument + * starting from the first, reading from most significant to least significant + * bits. + * + * Example (when considering fist 10 bits of x): + * + * rho(x = 0b1000000000) returns 1 + * rho(x = 0b0010000000) returns 3 + * rho(x = 0b0000000000) returns b + 1 + * + * "The binary address determined by the first b bits of x" + * + * Return value "j" used to index bit pattern to watch. + */ +static inline uint8 +rho(uint32 x, uint8 b) +{ + uint8 j = 1; + + if (x == 0) + return b + 1; + + j = 32 - pg_leftmost_one_pos32(x); + + if (j > b) + return b + 1; + + return j; +} + +/* + * Initialize HyperLogLog track state + */ +void +initSHLL(HyperLogLogState *cState) +{ + memset(cState->regs, 0, sizeof(cState->regs)); +} + +/* + * Adds element to the estimator, from caller-supplied hash. + * + * It is critical that the hash value passed be an actual hash value, typically + * generated using hash_any(). The algorithm relies on a specific bit-pattern + * observable in conjunction with stochastic averaging. There must be a + * uniform distribution of bits in hash values for each distinct original value + * observed. + */ +void +addSHLL(HyperLogLogState *cState, uint32 hash) +{ + uint8 count; + uint32 index; + size_t i; + size_t j; + + TimestampTz now = GetCurrentTimestamp(); + /* Use the first "k" (registerWidth) bits as a zero based index */ + index = hash >> HLL_C_BITS; + + /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ + count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS); + + cState->regs[index][count] = now; +} + +static uint8 +getMaximum(const TimestampTz* reg, TimestampTz since) +{ + uint8 max = 0; + + for (size_t i = 0; i < HLL_C_BITS + 1; i++) + { + if (reg[i] >= since) + { + max = i; + } + } + + return max; +} + + +/* + * Estimates cardinality, based on elements added so far + */ +double +estimateSHLL(HyperLogLogState *cState, time_t duration) +{ + double result; + double sum = 0.0; + size_t i; + uint8 R[HLL_N_REGISTERS]; + /* 0 indicates uninitialized timestamp, so if we need to cover the whole range than starts with 1 */ + TimestampTz since = duration == (time_t)-1 ? 1 : GetCurrentTimestamp() - duration * USECS_PER_SEC; + + for (i = 0; i < HLL_N_REGISTERS; i++) + { + R[i] = getMaximum(cState->regs[i], since); + sum += 1.0 / pow(2.0, R[i]); + } + + /* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */ + result = ALPHA_MM / sum; + + if (result <= (5.0 / 2.0) * HLL_N_REGISTERS) + { + /* Small range correction */ + int zero_count = 0; + + for (i = 0; i < HLL_N_REGISTERS; i++) + { + zero_count += R[i] == 0; + } + + if (zero_count != 0) + result = HLL_N_REGISTERS * log((double) HLL_N_REGISTERS / + zero_count); + } + else if (result > (1.0 / 30.0) * POW_2_32) + { + /* Large range correction */ + result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32)); + } + + return result; +} + diff --git a/pgxn/neon/hll.h b/pgxn/neon/hll.h new file mode 100644 index 0000000000..9256cb9afa --- /dev/null +++ b/pgxn/neon/hll.h @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------- + * + * hll.h + * Sliding HyperLogLog cardinality estimator + * + * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group + * + * Implements https://hal.science/hal-00465313/document + * + * Based on Hideaki Ohno's C++ implementation. This is probably not ideally + * suited to estimating the cardinality of very large sets; in particular, we + * have not attempted to further optimize the implementation as described in + * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic + * Engineering of a State of The Art Cardinality Estimation Algorithm". + * + * A sparse representation of HyperLogLog state is used, with fixed space + * overhead. + * + * The copyright terms of Ohno's original version (the MIT license) follow. + * + * IDENTIFICATION + * src/backend/lib/hyperloglog.c + * + *------------------------------------------------------------------------- + */ + +/* + * Copyright (c) 2013 Hideaki Ohno + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef HLL_H +#define HLL_H + +#define HLL_BIT_WIDTH 10 +#define HLL_C_BITS (32 - HLL_BIT_WIDTH) +#define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH) + +/* + * HyperLogLog is an approximate technique for computing the number of distinct + * entries in a set. Importantly, it does this by using a fixed amount of + * memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal + * cardinality estimation algorithm" for more. + * + * Instead of a single counter for every bits register, we have a timestamp + * for every valid number of bits we can encounter. Every time we encounter + * a certain number of bits, we update the timestamp in those registers to + * the current timestamp. + * + * We can query the sketch's stored cardinality for the range of some timestamp + * up to now: For each register, we return the highest bits bucket that has a + * modified timestamp >= the query timestamp. This value is the number of bits + * for this register in the normal HLL calculation. + * + * The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB. + * Usage could be halved if we decide to reduce the required time dimension + * precision; as 32 bits in second precision should be enough for statistics. + * However, that is not yet implemented. + */ +typedef struct HyperLogLogState +{ + TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1]; +} HyperLogLogState; + +extern void initSHLL(HyperLogLogState *cState); +extern void addSHLL(HyperLogLogState *cState, uint32 hash); +extern double estimateSHLL(HyperLogLogState *cState, time_t dutration); + +#endif diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 0eb1acbfb0..5126c26c5d 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -49,7 +49,8 @@ char *neon_auth_token; int readahead_buffer_size = 128; int flush_every_n_requests = 8; -static int n_reconnect_attempts = 0; +int neon_protocol_version = 2; + static int max_reconnect_attempts = 60; static int stripe_size; @@ -93,24 +94,44 @@ static shmem_startup_hook_type prev_shmem_startup_hook; static PagestoreShmemState *pagestore_shared; static uint64 pagestore_local_counter = 0; +typedef enum PSConnectionState { + PS_Disconnected, /* no connection yet */ + PS_Connecting_Startup, /* connection starting up */ + PS_Connecting_PageStream, /* negotiating pagestream */ + PS_Connected, /* connected, pagestream established */ +} PSConnectionState; + /* This backend's per-shard connections */ typedef struct { - PGconn *conn; + TimestampTz last_connect_time; /* read-only debug value */ + TimestampTz last_reconnect_time; + uint32 delay_us; + int n_reconnect_attempts; /*--- - * WaitEventSet containing: - * - WL_SOCKET_READABLE on 'conn' - * - WL_LATCH_SET on MyLatch, and - * - WL_EXIT_ON_PM_DEATH. + * Pageserver connection state, i.e. + * disconnected: conn == NULL, wes == NULL; + * conn_startup: connection initiated, waiting for connection establishing + * conn_ps: PageStream query sent, waiting for confirmation + * connected: PageStream established */ - WaitEventSet *wes; + PSConnectionState state; + PGconn *conn; + /*--- + * WaitEventSet containing: + * - WL_SOCKET_READABLE on 'conn' + * - WL_LATCH_SET on MyLatch, and + * - WL_EXIT_ON_PM_DEATH. + */ + WaitEventSet *wes_read; } PageServer; static PageServer page_servers[MAX_SHARDS]; static bool pageserver_flush(shardno_t shard_no); static void pageserver_disconnect(shardno_t shard_no); +static void pageserver_disconnect_shard(shardno_t shard_no); static bool PagestoreShmemIsValid(void) @@ -300,111 +321,288 @@ get_shard_number(BufferTag *tag) return hash % n_shards; } +static inline void +CLEANUP_AND_DISCONNECT(PageServer *shard) +{ + if (shard->wes_read) + { + FreeWaitEventSet(shard->wes_read); + shard->wes_read = NULL; + } + if (shard->conn) + { + PQfinish(shard->conn); + shard->conn = NULL; + } + + shard->state = PS_Disconnected; +} + +/* + * Connect to a pageserver, or continue to try to connect if we're yet to + * complete the connection (e.g. due to receiving an earlier cancellation + * during connection start). + * Returns true if successfully connected; false if the connection failed. + * + * Throws errors in unrecoverable situations, or when this backend's query + * is canceled. + */ static bool pageserver_connect(shardno_t shard_no, int elevel) { - char *query; - int ret; - const char *keywords[3]; - const char *values[3]; - int n; - PGconn *conn; - WaitEventSet *wes; + PageServer *shard = &page_servers[shard_no]; char connstr[MAX_PAGESERVER_CONNSTRING_SIZE]; - static TimestampTz last_connect_time = 0; - static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC; - TimestampTz now; - uint64_t us_since_last_connect; - - Assert(page_servers[shard_no].conn == NULL); - /* * Get the connection string for this shard. If the shard map has been * updated since we last looked, this will also disconnect any existing * pageserver connections as a side effect. + * Note that connstr is used both during connection start, and when we + * log the successful connection. */ load_shard_map(shard_no, connstr, NULL); - now = GetCurrentTimestamp(); - us_since_last_connect = now - last_connect_time; - if (us_since_last_connect < delay_us) + switch (shard->state) { - pg_usleep(delay_us - us_since_last_connect); - delay_us *= 2; - if (delay_us > MAX_RECONNECT_INTERVAL_USEC) - delay_us = MAX_RECONNECT_INTERVAL_USEC; - last_connect_time = GetCurrentTimestamp(); + case PS_Disconnected: + { + const char *keywords[3]; + const char *values[3]; + int n_pgsql_params; + TimestampTz now; + int64 us_since_last_attempt; + + /* Make sure we start with a clean slate */ + CLEANUP_AND_DISCONNECT(shard); + + neon_shard_log(shard_no, DEBUG5, "Connection state: Disconnected"); + + now = GetCurrentTimestamp(); + us_since_last_attempt = (int64) (now - shard->last_reconnect_time); + shard->last_reconnect_time = now; + + /* + * Make sure we don't do exponential backoff with a constant multiplier + * of 0 us, as that doesn't really do much for timeouts... + * + * cf. https://github.com/neondatabase/neon/issues/7897 + */ + if (shard->delay_us == 0) + shard->delay_us = MIN_RECONNECT_INTERVAL_USEC; + + /* + * If we did other tasks between reconnect attempts, then we won't + * need to wait as long as a full delay. + */ + if (us_since_last_attempt < shard->delay_us) + { + pg_usleep(shard->delay_us - us_since_last_attempt); + } + + /* update the delay metric */ + shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC); + + /* + * Connect using the connection string we got from the + * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment + * variable was set, use that as the password. + * + * The connection options are parsed in the order they're given, so when + * we set the password before the connection string, the connection string + * can override the password from the env variable. Seems useful, although + * we don't currently use that capability anywhere. + */ + keywords[0] = "dbname"; + values[0] = connstr; + n_pgsql_params = 1; + + if (neon_auth_token) + { + keywords[1] = "password"; + values[1] = neon_auth_token; + n_pgsql_params++; + } + + keywords[n_pgsql_params] = NULL; + values[n_pgsql_params] = NULL; + + shard->conn = PQconnectStartParams(keywords, values, 1); + if (PQstatus(shard->conn) == CONNECTION_BAD) + { + char *msg = pchomp(PQerrorMessage(shard->conn)); + CLEANUP_AND_DISCONNECT(shard); + ereport(elevel, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), + errdetail_internal("%s", msg))); + pfree(msg); + return false; + } + shard->state = PS_Connecting_Startup; + /* fallthrough */ } - else + case PS_Connecting_Startup: { - delay_us = MIN_RECONNECT_INTERVAL_USEC; - last_connect_time = now; + char *pagestream_query; + int ps_send_query_ret; + bool connected = false; + int poll_result = PGRES_POLLING_WRITING; + neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_Startup"); + + do + { + WaitEvent event; + + switch (poll_result) + { + default: /* unknown/unused states are handled as a failed connection */ + case PGRES_POLLING_FAILED: + { + char *pqerr = PQerrorMessage(shard->conn); + char *msg = NULL; + neon_shard_log(shard_no, DEBUG5, "POLLING_FAILED"); + + if (pqerr) + msg = pchomp(pqerr); + + CLEANUP_AND_DISCONNECT(shard); + + if (msg) + { + neon_shard_log(shard_no, elevel, + "could not connect to pageserver: %s", + msg); + pfree(msg); + } + else + neon_shard_log(shard_no, elevel, + "could not connect to pageserver"); + + return false; + } + case PGRES_POLLING_READING: + /* Sleep until there's something to do */ + while (true) + { + int rc = WaitLatchOrSocket(MyLatch, + WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_READABLE, + PQsocket(shard->conn), + 0, + PG_WAIT_EXTENSION); + elog(DEBUG5, "PGRES_POLLING_READING=>%d", rc); + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + /* query cancellation, backend shutdown */ + CHECK_FOR_INTERRUPTS(); + } + if (rc & WL_SOCKET_READABLE) + break; + } + /* PQconnectPoll() handles the socket polling state updates */ + + break; + case PGRES_POLLING_WRITING: + /* Sleep until there's something to do */ + while (true) + { + int rc = WaitLatchOrSocket(MyLatch, + WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_WRITEABLE, + PQsocket(shard->conn), + 0, + PG_WAIT_EXTENSION); + elog(DEBUG5, "PGRES_POLLING_WRITING=>%d", rc); + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + /* query cancellation, backend shutdown */ + CHECK_FOR_INTERRUPTS(); + } + if (rc & WL_SOCKET_WRITEABLE) + break; + } + /* PQconnectPoll() handles the socket polling state updates */ + + break; + case PGRES_POLLING_OK: + neon_shard_log(shard_no, DEBUG5, "POLLING_OK"); + connected = true; + break; + } + poll_result = PQconnectPoll(shard->conn); + elog(DEBUG5, "PQconnectPoll=>%d", poll_result); + } + while (!connected); + + /* No more polling needed; connection succeeded */ + shard->last_connect_time = GetCurrentTimestamp(); + + shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3); + AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET, + MyLatch, NULL); + AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + NULL, NULL); + AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL); + + + switch (neon_protocol_version) + { + case 2: + pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline); + break; + default: + elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version); + } + + if (PQstatus(shard->conn) == CONNECTION_BAD) + { + char *msg = pchomp(PQerrorMessage(shard->conn)); + + CLEANUP_AND_DISCONNECT(shard); + + ereport(elevel, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), + errdetail_internal("%s", msg))); + pfree(msg); + return false; + } + + ps_send_query_ret = PQsendQuery(shard->conn, pagestream_query); + pfree(pagestream_query); + if (ps_send_query_ret != 1) + { + CLEANUP_AND_DISCONNECT(shard); + + neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver"); + return false; + } + + shard->state = PS_Connecting_PageStream; + /* fallthrough */ } - - /* - * Connect using the connection string we got from the - * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment - * variable was set, use that as the password. - * - * The connection options are parsed in the order they're given, so when - * we set the password before the connection string, the connection string - * can override the password from the env variable. Seems useful, although - * we don't currently use that capability anywhere. - */ - n = 0; - if (neon_auth_token) + case PS_Connecting_PageStream: { - keywords[n] = "password"; - values[n] = neon_auth_token; - n++; - } - keywords[n] = "dbname"; - values[n] = connstr; - n++; - keywords[n] = NULL; - values[n] = NULL; - n++; - conn = PQconnectdbParams(keywords, values, 1); + neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_PageStream"); - if (PQstatus(conn) == CONNECTION_BAD) - { - char *msg = pchomp(PQerrorMessage(conn)); + if (PQstatus(shard->conn) == CONNECTION_BAD) + { + char *msg = pchomp(PQerrorMessage(shard->conn)); + CLEANUP_AND_DISCONNECT(shard); + ereport(elevel, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), + errdetail_internal("%s", msg))); + pfree(msg); + return false; + } - PQfinish(conn); - - ereport(elevel, - (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), - errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), - errdetail_internal("%s", msg))); - pfree(msg); - return false; - } - query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); - ret = PQsendQuery(conn, query); - pfree(query); - if (ret != 1) - { - PQfinish(conn); - neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver"); - return false; - } - - wes = CreateWaitEventSet(TopMemoryContext, 3); - AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET, - MyLatch, NULL); - AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, - NULL, NULL); - AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL); - - PG_TRY(); - { - while (PQisBusy(conn)) + while (PQisBusy(shard->conn)) { WaitEvent event; /* Sleep until there's something to do */ - (void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION); + (void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); @@ -412,33 +610,37 @@ pageserver_connect(shardno_t shard_no, int elevel) /* Data available in socket? */ if (event.events & WL_SOCKET_READABLE) { - if (!PQconsumeInput(conn)) + if (!PQconsumeInput(shard->conn)) { - char *msg = pchomp(PQerrorMessage(conn)); - - PQfinish(conn); - FreeWaitEventSet(wes); + char *msg = pchomp(PQerrorMessage(shard->conn)); + CLEANUP_AND_DISCONNECT(shard); neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s", msg); + pfree(msg); return false; } } } - } - PG_CATCH(); - { - PQfinish(conn); - FreeWaitEventSet(wes); - PG_RE_THROW(); - } - PG_END_TRY(); - neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr); - page_servers[shard_no].conn = conn; - page_servers[shard_no].wes = wes; + shard->state = PS_Connected; + /* fallthrough */ + } + case PS_Connected: + /* + * We successfully connected. Future connections to this PageServer + * will do fast retries again, with exponential backoff. + */ + shard->delay_us = MIN_RECONNECT_INTERVAL_USEC; - return true; + neon_shard_log(shard_no, DEBUG5, "Connection state: Connected"); + neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version); + return true; + default: + neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state); + } + /* This shouldn't be hit */ + Assert(false); } /* @@ -458,7 +660,7 @@ retry: WaitEvent event; /* Sleep until there's something to do */ - (void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION); + (void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); @@ -482,49 +684,61 @@ retry: return ret; } - +/* + * Reset prefetch and drop connection to the shard. + * It also drops connection to all other shards involved in prefetch, through + * prefetch_on_ps_disconnect(). + */ static void pageserver_disconnect(shardno_t shard_no) { + /* + * If the connection to any pageserver is lost, we throw away the + * whole prefetch queue, even for other pageservers. It should not + * cause big problems, because connection loss is supposed to be a + * rare event. + */ + prefetch_on_ps_disconnect(); + + pageserver_disconnect_shard(shard_no); +} + +/* + * Disconnect from specified shard + */ +static void +pageserver_disconnect_shard(shardno_t shard_no) +{ + PageServer *shard = &page_servers[shard_no]; /* * If anything goes wrong while we were sending a request, it's not clear * what state the connection is in. For example, if we sent the request * but didn't receive a response yet, we might receive the response some * time later after we have already sent a new unrelated request. Close * the connection to avoid getting confused. + * Similarly, even when we're in PS_DISCONNECTED, we may have junk to + * clean up: It is possible that we encountered an error allocating any + * of the wait event sets or the psql connection, or failed when we tried + * to attach wait events to the WaitEventSets. */ - if (page_servers[shard_no].conn) - { - neon_shard_log(shard_no, LOG, "dropping connection to page server due to error"); - PQfinish(page_servers[shard_no].conn); - page_servers[shard_no].conn = NULL; + CLEANUP_AND_DISCONNECT(shard); - /* - * If the connection to any pageserver is lost, we throw away the - * whole prefetch queue, even for other pageservers. It should not - * cause big problems, because connection loss is supposed to be a - * rare event. - */ - prefetch_on_ps_disconnect(); - } - if (page_servers[shard_no].wes != NULL) - { - FreeWaitEventSet(page_servers[shard_no].wes); - page_servers[shard_no].wes = NULL; - } + shard->state = PS_Disconnected; } static bool pageserver_send(shardno_t shard_no, NeonRequest *request) { StringInfoData req_buff; - PGconn *pageserver_conn = page_servers[shard_no].conn; + PageServer *shard = &page_servers[shard_no]; + PGconn *pageserver_conn; /* If the connection was lost for some reason, reconnect */ - if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD) + if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD) { neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection"); pageserver_disconnect(shard_no); + pageserver_conn = NULL; } req_buff = nm_pack_request(request); @@ -538,17 +752,19 @@ pageserver_send(shardno_t shard_no, NeonRequest *request) * https://github.com/neondatabase/neon/issues/1138 So try to reestablish * connection in case of failure. */ - if (!page_servers[shard_no].conn) + if (shard->state != PS_Connected) { - while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR)) + while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR)) { HandleMainLoopInterrupts(); - n_reconnect_attempts += 1; + shard->n_reconnect_attempts += 1; } - n_reconnect_attempts = 0; + shard->n_reconnect_attempts = 0; + } else { + Assert(shard->conn != NULL); } - pageserver_conn = page_servers[shard_no].conn; + pageserver_conn = shard->conn; /* * Send request. @@ -557,13 +773,17 @@ pageserver_send(shardno_t shard_no, NeonRequest *request) * should use async mode and check for interrupts while waiting. In * practice, our requests are small enough to always fit in the output and * TCP buffer. + * + * Note that this also will fail when the connection is in the + * PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this + * point, but on the grand scheme of things it's only a small issue. */ if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) { char *msg = pchomp(PQerrorMessage(pageserver_conn)); pageserver_disconnect(shard_no); - neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg); + neon_shard_log(shard_no, LOG, "pageserver_send disconnected: failed to send page request (try to reconnect): %s", msg); pfree(msg); pfree(req_buff.data); return false; @@ -578,6 +798,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request) neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg); pfree(msg); } + return true; } @@ -586,58 +807,68 @@ pageserver_receive(shardno_t shard_no) { StringInfoData resp_buff; NeonResponse *resp; - PGconn *pageserver_conn = page_servers[shard_no].conn; + PageServer *shard = &page_servers[shard_no]; + PGconn *pageserver_conn = shard->conn; + /* read response */ + int rc; - if (!pageserver_conn) - return NULL; - - PG_TRY(); + if (shard->state != PS_Connected) { - /* read response */ - int rc; + neon_shard_log(shard_no, LOG, + "pageserver_receive: returning NULL for non-connected pageserver connection: 0x%02x", + shard->state); + return NULL; + } - rc = call_PQgetCopyData(shard_no, &resp_buff.data); - if (rc >= 0) + Assert(pageserver_conn); + + rc = call_PQgetCopyData(shard_no, &resp_buff.data); + if (rc >= 0) + { + /* call_PQgetCopyData handles rc == 0 */ + Assert(rc > 0); + + PG_TRY(); { resp_buff.len = rc; resp_buff.cursor = 0; resp = nm_unpack_response(&resp_buff); PQfreemem(resp_buff.data); - - if (message_level_is_interesting(PageStoreTrace)) - { - char *msg = nm_to_string((NeonMessage *) resp); - - neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg); - pfree(msg); - } } - else if (rc == -1) + PG_CATCH(); { - neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn))); + neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response"); pageserver_disconnect(shard_no); - resp = NULL; + PG_RE_THROW(); } - else if (rc == -2) - { - char *msg = pchomp(PQerrorMessage(pageserver_conn)); + PG_END_TRY(); - pageserver_disconnect(shard_no); - neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg); - } - else + if (message_level_is_interesting(PageStoreTrace)) { - pageserver_disconnect(shard_no); - neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc); + char *msg = nm_to_string((NeonMessage *) resp); + + neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg); + pfree(msg); } } - PG_CATCH(); + else if (rc == -1) { - neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception"); + neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn))); pageserver_disconnect(shard_no); - PG_RE_THROW(); + resp = NULL; + } + else if (rc == -2) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + pageserver_disconnect(shard_no); + neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg); + } + else + { + pageserver_disconnect(shard_no); + neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc); } - PG_END_TRY(); return (NeonResponse *) resp; } @@ -648,7 +879,7 @@ pageserver_flush(shardno_t shard_no) { PGconn *pageserver_conn = page_servers[shard_no].conn; - if (!pageserver_conn) + if (page_servers[shard_no].state != PS_Connected) { neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected"); } @@ -664,6 +895,7 @@ pageserver_flush(shardno_t shard_no) return false; } } + return true; } @@ -671,7 +903,8 @@ page_server_api api = { .send = pageserver_send, .flush = pageserver_flush, - .receive = pageserver_receive + .receive = pageserver_receive, + .disconnect = pageserver_disconnect_shard }; static bool @@ -822,6 +1055,16 @@ pg_init_libpagestore(void) PGC_USERSET, 0, /* no flags required */ NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL); + DefineCustomIntVariable("neon.protocol_version", + "Version of compute<->page server protocol", + NULL, + &neon_protocol_version, + 2, /* use protocol version 2 */ + 2, /* min */ + 2, /* max */ + PGC_SU_BACKEND, + 0, /* no flags required */ + NULL, NULL, NULL); relsize_hash_init(); @@ -847,5 +1090,7 @@ pg_init_libpagestore(void) dbsize_hook = neon_dbsize; } + memset(page_servers, 0, sizeof(page_servers)); + lfc_init(); } diff --git a/pgxn/neon/neon--1.1--1.0.sql b/pgxn/neon/neon--1.1--1.0.sql new file mode 100644 index 0000000000..e83e3104e8 --- /dev/null +++ b/pgxn/neon/neon--1.1--1.0.sql @@ -0,0 +1,6 @@ +-- the order of operations is important here +-- because the view depends on the function + +DROP VIEW IF EXISTS neon_lfc_stats CASCADE; + +DROP FUNCTION IF EXISTS neon_get_lfc_stats CASCADE; diff --git a/pgxn/neon/neon--1.1--1.2.sql b/pgxn/neon/neon--1.1--1.2.sql new file mode 100644 index 0000000000..5818b4ffe5 --- /dev/null +++ b/pgxn/neon/neon--1.1--1.2.sql @@ -0,0 +1,29 @@ +\echo Use "ALTER EXTENSION neon UPDATE TO '1.2'" to load this file. \quit + +-- Create a convenient view similar to pg_stat_database +-- that exposes all lfc stat values in one row. +CREATE OR REPLACE VIEW NEON_STAT_FILE_CACHE AS + WITH lfc_stats AS ( + SELECT + stat_name, + count + FROM neon_get_lfc_stats() AS t(stat_name text, count bigint) + ), + lfc_values AS ( + SELECT + MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE NULL END) AS file_cache_misses, + MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE NULL END) AS file_cache_hits, + MAX(CASE WHEN stat_name = 'file_cache_used' THEN count ELSE NULL END) AS file_cache_used, + MAX(CASE WHEN stat_name = 'file_cache_writes' THEN count ELSE NULL END) AS file_cache_writes, + -- Calculate the file_cache_hit_ratio within the same CTE for simplicity + CASE + WHEN MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE 0 END) + MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END) = 0 THEN NULL + ELSE ROUND((MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END)::DECIMAL / + (MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END) + MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE 0 END))) * 100, 2) + END AS file_cache_hit_ratio + FROM lfc_stats + ) +SELECT file_cache_misses, file_cache_hits, file_cache_used, file_cache_writes, file_cache_hit_ratio from lfc_values; + +-- externalize the view to all users in role pg_monitor +GRANT SELECT ON NEON_STAT_FILE_CACHE TO PG_MONITOR; \ No newline at end of file diff --git a/pgxn/neon/neon--1.2--1.1.sql b/pgxn/neon/neon--1.2--1.1.sql new file mode 100644 index 0000000000..c9f6a40f73 --- /dev/null +++ b/pgxn/neon/neon--1.2--1.1.sql @@ -0,0 +1 @@ +DROP VIEW IF EXISTS NEON_STAT_FILE_CACHE CASCADE; diff --git a/pgxn/neon/neon--1.2--1.3.sql b/pgxn/neon/neon--1.2--1.3.sql new file mode 100644 index 0000000000..9583008777 --- /dev/null +++ b/pgxn/neon/neon--1.2--1.3.sql @@ -0,0 +1,9 @@ +\echo Use "ALTER EXTENSION neon UPDATE TO '1.3'" to load this file. \quit + +CREATE FUNCTION approximate_working_set_size(reset bool) +RETURNS integer +AS 'MODULE_PATHNAME', 'approximate_working_set_size' +LANGUAGE C PARALLEL SAFE; + +GRANT EXECUTE ON FUNCTION approximate_working_set_size(bool) TO pg_monitor; + diff --git a/pgxn/neon/neon--1.3--1.2.sql b/pgxn/neon/neon--1.3--1.2.sql new file mode 100644 index 0000000000..2733a15c75 --- /dev/null +++ b/pgxn/neon/neon--1.3--1.2.sql @@ -0,0 +1 @@ +DROP FUNCTION IF EXISTS approximate_working_set_size(bool) CASCADE; diff --git a/pgxn/neon/neon--1.3--1.4.sql b/pgxn/neon/neon--1.3--1.4.sql new file mode 100644 index 0000000000..042effe346 --- /dev/null +++ b/pgxn/neon/neon--1.3--1.4.sql @@ -0,0 +1,9 @@ +\echo Use "ALTER EXTENSION neon UPDATE TO '1.4'" to load this file. \quit + +CREATE FUNCTION approximate_working_set_size_seconds(duration integer default null) +RETURNS integer +AS 'MODULE_PATHNAME', 'approximate_working_set_size_seconds' +LANGUAGE C PARALLEL SAFE; + +GRANT EXECUTE ON FUNCTION approximate_working_set_size_seconds(integer) TO pg_monitor; + diff --git a/pgxn/neon/neon--1.4--1.3.sql b/pgxn/neon/neon--1.4--1.3.sql new file mode 100644 index 0000000000..bea72d1a6b --- /dev/null +++ b/pgxn/neon/neon--1.4--1.3.sql @@ -0,0 +1 @@ +DROP FUNCTION IF EXISTS approximate_working_set_size_seconds(integer) CASCADE; diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index b930fdb3ca..fe8e276d1c 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -11,25 +11,621 @@ #include "postgres.h" #include "fmgr.h" +#include "miscadmin.h" +#include "access/subtrans.h" +#include "access/twophase.h" #include "access/xact.h" #include "access/xlog.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "catalog/pg_type.h" +#include "postmaster/bgworker.h" +#include "postmaster/interrupt.h" +#include "replication/logical.h" +#include "replication/slot.h" #include "replication/walsender.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "tcop/tcopprot.h" #include "funcapi.h" #include "access/htup_details.h" +#include "utils/builtins.h" #include "utils/pg_lsn.h" #include "utils/guc.h" +#include "utils/guc_tables.h" +#include "utils/wait_event.h" +#include "extension_server.h" #include "neon.h" #include "walproposer.h" #include "pagestore_client.h" #include "control_plane_connector.h" +#include "walsender_hooks.h" PG_MODULE_MAGIC; void _PG_init(void); +static int logical_replication_max_snap_files = 300; + +static int running_xacts_overflow_policy; + +enum RunningXactsOverflowPolicies { + OP_IGNORE, + OP_SKIP, + OP_WAIT +}; + +static const struct config_enum_entry running_xacts_overflow_policies[] = { + {"ignore", OP_IGNORE, false}, + {"skip", OP_SKIP, false}, + {"wait", OP_WAIT, false}, + {NULL, 0, false} +}; + +static void +InitLogicalReplicationMonitor(void) +{ + BackgroundWorker bgw; + + DefineCustomIntVariable( + "neon.logical_replication_max_snap_files", + "Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.", + NULL, + &logical_replication_max_snap_files, + 300, -1, INT_MAX, + PGC_SIGHUP, + 0, + NULL, NULL, NULL); + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor"); + bgw.bgw_restart_time = 5; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); +} + +static int +LsnDescComparator(const void *a, const void *b) +{ + XLogRecPtr lsn1 = *((const XLogRecPtr *) a); + XLogRecPtr lsn2 = *((const XLogRecPtr *) b); + + if (lsn1 < lsn2) + return 1; + else if (lsn1 == lsn2) + return 0; + else + return -1; +} + +/* + * Look at .snap files and calculate minimum allowed restart_lsn of slot so that + * next gc would leave not more than logical_replication_max_snap_files; all + * slots having lower restart_lsn should be dropped. + */ +static XLogRecPtr +get_num_snap_files_lsn_threshold(void) +{ + DIR *dirdesc; + struct dirent *de; + char *snap_path = "pg_logical/snapshots/"; + int lsns_allocated = 1024; + int lsns_num = 0; + XLogRecPtr *lsns; + XLogRecPtr cutoff; + + if (logical_replication_max_snap_files < 0) + return 0; + + lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated); + + /* find all .snap files and get their lsns */ + dirdesc = AllocateDir(snap_path); + while ((de = ReadDir(dirdesc, snap_path)) != NULL) + { + XLogRecPtr lsn; + uint32 hi; + uint32 lo; + + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + + if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2) + { + ereport(LOG, + (errmsg("could not parse file name as .snap file \"%s\"", de->d_name))); + continue; + } + + lsn = ((uint64) hi) << 32 | lo; + elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn)); + if (lsns_allocated == lsns_num) + { + lsns_allocated *= 2; + lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated); + } + lsns[lsns_num++] = lsn; + } + /* sort by lsn desc */ + qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator); + /* and take cutoff at logical_replication_max_snap_files */ + if (logical_replication_max_snap_files > lsns_num) + cutoff = 0; + /* have less files than cutoff */ + else + { + cutoff = lsns[logical_replication_max_snap_files - 1]; + elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d", + LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files); + } + pfree(lsns); + FreeDir(dirdesc); + return cutoff; +} + +#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */ + +/* + * Unused logical replication slots pins WAL and prevents deletion of snapshots. + * WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which + * need too many .snap files. + */ +PGDLLEXPORT void +LogicalSlotsMonitorMain(Datum main_arg) +{ + /* Establish signal handlers. */ + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + + for (;;) + { + XLogRecPtr cutoff_lsn; + + /* In case of a SIGHUP, just reload the configuration. */ + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + /* + * If there are too many .snap files, just drop all logical slots to + * prevent aux files bloat. + */ + cutoff_lsn = get_num_snap_files_lsn_threshold(); + if (cutoff_lsn > 0) + { + for (int i = 0; i < max_replication_slots; i++) + { + char slot_name[NAMEDATALEN]; + ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; + XLogRecPtr restart_lsn; + + /* find the name */ + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + /* Consider only logical repliction slots */ + if (!s->in_use || !SlotIsLogical(s)) + { + LWLockRelease(ReplicationSlotControlLock); + continue; + } + + /* do we need to drop it? */ + SpinLockAcquire(&s->mutex); + restart_lsn = s->data.restart_lsn; + SpinLockRelease(&s->mutex); + if (restart_lsn >= cutoff_lsn) + { + LWLockRelease(ReplicationSlotControlLock); + continue; + } + + strlcpy(slot_name, s->data.name.data, NAMEDATALEN); + elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X", + slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn)); + LWLockRelease(ReplicationSlotControlLock); + + /* now try to drop it, killing owner before if any */ + for (;;) + { + pid_t active_pid; + + SpinLockAcquire(&s->mutex); + active_pid = s->active_pid; + SpinLockRelease(&s->mutex); + + if (active_pid == 0) + { + /* + * Slot is releasted, try to drop it. Though of course + * it could have been reacquired, so drop can ERROR + * out. Similarly it could have been dropped in the + * meanwhile. + * + * In principle we could remove pg_try/pg_catch, that + * would restart the whole bgworker. + */ + ConditionVariableCancelSleep(); + PG_TRY(); + { + ReplicationSlotDrop(slot_name, true); + elog(LOG, "ls_monitor: slot %s dropped", slot_name); + } + PG_CATCH(); + { + /* log ERROR and reset elog stack */ + EmitErrorReport(); + FlushErrorState(); + elog(LOG, "ls_monitor: failed to drop slot %s", slot_name); + } + PG_END_TRY(); + break; + } + else + { + /* kill the owner and wait for release */ + elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid); + (void) kill(active_pid, SIGTERM); + /* We shouldn't get stuck, but to be safe add timeout. */ + ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP); + } + } + } + } + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, + LS_MONITOR_CHECK_INTERVAL, + PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } +} + +/* + * XXX: These private to procarray.c, but we need them here. + */ +#define PROCARRAY_MAXPROCS (MaxBackends + max_prepared_xacts) +#define TOTAL_MAX_CACHED_SUBXIDS \ + ((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS) + +/* + * Restore running-xact information by scanning the CLOG at startup. + * + * In PostgreSQL, a standby always has to wait for a running-xacts WAL record + * to arrive before it can start accepting queries. Furthermore, if there are + * transactions with too many subxids (> 64) open to fit in the in-memory + * subxids cache, the running-xacts record will be marked as "suboverflowed", + * and the standby will need to also wait for the currently in-progress + * transactions to finish. + * + * That's not great in PostgreSQL, because a hot standby does not necessary + * open up for queries immediately as you might expect. But it's worse in + * Neon: A standby in Neon doesn't need to start WAL replay from a checkpoint + * record; it can start at any LSN. Postgres arranges things so that there is + * a running-xacts record soon after every checkpoint record, but when you + * start from an arbitrary LSN, that doesn't help. If the primary is idle, or + * not running at all, it might never write a new running-xacts record, + * leaving the replica in a limbo where it can never start accepting queries. + * + * To mitigate that, we have an additional mechanism to find the running-xacts + * information: we scan the CLOG, making note of any XIDs not marked as + * committed or aborted. They are added to the Postgres known-assigned XIDs + * array by calling ProcArrayApplyRecoveryInfo() in the caller of this + * function. + * + * There is one big limitation with that mechanism: The size of the + * known-assigned XIDs is limited, so if there are a lot of in-progress XIDs, + * we have to give up. Furthermore, we don't know how many of the in-progress + * XIDs are subtransactions, and if we use up all the space in the + * known-assigned XIDs array for subtransactions, we might run out of space in + * the array later during WAL replay, causing the replica to shut down with + * "ERROR: too many KnownAssignedXids". The safe # of XIDs that we can add to + * the known-assigned array without risking that error later is very low, + * merely PGPROC_MAX_CACHED_SUBXIDS == 64, so we take our chances and use up + * to half of the known-assigned XIDs array for the subtransactions, even + * though that risks getting the error later. + * + * Note: It's OK if the recovered list of XIDs includes some transactions that + * have crashed in the primary, and hence will never commit. They will be seen + * as in-progress, until we see a new next running-acts record with an + * oldestActiveXid that invalidates them. That's how the known-assigned XIDs + * array always works. + * + * If scraping the CLOG doesn't succeed for some reason, like the subxid + * overflow, Postgres will fall back to waiting for a running-xacts record + * like usual. + * + * Returns true if a complete list of in-progress XIDs was scraped. + */ +static bool +RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *nxids) +{ + TransactionId from; + TransactionId till; + int max_xcnt; + TransactionId *prepared_xids = NULL; + int n_prepared_xids; + TransactionId *restored_xids = NULL; + int n_restored_xids; + int next_prepared_idx; + + Assert(*xids == NULL); + + /* + * If the checkpoint doesn't have a valid oldestActiveXid, bail out. We + * don't know where to start the scan. + * + * This shouldn't happen, because the pageserver always maintains a valid + * oldestActiveXid nowadays. Except when starting at an old point in time + * that was ingested before the pageserver was taught to do that. + */ + if (!TransactionIdIsValid(checkpoint->oldestActiveXid)) + { + elog(LOG, "cannot restore running-xacts from CLOG because oldestActiveXid is not set"); + goto fail; + } + + /* + * We will scan the CLOG starting from the oldest active XID. + * + * In some corner cases, the oldestActiveXid from the last checkpoint + * might already have been truncated from the CLOG. That is, + * oldestActiveXid might be older than oldestXid. That's possible because + * oldestActiveXid is only updated at checkpoints. After the last + * checkpoint, the oldest transaction might have committed, and the CLOG + * might also have been already truncated. So if oldestActiveXid is older + * than oldestXid, start at oldestXid instead. (Otherwise we'd try to + * access CLOG segments that have already been truncated away.) + */ + from = TransactionIdPrecedes(checkpoint->oldestXid, checkpoint->oldestActiveXid) + ? checkpoint->oldestActiveXid : checkpoint->oldestXid; + till = XidFromFullTransactionId(checkpoint->nextXid); + + /* + * To avoid "too many KnownAssignedXids" error later during replay, we + * limit number of collected transactions. This is a tradeoff: if we are + * willing to consume more of the KnownAssignedXids space for the XIDs + * now, that allows us to start up, but we might run out of space later. + * + * The size of the KnownAssignedXids array is TOTAL_MAX_CACHED_SUBXIDS, + * which is (PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS). In + * PostgreSQL, that's always enough because the primary will always write + * an XLOG_XACT_ASSIGNMENT record if a transaction has more than + * PGPROC_MAX_CACHED_SUBXIDS subtransactions. Seeing that record allows + * the standby to mark the XIDs in pg_subtrans and removing them from the + * KnowingAssignedXids array. + * + * Here, we don't know which XIDs belong to subtransactions that have + * already been WAL-logged with an XLOG_XACT_ASSIGNMENT record. If we + * wanted to be totally safe and avoid the possibility of getting a "too + * many KnownAssignedXids" error later, we would have to limit ourselves + * to PGPROC_MAX_CACHED_SUBXIDS, which is not much. And that includes top + * transaction IDs too, because we cannot distinguish between top + * transaction IDs and subtransactions here. + * + * Somewhat arbitrarily, we use up to half of KnownAssignedXids. That + * strikes a sensible balance between being useful, and risking a "too + * many KnownAssignedXids" error later. + */ + max_xcnt = TOTAL_MAX_CACHED_SUBXIDS / 2; + + /* + * Collect XIDs of prepared transactions in an array. This includes only + * their top-level XIDs. We assume that StandbyRecoverPreparedTransactions + * has already been called, so we can find all the sub-transactions in + * pg_subtrans. + */ + PrescanPreparedTransactions(&prepared_xids, &n_prepared_xids); + qsort(prepared_xids, n_prepared_xids, sizeof(TransactionId), xidLogicalComparator); + + /* + * Scan the CLOG, collecting in-progress XIDs into 'restored_xids'. + */ + elog(DEBUG1, "scanning CLOG between %u and %u for in-progress XIDs", from, till); + restored_xids = (TransactionId *) palloc(max_xcnt * sizeof(TransactionId)); + n_restored_xids = 0; + next_prepared_idx = 0; + + for (TransactionId xid = from; xid != till;) + { + XLogRecPtr xidlsn; + XidStatus xidstatus; + + xidstatus = TransactionIdGetStatus(xid, &xidlsn); + + /* + * "Merge" the prepared transactions into the restored_xids array as + * we go. The prepared transactions array is sorted. This is mostly + * a sanity check to ensure that all the prepared transactions are + * seen as in-progress. (There is a check after the loop that we didn't + * miss any.) + */ + if (next_prepared_idx < n_prepared_xids && xid == prepared_xids[next_prepared_idx]) + { + /* + * This is a top-level transaction ID of a prepared transaction. + * Include it in the array. + */ + + /* sanity check */ + if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS) + { + elog(LOG, "prepared transaction %u has unexpected status %X, cannot restore running-xacts from CLOG", + xid, xidstatus); + Assert(false); + goto fail; + } + + elog(DEBUG1, "XID %u: was next prepared xact (%d / %d)", xid, next_prepared_idx, n_prepared_xids); + next_prepared_idx++; + } + else if (xidstatus == TRANSACTION_STATUS_COMMITTED) + { + elog(DEBUG1, "XID %u: was committed", xid); + goto skip; + } + else if (xidstatus == TRANSACTION_STATUS_ABORTED) + { + elog(DEBUG1, "XID %u: was aborted", xid); + goto skip; + } + else if (xidstatus == TRANSACTION_STATUS_IN_PROGRESS) + { + /* + * In-progress transactions are included in the array. + * + * Except subtransactions of the prepared transactions. They are + * already set in pg_subtrans, and hence don't need to be tracked + * in the known-assigned XIDs array. + */ + if (n_prepared_xids > 0) + { + TransactionId parent = SubTransGetParent(xid); + + if (TransactionIdIsValid(parent)) + { + /* + * This is a subtransaction belonging to a prepared + * transaction. + * + * Sanity check that it is in the prepared XIDs array. It + * should be, because StandbyRecoverPreparedTransactions + * populated pg_subtrans, and no other XID should be set + * in it yet. (This also relies on the fact that + * StandbyRecoverPreparedTransactions sets the parent of + * each subxid to point directly to the top-level XID, + * rather than restoring the original subtransaction + * hierarchy.) + */ + if (bsearch(&parent, prepared_xids, next_prepared_idx, + sizeof(TransactionId), xidLogicalComparator) == NULL) + { + elog(LOG, "sub-XID %u has unexpected parent %u, cannot restore running-xacts from CLOG", + xid, parent); + Assert(false); + goto fail; + } + elog(DEBUG1, "XID %u: was a subtransaction of prepared xid %u", xid, parent); + goto skip; + } + } + + /* include it in the array */ + elog(DEBUG1, "XID %u: is in progress", xid); + } + else + { + /* + * SUB_COMMITTED is a transient state used at commit. We don't + * expect to see that here. + */ + elog(LOG, "XID %u has unexpected status %X in pg_xact, cannot restore running-xacts from CLOG", + xid, xidstatus); + Assert(false); + goto fail; + } + + if (n_restored_xids >= max_xcnt) + { + /* + * Overflowed. We won't be able to install the RunningTransactions + * snapshot. + */ + elog(LOG, "too many running xacts to restore from the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u", + checkpoint->oldestXid, checkpoint->oldestActiveXid, + XidFromFullTransactionId(checkpoint->nextXid)); + + switch (running_xacts_overflow_policy) + { + case OP_WAIT: + goto fail; + case OP_IGNORE: + goto success; + case OP_SKIP: + n_restored_xids = 0; + goto success; + } + } + + restored_xids[n_restored_xids++] = xid; + + skip: + TransactionIdAdvance(xid); + } + + /* sanity check */ + if (next_prepared_idx != n_prepared_xids) + { + elog(LOG, "prepared transaction ID %u was not visited in the CLOG scan, cannot restore running-xacts from CLOG", + prepared_xids[next_prepared_idx]); + Assert(false); + goto fail; + } + success: + elog(LOG, "restored %d running xacts by scanning the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u", + n_restored_xids, checkpoint->oldestXid, checkpoint->oldestActiveXid, XidFromFullTransactionId(checkpoint->nextXid)); + *nxids = n_restored_xids; + *xids = restored_xids; + if (prepared_xids) + pfree(prepared_xids); + return true; + + fail: + *nxids = 0; + *xids = NULL; + if (restored_xids) + pfree(restored_xids); + if (prepared_xids) + pfree(prepared_xids); + return false; +} + + +/* + * pgbouncer is able to track GUCs reported by Postgres. + * But most parameters cannot be tracked this way. The only parameters that can be tracked are ones + * that Postgres reports to the client. Unfortunately `search_path` is not reported by Postgres: + * https://www.postgresql.org/message-id/flat/CAGECzQQ6xFcgrg%2Be0p9mCumtK362TiA6vTiiZKoYbS8OXggwuQ%40mail.gmail.com#be4bfd7a9cf1f0633bdb2d1790a0a1be + * This code sets GUC_REPORT flag for `search_path`making it possible to include it in + * pgbouncer's `track_extra_parameters` list. + * + * This code is inspired by how the Citus extension does this, see + * https://github.com/citusdata/citus/blob/2a263fe69a707d16ef24378f7650742386b0968f/src/backend/distributed/shared_library_init.c#L2694 + */ +static void +ReportSearchPath(void) +{ +#if PG_VERSION_NUM >= 160000 + int nGucs = 0; + struct config_generic **gucs = get_guc_variables(&nGucs); +#else + struct config_generic **gucs = get_guc_variables(); + int nGucs = GetNumConfigOptions(); +#endif + + for (int i = 0; i < nGucs; i++) + { + struct config_generic *guc = (struct config_generic *) gucs[i]; + + if (strcmp(guc->name, "search_path") == 0) + { + guc->flags |= GUC_REPORT; + } + } +} + void _PG_init(void) { @@ -43,17 +639,38 @@ _PG_init(void) pg_init_libpagestore(); pg_init_walproposer(); + WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; + LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; + SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; + + InitLogicalReplicationMonitor(); InitControlPlaneConnector(); pg_init_extension_server(); + restore_running_xacts_callback = RestoreRunningXactsFromClog; + + + DefineCustomEnumVariable( + "neon.running_xacts_overflow_policy", + "Action performed on snapshot overflow when restoring runnings xacts from CLOG", + NULL, + &running_xacts_overflow_policy, + OP_IGNORE, + running_xacts_overflow_policies, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); + /* * Important: This must happen after other parts of the extension are * loaded, otherwise any settings to GUCs that were set before the * extension was loaded will be removed. */ EmitWarningsOnPlaceholders("neon"); + + ReportSearchPath(); } PG_FUNCTION_INFO_V1(pg_cluster_size); @@ -65,7 +682,7 @@ pg_cluster_size(PG_FUNCTION_ARGS) { int64 size; - size = GetZenithCurrentClusterSize(); + size = GetNeonCurrentClusterSize(); if (size == 0) PG_RETURN_NULL(); diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control index 4e4cb9f372..03bdb9a0b4 100644 --- a/pgxn/neon/neon.control +++ b/pgxn/neon/neon.control @@ -1,5 +1,6 @@ # neon extension comment = 'cloud storage for PostgreSQL' -default_version = '1.1' +default_version = '1.4' module_pathname = '$libdir/neon' relocatable = true +trusted = true diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index c3afecc679..5c653fc6c6 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -25,12 +25,13 @@ extern int wal_acceptor_connection_timeout; extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); -extern void pg_init_extension_server(void); - extern uint64 BackpressureThrottlingTime(void); +extern void SetNeonCurrentClusterSize(uint64 size); +extern uint64 GetNeonCurrentClusterSize(void); extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]); extern void PGDLLEXPORT WalProposerMain(Datum main_arg); +PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg); #endif /* NEON_H */ diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index f19732cbbb..addb6ccce6 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -54,6 +54,10 @@ #define BufTagGetNRelFileInfo(tag) tag.rnode +#define BufTagGetRelNumber(tagp) ((tagp)->rnode.relNode) + +#define InvalidRelFileNumber InvalidOid + #define SMgrRelGetRelInfo(reln) \ (reln->smgr_rnode.node) diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c index 9135847aaf..1fb4ed9522 100644 --- a/pgxn/neon/neon_utils.c +++ b/pgxn/neon/neon_utils.c @@ -1,8 +1,12 @@ - #include +#ifndef WALPROPOSER_LIB +#include +#endif + #include "postgres.h" +#include "neon_utils.h" #include "lib/stringinfo.h" #include "libpq/pqformat.h" @@ -11,7 +15,7 @@ * * Returns -1 if the character is not a hexadecimal digit. */ -int +static int HexDecodeChar(char c) { if (c >= '0' && c <= '9') @@ -114,3 +118,48 @@ disable_core_dump() fprintf(stderr, "WARNING: disable cores setrlimit failed: %s", strerror(save_errno)); } } + +#ifndef WALPROPOSER_LIB + +/* + * On macOS with a libcurl that has IPv6 support, curl_global_init() calls + * SCDynamicStoreCopyProxies(), which makes the program multithreaded. An ideal + * place to call curl_global_init() would be _PG_init(), but Neon has to be + * added to shared_preload_libraries, which are loaded in the Postmaster + * process. The Postmaster is not supposed to become multithreaded at any point + * in its lifecycle. Postgres doesn't have any good hook that I know of to + * initialize per-backend structures, so we have to check this on any + * allocation of a CURL handle. + * + * Free the allocated CURL handle with curl_easy_cleanup(3). + * + * https://developer.apple.com/documentation/systemconfiguration/1517088-scdynamicstorecopyproxies + */ +CURL * +alloc_curl_handle(void) +{ + static bool curl_initialized = false; + + CURL *handle; + + if (unlikely(!curl_initialized)) + { + /* Protected by mutex internally */ + if (curl_global_init(CURL_GLOBAL_DEFAULT)) + { + elog(ERROR, "Failed to initialize curl"); + } + + curl_initialized = true; + } + + handle = curl_easy_init(); + if (handle == NULL) + { + elog(ERROR, "Failed to initialize curl handle"); + } + + return handle; +} + +#endif diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h index a86f1e061c..89683714f1 100644 --- a/pgxn/neon/neon_utils.h +++ b/pgxn/neon/neon_utils.h @@ -1,11 +1,23 @@ #ifndef __NEON_UTILS_H__ #define __NEON_UTILS_H__ +#include "lib/stringinfo.h" + +#ifndef WALPROPOSER_LIB +#include +#endif + bool HexDecodeString(uint8 *result, char *input, int nbytes); uint32 pq_getmsgint32_le(StringInfo msg); uint64 pq_getmsgint64_le(StringInfo msg); void pq_sendint32_le(StringInfo buf, uint32 i); void pq_sendint64_le(StringInfo buf, uint64 i); -extern void disable_core_dump(); +void disable_core_dump(void); + +#ifndef WALPROPOSER_LIB + +CURL * alloc_curl_handle(void); + +#endif #endif /* __NEON_UTILS_H__ */ diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c index f7ec9e5bfa..b575712dbe 100644 --- a/pgxn/neon/neon_walreader.c +++ b/pgxn/neon/neon_walreader.c @@ -36,10 +36,7 @@ static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state); -static void NeonWALReaderResetRemote(NeonWALReader *state); static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); -static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p); -static void neon_wal_segment_close(NeonWALReader *state); static bool is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli); @@ -82,8 +79,9 @@ struct NeonWALReader XLogRecPtr req_lsn; Size req_len; Size req_progress; - WalProposer *wp; /* we learn donor through walproposer */ + char donor_conninfo[MAXCONNINFO]; char donor_name[64]; /* saved donor safekeeper name for logging */ + XLogRecPtr donor_lsn; /* state of connection to safekeeper */ NeonWALReaderRemoteState rem_state; WalProposerConn *wp_conn; @@ -107,15 +105,16 @@ struct NeonWALReader /* palloc and initialize NeonWALReader */ NeonWALReader * -NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix) +NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix) { NeonWALReader *reader; + /* + * Note: we allocate in TopMemoryContext, reusing the reader for all process + * reads. + */ reader = (NeonWALReader *) - palloc_extended(sizeof(NeonWALReader), - MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO); - if (!reader) - return NULL; + MemoryContextAllocZero(TopMemoryContext, sizeof(NeonWALReader)); reader->available_lsn = available_lsn; reader->seg.ws_file = -1; @@ -123,8 +122,6 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalPropose reader->seg.ws_tli = 0; reader->segcxt.ws_segsize = wal_segment_size; - reader->wp = wp; - reader->rem_state = RS_NONE; if (log_prefix) @@ -188,8 +185,8 @@ NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, Ti } else if (state->wre_errno == ENOENT) { - nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote", - LSN_FORMAT_ARGS(startptr)); + nwr_log(LOG, "local read at %X/%X len %zu failed as segment file doesn't exist, attempting remote", + LSN_FORMAT_ARGS(startptr), count); return NeonWALReadRemote(state, buf, startptr, count, tli); } else @@ -204,21 +201,16 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou { if (state->rem_state == RS_NONE) { - XLogRecPtr donor_lsn; - - /* no connection yet; start one */ - Safekeeper *donor = GetDonor(state->wp, &donor_lsn); - - if (donor == NULL) + if (!NeonWALReaderUpdateDonor(state)) { snprintf(state->err_msg, sizeof(state->err_msg), "failed to establish remote connection to fetch WAL: no donor available"); return NEON_WALREAD_ERROR; + } - snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port); - nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL", - state->donor_name, LSN_FORMAT_ARGS(donor_lsn)); - state->wp_conn = libpqwp_connect_start(donor->conninfo); + /* no connection yet; start one */ + nwr_log(LOG, "establishing connection to %s, lsn=%X/%X to fetch WAL", state->donor_name, LSN_FORMAT_ARGS(state->donor_lsn)); + state->wp_conn = libpqwp_connect_start(state->donor_conninfo); if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD) { snprintf(state->err_msg, sizeof(state->err_msg), @@ -228,7 +220,8 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou return NEON_WALREAD_ERROR; } /* we'll poll immediately */ - state->rem_state = RS_CONNECTING_READ; + state->rem_state = RS_CONNECTING_WRITE; + return NEON_WALREAD_WOULDBLOCK; } if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE) @@ -251,10 +244,22 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou { /* connection successfully established */ char start_repl_query[128]; + term_t term = pg_atomic_read_u64(&GetWalpropShmemState()->mineLastElectedTerm); + /* + * Set elected walproposer's term to pull only data from + * its history. Note: for logical walsender it means we + * might stream WAL not yet committed by safekeepers. It + * would be cleaner to fix this. + * + * mineLastElectedTerm shouldn't be 0 at this point + * because we checked above that donor exists and it + * appears only after successfull election. + */ + Assert(term > 0); snprintf(start_repl_query, sizeof(start_repl_query), "START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')", - LSN_FORMAT_ARGS(startptr), state->wp->propTerm); + LSN_FORMAT_ARGS(startptr), term); nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s", state->donor_name, start_repl_query); if (!libpqwp_send_query(state->wp_conn, start_repl_query)) @@ -404,6 +409,10 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou state->req_lsn = InvalidXLogRecPtr; state->req_len = 0; state->req_progress = 0; + + /* Update the current segment info. */ + state->seg.ws_tli = tli; + return NEON_WALREAD_SUCCESS; } } @@ -526,7 +535,7 @@ err: } /* reset remote connection and request in progress */ -static void +void NeonWALReaderResetRemote(NeonWALReader *state) { state->req_lsn = InvalidXLogRecPtr; @@ -607,6 +616,7 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun uint32 startoff; int segbytes; int readbytes; + XLogSegNo lastRemovedSegNo; startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize); @@ -682,6 +692,23 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun return false; } + /* + * Recheck that the segment hasn't been removed while we were reading + * it. + */ + lastRemovedSegNo = XLogGetLastRemovedSegno(); + if (state->seg.ws_segno <= lastRemovedSegNo) + { + char fname[MAXFNAMELEN]; + + state->wre_errno = ENOENT; + + XLogFileName(fname, tli, state->seg.ws_segno, state->segcxt.ws_segsize); + snprintf(state->err_msg, sizeof(state->err_msg), "WAL segment %s has been removed during the read, lastRemovedSegNo " UINT64_FORMAT, + fname, lastRemovedSegNo); + return false; + } + /* Update state for read */ recptr += readbytes; nbytes -= readbytes; @@ -691,13 +718,25 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun return true; } +XLogRecPtr +NeonWALReaderGetRemLsn(NeonWALReader *state) +{ + return state->rem_lsn; +} + +const WALOpenSegment * +NeonWALReaderGetSegment(NeonWALReader *state) +{ + return &state->seg; +} + /* * Copy of vanilla wal_segment_open, but returns false in case of error instead * of ERROR, with errno set. * * XLogReaderRoutine->segment_open callback for local pg_wal files */ -static bool +bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p) { @@ -724,7 +763,7 @@ is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli) } /* copy of vanilla wal_segment_close with NeonWALReader */ -static void +void neon_wal_segment_close(NeonWALReader *state) { if (state->seg.ws_file >= 0) @@ -740,3 +779,19 @@ NeonWALReaderErrMsg(NeonWALReader *state) { return state->err_msg; } + +/* + * Returns true if there is a donor, and false otherwise + */ +bool +NeonWALReaderUpdateDonor(NeonWALReader *state) +{ + WalproposerShmemState *wps = GetWalpropShmemState(); + + SpinLockAcquire(&wps->mutex); + memcpy(state->donor_name, wps->donor_name, sizeof(state->donor_name)); + memcpy(state->donor_conninfo, wps->donor_conninfo, sizeof(state->donor_conninfo)); + state->donor_lsn = wps->donor_lsn; + SpinLockRelease(&wps->mutex); + return state->donor_name[0] != '\0'; +} diff --git a/pgxn/neon/neon_walreader.h b/pgxn/neon/neon_walreader.h index 6be9f149aa..3e41825069 100644 --- a/pgxn/neon/neon_walreader.h +++ b/pgxn/neon/neon_walreader.h @@ -19,12 +19,19 @@ typedef enum NEON_WALREAD_ERROR, } NeonWALReadResult; -extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix); +extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix); extern void NeonWALReaderFree(NeonWALReader *state); +extern void NeonWALReaderResetRemote(NeonWALReader *state); extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); extern pgsocket NeonWALReaderSocket(NeonWALReader *state); extern uint32 NeonWALReaderEvents(NeonWALReader *state); extern bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state); extern char *NeonWALReaderErrMsg(NeonWALReader *state); +extern XLogRecPtr NeonWALReaderGetRemLsn(NeonWALReader *state); +extern const WALOpenSegment *NeonWALReaderGetSegment(NeonWALReader *state); +extern bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p); +extern void neon_wal_segment_close(NeonWALReader *state); +extern bool NeonWALReaderUpdateDonor(NeonWALReader *state); + #endif /* __NEON_WALREADER_H__ */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 8c02f357bc..1f196d016c 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -15,6 +15,7 @@ #include "neon_pgversioncompat.h" +#include "access/slru.h" #include "access/xlogdefs.h" #include RELFILEINFO_HDR #include "lib/stringinfo.h" @@ -34,6 +35,7 @@ typedef enum T_NeonNblocksRequest, T_NeonGetPageRequest, T_NeonDbSizeRequest, + T_NeonGetSlruSegmentRequest, /* pagestore -> pagestore_client */ T_NeonExistsResponse = 100, @@ -41,6 +43,7 @@ typedef enum T_NeonGetPageResponse, T_NeonErrorResponse, T_NeonDbSizeResponse, + T_NeonGetSlruSegmentResponse, } NeonMessageTag; /* base struct for c-style inheritance */ @@ -59,18 +62,39 @@ typedef struct (errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \ errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0))) -/* - * supertype of all the Neon*Request structs below +/* SLRUs downloadable from page server */ +typedef enum { + SLRU_CLOG, + SLRU_MULTIXACT_MEMBERS, + SLRU_MULTIXACT_OFFSETS +} SlruKind; + +/*-- + * supertype of all the Neon*Request structs below. * - * If 'latest' is true, we are requesting the latest page version, and 'lsn' - * is just a hint to the server that we know there are no versions of the page - * (or relation size, for exists/nblocks requests) later than the 'lsn'. + * All requests contain two LSNs: + * + * lsn: request page (or relation size, etc) at this LSN + * not_modified_since: Hint that the page hasn't been modified between + * this LSN and the request LSN (`lsn`). + * + * To request the latest version of a page, you can use MAX_LSN as the request + * LSN. + * + * If you don't know any better, you can always set 'not_modified_since' equal + * to 'lsn', but providing a lower value can speed up processing the request + * in the pageserver, as it doesn't need to wait for the WAL to arrive, and it + * can skip traversing through recent layers which we know to not contain any + * versions for the requested page. + * + * These structs describe the V2 of these requests. (The old now-defunct V1 + * protocol contained just one LSN and a boolean 'latest' flag.) */ typedef struct { NeonMessageTag tag; - bool latest; /* if true, request latest page version */ - XLogRecPtr lsn; /* request page version @ this LSN */ + XLogRecPtr lsn; + XLogRecPtr not_modified_since; } NeonRequest; typedef struct @@ -101,6 +125,13 @@ typedef struct BlockNumber blkno; } NeonGetPageRequest; +typedef struct +{ + NeonRequest req; + SlruKind kind; + int segno; +} NeonGetSlruSegmentRequest; + /* supertype of all the Neon*Response structs below */ typedef struct { @@ -140,6 +171,14 @@ typedef struct * message */ } NeonErrorResponse; +typedef struct +{ + NeonMessageTag tag; + int n_blocks; + char data[BLCKSZ * SLRU_PAGES_PER_SEGMENT]; +} NeonGetSlruSegmentResponse; + + extern StringInfoData nm_pack_request(NeonRequest *msg); extern NeonResponse *nm_unpack_response(StringInfo s); extern char *nm_to_string(NeonMessage *msg); @@ -155,6 +194,7 @@ typedef struct bool (*send) (shardno_t shard_no, NeonRequest * request); NeonResponse *(*receive) (shardno_t shard_no); bool (*flush) (shardno_t shard_no); + void (*disconnect) (shardno_t shard_no); } page_server_api; extern void prefetch_on_ps_disconnect(void); @@ -167,6 +207,7 @@ extern int readahead_buffer_size; extern char *neon_timeline; extern char *neon_tenant; extern int32 max_cluster_size; +extern int neon_protocol_version; extern shardno_t get_shard_number(BufferTag* tag); @@ -195,18 +236,50 @@ extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum, extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +/* + * LSN values associated with each request to the pageserver + */ +typedef struct +{ + /* + * 'request_lsn' is the main value that determines which page version to + * fetch. + */ + XLogRecPtr request_lsn; + + /* + * A hint to the pageserver that the requested page hasn't been modified + * between this LSN and 'request_lsn'. That allows the pageserver to + * return the page faster, without waiting for 'request_lsn' to arrive in + * the pageserver, as long as 'not_modified_since' has arrived. + */ + XLogRecPtr not_modified_since; + + /* + * 'effective_request_lsn' is not included in the request that's sent to + * the pageserver, but is used to keep track of the latest LSN of when the + * request was made. In a standby server, this is always the same as the + * 'request_lsn', but in the primary we use UINT64_MAX as the + * 'request_lsn' to request the latest page version, so we need this + * separate field to remember that latest LSN was when the request was + * made. It's needed to manage prefetch request, to verify if the response + * to a prefetched request is still valid. + */ + XLogRecPtr effective_request_lsn; +} neon_request_lsns; + #if PG_MAJORVERSION_NUM < 16 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); + neon_request_lsns request_lsns, char *buffer); extern void neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); #else extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, void *buffer); + neon_request_lsns request_lsns, void *buffer); extern void neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync); #endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 1fa802e6f4..7f39c7d026 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -45,6 +45,7 @@ */ #include "postgres.h" +#include "access/parallel.h" #include "access/xact.h" #include "access/xlog.h" #include "access/xlogdefs.h" @@ -93,6 +94,10 @@ static char *hexdump_page(char *page); const int SmgrTrace = DEBUG5; +#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \ + neon_shard_log(shard_no, elvl, "Broken connection state: " message, \ + ##__VA_ARGS__) + page_server_api *page_server; /* unlogged relation build states */ @@ -168,8 +173,7 @@ typedef enum PrefetchStatus typedef struct PrefetchRequest { BufferTag buftag; /* must be first entry in the struct */ - XLogRecPtr effective_request_lsn; - XLogRecPtr actual_request_lsn; + neon_request_lsns request_lsns; NeonResponse *response; /* may be null */ PrefetchStatus status; shardno_t shard_no; @@ -269,19 +273,18 @@ static PrefetchState *MyPState; ) \ ) -static XLogRecPtr prefetch_lsn = 0; - static bool compact_prefetch_buffers(void); static void consume_prefetch_responses(void); -static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn); +static uint64 prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns); static bool prefetch_read(PrefetchRequest *slot); -static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn); +static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns); static bool prefetch_wait_for(uint64 ring_index); static void prefetch_cleanup_trailing_unused(void); static inline void prefetch_set_unused(uint64 ring_index); -static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, - ForkNumber forknum, BlockNumber blkno); +static neon_request_lsns neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno); +static bool neon_prefetch_response_usable(neon_request_lsns request_lsns, + PrefetchRequest *slot); static bool compact_prefetch_buffers(void) @@ -338,8 +341,7 @@ compact_prefetch_buffers(void) target_slot->shard_no = source_slot->shard_no; target_slot->status = source_slot->status; target_slot->response = source_slot->response; - target_slot->effective_request_lsn = source_slot->effective_request_lsn; - target_slot->actual_request_lsn = source_slot->actual_request_lsn; + target_slot->request_lsns = source_slot->request_lsns; target_slot->my_ring_index = empty_ring_index; prfh_delete(MyPState->prf_hash, source_slot); @@ -358,7 +360,9 @@ compact_prefetch_buffers(void) }; source_slot->response = NULL; source_slot->my_ring_index = 0; - source_slot->effective_request_lsn = 0; + source_slot->request_lsns = (neon_request_lsns) { + InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr + }; /* update bookkeeping */ n_moved++; @@ -526,6 +530,8 @@ prefetch_flush_requests(void) * * NOTE: this function may indirectly update MyPState->pfs_hash; which * invalidates any active pointers into the hash table. + * NOTE: callers should make sure they can handle query cancellations in this + * function's call path. */ static bool prefetch_wait_for(uint64 ring_index) @@ -561,6 +567,8 @@ prefetch_wait_for(uint64 ring_index) * * NOTE: this function may indirectly update MyPState->pfs_hash; which * invalidates any active pointers into the hash table. + * + * NOTE: this does IO, and can get canceled out-of-line. */ static bool prefetch_read(PrefetchRequest *slot) @@ -572,6 +580,14 @@ prefetch_read(PrefetchRequest *slot) Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_receive); + if (slot->status != PRFS_REQUESTED || + slot->response != NULL || + slot->my_ring_index != MyPState->ring_receive) + neon_shard_log(slot->shard_no, ERROR, + "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu", + slot->status, slot->response, + (long)slot->my_ring_index, (long)MyPState->ring_receive); + old = MemoryContextSwitchTo(MyPState->errctx); response = (NeonResponse *) page_server->receive(slot->shard_no); MemoryContextSwitchTo(old); @@ -589,6 +605,11 @@ prefetch_read(PrefetchRequest *slot) } else { + neon_shard_log(slot->shard_no, LOG, + "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect", + (long)slot->my_ring_index, + RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)), + slot->buftag.forkNum, slot->buftag.blockNum); return false; } } @@ -603,6 +624,7 @@ void prefetch_on_ps_disconnect(void) { MyPState->ring_flush = MyPState->ring_unused; + while (MyPState->ring_receive < MyPState->ring_unused) { PrefetchRequest *slot; @@ -613,10 +635,19 @@ prefetch_on_ps_disconnect(void) Assert(slot->status == PRFS_REQUESTED); Assert(slot->my_ring_index == ring_index); + /* + * Drop connection to all shards which have prefetch requests. + * It is not a problem to call disconnect multiple times on the same connection + * because disconnect implementation in libpagestore.c will check if connection + * is alive and do nothing of connection was already dropped. + */ + page_server->disconnect(slot->shard_no); + /* clean up the request */ slot->status = PRFS_TAG_REMAINS; MyPState->n_requests_inflight -= 1; MyPState->ring_receive += 1; + prefetch_set_unused(ring_index); } } @@ -633,13 +664,12 @@ prefetch_on_ps_disconnect(void) static inline void prefetch_set_unused(uint64 ring_index) { - PrefetchRequest *slot = GetPrfSlot(ring_index); + PrefetchRequest *slot; if (ring_index < MyPState->ring_last) return; /* Should already be unused */ - Assert(MyPState->ring_unused > ring_index); - + slot = GetPrfSlot(ring_index); if (slot->status == PRFS_UNUSED) return; @@ -676,61 +706,43 @@ prefetch_set_unused(uint64 ring_index) compact_prefetch_buffers(); } +/* + * Send one prefetch request to the pageserver. To wait for the response, call + * prefetch_wait_for(). + */ static void -prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn) +prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns) { bool found; + uint64 mySlotNo = slot->my_ring_index; + NeonGetPageRequest request = { .req.tag = T_NeonGetPageRequest, - .req.latest = false, - .req.lsn = 0, + /* lsn and not_modified_since are filled in below */ .rinfo = BufTagGetNRelFileInfo(slot->buftag), .forknum = slot->buftag.forkNum, .blkno = slot->buftag.blockNum, }; - if (force_lsn && force_latest) - { - request.req.lsn = *force_lsn; - request.req.latest = *force_latest; - slot->actual_request_lsn = slot->effective_request_lsn = *force_lsn; - } - else - { - XLogRecPtr lsn = neon_get_request_lsn( - &request.req.latest, - BufTagGetNRelFileInfo(slot->buftag), - slot->buftag.forkNum, - slot->buftag.blockNum - ); + Assert(mySlotNo == MyPState->ring_unused); - /* - * Note: effective_request_lsn is potentially higher than the - * requested LSN, but still correct: - * - * We know there are no changes between the actual requested LSN and - * the value of effective_request_lsn: If there were, the page would - * have been in cache and evicted between those LSN values, which then - * would have had to result in a larger request LSN for this page. - * - * It is possible that a concurrent backend loads the page, modifies - * it and then evicts it again, but the LSN of that eviction cannot be - * smaller than the current WAL insert/redo pointer, which is already - * larger than this prefetch_lsn. So in any case, that would - * invalidate this cache. - * - * The best LSN to use for effective_request_lsn would be - * XLogCtl->Insert.RedoRecPtr, but that's expensive to access. - */ - slot->actual_request_lsn = request.req.lsn = lsn; - prefetch_lsn = Max(prefetch_lsn, lsn); - slot->effective_request_lsn = prefetch_lsn; - } + if (force_request_lsns) + slot->request_lsns = *force_request_lsns; + else + slot->request_lsns = neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), + slot->buftag.forkNum, + slot->buftag.blockNum); + request.req.lsn = slot->request_lsns.request_lsn; + request.req.not_modified_since = slot->request_lsns.not_modified_since; Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_unused); - while (!page_server->send(slot->shard_no, (NeonRequest *) &request)); + while (!page_server->send(slot->shard_no, (NeonRequest *) &request)) + { + Assert(mySlotNo == MyPState->ring_unused); + /* loop */ + } /* update prefetch state */ MyPState->n_requests_inflight += 1; @@ -741,8 +753,6 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force /* update slot state */ slot->status = PRFS_REQUESTED; - - prfh_insert(MyPState->prf_hash, slot, &found); Assert(!found); } @@ -752,16 +762,16 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force * * Register that we may want the contents of BufferTag in the near future. * - * If force_latest and force_lsn are not NULL, those values are sent to the - * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure - * to fill in these values manually. + * If force_request_lsns is not NULL, those values are sent to the + * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure + * to calculate the LSNs to send. * * NOTE: this function may indirectly update MyPState->pfs_hash; which * invalidates any active pointers into the hash table. */ static uint64 -prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn) +prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns) { uint64 ring_index; PrefetchRequest req; @@ -785,38 +795,18 @@ Retry: Assert(BUFFERTAGS_EQUAL(slot->buftag, tag)); /* - * If we want a specific lsn, we do not accept requests that were made - * with a potentially different LSN. + * If the caller specified a request LSN to use, only accept prefetch + * responses that satisfy that request. */ - if (force_latest && force_lsn) + if (force_request_lsns) { - /* - * if we want the latest version, any effective_request_lsn < - * request lsn is OK - */ - if (*force_latest) + if (!neon_prefetch_response_usable(*force_request_lsns, slot)) { - if (*force_lsn > slot->effective_request_lsn) - { - prefetch_wait_for(ring_index); - prefetch_set_unused(ring_index); - entry = NULL; - } - - } - - /* - * if we don't want the latest version, only accept requests with - * the exact same LSN - */ - else - { - if (*force_lsn != slot->effective_request_lsn) - { - prefetch_wait_for(ring_index); - prefetch_set_unused(ring_index); - entry = NULL; - } + /* Wait for the old request to finish and discard it */ + if (!prefetch_wait_for(ring_index)) + goto Retry; + prefetch_set_unused(ring_index); + entry = NULL; } } @@ -879,7 +869,8 @@ Retry: { case PRFS_REQUESTED: Assert(MyPState->ring_receive == cleanup_index); - prefetch_wait_for(cleanup_index); + if (!prefetch_wait_for(cleanup_index)) + goto Retry; prefetch_set_unused(cleanup_index); break; case PRFS_RECEIVED: @@ -911,7 +902,7 @@ Retry: slot->shard_no = get_shard_number(&tag); slot->my_ring_index = ring_index; - prefetch_do_request(slot, force_latest, force_lsn); + prefetch_do_request(slot, force_request_lsns); Assert(slot->status == PRFS_REQUESTED); Assert(MyPState->ring_last <= ring_index && ring_index < MyPState->ring_unused); @@ -933,6 +924,10 @@ Retry: return ring_index; } +/* + * Note: this function can get canceled and use a long jump to the next catch + * context. Take care. + */ static NeonResponse * page_server_request(void const *req) { @@ -940,7 +935,7 @@ page_server_request(void const *req) BufferTag tag = {0}; shardno_t shard_no; - switch (((NeonRequest *) req)->tag) + switch (messageTag(req)) { case T_NeonExistsRequest: CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo); @@ -956,28 +951,46 @@ page_server_request(void const *req) tag.blockNum = ((NeonGetPageRequest *) req)->blkno; break; default: - neon_log(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag); + neon_log(ERROR, "Unexpected request tag: %d", messageTag(req)); } shard_no = get_shard_number(&tag); - /* * Current sharding model assumes that all metadata is present only at shard 0. * We still need to call get_shard_no() to check if shard map is up-to-date. */ - if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM) + if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || + ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM) { shard_no = 0; } do { - while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no)); - consume_prefetch_responses(); - resp = page_server->receive(shard_no); - } while (resp == NULL); - return resp; + PG_TRY(); + { + while (!page_server->send(shard_no, (NeonRequest *) req) + || !page_server->flush(shard_no)) + { + /* do nothing */ + } + consume_prefetch_responses(); + resp = page_server->receive(shard_no); + } + PG_CATCH(); + { + /* + * Cancellation in this code needs to be handled better at some + * point, but this currently seems fine for now. + */ + page_server->disconnect(shard_no); + PG_RE_THROW(); + } + PG_END_TRY(); + } while (resp == NULL); + + return resp; } @@ -987,7 +1000,10 @@ nm_pack_request(NeonRequest *msg) StringInfoData s; initStringInfo(&s); + pq_sendbyte(&s, msg->tag); + pq_sendint64(&s, msg->lsn); + pq_sendint64(&s, msg->not_modified_since); switch (messageTag(msg)) { @@ -996,8 +1012,6 @@ nm_pack_request(NeonRequest *msg) { NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); @@ -1009,8 +1023,6 @@ nm_pack_request(NeonRequest *msg) { NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); @@ -1022,8 +1034,6 @@ nm_pack_request(NeonRequest *msg) { NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); pq_sendint32(&s, msg_req->dbNode); break; @@ -1032,8 +1042,6 @@ nm_pack_request(NeonRequest *msg) { NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); @@ -1043,12 +1051,23 @@ nm_pack_request(NeonRequest *msg) break; } + case T_NeonGetSlruSegmentRequest: + { + NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; + + pq_sendbyte(&s, msg_req->kind); + pq_sendint32(&s, msg_req->segno); + + break; + } + /* pagestore -> pagestore_client. We never need to create these. */ case T_NeonExistsResponse: case T_NeonNblocksResponse: case T_NeonGetPageResponse: case T_NeonErrorResponse: case T_NeonDbSizeResponse: + case T_NeonGetSlruSegmentResponse: default: neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag); break; @@ -1135,6 +1154,20 @@ nm_unpack_response(StringInfo s) break; } + case T_NeonGetSlruSegmentResponse: + { + NeonGetSlruSegmentResponse *msg_resp; + int n_blocks = pq_getmsgint(s, 4); + msg_resp = palloc(sizeof(NeonGetSlruSegmentResponse)); + msg_resp->tag = tag; + msg_resp->n_blocks = n_blocks; + memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + /* * pagestore_client -> pagestore * @@ -1144,6 +1177,7 @@ nm_unpack_response(StringInfo s) case T_NeonNblocksRequest: case T_NeonGetPageRequest: case T_NeonDbSizeRequest: + case T_NeonGetSlruSegmentRequest: default: neon_log(ERROR, "unexpected neon message tag 0x%02x", tag); break; @@ -1171,7 +1205,7 @@ nm_to_string(NeonMessage *msg) appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1184,7 +1218,7 @@ nm_to_string(NeonMessage *msg) appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1198,7 +1232,7 @@ nm_to_string(NeonMessage *msg) appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1209,11 +1243,22 @@ nm_to_string(NeonMessage *msg) appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\""); appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); appendStringInfoChar(&s, '}'); break; } + case T_NeonGetSlruSegmentRequest: + { + NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; + appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\""); + appendStringInfo(&s, ", \"kind\": %u", msg_req->kind); + appendStringInfo(&s, ", \"segno\": %u", msg_req->segno); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); + appendStringInfoChar(&s, '}'); + break; + } /* pagestore -> pagestore_client */ case T_NeonExistsResponse: { @@ -1267,6 +1312,17 @@ nm_to_string(NeonMessage *msg) msg_resp->db_size); appendStringInfoChar(&s, '}'); + break; + } + case T_NeonGetSlruSegmentResponse: + { + NeonGetSlruSegmentResponse *msg_resp = (NeonGetSlruSegmentResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentResponse\""); + appendStringInfo(&s, ", \"n_blocks\": %u}", + msg_resp->n_blocks); + appendStringInfoChar(&s, '}'); + break; } @@ -1305,6 +1361,10 @@ PageIsEmptyHeapPage(char *buffer) return memcmp(buffer, empty_page.data, BLCKSZ) == 0; } +/* + * A page is being evicted from the shared buffer cache. Update the + * last-written LSN of the page, and WAL-log it if needed. + */ static void #if PG_MAJORVERSION_NUM < 16 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force) @@ -1313,12 +1373,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co #endif { XLogRecPtr lsn = PageGetLSN((Page) buffer); - - if (ShutdownRequestPending) - return; - /* Don't log any pages if we're not allowed to do so. */ - if (!XLogInsertAllowed()) - return; + bool log_page; /* * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM @@ -1327,9 +1382,21 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co * correctness, the non-logged updates are not critical. But we want to * have a reasonably up-to-date VM and FSM in the page server. */ - if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress()) + log_page = false; + if (force) + { + Assert(XLogInsertAllowed()); + log_page = true; + } + else if (XLogInsertAllowed() && + !ShutdownRequestPending && + (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM)) + { + log_page = true; + } + + if (log_page) { - /* FSM is never WAL-logged and we don't care. */ XLogRecPtr recptr; recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum, @@ -1342,7 +1409,8 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(lsn)))); } - else if (lsn == InvalidXLogRecPtr) + + if (lsn == InvalidXLogRecPtr) { /* * When PostgreSQL extends a relation, it calls smgrextend() with an @@ -1378,19 +1446,31 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); } - else + else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM) { - ereport(PANIC, + /* + * Its a bad sign if there is a page with zero LSN in the buffer + * cache in a standby, too. However, PANICing seems like a cure + * worse than the disease, as the damage has likely already been + * done in the primary. So in a standby, make this an assertion, + * and in a release build just LOG the error and soldier on. We + * update the last-written LSN of the page with a conservative + * value in that case, which is the last replayed LSN. + */ + ereport(RecoveryInProgress() ? LOG : PANIC, (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN", blocknum, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); + Assert(false); + + lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */ } } else { ereport(SmgrTrace, - (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X", + (errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X", blocknum, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(lsn)))); @@ -1471,44 +1551,123 @@ nm_adjust_lsn(XLogRecPtr lsn) /* * Return LSN for requesting pages and number of blocks from page server */ -static XLogRecPtr -neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) +static neon_request_lsns +neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) { - XLogRecPtr lsn; + XLogRecPtr last_written_lsn; + neon_request_lsns result; + + last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno); + last_written_lsn = nm_adjust_lsn(last_written_lsn); + Assert(last_written_lsn != InvalidXLogRecPtr); if (RecoveryInProgress()) { - /* - * We don't know if WAL has been generated but not yet replayed, so - * we're conservative in our estimates about latest pages. + /*--- + * In broad strokes, a replica always requests the page at the current + * replay LSN. But looking closer, what exactly is the replay LSN? Is + * it the last replayed record, or the record being replayed? And does + * the startup process performing the replay need to do something + * differently than backends running queries? Let's take a closer look + * at the different scenarios: + * + * 1. Startup process reads a page, last_written_lsn is old. + * + * Read the old version of the page. We will apply the WAL record on + * it to bring it up-to-date. + * + * We could read the new version, with the changes from this WAL + * record already applied, to offload the work of replaying the record + * to the pageserver. The pageserver might not have received the WAL + * record yet, though, so a read of the old page version and applying + * the record ourselves is likely faster. Also, the redo function + * might be surprised if the changes have already applied. That's + * normal during crash recovery, but not in hot standby. + * + * 2. Startup process reads a page, last_written_lsn == record we're + * replaying. + * + * Can this happen? There are a few theoretical cases when it might: + * + * A) The redo function reads the same page twice. We had already read + * and applied the changes once, and now we're reading it for the + * second time. That would be a rather silly thing for a redo + * function to do, and I'm not aware of any that would do it. + * + * B) The redo function modifies multiple pages, and it already + * applied the changes to one of the pages, released the lock on + * it, and is now reading a second page. Furthermore, the first + * page was already evicted from the buffer cache, and also from + * the last-written LSN cache, so that the per-relation or global + * last-written LSN was already updated. All the WAL redo functions + * hold the locks on pages that they modify, until all the changes + * have been modified (?), which would make that impossible. + * However, we skip the locking, if the page isn't currently in the + * page cache (see neon_redo_read_buffer_filter below). + * + * Even if the one of the above cases were possible in theory, they + * would also require the pages being modified by the redo function to + * be immediately evicted from the page cache. + * + * So this probably does not happen in practice. But if it does, we + * request the new version, including the changes from the record + * being replayed. That seems like the correct behavior in any case. + * + * 3. Backend process reads a page with old last-written LSN + * + * Nothing special here. Read the old version. + * + * 4. Backend process reads a page with last_written_lsn == record being replayed + * + * This can happen, if the redo function has started to run, and saw + * that the page isn't present in the page cache (see + * neon_redo_read_buffer_filter below). Normally, in a normal + * Postgres server, the redo function would hold a lock on the page, + * so we would get blocked waiting the redo function to release the + * lock. To emulate that, wait for the WAL replay of the record to + * finish. */ - *latest = false; + /* Request the page at the end of the last fully replayed LSN. */ + XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL); - /* - * Get the last written LSN of this page. - */ - lsn = GetLastWrittenLSN(rinfo, forknum, blkno); - lsn = nm_adjust_lsn(lsn); + if (last_written_lsn > replay_lsn) + { + /* GetCurrentReplayRecPtr was introduced in v15 */ +#if PG_VERSION_NUM >= 150000 + Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL)); +#endif - neon_log(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", - (uint32) ((lsn) >> 32), (uint32) (lsn)); + /* + * Cases 2 and 4. If this is a backend (case 4), the + * neon_read_at_lsn() call later will wait for the WAL record to be + * fully replayed. + */ + result.request_lsn = last_written_lsn; + } + else + { + /* cases 1 and 3 */ + result.request_lsn = replay_lsn; + } + result.not_modified_since = last_written_lsn; + result.effective_request_lsn = result.request_lsn; + Assert(last_written_lsn <= result.request_lsn); + + neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X", + LSN_FORMAT_ARGS(result.request_lsn), LSN_FORMAT_ARGS(result.not_modified_since)); } else { XLogRecPtr flushlsn; /* - * Use the latest LSN that was evicted from the buffer cache. Any - * pages modified by later WAL records must still in the buffer cache, - * so our request cannot concern those. + * Use the latest LSN that was evicted from the buffer cache as the + * 'not_modified_since' hint. Any pages modified by later WAL records + * must still in the buffer cache, so our request cannot concern + * those. */ - *latest = true; - lsn = GetLastWrittenLSN(rinfo, forknum, blkno); - Assert(lsn != InvalidXLogRecPtr); - neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ", - (uint32) ((lsn) >> 32), (uint32) (lsn)); - - lsn = nm_adjust_lsn(lsn); + neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X", + LSN_FORMAT_ARGS(last_written_lsn)); /* * Is it possible that the last-written LSN is ahead of last flush @@ -1523,16 +1682,144 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block #else flushlsn = GetFlushRecPtr(); #endif - if (lsn > flushlsn) + if (last_written_lsn > flushlsn) { neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", - (uint32) (lsn >> 32), (uint32) lsn, - (uint32) (flushlsn >> 32), (uint32) flushlsn); - XLogFlush(lsn); + LSN_FORMAT_ARGS(last_written_lsn), + LSN_FORMAT_ARGS(flushlsn)); + XLogFlush(last_written_lsn); + flushlsn = last_written_lsn; } + + /* + * Request the very latest version of the page. In principle we + * want to read the page at the current insert LSN, and we could + * use that value in the request. However, there's a corner case + * with pageserver's garbage collection. If the GC horizon is + * set to a very small value, it's possible that by the time + * that the pageserver processes our request, the GC horizon has + * already moved past the LSN we calculate here. Standby servers + * always have that problem as the can always lag behind the + * primary, but for the primary we can avoid it by always + * requesting the latest page, by setting request LSN to + * UINT64_MAX. + * + * Remember the current LSN, however, so that we can later + * correctly determine if the response to the request is still + * valid. The most up-to-date LSN we could use for that purpose + * would be the current insert LSN, but to avoid the overhead of + * looking it up, use 'flushlsn' instead. This relies on the + * assumption that if the page was modified since the last WAL + * flush, it should still be in the buffer cache, and we + * wouldn't be requesting it. + */ + result.request_lsn = UINT64_MAX; + result.not_modified_since = last_written_lsn; + result.effective_request_lsn = flushlsn; } - return lsn; + return result; +} + +/* + * neon_prefetch_response_usable -- Can a new request be satisfied by old one? + * + * This is used to check if the response to a prefetch request can be used to + * satisfy a page read now. + */ +static bool +neon_prefetch_response_usable(neon_request_lsns request_lsns, + PrefetchRequest *slot) +{ + /* sanity check the LSN's on the old and the new request */ + Assert(request_lsns.request_lsn >= request_lsns.not_modified_since); + Assert(request_lsns.effective_request_lsn >= request_lsns.not_modified_since); + Assert(request_lsns.effective_request_lsn <= request_lsns.request_lsn); + Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since); + Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); + Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn); + Assert(slot->status != PRFS_UNUSED); + + /* + * The new request's LSN should never be older than the old one. This + * could be an Assert, except that for testing purposes, we do provide an + * interface in neon_test_utils to fetch pages at arbitary LSNs, which + * violates this. + * + * Similarly, the not_modified_since value calculated for a page should + * never move backwards. This assumption is a bit fragile; if we updated + * the last-written cache when we read in a page, for example, then it + * might. But as the code stands, it should not. + * + * (If two backends issue a request at the same time, they might race and + * calculate LSNs "out of order" with each other, but the prefetch queue + * is backend-private at the moment.) + */ + if (request_lsns.effective_request_lsn < slot->request_lsns.effective_request_lsn || + request_lsns.not_modified_since < slot->request_lsns.not_modified_since) + { + ereport(LOG, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "request with unexpected LSN after prefetch"), + errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), + LSN_FORMAT_ARGS(request_lsns.not_modified_since), + LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn), + LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)))); + return false; + } + + /*--- + * Each request to the pageserver has three LSN values associated with it: + * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'. + * `not_modified_since` and `request_lsn` are sent to the pageserver, but + * in the primary node, we always use UINT64_MAX as the `request_lsn`, so + * we remember `effective_request_lsn` separately. In a primary, + * `effective_request_lsn` is the last flush WAL position when the request + * was sent to the pageserver. That's logically the LSN that we are + * requesting the page at, but we send UINT64_MAX to the pageserver so + * that if the GC horizon advances past that position, we still get a + * valid response instead of an error. + * + * To determine whether a response to a GetPage request issued earlier is + * still valid to satisfy a new page read, we look at the + * (not_modified_since, effective_request_lsn] range of the request. It is + * effectively a claim that the page has not been modified between those + * LSNs. If the range of the old request in the queue overlaps with the + * new request, we know that the page hasn't been modified in the union of + * the ranges. We can use the response to old request to satisfy the new + * request in that case. For example: + * + * 100 500 + * Old request: +--------+ + * + * 400 800 + * New request: +--------+ + * + * The old request claims that the page was not modified between LSNs 100 + * and 500, and the second claims that it was not modified between 400 and + * 800. Together they mean that the page was not modified between 100 and + * 800. Therefore the response to the old request is also valid for the + * new request. + * + * This logic also holds at the boundary case that the old request's LSN + * matches the new request's not_modified_since LSN exactly: + * + * 100 500 + * Old request: +--------+ + * + * 500 900 + * New request: +--------+ + * + * The response to the old request is the page as it was at LSN 500, and + * the page hasn't been changed in the range (500, 900], therefore the + * response is valid also for the new request. + */ + + /* this follows from the checks above */ + Assert(request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); + + return request_lsns.not_modified_since <= slot->request_lsns.effective_request_lsn; } /* @@ -1544,8 +1831,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) bool exists; NeonResponse *resp; BlockNumber n_blocks; - bool latest; - XLogRecPtr request_lsn; + neon_request_lsns request_lsns; switch (reln->smgr_relpersistence) { @@ -1600,14 +1886,15 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) return false; } - request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO); + request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO); { NeonExistsRequest request = { .req.tag = T_NeonExistsRequest, - .req.latest = latest, - .req.lsn = request_lsn, + .req.lsn = request_lsns.request_lsn, + .req.not_modified_since = request_lsns.not_modified_since, .rinfo = InfoFromSMgrRel(reln), - .forknum = forkNum}; + .forknum = forkNum + }; resp = page_server_request(&request); } @@ -1624,13 +1911,15 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, - (uint32) (request_lsn >> 32), (uint32) request_lsn), + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x", + T_NeonExistsResponse, T_NeonErrorResponse, resp->tag); } pfree(resp); return exists; @@ -1781,7 +2070,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && !IsAutoVacuumWorkerProcess()) { - uint64 current_size = GetZenithCurrentClusterSize(); + uint64 current_size = GetNeonCurrentClusterSize(); if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) ereport(ERROR, @@ -1838,7 +2127,6 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, int nblocks, bool skipFsync) { const PGAlignedBlock buffer = {0}; - BlockNumber curblocknum = blocknum; int remblocks = nblocks; XLogRecPtr lsn = 0; @@ -1863,7 +2151,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && !IsAutoVacuumWorkerProcess()) { - uint64 current_size = GetZenithCurrentClusterSize(); + uint64 current_size = GetNeonCurrentClusterSize(); if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) ereport(ERROR, @@ -1990,7 +2278,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln)); - ring_index = prefetch_register_buffer(tag, NULL, NULL); + ring_index = prefetch_register_buffer(tag, NULL); Assert(ring_index < MyPState->ring_unused && MyPState->ring_last <= ring_index); @@ -2043,10 +2331,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, void #if PG_MAJORVERSION_NUM < 16 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer) + neon_request_lsns request_lsns, char *buffer) #else neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, void *buffer) + neon_request_lsns request_lsns, void *buffer) #endif { NeonResponse *resp; @@ -2078,25 +2366,27 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * value of the LwLsn cache when the entry is not found. */ if (RecoveryInProgress() && !(MyBackendType == B_STARTUP)) - XLogWaitForReplayOf(request_lsn); + XLogWaitForReplayOf(request_lsns.request_lsn); /* * Try to find prefetched page in the list of received pages. */ +Retry: entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); if (entry != NULL) { slot = entry->slot; - if (slot->effective_request_lsn >= request_lsn) + if (neon_prefetch_response_usable(request_lsns, slot)) { ring_index = slot->my_ring_index; pgBufferUsage.prefetch.hits += 1; } - else /* the current prefetch LSN is not large - * enough, so drop the prefetch */ + else { /* + * Cannot use this prefetch, discard it + * * We can't drop cache for not-yet-received requested items. It is * unlikely this happens, but it can happen if prefetch distance * is large enough and a backend didn't consume all prefetch @@ -2104,7 +2394,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, */ if (slot->status == PRFS_REQUESTED) { - prefetch_wait_for(slot->my_ring_index); + if (!prefetch_wait_for(slot->my_ring_index)) + goto Retry; } /* drop caches */ prefetch_set_unused(slot->my_ring_index); @@ -2120,8 +2411,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, { pgBufferUsage.prefetch.misses += 1; - ring_index = prefetch_register_buffer(buftag, &request_latest, - &request_lsn); + ring_index = prefetch_register_buffer(buftag, &request_lsns); slot = GetPrfSlot(ring_index); } else @@ -2162,12 +2452,14 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, slot->shard_no, blkno, RelFileInfoFmt(rinfo), forkNum, - (uint32) (request_lsn >> 32), (uint32) request_lsn), + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", + T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); } /* buffer was used, clean up for later reuse */ @@ -2185,8 +2477,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) #endif { - bool latest; - XLogRecPtr request_lsn; + neon_request_lsns request_lsns; switch (reln->smgr_relpersistence) { @@ -2211,8 +2502,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer return; } - request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, blkno); - neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, latest, buffer); + request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno); + neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -2381,8 +2672,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) { NeonResponse *resp; BlockNumber n_blocks; - bool latest; - XLogRecPtr request_lsn; + neon_request_lsns request_lsns; switch (reln->smgr_relpersistence) { @@ -2409,12 +2699,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) return n_blocks; } - request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO); + request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO); { NeonNblocksRequest request = { .req.tag = T_NeonNblocksRequest, - .req.latest = latest, - .req.lsn = request_lsn, + .req.lsn = request_lsns.request_lsn, + .req.not_modified_since = request_lsns.not_modified_since, .rinfo = InfoFromSMgrRel(reln), .forknum = forknum, }; @@ -2434,21 +2724,23 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn), + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x", + T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag); } update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - n_blocks); + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), + n_blocks); pfree(resp); return n_blocks; @@ -2462,16 +2754,15 @@ neon_dbsize(Oid dbNode) { NeonResponse *resp; int64 db_size; - XLogRecPtr request_lsn; - bool latest; + neon_request_lsns request_lsns; NRelFileInfo dummy_node = {0}; - request_lsn = neon_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); + request_lsns = neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); { NeonDbSizeRequest request = { .req.tag = T_NeonDbSizeRequest, - .req.latest = latest, - .req.lsn = request_lsn, + .req.lsn = request_lsns.request_lsn, + .req.not_modified_since = request_lsns.not_modified_since, .dbNode = dbNode, }; @@ -2488,20 +2779,19 @@ neon_dbsize(Oid dbNode) ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X", - dbNode, - (uint32) (request_lsn >> 32), (uint32) request_lsn), + dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x", + T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag); } neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", - dbNode, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - db_size); + dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size); pfree(resp); return db_size; @@ -2544,7 +2834,6 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * the most recently inserted WAL record's LSN. */ lsn = GetXLogInsertRecPtr(); - lsn = nm_adjust_lsn(lsn); /* @@ -2662,10 +2951,14 @@ neon_start_unlogged_build(SMgrRelation reln) reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED; /* + * Create the local file. In a parallel build, the leader is expected to + * call this first and do it. + * * FIXME: should we pass isRedo true to create the tablespace dir if it * doesn't exist? Is it needed? */ - mdcreate(reln, MAIN_FORKNUM, false); + if (!IsParallelWorker()) + mdcreate(reln, MAIN_FORKNUM, false); } /* @@ -2689,7 +2982,17 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln) Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1); Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); - unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; + /* + * In a parallel build, (only) the leader process performs the 2nd + * phase. + */ + if (IsParallelWorker()) + { + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + } + else + unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; } /* @@ -2739,6 +3042,99 @@ neon_end_unlogged_build(SMgrRelation reln) unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; } +#define STRPREFIX(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0) + +static int +neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer) +{ + XLogRecPtr request_lsn, + not_modified_since; + + /* + * Compute a request LSN to use, similar to neon_get_request_lsns() but the + * logic is a bit simpler. + */ + if (RecoveryInProgress()) + { + request_lsn = GetXLogReplayRecPtr(NULL); + if (request_lsn == InvalidXLogRecPtr) + { + /* + * This happens in neon startup, we start up without replaying any + * records. + */ + request_lsn = GetRedoStartLsn(); + } + request_lsn = nm_adjust_lsn(request_lsn); + } + else + request_lsn = UINT64_MAX; + + /* + * GetRedoStartLsn() returns LSN of the basebackup. We know that the SLRU + * segment has not changed since the basebackup, because in order to + * modify it, we would have had to download it already. And once + * downloaded, we never evict SLRU segments from local disk. + */ + not_modified_since = nm_adjust_lsn(GetRedoStartLsn()); + + SlruKind kind; + + if (STRPREFIX(path, "pg_xact")) + kind = SLRU_CLOG; + else if (STRPREFIX(path, "pg_multixact/members")) + kind = SLRU_MULTIXACT_MEMBERS; + else if (STRPREFIX(path, "pg_multixact/offsets")) + kind = SLRU_MULTIXACT_OFFSETS; + else + return -1; + + NeonResponse *resp; + NeonGetSlruSegmentRequest request = { + .req.tag = T_NeonGetSlruSegmentRequest, + .req.lsn = request_lsn, + .req.not_modified_since = not_modified_since, + + .kind = kind, + .segno = segno + }; + int n_blocks; + shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ + do + { + while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no)); + consume_prefetch_responses(); + resp = page_server->receive(shard_no); + } while (resp == NULL); + + switch (resp->tag) + { + case T_NeonGetSlruSegmentResponse: + n_blocks = ((NeonGetSlruSegmentResponse *) resp)->n_blocks; + memcpy(buffer, ((NeonGetSlruSegmentResponse *) resp)->data, n_blocks*BLCKSZ); + break; + + case T_NeonErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "could not read SLRU %d segment %d at lsn %X/%08X", + kind, + segno, + LSN_FORMAT_ARGS(request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x", + T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag); + } + pfree(resp); + + return n_blocks; +} + static void AtEOXact_neon(XactEvent event, void *arg) { @@ -2797,6 +3193,8 @@ static const struct f_smgr neon_smgr = .smgr_start_unlogged_build = neon_start_unlogged_build, .smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1, .smgr_end_unlogged_build = neon_end_unlogged_build, + + .smgr_read_slru_segment = neon_read_slru_segment, }; const f_smgr * @@ -2825,6 +3223,9 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, { BlockNumber relsize; + /* This is only used in WAL replay */ + Assert(RecoveryInProgress()); + /* Extend the relation if we know its size */ if (get_cached_relsize(rinfo, forknum, &relsize)) { @@ -2843,14 +3244,13 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, * This length is later reused when we open the smgr to read the * block, which is fine and expected. */ - NeonResponse *response; NeonNblocksResponse *nbresponse; NeonNblocksRequest request = { .req = (NeonRequest) { - .lsn = end_recptr, - .latest = false, .tag = T_NeonNblocksRequest, + .lsn = end_recptr, + .not_modified_since = end_recptr, }, .rinfo = rinfo, .forknum = forknum, @@ -2946,7 +3346,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) BufferTag tag; uint32 hash; LWLock *partitionLock; - Buffer buffer; + int buf_id; bool no_redo_needed; if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id)) @@ -2959,14 +3359,6 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno); #endif - /* - * Out of an abundance of caution, we always run redo on shared catalogs, - * regardless of whether the block is stored in shared buffers. See also - * this function's top comment. - */ - if (!OidIsValid(NInfoGetDbOid(rinfo))) - return false; - CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forknum; tag.blockNum = blkno; @@ -2980,21 +3372,32 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) */ LWLockAcquire(partitionLock, LW_SHARED); - /* Try to find the relevant buffer */ - buffer = BufTableLookup(&tag, hash); + /* + * Out of an abundance of caution, we always run redo on shared catalogs, + * regardless of whether the block is stored in shared buffers. See also + * this function's top comment. + */ + if (!OidIsValid(NInfoGetDbOid(rinfo))) + { + no_redo_needed = false; + } + else + { + /* Try to find the relevant buffer */ + buf_id = BufTableLookup(&tag, hash); - no_redo_needed = buffer < 0; - - /* In both cases st lwlsn past this WAL record */ - SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); + no_redo_needed = buf_id < 0; + } /* * we don't have the buffer in memory, update lwLsn past this record, also - * evict page fro file cache + * evict page from file cache */ if (no_redo_needed) + { + SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); lfc_evict(rinfo, forknum, blkno); - + } LWLockRelease(partitionLock); diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c index cc7ac2c394..2a4c2dc799 100644 --- a/pgxn/neon/relsize_cache.c +++ b/pgxn/neon/relsize_cache.c @@ -110,7 +110,8 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size) tag.rinfo = rinfo; tag.forknum = forknum; - LWLockAcquire(relsize_lock, LW_SHARED); + /* We need exclusive lock here because of LRU list manipulation */ + LWLockAcquire(relsize_lock, LW_EXCLUSIVE); entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL); if (entry != NULL) { diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 171af7d2aa..c53257923a 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -70,7 +70,7 @@ static bool SendAppendRequests(Safekeeper *sk); static bool RecvAppendResponses(Safekeeper *sk); static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp); static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp); -static void HandleSafekeeperResponse(WalProposer *wp); +static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk); static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg); static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state); @@ -80,7 +80,7 @@ static int CompareLsn(const void *a, const void *b); static char *FormatSafekeeperState(Safekeeper *sk); static void AssertEventsOkForState(uint32 events, Safekeeper *sk); static char *FormatEvents(WalProposer *wp, uint32 events); - +static void UpdateDonorShmem(WalProposer *wp); WalProposer * WalProposerCreate(WalProposerConfig *config, walproposer_api api) @@ -688,7 +688,7 @@ RecvAcceptorGreeting(Safekeeper *sk) if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) return; - wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port); + wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, term=" INT64_FORMAT, sk->host, sk->port, sk->greetResponse.term); /* Protocol is all good, move to voting. */ sk->state = SS_VOTING; @@ -922,6 +922,8 @@ static void DetermineEpochStartLsn(WalProposer *wp) { TermHistory *dth; + int n_ready = 0; + WalproposerShmemState *walprop_shared; wp->propEpochStartLsn = InvalidXLogRecPtr; wp->donorEpoch = 0; @@ -932,6 +934,8 @@ DetermineEpochStartLsn(WalProposer *wp) { if (wp->safekeeper[i].state == SS_IDLE) { + n_ready++; + if (GetEpoch(&wp->safekeeper[i]) > wp->donorEpoch || (GetEpoch(&wp->safekeeper[i]) == wp->donorEpoch && wp->safekeeper[i].voteResponse.flushLsn > wp->propEpochStartLsn)) @@ -958,9 +962,21 @@ DetermineEpochStartLsn(WalProposer *wp) } } + if (n_ready < wp->quorum) + { + /* + * This is a rare case that can be triggered if safekeeper has voted + * and disconnected. In this case, its state will not be SS_IDLE and + * its vote cannot be used, because we clean up `voteResponse` in + * `ShutdownConnection`. + */ + wp_log(FATAL, "missing majority of votes, collected %d, expected %d, got %d", wp->n_votes, wp->quorum, n_ready); + } + /* - * If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are bootstrapping - * and nothing was committed yet. Start streaming then from the basebackup LSN. + * If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are + * bootstrapping and nothing was committed yet. Start streaming then from + * the basebackup LSN. */ if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers) { @@ -971,11 +987,12 @@ DetermineEpochStartLsn(WalProposer *wp) } wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn)); } + pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn); /* - * Safekeepers are setting truncateLsn after timelineStartLsn is known, so it - * should never be zero at this point, if we know timelineStartLsn. - * + * Safekeepers are setting truncateLsn after timelineStartLsn is known, so + * it should never be zero at this point, if we know timelineStartLsn. + * * timelineStartLsn can be zero only on the first syncSafekeepers run. */ Assert((wp->truncateLsn != InvalidXLogRecPtr) || @@ -1009,10 +1026,9 @@ DetermineEpochStartLsn(WalProposer *wp) * since which we are going to write according to the consensus. If not, * we must bail out, as clog and other non rel data is inconsistent. */ + walprop_shared = wp->api.get_shmem_state(wp); if (!wp->config->syncSafekeepers) { - WalproposerShmemState *walprop_shared = wp->api.get_shmem_state(wp); - /* * Basebackup LSN always points to the beginning of the record (not * the page), as StartupXLOG most probably wants it this way. @@ -1027,7 +1043,7 @@ DetermineEpochStartLsn(WalProposer *wp) * compute (who could generate WAL) is ok. */ if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term == - walprop_shared->mineLastElectedTerm))) + pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm)))) { /* * Panic to restart PG as we need to retake basebackup. @@ -1041,8 +1057,8 @@ DetermineEpochStartLsn(WalProposer *wp) LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp))); } } - walprop_shared->mineLastElectedTerm = wp->propTerm; } + pg_atomic_write_u64(&walprop_shared->mineLastElectedTerm, wp->propTerm); } /* @@ -1092,9 +1108,13 @@ SendProposerElected(Safekeeper *sk) { /* safekeeper is empty or no common point, start from the beginning */ sk->startStreamingAt = wp->propTermHistory.entries[0].lsn; - wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" , - sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries); - /* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */ + wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u", + sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries); + + /* + * wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline + * is created manually (test_s3_wal_replay) + */ Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr); } else @@ -1164,6 +1184,12 @@ StartStreaming(Safekeeper *sk) sk->active_state = SS_ACTIVE_SEND; sk->streamingAt = sk->startStreamingAt; + /* + * Donors can only be in SS_ACTIVE state, so we potentially update the + * donor when we switch one to SS_ACTIVE. + */ + UpdateDonorShmem(sk->wp); + /* event set will be updated inside SendMessageToNode */ SendMessageToNode(sk); } @@ -1207,7 +1233,7 @@ PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr begin req->epochStartLsn = wp->propEpochStartLsn; req->beginLsn = beginLsn; req->endLsn = endLsn; - req->commitLsn = GetAcknowledgedByQuorumWALPosition(wp); + req->commitLsn = wp->commitLsn; req->truncateLsn = wp->truncateLsn; req->proposerId = wp->greetRequest.proposerId; } @@ -1392,7 +1418,6 @@ static bool RecvAppendResponses(Safekeeper *sk) { WalProposer *wp = sk->wp; - XLogRecPtr minQuorumLsn; bool readAnything = false; while (true) @@ -1412,6 +1437,8 @@ RecvAppendResponses(Safekeeper *sk) LSN_FORMAT_ARGS(sk->appendResponse.commitLsn), sk->host, sk->port); + readAnything = true; + if (sk->appendResponse.term > wp->propTerm) { /* @@ -1420,39 +1447,33 @@ RecvAppendResponses(Safekeeper *sk) * core as this is kinda expected scenario. */ disable_core_dump(); - wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", + wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us", sk->host, sk->port, sk->appendResponse.term, wp->propTerm); } - readAnything = true; + HandleSafekeeperResponse(wp, sk); } if (!readAnything) return sk->state == SS_ACTIVE; - HandleSafekeeperResponse(wp); - - /* - * Also send the new commit lsn to all the safekeepers. - */ - minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp); - if (minQuorumLsn > wp->lastSentCommitLsn) - { - BroadcastAppendRequest(wp); - wp->lastSentCommitLsn = minQuorumLsn; - } - return sk->state == SS_ACTIVE; } +#define psfeedback_log(fmt, key, ...) \ + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: %s " fmt, key, __VA_ARGS__) + /* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */ -void -ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *rf) +static void +ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *ps_feedback) { uint8 nkeys; int i; - int32 len; + + /* initialize the struct before parsing */ + memset(ps_feedback, 0, sizeof(PageserverFeedback)); + ps_feedback->present = true; /* get number of custom keys */ nkeys = pq_getmsgbyte(reply_message); @@ -1460,66 +1481,52 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese for (i = 0; i < nkeys; i++) { const char *key = pq_getmsgstring(reply_message); + unsigned int value_len = pq_getmsgint(reply_message, sizeof(int32)); if (strcmp(key, "current_timeline_size") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->currentClusterSize = pq_getmsgint64(reply_message); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu", - rf->currentClusterSize); + Assert(value_len == sizeof(int64)); + ps_feedback->currentClusterSize = pq_getmsgint64(reply_message); + psfeedback_log(UINT64_FORMAT, key, ps_feedback->currentClusterSize); } else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0)) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->last_received_lsn = pq_getmsgint64(reply_message); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X", - LSN_FORMAT_ARGS(rf->last_received_lsn)); + Assert(value_len == sizeof(int64)); + ps_feedback->last_received_lsn = pq_getmsgint64(reply_message); + psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->last_received_lsn)); } else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0)) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->disk_consistent_lsn = pq_getmsgint64(reply_message); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X", - LSN_FORMAT_ARGS(rf->disk_consistent_lsn)); + Assert(value_len == sizeof(int64)); + ps_feedback->disk_consistent_lsn = pq_getmsgint64(reply_message); + psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->disk_consistent_lsn)); } else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0)) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->remote_consistent_lsn = pq_getmsgint64(reply_message); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X", - LSN_FORMAT_ARGS(rf->remote_consistent_lsn)); + Assert(value_len == sizeof(int64)); + ps_feedback->remote_consistent_lsn = pq_getmsgint64(reply_message); + psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->remote_consistent_lsn)); } else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0)) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->replytime = pq_getmsgint64(reply_message); - { - char *replyTimeStr; - - /* Copy because timestamptz_to_str returns a static buffer */ - replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime)); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s", - rf->replytime, replyTimeStr); - - pfree(replyTimeStr); - } + Assert(value_len == sizeof(int64)); + ps_feedback->replytime = pq_getmsgint64(reply_message); + psfeedback_log("%s", key, timestamptz_to_str(ps_feedback->replytime)); + } + else if (strcmp(key, "shard_number") == 0) + { + Assert(value_len == sizeof(uint32)); + ps_feedback->shard_number = pq_getmsgint(reply_message, sizeof(uint32)); + psfeedback_log("%u", key, ps_feedback->shard_number); } else { - len = pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - /* * Skip unknown keys to support backward compatibile protocol * changes */ - wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len); - pq_getmsgbytes(reply_message, len); + wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, value_len); + pq_getmsgbytes(reply_message, value_len); }; } } @@ -1574,17 +1581,17 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp) * none if it doesn't exist. donor_lsn is set to end position of the donor to * the best of our knowledge. */ -Safekeeper * -GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) +static void +UpdateDonorShmem(WalProposer *wp) { - *donor_lsn = InvalidXLogRecPtr; Safekeeper *donor = NULL; int i; + XLogRecPtr donor_lsn = InvalidXLogRecPtr; if (wp->n_votes < wp->quorum) { - wp_log(WARNING, "GetDonor called before elections are won"); - return NULL; + wp_log(WARNING, "UpdateDonorShmem called before elections are won"); + return; } /* @@ -1595,7 +1602,7 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) if (wp->safekeeper[wp->donor].state >= SS_IDLE) { donor = &wp->safekeeper[wp->donor]; - *donor_lsn = wp->propEpochStartLsn; + donor_lsn = wp->propEpochStartLsn; } /* @@ -1607,23 +1614,45 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) { Safekeeper *sk = &wp->safekeeper[i]; - if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > *donor_lsn) + if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > donor_lsn) { donor = sk; - *donor_lsn = sk->appendResponse.flushLsn; + donor_lsn = sk->appendResponse.flushLsn; } } - return donor; + + if (donor == NULL) + { + wp_log(WARNING, "UpdateDonorShmem didn't find a suitable donor, skipping"); + return; + } + wp->api.update_donor(wp, donor, donor_lsn); } +/* + * Process AppendResponse message from safekeeper. + */ static void -HandleSafekeeperResponse(WalProposer *wp) +HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk) { - XLogRecPtr minQuorumLsn; XLogRecPtr candidateTruncateLsn; + XLogRecPtr newCommitLsn; - minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp); - wp->api.process_safekeeper_feedback(wp, minQuorumLsn); + newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp); + if (newCommitLsn > wp->commitLsn) + { + wp->commitLsn = newCommitLsn; + /* Send new value to all safekeepers. */ + BroadcastAppendRequest(wp); + } + + /* + * Unlock syncrep waiters, update ps_feedback, CheckGracefulShutdown(). + * The last one will terminate the process if the shutdown is requested + * and WAL is committed by the quorum. BroadcastAppendRequest() should be + * called to notify safekeepers about the new commitLsn. + */ + wp->api.process_safekeeper_feedback(wp, sk); /* * Try to advance truncateLsn -- the last record flushed to all @@ -1636,7 +1665,7 @@ HandleSafekeeperResponse(WalProposer *wp) * can't commit entries from previous term' in Raft); 2) */ candidateTruncateLsn = CalculateMinFlushLsn(wp); - candidateTruncateLsn = Min(candidateTruncateLsn, minQuorumLsn); + candidateTruncateLsn = Min(candidateTruncateLsn, wp->commitLsn); if (candidateTruncateLsn > wp->truncateLsn) { wp->truncateLsn = candidateTruncateLsn; @@ -1799,8 +1828,10 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) msg->hs.ts = pq_getmsgint64_le(&s); msg->hs.xmin.value = pq_getmsgint64_le(&s); msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); - if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) - ParsePageserverFeedbackMessage(wp, &s, &msg->rf); + if (s.len > s.cursor) + ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback); + else + msg->ps_feedback.present = false; pq_getmsgend(&s); return true; } diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 688d8e6e52..41daeb87b9 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -10,6 +10,7 @@ #include "libpqwalproposer.h" #include "neon_walreader.h" +#include "pagestore_client.h" #define SK_MAGIC 0xCafeCeefu #define SK_PROTOCOL_VERSION 2 @@ -269,6 +270,8 @@ typedef struct HotStandbyFeedback typedef struct PageserverFeedback { + /* true if AppendResponse contains this feedback */ + bool present; /* current size of the timeline on pageserver */ uint64 currentClusterSize; /* standby_status_update fields that safekeeper received from pageserver */ @@ -276,14 +279,27 @@ typedef struct PageserverFeedback XLogRecPtr disk_consistent_lsn; XLogRecPtr remote_consistent_lsn; TimestampTz replytime; + uint32 shard_number; } PageserverFeedback; typedef struct WalproposerShmemState { + pg_atomic_uint64 propEpochStartLsn; + char donor_name[64]; + char donor_conninfo[MAXCONNINFO]; + XLogRecPtr donor_lsn; + slock_t mutex; - PageserverFeedback feedback; - term_t mineLastElectedTerm; + pg_atomic_uint64 mineLastElectedTerm; pg_atomic_uint64 backpressureThrottlingTime; + pg_atomic_uint64 currentClusterSize; + + /* last feedback from each shard */ + PageserverFeedback shard_ps_feedback[MAX_SHARDS]; + int num_shards; + + /* aggregated feedback with min LSNs across shards */ + PageserverFeedback min_ps_feedback; } WalproposerShmemState; /* @@ -307,12 +323,12 @@ typedef struct AppendResponse /* Feedback received from pageserver includes standby_status_update fields */ /* and custom neon feedback. */ /* This part of the message is extensible. */ - PageserverFeedback rf; + PageserverFeedback ps_feedback; } AppendResponse; /* PageserverFeedback is extensible part of the message that is parsed separately */ /* Other fields are fixed part */ -#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) +#define APPENDRESPONSE_FIXEDPART_SIZE 56 struct WalProposer; typedef struct WalProposer WalProposer; @@ -454,6 +470,9 @@ typedef struct walproposer_api /* Get pointer to the latest available WAL. */ XLogRecPtr (*get_flush_rec_ptr) (WalProposer *wp); + /* Update current donor info in WalProposer Shmem */ + void (*update_donor) (WalProposer *wp, Safekeeper *donor, XLogRecPtr donor_lsn); + /* Get current time. */ TimestampTz (*get_current_timestamp) (WalProposer *wp); @@ -486,6 +505,8 @@ typedef struct walproposer_api * * On success, the data is placed in *buf. It is valid until the next call * to this function. + * + * Returns PG_ASYNC_READ_FAIL on closed connection. */ PGAsyncReadResult (*conn_async_read) (Safekeeper *sk, char **buf, int *amount); @@ -532,6 +553,14 @@ typedef struct walproposer_api * Returns 0 if timeout is reached, 1 if some event happened. Updates * events mask to indicate events and sets sk to the safekeeper which has * an event. + * + * On timeout, events is set to WL_NO_EVENTS. On socket event, events is + * set to WL_SOCKET_READABLE and/or WL_SOCKET_WRITEABLE. When socket is + * closed, events is set to WL_SOCKET_READABLE. + * + * WL_SOCKET_WRITEABLE is usually set only when we need to flush the + * buffer. It can be returned only if caller asked for this event in the + * last *_event_set call. */ int (*wait_event_set) (WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events); @@ -551,11 +580,11 @@ typedef struct walproposer_api void (*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn); /* - * Called after every new message from the safekeeper. Used to propagate - * backpressure feedback and to confirm WAL persistence (has been commited - * on the quorum of safekeepers). + * Called after every AppendResponse from the safekeeper. Used to + * propagate backpressure feedback and to confirm WAL persistence (has + * been commited on the quorum of safekeepers). */ - void (*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn); + void (*process_safekeeper_feedback) (WalProposer *wp, Safekeeper *sk); /* * Write a log message to the internal log processor. This is used only @@ -637,8 +666,8 @@ typedef struct WalProposer /* WAL has been generated up to this point */ XLogRecPtr availableLsn; - /* last commitLsn broadcasted to safekeepers */ - XLogRecPtr lastSentCommitLsn; + /* cached GetAcknowledgedByQuorumWALPosition result */ + XLogRecPtr commitLsn; ProposerGreeting greetRequest; @@ -696,12 +725,14 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt extern void WalProposerPoll(WalProposer *wp); extern void WalProposerFree(WalProposer *wp); +extern WalproposerShmemState *GetWalpropShmemState(); + /* * WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to * recreate set from scratch, hence the export. */ extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events); -extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn); +extern TimeLineID walprop_pg_get_timeline_id(void); #define WPEVENT 1337 /* special log level for walproposer internal diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 61a2a54809..65ef588ba5 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -63,14 +63,22 @@ char *wal_acceptors_list = ""; int wal_acceptor_reconnect_timeout = 1000; int wal_acceptor_connection_timeout = 10000; -static AppendResponse quorumFeedback; +/* Set to true in the walproposer bgw. */ +static bool am_walproposer; static WalproposerShmemState *walprop_shared; static WalProposerConfig walprop_config; static XLogRecPtr sentPtr = InvalidXLogRecPtr; static const walproposer_api walprop_pg; +static volatile sig_atomic_t got_SIGUSR2 = false; +static bool reported_sigusr2 = false; + +static XLogRecPtr standby_flush_lsn = InvalidXLogRecPtr; +static XLogRecPtr standby_apply_lsn = InvalidXLogRecPtr; +static HotStandbyFeedback agg_hs_feedback; static void nwp_shmem_startup_hook(void); static void nwp_register_gucs(void); +static void assign_neon_safekeepers(const char *newval, void *extra); static void nwp_prepare_shmem(void); static uint64 backpressure_lag_impl(void); static bool backpressure_throttling_impl(void); @@ -80,7 +88,6 @@ static void walprop_pg_init_standalone_sync_safekeepers(void); static void walprop_pg_init_walsender(void); static void walprop_pg_init_bgworker(void); static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp); -static TimeLineID walprop_pg_get_timeline_id(void); static void walprop_pg_load_libpqwalreceiver(void); static process_interrupts_callback_t PrevProcessInterruptsCallback; @@ -89,26 +96,26 @@ static shmem_startup_hook_type prev_shmem_startup_hook_type; static shmem_request_hook_type prev_shmem_request_hook = NULL; static void walproposer_shmem_request(void); #endif +static void WalproposerShmemInit_SyncSafekeeper(void); + static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd); static void WalSndLoop(WalProposer *wp); static void XLogBroadcastWalProposer(WalProposer *wp); -static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr); -static void XLogWalPropClose(XLogRecPtr recptr); - static void add_nwr_event_set(Safekeeper *sk, uint32 events); static void update_nwr_event_set(Safekeeper *sk, uint32 events); static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk); -static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp); +static void CheckGracefulShutdown(WalProposer *wp); static void init_walprop_config(bool syncSafekeepers) { walprop_config.neon_tenant = neon_tenant; walprop_config.neon_timeline = neon_timeline; - walprop_config.safekeepers_list = wal_acceptors_list; + /* WalProposerCreate scribbles directly on it, so pstrdup */ + walprop_config.safekeepers_list = pstrdup(wal_acceptors_list); walprop_config.safekeeper_reconnect_timeout = wal_acceptor_reconnect_timeout; walprop_config.safekeeper_connection_timeout = wal_acceptor_connection_timeout; walprop_config.wal_segment_size = wal_segment_size; @@ -129,6 +136,7 @@ WalProposerSync(int argc, char *argv[]) WalProposer *wp; init_walprop_config(true); + WalproposerShmemInit_SyncSafekeeper(); walprop_pg_init_standalone_sync_safekeepers(); walprop_pg_load_libpqwalreceiver(); @@ -147,6 +155,7 @@ WalProposerMain(Datum main_arg) init_walprop_config(false); walprop_pg_init_bgworker(); + am_walproposer = true; walprop_pg_load_libpqwalreceiver(); wp = WalProposerCreate(&walprop_config, walprop_pg); @@ -185,10 +194,10 @@ nwp_register_gucs(void) NULL, /* long_desc */ &wal_acceptors_list, /* valueAddr */ "", /* bootValue */ - PGC_POSTMASTER, + PGC_SIGHUP, GUC_LIST_INPUT, /* extensions can't use* * GUC_LIST_QUOTE */ - NULL, NULL, NULL); + NULL, assign_neon_safekeepers, NULL); DefineCustomIntVariable( "neon.safekeeper_reconnect_timeout", @@ -211,7 +220,99 @@ nwp_register_gucs(void) NULL, NULL, NULL); } -/* Check if we need to suspend inserts because of lagging replication. */ + +static int +split_safekeepers_list(char *safekeepers_list, char *safekeepers[]) +{ + int n_safekeepers = 0; + char *curr_sk = safekeepers_list; + + for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma) + { + if (++n_safekeepers >= MAX_SAFEKEEPERS) { + wpg_log(FATAL, "too many safekeepers"); + } + + coma = strchr(coma, ','); + safekeepers[n_safekeepers-1] = curr_sk; + + if (coma != NULL) { + *coma++ = '\0'; + } + } + + return n_safekeepers; +} + +/* + * Accept two coma-separated strings with list of safekeeper host:port addresses. + * Split them into arrays and return false if two sets do not match, ignoring the order. + */ +static bool +safekeepers_cmp(char *old, char *new) +{ + char *safekeepers_old[MAX_SAFEKEEPERS]; + char *safekeepers_new[MAX_SAFEKEEPERS]; + int len_old = 0; + int len_new = 0; + + len_old = split_safekeepers_list(old, safekeepers_old); + len_new = split_safekeepers_list(new, safekeepers_new); + + if (len_old != len_new) + { + return false; + } + + qsort(&safekeepers_old, len_old, sizeof(char *), pg_qsort_strcmp); + qsort(&safekeepers_new, len_new, sizeof(char *), pg_qsort_strcmp); + + for (int i = 0; i < len_new; i++) + { + if (strcmp(safekeepers_old[i], safekeepers_new[i]) != 0) + { + return false; + } + } + + return true; +} + +/* + * GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if + * the list changed. + */ +static void +assign_neon_safekeepers(const char *newval, void *extra) +{ + if (!am_walproposer) + return; + + if (!newval) { + /* should never happen */ + wpg_log(FATAL, "neon.safekeepers is empty"); + } + + /* Copy values because we will modify them in split_safekeepers_list() */ + char *newval_copy = pstrdup(newval); + char *oldval = pstrdup(wal_acceptors_list); + + /* + * TODO: restarting through FATAL is stupid and introduces 1s delay before + * next bgw start. We should refactor walproposer to allow graceful exit and + * thus remove this delay. + * XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder. + */ + if (!safekeepers_cmp(oldval, newval_copy)) + { + wpg_log(FATAL, "restarting walproposer to change safekeeper list from %s to %s", + wal_acceptors_list, newval); + } + pfree(newval_copy); + pfree(oldval); +} + +/* Check if we need to suspend inserts because of lagging replication. */ static uint64 backpressure_lag_impl(void) { @@ -274,13 +375,27 @@ WalproposerShmemInit(void) { memset(walprop_shared, 0, WalproposerShmemSize()); SpinLockInit(&walprop_shared->mutex); + pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0); + pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0); pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); + pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0); } LWLockRelease(AddinShmemInitLock); return found; } +static void +WalproposerShmemInit_SyncSafekeeper(void) +{ + walprop_shared = palloc(WalproposerShmemSize()); + memset(walprop_shared, 0, WalproposerShmemSize()); + SpinLockInit(&walprop_shared->mutex); + pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0); + pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0); + pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); +} + #define BACK_PRESSURE_DELAY 10000L // 0.01 sec static bool @@ -345,7 +460,7 @@ walprop_register_bgworker(void) snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain"); snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer"); snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer"); - bgw.bgw_restart_time = 5; + bgw.bgw_restart_time = 1; bgw.bgw_notify_pid = 0; bgw.bgw_main_arg = (Datum) 0; @@ -391,6 +506,13 @@ nwp_shmem_startup_hook(void) WalproposerShmemInit(); } +WalproposerShmemState * +GetWalpropShmemState() +{ + Assert(walprop_shared != NULL); + return walprop_shared; +} + static WalproposerShmemState * walprop_pg_get_shmem_state(WalProposer *wp) { @@ -398,26 +520,64 @@ walprop_pg_get_shmem_state(WalProposer *wp) return walprop_shared; } -void -replication_feedback_set(PageserverFeedback *rf) +/* + * Record new ps_feedback in the array with shards and update min_feedback. + */ +static PageserverFeedback +record_pageserver_feedback(PageserverFeedback *ps_feedback) { + PageserverFeedback min_feedback; + + Assert(ps_feedback->present); + Assert(ps_feedback->shard_number < MAX_SHARDS); + SpinLockAcquire(&walprop_shared->mutex); - memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback)); + + /* Update the number of shards */ + if (ps_feedback->shard_number + 1 > walprop_shared->num_shards) + walprop_shared->num_shards = ps_feedback->shard_number + 1; + + /* Update the feedback */ + memcpy(&walprop_shared->shard_ps_feedback[ps_feedback->shard_number], ps_feedback, sizeof(PageserverFeedback)); + + /* Calculate min LSNs */ + memcpy(&min_feedback, ps_feedback, sizeof(PageserverFeedback)); + for (int i = 0; i < walprop_shared->num_shards; i++) + { + PageserverFeedback *feedback = &walprop_shared->shard_ps_feedback[i]; + + if (feedback->present) + { + if (min_feedback.last_received_lsn == InvalidXLogRecPtr || feedback->last_received_lsn < min_feedback.last_received_lsn) + min_feedback.last_received_lsn = feedback->last_received_lsn; + + if (min_feedback.disk_consistent_lsn == InvalidXLogRecPtr || feedback->disk_consistent_lsn < min_feedback.disk_consistent_lsn) + min_feedback.disk_consistent_lsn = feedback->disk_consistent_lsn; + + if (min_feedback.remote_consistent_lsn == InvalidXLogRecPtr || feedback->remote_consistent_lsn < min_feedback.remote_consistent_lsn) + min_feedback.remote_consistent_lsn = feedback->remote_consistent_lsn; + } + } + /* Copy min_feedback back to shmem */ + memcpy(&walprop_shared->min_ps_feedback, &min_feedback, sizeof(PageserverFeedback)); + SpinLockRelease(&walprop_shared->mutex); + + return min_feedback; } void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) { SpinLockAcquire(&walprop_shared->mutex); - *writeLsn = walprop_shared->feedback.last_received_lsn; - *flushLsn = walprop_shared->feedback.disk_consistent_lsn; - *applyLsn = walprop_shared->feedback.remote_consistent_lsn; + *writeLsn = walprop_shared->min_ps_feedback.last_received_lsn; + *flushLsn = walprop_shared->min_ps_feedback.disk_consistent_lsn; + *applyLsn = walprop_shared->min_ps_feedback.remote_consistent_lsn; SpinLockRelease(&walprop_shared->mutex); } /* - * Start walsender streaming replication + * Start walproposer streaming replication */ static void walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos) @@ -492,6 +652,26 @@ walprop_pg_init_standalone_sync_safekeepers(void) BackgroundWorkerUnblockSignals(); } +/* + * We pretend to be a walsender process, and the lifecycle of a walsender is + * slightly different than other procesess. At shutdown, walsender processes + * stay alive until the very end, after the checkpointer has written the + * shutdown checkpoint. When the checkpointer exits, the postmaster sends all + * remaining walsender processes SIGUSR2. On receiving SIGUSR2, we try to send + * the remaining WAL, and then exit. This ensures that the checkpoint record + * reaches durable storage (in safekeepers), before the server shuts down + * completely. + */ +static void +walprop_sigusr2(SIGNAL_ARGS) +{ + int save_errno = errno; + + got_SIGUSR2 = true; + SetLatch(MyLatch); + errno = save_errno; +} + static void walprop_pg_init_bgworker(void) { @@ -503,6 +683,7 @@ walprop_pg_init_bgworker(void) pqsignal(SIGUSR1, procsignal_sigusr1_handler); pqsignal(SIGHUP, SignalHandlerForConfigReload); pqsignal(SIGTERM, die); + pqsignal(SIGUSR2, walprop_sigusr2); BackgroundWorkerUnblockSignals(); @@ -533,7 +714,7 @@ walprop_pg_get_current_timestamp(WalProposer *wp) return GetCurrentTimestamp(); } -static TimeLineID +TimeLineID walprop_pg_get_timeline_id(void) { #if PG_VERSION_NUM >= 150000 @@ -552,6 +733,20 @@ walprop_pg_load_libpqwalreceiver(void) wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly"); } +static void +walprop_pg_update_donor(WalProposer *wp, Safekeeper *donor, XLogRecPtr donor_lsn) +{ + WalproposerShmemState *wps = wp->api.get_shmem_state(wp); + char donor_name[64]; + + pg_snprintf(donor_name, sizeof(donor_name), "%s:%s", donor->host, donor->port); + SpinLockAcquire(&wps->mutex); + memcpy(wps->donor_name, donor_name, sizeof(donor_name)); + memcpy(wps->donor_conninfo, donor->conninfo, sizeof(donor->conninfo)); + wps->donor_lsn = donor_lsn; + SpinLockRelease(&wps->mutex); +} + /* Helper function */ static bool ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking) @@ -652,7 +847,6 @@ walprop_connect_start(Safekeeper *sk) { Assert(sk->conn == NULL); sk->conn = libpqwp_connect_start(sk->conninfo); - } static WalProposerConnectPollStatusType @@ -1026,7 +1220,7 @@ static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd) { XLogRecPtr FlushPtr; - TimeLineID currTLI; + __attribute__((unused)) TimeLineID currTLI; #if PG_VERSION_NUM < 150000 if (ThisTimeLineID == 0) @@ -1075,14 +1269,26 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd) #endif /* - * When we first start replication the standby will be behind the primary. - * For some applications, for example synchronous replication, it is - * important to have a clear state for this initial catchup mode, so we - * can trigger actions when we change streaming state later. We may stay - * in this state for a long time, which is exactly why we want to be able - * to monitor whether or not we are still here. + * XXX: Move straight to STOPPING state, skipping the STREAMING state. + * + * This is a bit weird. Normal walsenders stay in STREAMING state, until + * the checkpointer signals them that it is about to start writing the + * shutdown checkpoint. The walsenders acknowledge that they have received + * that signal by switching to STOPPING state. That tells the walsenders + * that they must not write any new WAL. + * + * However, we cannot easily intercept that signal from the checkpointer. + * It's sent by WalSndInitStopping(), using + * SendProcSignal(PROCSIGNAL_WALSND_INIT_STOPPING). It's received by + * HandleWalSndInitStopping, which sets a process-local got_STOPPING flag. + * However, that's all private to walsender.c. + * + * We don't need to do anything special upon receiving the signal, the + * walproposer doesn't write any WAL anyway, so we skip the STREAMING + * state and go directly to STOPPING mode. That way, the checkpointer + * won't wait for us. */ - WalSndSetState(WALSNDSTATE_CATCHUP); + WalSndSetState(WALSNDSTATE_STOPPING); /* * Don't allow a request to stream from a future point in WAL that hasn't @@ -1130,9 +1336,6 @@ WalSndLoop(WalProposer *wp) CHECK_FOR_INTERRUPTS(); XLogBroadcastWalProposer(wp); - - if (MyWalSnd->state == WALSNDSTATE_CATCHUP) - WalSndSetState(WALSNDSTATE_STREAMING); WalProposerPoll(wp); } } @@ -1219,250 +1422,17 @@ XLogBroadcastWalProposer(WalProposer *wp) } } -/* Download WAL before basebackup for logical walsenders from sk, if needed */ +/* + Used to download WAL before basebackup for walproposer/logical walsenders. No + longer used, replaced by neon_walreader; but callback still exists because + simulation tests use it. + */ static bool WalProposerRecovery(WalProposer *wp, Safekeeper *sk) { - char *err; - WalReceiverConn *wrconn; - WalRcvStreamOptions options; - char conninfo[MAXCONNINFO]; - TimeLineID timeline; - XLogRecPtr startpos; - XLogRecPtr endpos; - uint64 download_range_mb; - - startpos = GetLogRepRestartLSN(wp); - if (startpos == InvalidXLogRecPtr) - return true; /* recovery not needed */ - endpos = wp->propEpochStartLsn; - - timeline = wp->greetRequest.timeline; - - if (!neon_auth_token) - { - memcpy(conninfo, sk->conninfo, MAXCONNINFO); - } - else - { - int written = 0; - - written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo); - if (written > MAXCONNINFO || written < 0) - wpg_log(FATAL, "could not append password to the safekeeper connection string"); - } - -#if PG_MAJORVERSION_NUM < 16 - wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err); -#else - wrconn = walrcv_connect(conninfo, false, false, "wal_proposer_recovery", &err); -#endif - - if (!wrconn) - { - ereport(WARNING, - (errmsg("could not connect to WAL acceptor %s:%s: %s", - sk->host, sk->port, - err))); - return false; - } - wpg_log(LOG, - "start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline " - "%d", - sk->host, sk->port, (uint32) (startpos >> 32), - (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); - - options.logical = false; - options.startpoint = startpos; - options.slotname = NULL; - options.proto.physical.startpointTLI = timeline; - - if (walrcv_startstreaming(wrconn, &options)) - { - XLogRecPtr rec_start_lsn; - XLogRecPtr rec_end_lsn = 0; - int len; - char *buf; - pgsocket wait_fd = PGINVALID_SOCKET; - - while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0) - { - if (len == 0) - { - (void) WaitLatchOrSocket( - MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd, - -1, WAIT_EVENT_WAL_RECEIVER_MAIN); - } - else - { - Assert(buf[0] == 'w' || buf[0] == 'k'); - if (buf[0] == 'k') - continue; /* keepalive */ - memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], - sizeof rec_start_lsn); - rec_start_lsn = pg_ntoh64(rec_start_lsn); - rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE; - - /* write WAL to disk */ - XLogWalPropWrite(sk->wp, &buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn); - - ereport(DEBUG1, - (errmsg("Recover message %X/%X length %d", - LSN_FORMAT_ARGS(rec_start_lsn), len))); - if (rec_end_lsn >= endpos) - break; - } - } - ereport(LOG, - (errmsg("end of replication stream at %X/%X: %m", - LSN_FORMAT_ARGS(rec_end_lsn)))); - walrcv_disconnect(wrconn); - - /* failed to receive all WAL till endpos */ - if (rec_end_lsn < endpos) - return false; - } - else - { - ereport(LOG, - (errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X", - timeline, (uint32) (startpos >> 32), (uint32) startpos))); - return false; - } - return true; } -/* - * These variables are used similarly to openLogFile/SegNo, - * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID - * corresponding the filename of walpropFile. - */ -static int walpropFile = -1; -static TimeLineID walpropFileTLI = 0; -static XLogSegNo walpropSegNo = 0; - -/* - * Write XLOG data to disk. - */ -static void -XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr) -{ - int startoff; - int byteswritten; - - /* - * Apart from walproposer, basebackup LSN page is also written out by - * postgres itself which writes WAL only in pages, and in basebackup it is - * inherently dummy (only safekeepers have historic WAL). Update WAL - * buffers here to avoid dummy page overwriting correct one we download - * here. Ugly, but alternatives are about the same ugly. We won't need - * that if we switch to on-demand WAL download from safekeepers, without - * writing to disk. - * - * https://github.com/neondatabase/neon/issues/5749 - */ - if (!wp->config->syncSafekeepers) - XLogUpdateWalBuffers(buf, recptr, nbytes); - - while (nbytes > 0) - { - int segbytes; - - /* Close the current segment if it's completed */ - if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) - XLogWalPropClose(recptr); - - if (walpropFile < 0) - { -#if PG_VERSION_NUM >= 150000 - /* FIXME Is it ok to use hardcoded value here? */ - TimeLineID tli = 1; -#else - bool use_existent = true; -#endif - /* Create/use new log file */ - XLByteToSeg(recptr, walpropSegNo, wal_segment_size); -#if PG_VERSION_NUM >= 150000 - walpropFile = XLogFileInit(walpropSegNo, tli); - walpropFileTLI = tli; -#else - walpropFile = XLogFileInit(walpropSegNo, &use_existent, false); - walpropFileTLI = ThisTimeLineID; -#endif - } - - /* Calculate the start offset of the received logs */ - startoff = XLogSegmentOffset(recptr, wal_segment_size); - - if (startoff + nbytes > wal_segment_size) - segbytes = wal_segment_size - startoff; - else - segbytes = nbytes; - - /* OK to write the logs */ - errno = 0; - - byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff); - if (byteswritten <= 0) - { - char xlogfname[MAXFNAMELEN]; - int save_errno; - - /* if write didn't set errno, assume no disk space */ - if (errno == 0) - errno = ENOSPC; - - save_errno = errno; - XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); - errno = save_errno; - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not write to log segment %s " - "at offset %u, length %lu: %m", - xlogfname, startoff, (unsigned long) segbytes))); - } - - /* Update state for write */ - recptr += byteswritten; - - nbytes -= byteswritten; - buf += byteswritten; - } - - /* - * Close the current segment if it's fully written up in the last cycle of - * the loop. - */ - if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) - { - XLogWalPropClose(recptr); - } -} - -/* - * Close the current segment. - */ -static void -XLogWalPropClose(XLogRecPtr recptr) -{ - Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)); - - if (close(walpropFile) != 0) - { - char xlogfname[MAXFNAMELEN]; - - XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); - - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not close log segment %s: %m", - xlogfname))); - } - - walpropFile = -1; -} - static void walprop_pg_wal_reader_allocate(Safekeeper *sk) { @@ -1470,7 +1440,7 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk) snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port); Assert(!sk->xlogreader); - sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix); + sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, log_prefix); if (sk->xlogreader == NULL) wpg_log(FATAL, "failed to allocate xlog reader"); } @@ -1745,6 +1715,9 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 { ConditionVariableCancelSleep(); ResetLatch(MyLatch); + + CheckGracefulShutdown(wp); + *events = WL_LATCH_SET; return 1; } @@ -1763,6 +1736,18 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 late_cv_trigger = ConditionVariableCancelSleep(); #endif + /* + * Process config if requested. This restarts walproposer if safekeepers + * list changed. Don't do that for sync-safekeepers because quite probably + * it (re-reading config) won't work without some effort, and + * sync-safekeepers should be quick to finish anyway. + */ + if (!wp->config->syncSafekeepers && ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + /* * If wait is terminated by latch set (walsenders' latch is set on each * wal flush). (no need for pm death check due to WL_EXIT_ON_PM_DEATH) @@ -1799,36 +1784,38 @@ walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn) } /* - * Choose most advanced PageserverFeedback and set it to *rf. + * Like vanilla walsender, on sigusr2 send all remaining WAL and exit. + * + * Note that unlike sync-safekeepers waiting here is not reliable: we + * don't check that majority of safekeepers received and persisted + * commit_lsn -- only that walproposer reached it (which immediately + * broadcasts new value). Doing that without incurring redundant control + * file syncing would need wp -> sk protocol change. OTOH unlike + * sync-safekeepers which must bump commit_lsn or basebackup will fail, + * this catchup is important only for tests where safekeepers/network + * don't crash on their own. */ static void -GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp) +CheckGracefulShutdown(WalProposer *wp) { - int latest_safekeeper = 0; - XLogRecPtr last_received_lsn = InvalidXLogRecPtr; - - for (int i = 0; i < wp->n_safekeepers; i++) + if (got_SIGUSR2) { - if (wp->safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn) + if (!reported_sigusr2) { - latest_safekeeper = i; - last_received_lsn = wp->safekeeper[i].appendResponse.rf.last_received_lsn; + XLogRecPtr flushPtr = walprop_pg_get_flush_rec_ptr(wp); + + wpg_log(LOG, "walproposer will send and wait for remaining WAL between %X/%X and %X/%X", + LSN_FORMAT_ARGS(wp->commitLsn), LSN_FORMAT_ARGS(flushPtr)); + reported_sigusr2 = true; + } + + if (wp->commitLsn >= walprop_pg_get_flush_rec_ptr(wp)) + { + wpg_log(LOG, "walproposer sent all WAL up to %X/%X, exiting", + LSN_FORMAT_ARGS(wp->commitLsn)); + proc_exit(0); } } - - rf->currentClusterSize = wp->safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize; - rf->last_received_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn; - rf->disk_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn; - rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn; - rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime; - - wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," - " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu", - rf->currentClusterSize, - LSN_FORMAT_ARGS(rf->last_received_lsn), - LSN_FORMAT_ARGS(rf->disk_consistent_lsn), - LSN_FORMAT_ARGS(rf->remote_consistent_lsn), - rf->replytime); } /* @@ -1838,34 +1825,30 @@ static void CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp) { hs->ts = 0; - hs->xmin.value = ~0; /* largest unsigned value */ - hs->catalog_xmin.value = ~0; /* largest unsigned value */ + hs->xmin = InvalidFullTransactionId; + hs->catalog_xmin = InvalidFullTransactionId; for (int i = 0; i < wp->n_safekeepers; i++) { - if (wp->safekeeper[i].appendResponse.hs.ts != 0) + + if (wp->safekeeper[i].state == SS_ACTIVE) { HotStandbyFeedback *skhs = &wp->safekeeper[i].appendResponse.hs; if (FullTransactionIdIsNormal(skhs->xmin) - && FullTransactionIdPrecedes(skhs->xmin, hs->xmin)) + && (!FullTransactionIdIsValid(hs->xmin) || FullTransactionIdPrecedes(skhs->xmin, hs->xmin))) { hs->xmin = skhs->xmin; hs->ts = skhs->ts; } if (FullTransactionIdIsNormal(skhs->catalog_xmin) - && FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin)) + && (!FullTransactionIdIsValid(hs->catalog_xmin) || FullTransactionIdPrecedes(skhs->catalog_xmin, hs->catalog_xmin))) { hs->catalog_xmin = skhs->catalog_xmin; hs->ts = skhs->ts; } } } - - if (hs->xmin.value == ~0) - hs->xmin = InvalidFullTransactionId; - if (hs->catalog_xmin.value == ~0) - hs->catalog_xmin = InvalidFullTransactionId; } /* @@ -1878,26 +1861,38 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp) * None of that is functional in sync-safekeepers. */ static void -walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn) +walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk) { HotStandbyFeedback hsFeedback; - XLogRecPtr oldDiskConsistentLsn; + bool needToAdvanceSlot = false; if (wp->config->syncSafekeepers) return; - oldDiskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn; - - /* Get PageserverFeedback fields from the most advanced safekeeper */ - GetLatestNeonFeedback(&quorumFeedback.rf, wp); - replication_feedback_set(&quorumFeedback.rf); - SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); - - if (commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn) + /* handle fresh ps_feedback */ + if (sk->appendResponse.ps_feedback.present) { - if (commitLsn > quorumFeedback.flushLsn) - quorumFeedback.flushLsn = commitLsn; + PageserverFeedback min_feedback = record_pageserver_feedback(&sk->appendResponse.ps_feedback); + /* Only one main shard sends non-zero currentClusterSize */ + if (sk->appendResponse.ps_feedback.currentClusterSize > 0) + SetNeonCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize); + + if (min_feedback.disk_consistent_lsn != standby_apply_lsn) + { + standby_apply_lsn = min_feedback.disk_consistent_lsn; + needToAdvanceSlot = true; + } + } + + if (wp->commitLsn > standby_flush_lsn) + { + standby_flush_lsn = wp->commitLsn; + needToAdvanceSlot = true; + } + + if (needToAdvanceSlot) + { /* * Advance the replication slot to commitLsn. WAL before it is * hardened and will be fetched from one of safekeepers by @@ -1906,29 +1901,45 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn) * Also wakes up syncrep waiters. */ ProcessStandbyReply( - /* write_lsn - This is what durably stored in WAL service. */ - quorumFeedback.flushLsn, - /* flush_lsn - This is what durably stored in WAL service. */ - quorumFeedback.flushLsn, + /* write_lsn - This is what durably stored in safekeepers quorum. */ + standby_flush_lsn, + /* flush_lsn - This is what durably stored in safekeepers quorum. */ + standby_flush_lsn, /* * apply_lsn - This is what processed and durably saved at* * pageserver. */ - quorumFeedback.rf.disk_consistent_lsn, + standby_apply_lsn, walprop_pg_get_current_timestamp(wp), false); } CombineHotStanbyFeedbacks(&hsFeedback, wp); - if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0) + if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0) { - quorumFeedback.hs = hsFeedback; + FullTransactionId xmin = hsFeedback.xmin; + FullTransactionId catalog_xmin = hsFeedback.catalog_xmin; + FullTransactionId next_xid = ReadNextFullTransactionId(); + /* + * Page server is updating nextXid in checkpoint each 1024 transactions, + * so feedback xmin can be actually larger then nextXid and + * function TransactionIdInRecentPast return false in this case, + * preventing update of slot's xmin. + */ + if (FullTransactionIdPrecedes(next_xid, xmin)) + xmin = next_xid; + if (FullTransactionIdPrecedes(next_xid, catalog_xmin)) + catalog_xmin = next_xid; + agg_hs_feedback = hsFeedback; + elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin)); ProcessStandbyHSFeedback(hsFeedback.ts, - XidFromFullTransactionId(hsFeedback.xmin), - EpochFromFullTransactionId(hsFeedback.xmin), - XidFromFullTransactionId(hsFeedback.catalog_xmin), - EpochFromFullTransactionId(hsFeedback.catalog_xmin)); + XidFromFullTransactionId(xmin), + EpochFromFullTransactionId(xmin), + XidFromFullTransactionId(catalog_xmin), + EpochFromFullTransactionId(catalog_xmin)); } + + CheckGracefulShutdown(wp); } static XLogRecPtr @@ -1949,62 +1960,25 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line) elog(FATAL, "unexpected log_internal message at level %d: %s", level, line); } -static XLogRecPtr -GetLogRepRestartLSN(WalProposer *wp) +void +SetNeonCurrentClusterSize(uint64 size) { - FILE *f; - XLogRecPtr lrRestartLsn = InvalidXLogRecPtr; - - /* We don't need to do anything in syncSafekeepers mode. */ - if (wp->config->syncSafekeepers) - return InvalidXLogRecPtr; - - /* - * If there are active logical replication subscription we need to provide - * enough WAL for their WAL senders based on th position of their - * replication slots. - */ - f = fopen("restart.lsn", "rb"); - if (f != NULL) - { - size_t rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f); - - fclose(f); - if (rc == 1 && lrRestartLsn != InvalidXLogRecPtr) - { - uint64 download_range_mb; - - wpg_log(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn)); - - /* - * If we need to download more than a max_slot_wal_keep_size, - * don't do it to avoid risk of exploding pg_wal. Logical - * replication won't work until recreated, but at least compute - * would start; this also follows max_slot_wal_keep_size - * semantics. - */ - download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB; - if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb) - { - wpg_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB", - LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb); - return InvalidXLogRecPtr; - } - - /* - * start from the beginning of the segment to fetch page headers - * verifed by XLogReader - */ - lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size); - } - } - return lrRestartLsn; + pg_atomic_write_u64(&walprop_shared->currentClusterSize, size); } +uint64 +GetNeonCurrentClusterSize(void) +{ + return pg_atomic_read_u64(&walprop_shared->currentClusterSize); +} +uint64 GetNeonCurrentClusterSize(void); + + static const walproposer_api walprop_pg = { .get_shmem_state = walprop_pg_get_shmem_state, .start_streaming = walprop_pg_start_streaming, .get_flush_rec_ptr = walprop_pg_get_flush_rec_ptr, + .update_donor = walprop_pg_update_donor, .get_current_timestamp = walprop_pg_get_current_timestamp, .conn_error_message = walprop_error_message, .conn_status = walprop_status, diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c new file mode 100644 index 0000000000..bd3856e9d9 --- /dev/null +++ b/pgxn/neon/walsender_hooks.c @@ -0,0 +1,205 @@ +/*------------------------------------------------------------------------- + * + * walsender_hooks.c + * + * Implements XLogReaderRoutine in terms of NeonWALReader. Allows for + * fetching WAL from safekeepers, which normal xlogreader can't do. + * + *------------------------------------------------------------------------- + */ +#include "walsender_hooks.h" +#include "postgres.h" +#include "fmgr.h" +#include "access/xlogdefs.h" +#include "replication/walsender.h" +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xlogreader.h" +#include "miscadmin.h" +#include "utils/wait_event.h" +#include "utils/guc.h" +#include "postmaster/interrupt.h" + +#include "neon.h" +#include "neon_walreader.h" +#include "walproposer.h" + +static NeonWALReader *wal_reader = NULL; + +struct WalSnd; +extern struct WalSnd *MyWalSnd; +extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc); +extern bool GetDonorShmem(XLogRecPtr *donor_lsn); +extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI); + +static XLogRecPtr +NeonWALReadWaitForWAL(XLogRecPtr loc) +{ + while (!NeonWALReaderUpdateDonor(wal_reader)) + { + pg_usleep(1000); + CHECK_FOR_INTERRUPTS(); + } + + // Walsender sends keepalives and stuff, so better use its normal wait + if (MyWalSnd != NULL) + return WalSndWaitForWal(loc); + + for (;;) + { + XLogRecPtr flush_ptr; + if (!RecoveryInProgress()) +#if PG_VERSION_NUM >= 150000 + flush_ptr = GetFlushRecPtr(NULL); +#else + flush_ptr = GetFlushRecPtr(); +#endif + else + flush_ptr = GetXLogReplayRecPtr(NULL); + + if (loc <= flush_ptr) + return flush_ptr; + + CHECK_FOR_INTERRUPTS(); + pg_usleep(1000); + } +} + +static int +NeonWALPageRead( + XLogReaderState *xlogreader, + XLogRecPtr targetPagePtr, + int reqLen, + XLogRecPtr targetRecPtr, + char *readBuf) +{ + XLogRecPtr rem_lsn; + + /* Wait for flush pointer to advance past our request */ + XLogRecPtr flushptr = NeonWALReadWaitForWAL(targetPagePtr + reqLen); + int count; + + if (flushptr < targetPagePtr + reqLen) + return -1; + + /* Read at most XLOG_BLCKSZ bytes */ + if (targetPagePtr + XLOG_BLCKSZ <= flushptr) + count = XLOG_BLCKSZ; + else + count = flushptr - targetPagePtr; + + /* + * Sometimes walsender requests non-monotonic sequences of WAL. If that's + * the case, we have to reset streaming from remote at the correct + * position. For example, walsender may try to verify the segment header + * when trying to read in the middle of it. + */ + rem_lsn = NeonWALReaderGetRemLsn(wal_reader); + if (rem_lsn != InvalidXLogRecPtr && targetPagePtr != rem_lsn) + { + NeonWALReaderResetRemote(wal_reader); + } + + for (;;) + { + NeonWALReadResult res = NeonWALRead( + wal_reader, + readBuf, + targetPagePtr, + count, + walprop_pg_get_timeline_id()); + + if (res == NEON_WALREAD_SUCCESS) + { + /* + * Setting ws_tli is required by the XLogReaderRoutine, it is used + * for segment name generation in error reports. + * + * ReadPageInternal updates ws_segno after calling cb on its own + * and XLogReaderRoutine description doesn't require it, but + * WALRead sets, let's follow it. + */ + xlogreader->seg.ws_tli = NeonWALReaderGetSegment(wal_reader)->ws_tli; + xlogreader->seg.ws_segno = NeonWALReaderGetSegment(wal_reader)->ws_segno; + + /* + * ws_file doesn't exist in case of remote read, and isn't used by + * xlogreader except by WALRead on which we don't rely anyway. + */ + return count; + } + if (res == NEON_WALREAD_ERROR) + { + elog(ERROR, "[walsender] Failed to read WAL (req_lsn=%X/%X, len=%d): %s", + LSN_FORMAT_ARGS(targetPagePtr), + reqLen, + NeonWALReaderErrMsg(wal_reader)); + return -1; + } + + /* + * Res is WOULDBLOCK, so we wait on the socket, recreating event set + * if necessary + */ + { + + pgsocket sock = NeonWALReaderSocket(wal_reader); + uint32_t reader_events = NeonWALReaderEvents(wal_reader); + long timeout_ms = 1000; + + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + WaitLatchOrSocket( + MyLatch, + WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | reader_events, + sock, + timeout_ms, + WAIT_EVENT_WAL_SENDER_MAIN); + } + } +} + +static void +NeonWALReadSegmentOpen(XLogReaderState *xlogreader, XLogSegNo nextSegNo, TimeLineID *tli_p) +{ + neon_wal_segment_open(wal_reader, nextSegNo, tli_p); + xlogreader->seg.ws_file = NeonWALReaderGetSegment(wal_reader)->ws_file; +} + +static void +NeonWALReadSegmentClose(XLogReaderState *xlogreader) +{ + neon_wal_segment_close(wal_reader); + xlogreader->seg.ws_file = NeonWALReaderGetSegment(wal_reader)->ws_file; +} + +void +NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr) +{ + /* + * If safekeepers are not configured, assume we don't need neon_walreader, + * i.e. running neon fork locally. + */ + if (wal_acceptors_list[0] == '\0') + return; + + if (!wal_reader) + { + XLogRecPtr epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn); + + if (epochStartLsn == 0) + { + elog(ERROR, "Unable to start walsender when propEpochStartLsn is 0!"); + } + wal_reader = NeonWALReaderAllocate(wal_segment_size, epochStartLsn, "[walsender] "); + } + xlr->page_read = NeonWALPageRead; + xlr->segment_open = NeonWALReadSegmentOpen; + xlr->segment_close = NeonWALReadSegmentClose; +} diff --git a/pgxn/neon/walsender_hooks.h b/pgxn/neon/walsender_hooks.h new file mode 100644 index 0000000000..2e3ce180f9 --- /dev/null +++ b/pgxn/neon/walsender_hooks.h @@ -0,0 +1,7 @@ +#ifndef __WALSENDER_HOOKS_H__ +#define __WALSENDER_HOOKS_H__ + +struct XLogReaderRoutine; +void NeonOnDemandXLogReaderRoutines(struct XLogReaderRoutine *xlr); + +#endif diff --git a/pgxn/neon_rmgr/neon_rmgr.c b/pgxn/neon_rmgr/neon_rmgr.c index 496ca08c08..c3f726db84 100644 --- a/pgxn/neon_rmgr/neon_rmgr.c +++ b/pgxn/neon_rmgr/neon_rmgr.c @@ -186,7 +186,7 @@ static void fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) { *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | - HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK); + HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK | HEAP_COMBOCID); *infomask2 &= ~HEAP_KEYS_UPDATED; if (infobits & XLHL_XMAX_IS_MULTI) @@ -195,6 +195,8 @@ fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) *infomask |= HEAP_XMAX_LOCK_ONLY; if (infobits & XLHL_XMAX_EXCL_LOCK) *infomask |= HEAP_XMAX_EXCL_LOCK; + if (infobits & XLHL_COMBOCID) + *infomask |= HEAP_COMBOCID; /* note HEAP_XMAX_SHR_LOCK isn't considered here */ if (infobits & XLHL_XMAX_KEYSHR_LOCK) *infomask |= HEAP_XMAX_KEYSHR_LOCK; @@ -284,7 +286,7 @@ redo_neon_heap_insert(XLogReaderState *record) htup->t_infomask = xlhdr.t_infomask; htup->t_hoff = xlhdr.t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, xlhdr.t_cid); + htup->t_choice.t_heap.t_field3.t_cid = xlhdr.t_cid; htup->t_ctid = target_tid; if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum, @@ -373,7 +375,7 @@ redo_neon_heap_delete(XLogReaderState *record) HeapTupleHeaderSetXmax(htup, xlrec->xmax); else HeapTupleHeaderSetXmin(htup, InvalidTransactionId); - HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; /* Mark the page as a candidate for pruning */ PageSetPrunable(page, XLogRecGetXid(record)); @@ -490,7 +492,7 @@ redo_neon_heap_update(XLogReaderState *record, bool hot_update) fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask, &htup->t_infomask2); HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); - HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; /* Set forward chain link in t_ctid */ htup->t_ctid = newtid; @@ -623,7 +625,7 @@ redo_neon_heap_update(XLogReaderState *record, bool hot_update) htup->t_hoff = xlhdr.t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, xlhdr.t_cid); + htup->t_choice.t_heap.t_field3.t_cid = xlhdr.t_cid; HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = newtid; @@ -728,7 +730,7 @@ redo_neon_heap_lock(XLogReaderState *record) offnum); } HeapTupleHeaderSetXmax(htup, xlrec->xmax); - HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } @@ -840,7 +842,7 @@ redo_neon_heap_multi_insert(XLogReaderState *record) htup->t_infomask = xlhdr->t_infomask; htup->t_hoff = xlhdr->t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, xlrec->t_cid); + htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; ItemPointerSetBlockNumber(&htup->t_ctid, blkno); ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile index 9c774ec185..252810b5b0 100644 --- a/pgxn/neon_test_utils/Makefile +++ b/pgxn/neon_test_utils/Makefile @@ -7,7 +7,7 @@ OBJS = \ neontest.o EXTENSION = neon_test_utils -DATA = neon_test_utils--1.0.sql +DATA = neon_test_utils--1.3.sql PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging" PG_CONFIG = pg_config diff --git a/pgxn/neon_test_utils/neon_test_utils--1.0.sql b/pgxn/neon_test_utils/neon_test_utils--1.0.sql deleted file mode 100644 index 402981a9a6..0000000000 --- a/pgxn/neon_test_utils/neon_test_utils--1.0.sql +++ /dev/null @@ -1,29 +0,0 @@ --- complain if script is sourced in psql, rather than via CREATE EXTENSION -\echo Use "CREATE EXTENSION neon_test_utils" to load this file. \quit - -CREATE FUNCTION test_consume_xids(nxids int) -RETURNS VOID -AS 'MODULE_PATHNAME', 'test_consume_xids' -LANGUAGE C STRICT -PARALLEL UNSAFE; - -CREATE FUNCTION clear_buffer_cache() -RETURNS VOID -AS 'MODULE_PATHNAME', 'clear_buffer_cache' -LANGUAGE C STRICT -PARALLEL UNSAFE; - -CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn) -RETURNS bytea -AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn' -LANGUAGE C PARALLEL UNSAFE; - -CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn) -RETURNS bytea -AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex' -LANGUAGE C PARALLEL UNSAFE; - -CREATE FUNCTION neon_xlogflush(lsn pg_lsn) -RETURNS VOID -AS 'MODULE_PATHNAME', 'neon_xlogflush' -LANGUAGE C PARALLEL UNSAFE; diff --git a/pgxn/neon_test_utils/neon_test_utils--1.3.sql b/pgxn/neon_test_utils/neon_test_utils--1.3.sql new file mode 100644 index 0000000000..9a9b41c3a3 --- /dev/null +++ b/pgxn/neon_test_utils/neon_test_utils--1.3.sql @@ -0,0 +1,71 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION neon_test_utils" to load this file. \quit + +CREATE FUNCTION test_consume_xids(nxids int) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_consume_xids' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION test_consume_oids(oid int) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_consume_oids' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION test_consume_cpu(seconds int) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_consume_cpu' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION test_consume_memory(megabytes int) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_consume_memory' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION test_release_memory(megabytes int DEFAULT NULL) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_release_memory' +LANGUAGE C +PARALLEL UNSAFE; + +CREATE FUNCTION clear_buffer_cache() +RETURNS VOID +AS 'MODULE_PATHNAME', 'clear_buffer_cache' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn) +RETURNS bytea +AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn' +LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn) +RETURNS bytea +AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex' +LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION neon_xlogflush(lsn pg_lsn DEFAULT NULL) +RETURNS VOID +AS 'MODULE_PATHNAME', 'neon_xlogflush' +LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION trigger_panic() +RETURNS VOID +AS 'MODULE_PATHNAME', 'trigger_panic' +LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION trigger_segfault() +RETURNS VOID +AS 'MODULE_PATHNAME', 'trigger_segfault' +LANGUAGE C PARALLEL UNSAFE; + +-- Alias for `trigger_segfault`, just because `SELECT 💣()` looks fun +CREATE OR REPLACE FUNCTION 💣() RETURNS void +LANGUAGE plpgsql AS $$ +BEGIN + PERFORM trigger_segfault(); +END; +$$; diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control index 94e6720503..f22afd70c4 100644 --- a/pgxn/neon_test_utils/neon_test_utils.control +++ b/pgxn/neon_test_utils/neon_test_utils.control @@ -1,5 +1,6 @@ # neon_test_utils extension comment = 'helpers for neon testing and debugging' -default_version = '1.0' +default_version = '1.3' module_pathname = '$libdir/neon_test_utils' relocatable = true +trusted = true diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index aa644efd40..0b5499ca53 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -15,16 +15,19 @@ #include "access/relation.h" #include "access/xact.h" #include "access/xlog.h" +#include "access/xlog_internal.h" #include "catalog/namespace.h" #include "fmgr.h" #include "funcapi.h" #include "miscadmin.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "storage/fd.h" #include "utils/builtins.h" #include "utils/pg_lsn.h" #include "utils/rel.h" #include "utils/varlena.h" +#include "utils/wait_event.h" #include "../neon/pagestore_client.h" PG_MODULE_MAGIC; @@ -32,10 +35,16 @@ PG_MODULE_MAGIC; extern void _PG_init(void); PG_FUNCTION_INFO_V1(test_consume_xids); +PG_FUNCTION_INFO_V1(test_consume_oids); +PG_FUNCTION_INFO_V1(test_consume_cpu); +PG_FUNCTION_INFO_V1(test_consume_memory); +PG_FUNCTION_INFO_V1(test_release_memory); PG_FUNCTION_INFO_V1(clear_buffer_cache); PG_FUNCTION_INFO_V1(get_raw_page_at_lsn); PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex); PG_FUNCTION_INFO_V1(neon_xlogflush); +PG_FUNCTION_INFO_V1(trigger_panic); +PG_FUNCTION_INFO_V1(trigger_segfault); /* * Linkage to functions in neon module. @@ -43,10 +52,10 @@ PG_FUNCTION_INFO_V1(neon_xlogflush); */ #if PG_MAJORVERSION_NUM < 16 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); + neon_request_lsns request_lsns, char *buffer); #else typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, void *buffer); + neon_request_lsns request_lsns, void *buffer); #endif static neon_read_at_lsn_type neon_read_at_lsn_ptr; @@ -66,6 +75,21 @@ _PG_init(void) #define neon_read_at_lsn neon_read_at_lsn_ptr +/* + * test_consume_oids(int4), for rapidly consuming OIDs, to test wraparound. + * Unlike test_consume_xids which is passed number of xids to be consumed, + * this function is given the target Oid. + */ +Datum +test_consume_oids(PG_FUNCTION_ARGS) +{ + int32 oid = PG_GETARG_INT32(0); + + while (oid != GetNewObjectId()); + + PG_RETURN_VOID(); +} + /* * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound. */ @@ -97,6 +121,117 @@ test_consume_xids(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +/* + * test_consume_cpu(seconds int). Keeps one CPU busy for the given number of seconds. + */ +Datum +test_consume_cpu(PG_FUNCTION_ARGS) +{ + int32 seconds = PG_GETARG_INT32(0); + TimestampTz start; + uint64 total_iterations = 0; + + start = GetCurrentTimestamp(); + + for (;;) + { + TimestampTz elapsed; + + elapsed = GetCurrentTimestamp() - start; + if (elapsed > (TimestampTz) seconds * USECS_PER_SEC) + break; + + /* keep spinning */ + for (int i = 0; i < 1000000; i++) + total_iterations++; + elog(DEBUG2, "test_consume_cpu(): %lu iterations in total", total_iterations); + + CHECK_FOR_INTERRUPTS(); + } + + PG_RETURN_VOID(); +} + +static MemoryContext consume_cxt = NULL; +static slist_head consumed_memory_chunks; +static int64 num_memory_chunks; + +/* + * test_consume_memory(megabytes int). + * + * Consume given amount of memory. The allocation is made in TopMemoryContext, + * so it outlives the function, until you call test_release_memory to + * explicitly release it, or close the session. + */ +Datum +test_consume_memory(PG_FUNCTION_ARGS) +{ + int32 megabytes = PG_GETARG_INT32(0); + + /* + * Consume the memory in a new memory context, so that it's convenient to + * release and to display it separately in a possible memory context dump. + */ + if (consume_cxt == NULL) + consume_cxt = AllocSetContextCreate(TopMemoryContext, + "test_consume_memory", + ALLOCSET_DEFAULT_SIZES); + + for (int32 i = 0; i < megabytes; i++) + { + char *p; + + p = MemoryContextAllocZero(consume_cxt, 1024 * 1024); + + /* touch the memory, so that it's really allocated by the kernel */ + for (int j = 0; j < 1024 * 1024; j += 1024) + p[j] = j % 0xFF; + + slist_push_head(&consumed_memory_chunks, (slist_node *) p); + num_memory_chunks++; + } + + PG_RETURN_VOID(); +} + +/* + * test_release_memory(megabytes int). NULL releases all + */ +Datum +test_release_memory(PG_FUNCTION_ARGS) +{ + if (PG_ARGISNULL(0)) + { + if (consume_cxt) + { + MemoryContextDelete(consume_cxt); + consume_cxt = NULL; + num_memory_chunks = 0; + } + } + else + { + int32 chunks_to_release = PG_GETARG_INT32(0); + + if (chunks_to_release > num_memory_chunks) + { + elog(WARNING, "only %lu MB is consumed, releasing it all", num_memory_chunks); + chunks_to_release = num_memory_chunks; + } + + for (int32 i = 0; i < chunks_to_release; i++) + { + slist_node *chunk = slist_pop_head_node(&consumed_memory_chunks); + + pfree(chunk); + num_memory_chunks--; + } + } + + PG_RETURN_VOID(); +} + /* * Flush the buffer cache, evicting all pages that are not currently pinned. */ @@ -182,9 +317,10 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) text *relname; text *forkname; uint32 blkno; + neon_request_lsns request_lsns; - bool request_latest = PG_ARGISNULL(3); - uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3); + if (PG_NARGS() != 5) + elog(ERROR, "unexpected number of arguments in SQL function signature"); if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) PG_RETURN_NULL(); @@ -193,6 +329,16 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) forkname = PG_GETARG_TEXT_PP(1); blkno = PG_GETARG_UINT32(2); + request_lsns.request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3); + request_lsns.not_modified_since = PG_ARGISNULL(4) ? request_lsns.request_lsn : PG_GETARG_LSN(4); + /* + * For the time being, use the same LSN for request and + * effective request LSN. If any test needed to use UINT64_MAX + * as the request LSN, we'd need to add effective_request_lsn + * as a new argument. + */ + request_lsns.effective_request_lsn = request_lsns.request_lsn; + if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), @@ -245,7 +391,8 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, read_lsn, request_latest, raw_page_data); + neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsns, + raw_page_data); relation_close(rel, AccessShareLock); @@ -264,6 +411,9 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) { char *raw_page_data; + if (PG_NARGS() != 7) + elog(ERROR, "unexpected number of arguments in SQL function signature"); + if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), @@ -287,30 +437,94 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) }; ForkNumber forknum = PG_GETARG_UINT32(3); - uint32 blkno = PG_GETARG_UINT32(4); - bool request_latest = PG_ARGISNULL(5); - uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); + neon_request_lsns request_lsns; /* Initialize buffer to copy to */ bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + request_lsns.request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5); + request_lsns.not_modified_since = PG_ARGISNULL(6) ? request_lsns.request_lsn : PG_GETARG_LSN(6); + /* + * For the time being, use the same LSN for request + * and effective request LSN. If any test needed to + * use UINT64_MAX as the request LSN, we'd need to add + * effective_request_lsn as a new argument. + */ + request_lsns.effective_request_lsn = request_lsns.request_lsn; + SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - neon_read_at_lsn(rinfo, forknum, blkno, read_lsn, request_latest, raw_page_data); + neon_read_at_lsn(rinfo, forknum, blkno, request_lsns, raw_page_data); PG_RETURN_BYTEA_P(raw_page); } } /* * Directly calls XLogFlush(lsn) to flush WAL buffers. + * + * If 'lsn' is not specified (is NULL), flush all generated WAL. */ Datum neon_xlogflush(PG_FUNCTION_ARGS) { - XLogRecPtr lsn = PG_GETARG_LSN(0); + XLogRecPtr lsn; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("cannot flush WAL during recovery."))); + + if (!PG_ARGISNULL(0)) + lsn = PG_GETARG_LSN(0); + else + { + lsn = GetXLogInsertRecPtr(); + + /*--- + * The LSN returned by GetXLogInsertRecPtr() is the position where the + * next inserted record would begin. If the last record ended just at + * the page boundary, the next record will begin after the page header + * on the next page, but the next page's page header has not been + * written yet. If we tried to flush it, XLogFlush() would throw an + * error: + * + * ERROR : xlog flush request %X/%X is not satisfied --- flushed only to %X/%X + * + * To avoid that, if the insert position points to just after the page + * header, back off to page boundary. + */ + if (lsn % XLOG_BLCKSZ == SizeOfXLogShortPHD && + XLogSegmentOffset(lsn, wal_segment_size) > XLOG_BLCKSZ) + lsn -= SizeOfXLogShortPHD; + else if (lsn % XLOG_BLCKSZ == SizeOfXLogLongPHD && + XLogSegmentOffset(lsn, wal_segment_size) < XLOG_BLCKSZ) + lsn -= SizeOfXLogLongPHD; + } XLogFlush(lsn); PG_RETURN_VOID(); } + +/* + * Function to trigger panic. + */ +Datum +trigger_panic(PG_FUNCTION_ARGS) +{ + elog(PANIC, "neon_test_utils: panic"); + PG_RETURN_VOID(); +} + +/* + * Function to trigger a segfault. + */ +Datum +trigger_segfault(PG_FUNCTION_ARGS) +{ + int *ptr = NULL; + *ptr = 42; + PG_RETURN_VOID(); +} diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index bdc50b0aa9..cc545393f5 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -140,9 +140,44 @@ static XLogReaderState *reader_state; #define TRACE DEBUG5 #ifdef HAVE_LIBSECCOMP + + +/* + * https://man7.org/linux/man-pages/man2/close_range.2.html + * + * The `close_range` syscall is available as of Linux 5.9. + * + * The `close_range` libc wrapper is only available in glibc >= 2.34. + * Debian Bullseye ships a libc package based on glibc 2.31. + * => write the wrapper ourselves, using the syscall number from the kernel headers. + * + * If the Linux uAPI headers don't define the system call number, + * fail the build deliberately rather than ifdef'ing it to ENOSYS. + * We prefer a compile time over a runtime error for walredo. + */ +#include +#include +#include + +static int +close_range_syscall(unsigned int start_fd, unsigned int count, unsigned int flags) +{ + return syscall(__NR_close_range, start_fd, count, flags); +} + static void enter_seccomp_mode(void) { + /* + * The pageserver process relies on us to close all the file descriptors + * it potentially leaked to us, _before_ we start processing potentially dangerous + * wal records. See the comment in the Rust code that launches this process. + */ + if (close_range_syscall(3, ~0U, 0) != 0) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: could not close files >= fd 3"))); + PgSeccompRule syscalls[] = { /* Hard requirements */ @@ -184,6 +219,9 @@ enter_seccomp_mode(void) } #endif /* HAVE_LIBSECCOMP */ +PGDLLEXPORT void +WalRedoMain(int argc, char *argv[]); + /* * Entry point for the WAL redo process. * @@ -771,6 +809,9 @@ ApplyRecord(StringInfo input_message) ErrorContextCallback errcallback; #if PG_VERSION_NUM >= 150000 DecodedXLogRecord *decoded; +#define STATIC_DECODEBUF_SIZE (64 * 1024) + static char *static_decodebuf = NULL; + size_t required_space; #endif /* @@ -800,7 +841,19 @@ ApplyRecord(StringInfo input_message) XLogBeginRead(reader_state, lsn); #if PG_VERSION_NUM >= 150000 - decoded = (DecodedXLogRecord *) XLogReadRecordAlloc(reader_state, record->xl_tot_len, true); + /* + * For reasonably small records, reuse a fixed size buffer to reduce + * palloc overhead. + */ + required_space = DecodeXLogRecordRequiredSpace(record->xl_tot_len); + if (required_space <= STATIC_DECODEBUF_SIZE) + { + if (static_decodebuf == NULL) + static_decodebuf = MemoryContextAlloc(TopMemoryContext, STATIC_DECODEBUF_SIZE); + decoded = (DecodedXLogRecord *) static_decodebuf; + } + else + decoded = palloc(required_space); if (!DecodeXLogRecord(reader_state, decoded, record, lsn, &errormsg)) elog(ERROR, "failed to decode WAL record: %s", errormsg); @@ -809,37 +862,15 @@ ApplyRecord(StringInfo input_message) /* Record the location of the next record. */ decoded->next_lsn = reader_state->NextRecPtr; - /* - * If it's in the decode buffer, mark the decode buffer space as - * occupied. - */ - if (!decoded->oversized) - { - /* The new decode buffer head must be MAXALIGNed. */ - Assert(decoded->size == MAXALIGN(decoded->size)); - if ((char *) decoded == reader_state->decode_buffer) - reader_state->decode_buffer_tail = reader_state->decode_buffer + decoded->size; - else - reader_state->decode_buffer_tail += decoded->size; - } - - /* Insert it into the queue of decoded records. */ - Assert(reader_state->decode_queue_tail != decoded); - if (reader_state->decode_queue_tail) - reader_state->decode_queue_tail->next = decoded; - reader_state->decode_queue_tail = decoded; - if (!reader_state->decode_queue_head) - reader_state->decode_queue_head = decoded; - /* * Update the pointers to the beginning and one-past-the-end of this * record, again for the benefit of historical code that expected the * decoder to track this rather than accessing these fields of the record * itself. */ - reader_state->record = reader_state->decode_queue_head; - reader_state->ReadRecPtr = reader_state->record->lsn; - reader_state->EndRecPtr = reader_state->record->next_lsn; + reader_state->record = decoded; + reader_state->ReadRecPtr = decoded->lsn; + reader_state->EndRecPtr = decoded->next_lsn; } #else /* @@ -879,8 +910,9 @@ ApplyRecord(StringInfo input_message) elog(TRACE, "applied WAL record with LSN %X/%X", (uint32) (lsn >> 32), (uint32) lsn); + #if PG_VERSION_NUM >= 150000 - if (decoded && decoded->oversized) + if ((char *) decoded != static_decodebuf) pfree(decoded); #endif } diff --git a/poetry.lock b/poetry.lock index 1644b2b299..48943a73e9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,91 +1,103 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. + +[[package]] +name = "aiohappyeyeballs" +version = "2.3.5" +description = "Happy Eyeballs for asyncio" +optional = false +python-versions = ">=3.8" +files = [ + {file = "aiohappyeyeballs-2.3.5-py3-none-any.whl", hash = "sha256:4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03"}, + {file = "aiohappyeyeballs-2.3.5.tar.gz", hash = "sha256:6fa48b9f1317254f122a07a131a86b71ca6946ca989ce6326fff54a99a920105"}, +] [[package]] name = "aiohttp" -version = "3.9.0" +version = "3.10.2" description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.8" files = [ - {file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6896b8416be9ada4d22cd359d7cb98955576ce863eadad5596b7cdfbf3e17c6c"}, - {file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1736d87dad8ef46a8ec9cddd349fa9f7bd3a064c47dd6469c0d6763d3d49a4fc"}, - {file = "aiohttp-3.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8c9e5f4d7208cda1a2bb600e29069eecf857e6980d0ccc922ccf9d1372c16f4b"}, - {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8488519aa05e636c5997719fe543c8daf19f538f4fa044f3ce94bee608817cff"}, - {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ab16c254e2312efeb799bc3c06897f65a133b38b69682bf75d1f1ee1a9c43a9"}, - {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7a94bde005a8f926d0fa38b88092a03dea4b4875a61fbcd9ac6f4351df1b57cd"}, - {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b777c9286b6c6a94f50ddb3a6e730deec327e9e2256cb08b5530db0f7d40fd8"}, - {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:571760ad7736b34d05597a1fd38cbc7d47f7b65deb722cb8e86fd827404d1f6b"}, - {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:deac0a32aec29608eb25d730f4bc5a261a65b6c48ded1ed861d2a1852577c932"}, - {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:4ee1b4152bc3190cc40ddd6a14715e3004944263ea208229ab4c297712aa3075"}, - {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:3607375053df58ed6f23903aa10cf3112b1240e8c799d243bbad0f7be0666986"}, - {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:65b0a70a25456d329a5e1426702dde67be0fb7a4ead718005ba2ca582d023a94"}, - {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5a2eb5311a37fe105aa35f62f75a078537e1a9e4e1d78c86ec9893a3c97d7a30"}, - {file = "aiohttp-3.9.0-cp310-cp310-win32.whl", hash = "sha256:2cbc14a13fb6b42d344e4f27746a4b03a2cb0c1c3c5b932b0d6ad8881aa390e3"}, - {file = "aiohttp-3.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:ac9669990e2016d644ba8ae4758688534aabde8dbbc81f9af129c3f5f01ca9cd"}, - {file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f8e05f5163528962ce1d1806fce763ab893b1c5b7ace0a3538cd81a90622f844"}, - {file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4afa8f71dba3a5a2e1e1282a51cba7341ae76585345c43d8f0e624882b622218"}, - {file = "aiohttp-3.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f929f4c9b9a00f3e6cc0587abb95ab9c05681f8b14e0fe1daecfa83ea90f8318"}, - {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28185e36a78d247c55e9fbea2332d16aefa14c5276a582ce7a896231c6b1c208"}, - {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a486ddf57ab98b6d19ad36458b9f09e6022de0381674fe00228ca7b741aacb2f"}, - {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70e851f596c00f40a2f00a46126c95c2e04e146015af05a9da3e4867cfc55911"}, - {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5b7bf8fe4d39886adc34311a233a2e01bc10eb4e842220235ed1de57541a896"}, - {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c67a51ea415192c2e53e4e048c78bab82d21955b4281d297f517707dc836bf3d"}, - {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:694df243f394629bcae2d8ed94c589a181e8ba8604159e6e45e7b22e58291113"}, - {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3dd8119752dd30dd7bca7d4bc2a92a59be6a003e4e5c2cf7e248b89751b8f4b7"}, - {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:eb6dfd52063186ac97b4caa25764cdbcdb4b10d97f5c5f66b0fa95052e744eb7"}, - {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:d97c3e286d0ac9af6223bc132dc4bad6540b37c8d6c0a15fe1e70fb34f9ec411"}, - {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:816f4db40555026e4cdda604a1088577c1fb957d02f3f1292e0221353403f192"}, - {file = "aiohttp-3.9.0-cp311-cp311-win32.whl", hash = "sha256:3abf0551874fecf95f93b58f25ef4fc9a250669a2257753f38f8f592db85ddea"}, - {file = "aiohttp-3.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:e18d92c3e9e22553a73e33784fcb0ed484c9874e9a3e96c16a8d6a1e74a0217b"}, - {file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:99ae01fb13a618b9942376df77a1f50c20a281390dad3c56a6ec2942e266220d"}, - {file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:05857848da443c8c12110d99285d499b4e84d59918a21132e45c3f0804876994"}, - {file = "aiohttp-3.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:317719d7f824eba55857fe0729363af58e27c066c731bc62cd97bc9c3d9c7ea4"}, - {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1e3b3c107ccb0e537f309f719994a55621acd2c8fdf6d5ce5152aed788fb940"}, - {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:45820ddbb276113ead8d4907a7802adb77548087ff5465d5c554f9aa3928ae7d"}, - {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:05a183f1978802588711aed0dea31e697d760ce9055292db9dc1604daa9a8ded"}, - {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51a4cd44788ea0b5e6bb8fa704597af3a30be75503a7ed1098bc5b8ffdf6c982"}, - {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:673343fbc0c1ac44d0d2640addc56e97a052504beacd7ade0dc5e76d3a4c16e8"}, - {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7e8a3b79b6d186a9c99761fd4a5e8dd575a48d96021f220ac5b5fa856e5dd029"}, - {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6777a390e41e78e7c45dab43a4a0196c55c3b8c30eebe017b152939372a83253"}, - {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7ae5f99a32c53731c93ac3075abd3e1e5cfbe72fc3eaac4c27c9dd64ba3b19fe"}, - {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f1e4f254e9c35d8965d377e065c4a8a55d396fe87c8e7e8429bcfdeeb229bfb3"}, - {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11ca808f9a6b63485059f5f6e164ef7ec826483c1212a44f268b3653c91237d8"}, - {file = "aiohttp-3.9.0-cp312-cp312-win32.whl", hash = "sha256:de3cc86f4ea8b4c34a6e43a7306c40c1275e52bfa9748d869c6b7d54aa6dad80"}, - {file = "aiohttp-3.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:ca4fddf84ac7d8a7d0866664936f93318ff01ee33e32381a115b19fb5a4d1202"}, - {file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f09960b5bb1017d16c0f9e9f7fc42160a5a49fa1e87a175fd4a2b1a1833ea0af"}, - {file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8303531e2c17b1a494ffaeba48f2da655fe932c4e9a2626c8718403c83e5dd2b"}, - {file = "aiohttp-3.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4790e44f46a4aa07b64504089def5744d3b6780468c4ec3a1a36eb7f2cae9814"}, - {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1d7edf74a36de0e5ca50787e83a77cf352f5504eb0ffa3f07000a911ba353fb"}, - {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:94697c7293199c2a2551e3e3e18438b4cba293e79c6bc2319f5fd652fccb7456"}, - {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1b66dbb8a7d5f50e9e2ea3804b01e766308331d0cac76eb30c563ac89c95985"}, - {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9623cfd9e85b76b83ef88519d98326d4731f8d71869867e47a0b979ffec61c73"}, - {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f32c86dc967ab8c719fd229ce71917caad13cc1e8356ee997bf02c5b368799bf"}, - {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f50b4663c3e0262c3a361faf440761fbef60ccdde5fe8545689a4b3a3c149fb4"}, - {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dcf71c55ec853826cd70eadb2b6ac62ec577416442ca1e0a97ad875a1b3a0305"}, - {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:42fe4fd9f0dfcc7be4248c162d8056f1d51a04c60e53366b0098d1267c4c9da8"}, - {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:76a86a9989ebf82ee61e06e2bab408aec4ea367dc6da35145c3352b60a112d11"}, - {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f9e09a1c83521d770d170b3801eea19b89f41ccaa61d53026ed111cb6f088887"}, - {file = "aiohttp-3.9.0-cp38-cp38-win32.whl", hash = "sha256:a00ce44c21612d185c5275c5cba4bab8d7c1590f248638b667ed8a782fa8cd6f"}, - {file = "aiohttp-3.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:d5b9345ab92ebe6003ae11d8092ce822a0242146e6fa270889b9ba965457ca40"}, - {file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98d21092bf2637c5fa724a428a69e8f5955f2182bff61f8036827cf6ce1157bf"}, - {file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:35a68cd63ca6aaef5707888f17a70c36efe62b099a4e853d33dc2e9872125be8"}, - {file = "aiohttp-3.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d7f6235c7475658acfc1769d968e07ab585c79f6ca438ddfecaa9a08006aee2"}, - {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db04d1de548f7a62d1dd7e7cdf7c22893ee168e22701895067a28a8ed51b3735"}, - {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:536b01513d67d10baf6f71c72decdf492fb7433c5f2f133e9a9087379d4b6f31"}, - {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c8b0a6487e8109427ccf638580865b54e2e3db4a6e0e11c02639231b41fc0f"}, - {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7276fe0017664414fdc3618fca411630405f1aaf0cc3be69def650eb50441787"}, - {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23170247ef89ffa842a02bbfdc425028574d9e010611659abeb24d890bc53bb8"}, - {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b1a2ea8252cacc7fd51df5a56d7a2bb1986ed39be9397b51a08015727dfb69bd"}, - {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2d71abc15ff7047412ef26bf812dfc8d0d1020d664617f4913df2df469f26b76"}, - {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:2d820162c8c2bdbe97d328cd4f417c955ca370027dce593345e437b2e9ffdc4d"}, - {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:2779f5e7c70f7b421915fd47db332c81de365678180a9f3ab404088f87ba5ff9"}, - {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:366bc870d7ac61726f32a489fbe3d1d8876e87506870be66b01aeb84389e967e"}, - {file = "aiohttp-3.9.0-cp39-cp39-win32.whl", hash = "sha256:1df43596b826022b14998f0460926ce261544fedefe0d2f653e1b20f49e96454"}, - {file = "aiohttp-3.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:9c196b30f1b1aa3363a69dd69079ae9bec96c2965c4707eaa6914ba099fb7d4f"}, - {file = "aiohttp-3.9.0.tar.gz", hash = "sha256:09f23292d29135025e19e8ff4f0a68df078fe4ee013bca0105b2e803989de92d"}, + {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:95213b3d79c7e387144e9cb7b9d2809092d6ff2c044cb59033aedc612f38fb6d"}, + {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1aa005f060aff7124cfadaa2493f00a4e28ed41b232add5869e129a2e395935a"}, + {file = "aiohttp-3.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eabe6bf4c199687592f5de4ccd383945f485779c7ffb62a9b9f1f8a3f9756df8"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e010736fc16d21125c7e2dc5c350cd43c528b85085c04bf73a77be328fe944"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99f81f9c1529fd8e03be4a7bd7df32d14b4f856e90ef6e9cbad3415dbfa9166c"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d611d1a01c25277bcdea06879afbc11472e33ce842322496b211319aa95441bb"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00191d38156e09e8c81ef3d75c0d70d4f209b8381e71622165f22ef7da6f101"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74c091a5ded6cb81785de2d7a8ab703731f26de910dbe0f3934eabef4ae417cc"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:18186a80ec5a701816adbf1d779926e1069392cf18504528d6e52e14b5920525"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5a7ceb2a0d2280f23a02c64cd0afdc922079bb950400c3dd13a1ab2988428aac"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8bd7be6ff6c162a60cb8fce65ee879a684fbb63d5466aba3fa5b9288eb04aefa"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:fae962b62944eaebff4f4fddcf1a69de919e7b967136a318533d82d93c3c6bd1"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a0fde16d284efcacbe15fb0c1013f0967b6c3e379649239d783868230bf1db42"}, + {file = "aiohttp-3.10.2-cp310-cp310-win32.whl", hash = "sha256:f81cd85a0e76ec7b8e2b6636fe02952d35befda4196b8c88f3cec5b4fb512839"}, + {file = "aiohttp-3.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:54ba10eb5a3481c28282eb6afb5f709aedf53cf9c3a31875ffbdc9fc719ffd67"}, + {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87fab7f948e407444c2f57088286e00e2ed0003ceaf3d8f8cc0f60544ba61d91"}, + {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ec6ad66ed660d46503243cbec7b2b3d8ddfa020f984209b3b8ef7d98ce69c3f2"}, + {file = "aiohttp-3.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a4be88807283bd96ae7b8e401abde4ca0bab597ba73b5e9a2d98f36d451e9aac"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01c98041f90927c2cbd72c22a164bb816fa3010a047d264969cf82e1d4bcf8d1"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54e36c67e1a9273ecafab18d6693da0fb5ac48fd48417e4548ac24a918c20998"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7de3ddb6f424af54535424082a1b5d1ae8caf8256ebd445be68c31c662354720"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dd9c7db94b4692b827ce51dcee597d61a0e4f4661162424faf65106775b40e7"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e57e21e1167705f8482ca29cc5d02702208d8bf4aff58f766d94bcd6ead838cd"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a1a50e59b720060c29e2951fd9f13c01e1ea9492e5a527b92cfe04dd64453c16"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:686c87782481fda5ee6ba572d912a5c26d9f98cc5c243ebd03f95222af3f1b0f"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:dafb4abb257c0ed56dc36f4e928a7341b34b1379bd87e5a15ce5d883c2c90574"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:494a6f77560e02bd7d1ab579fdf8192390567fc96a603f21370f6e63690b7f3d"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6fe8503b1b917508cc68bf44dae28823ac05e9f091021e0c41f806ebbb23f92f"}, + {file = "aiohttp-3.10.2-cp311-cp311-win32.whl", hash = "sha256:4ddb43d06ce786221c0dfd3c91b4892c318eaa36b903f7c4278e7e2fa0dd5102"}, + {file = "aiohttp-3.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:ca2f5abcb0a9a47e56bac173c01e9f6c6e7f27534d91451c5f22e6a35a5a2093"}, + {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:14eb6b17f6246959fb0b035d4f4ae52caa870c4edfb6170aad14c0de5bfbf478"}, + {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:465e445ec348d4e4bd349edd8b22db75f025da9d7b6dc1369c48e7935b85581e"}, + {file = "aiohttp-3.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:341f8ece0276a828d95b70cd265d20e257f5132b46bf77d759d7f4e0443f2906"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c01fbb87b5426381cd9418b3ddcf4fc107e296fa2d3446c18ce6c76642f340a3"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c474af073e1a6763e1c5522bbb2d85ff8318197e4c6c919b8d7886e16213345"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d9076810a5621236e29b2204e67a68e1fe317c8727ee4c9abbfbb1083b442c38"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8f515d6859e673940e08de3922b9c4a2249653b0ac181169313bd6e4b1978ac"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:655e583afc639bef06f3b2446972c1726007a21003cd0ef57116a123e44601bc"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8da9449a575133828cc99985536552ea2dcd690e848f9d41b48d8853a149a959"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:19073d57d0feb1865d12361e2a1f5a49cb764bf81a4024a3b608ab521568093a"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c8e98e1845805f184d91fda6f9ab93d7c7b0dddf1c07e0255924bfdb151a8d05"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:377220a5efde6f9497c5b74649b8c261d3cce8a84cb661be2ed8099a2196400a"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92f7f4a4dc9cdb5980973a74d43cdbb16286dacf8d1896b6c3023b8ba8436f8e"}, + {file = "aiohttp-3.10.2-cp312-cp312-win32.whl", hash = "sha256:9bb2834a6f11d65374ce97d366d6311a9155ef92c4f0cee543b2155d06dc921f"}, + {file = "aiohttp-3.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:518dc3cb37365255708283d1c1c54485bbacccd84f0a0fb87ed8917ba45eda5b"}, + {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7f98e70bbbf693086efe4b86d381efad8edac040b8ad02821453083d15ec315f"}, + {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9f6f0b252a009e98fe84028a4ec48396a948e7a65b8be06ccfc6ef68cf1f614d"}, + {file = "aiohttp-3.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9360e3ffc7b23565600e729e8c639c3c50d5520e05fdf94aa2bd859eef12c407"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3988044d1635c7821dd44f0edfbe47e9875427464e59d548aece447f8c22800a"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30a9d59da1543a6f1478c3436fd49ec59be3868bca561a33778b4391005e499d"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9f49bdb94809ac56e09a310a62f33e5f22973d6fd351aac72a39cd551e98194"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfd2dca3f11c365d6857a07e7d12985afc59798458a2fdb2ffa4a0332a3fd43"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c1508ec97b2cd3e120bfe309a4ff8e852e8a7460f1ef1de00c2c0ed01e33c"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:49904f38667c44c041a0b44c474b3ae36948d16a0398a8f8cd84e2bb3c42a069"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:352f3a4e5f11f3241a49b6a48bc5b935fabc35d1165fa0d87f3ca99c1fcca98b"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:fc61f39b534c5d5903490478a0dd349df397d2284a939aa3cbaa2fb7a19b8397"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:ad2274e707be37420d0b6c3d26a8115295fe9d8e6e530fa6a42487a8ca3ad052"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c836bf3c7512100219fe1123743fd8dd9a2b50dd7cfb0c3bb10d041309acab4b"}, + {file = "aiohttp-3.10.2-cp38-cp38-win32.whl", hash = "sha256:53e8898adda402be03ff164b0878abe2d884e3ea03a4701e6ad55399d84b92dc"}, + {file = "aiohttp-3.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:7cc8f65f5b22304693de05a245b6736b14cb5bc9c8a03da6e2ae9ef15f8b458f"}, + {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9dfc906d656e14004c5bc672399c1cccc10db38df2b62a13fb2b6e165a81c316"}, + {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:91b10208b222ddf655c3a3d5b727879d7163db12b634492df41a9182a76edaae"}, + {file = "aiohttp-3.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fd16b5e1a7bdd14668cd6bde60a2a29b49147a535c74f50d8177d11b38433a7"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2bfdda4971bd79201f59adbad24ec2728875237e1c83bba5221284dbbf57bda"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69d73f869cf29e8a373127fc378014e2b17bcfbe8d89134bc6fb06a2f67f3cb3"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df59f8486507c421c0620a2c3dce81fbf1d54018dc20ff4fecdb2c106d6e6abc"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df930015db36b460aa9badbf35eccbc383f00d52d4b6f3de2ccb57d064a6ade"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:562b1153ab7f766ee6b8b357ec777a302770ad017cf18505d34f1c088fccc448"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d984db6d855de58e0fde1ef908d48fe9a634cadb3cf715962722b4da1c40619d"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:14dc3fcb0d877911d775d511eb617a486a8c48afca0a887276e63db04d3ee920"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b52a27a5c97275e254704e1049f4b96a81e67d6205f52fa37a4777d55b0e98ef"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:cd33d9de8cfd006a0d0fe85f49b4183c57e91d18ffb7e9004ce855e81928f704"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1238fc979160bc03a92fff9ad021375ff1c8799c6aacb0d8ea1b357ea40932bb"}, + {file = "aiohttp-3.10.2-cp39-cp39-win32.whl", hash = "sha256:e2f43d238eae4f0b04f58d4c0df4615697d4ca3e9f9b1963d49555a94f0f5a04"}, + {file = "aiohttp-3.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:947847f07a8f81d7b39b2d0202fd73e61962ebe17ac2d8566f260679e467da7b"}, + {file = "aiohttp-3.10.2.tar.gz", hash = "sha256:4d1f694b5d6e459352e5e925a42e05bac66655bfde44d81c59992463d2897014"}, ] [package.dependencies] +aiohappyeyeballs = ">=2.3.0" aiosignal = ">=1.1.2" async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""} attrs = ">=17.3.0" @@ -94,7 +106,7 @@ multidict = ">=4.5,<7.0" yarl = ">=1.0,<2.0" [package.extras] -speedups = ["Brotli", "aiodns", "brotlicffi"] +speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"] [[package]] name = "aiopg" @@ -158,6 +170,50 @@ files = [ attrs = ">=16.0.0" pluggy = ">=0.4.0" +[[package]] +name = "annotated-types" +version = "0.6.0" +description = "Reusable constraint types to use with typing.Annotated" +optional = false +python-versions = ">=3.8" +files = [ + {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"}, + {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"}, +] + +[[package]] +name = "antlr4-python3-runtime" +version = "4.13.1" +description = "ANTLR 4.13.1 runtime for Python 3" +optional = false +python-versions = "*" +files = [ + {file = "antlr4-python3-runtime-4.13.1.tar.gz", hash = "sha256:3cd282f5ea7cfb841537fe01f143350fdb1c0b1ce7981443a2fa8513fddb6d1a"}, + {file = "antlr4_python3_runtime-4.13.1-py3-none-any.whl", hash = "sha256:78ec57aad12c97ac039ca27403ad61cb98aaec8a3f9bb8144f889aa0fa28b943"}, +] + +[[package]] +name = "anyio" +version = "4.3.0" +description = "High level compatibility layer for multiple asynchronous event loop implementations" +optional = false +python-versions = ">=3.8" +files = [ + {file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"}, + {file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"}, +] + +[package.dependencies] +exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} +idna = ">=2.8" +sniffio = ">=1.1" +typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""} + +[package.extras] +doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] +trio = ["trio (>=0.23)"] + [[package]] name = "async-timeout" version = "4.0.3" @@ -245,22 +301,23 @@ tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy" [[package]] name = "aws-sam-translator" -version = "1.48.0" +version = "1.88.0" description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates" optional = false -python-versions = ">=3.7, <=4.0, !=4.0" +python-versions = "!=4.0,<=4.0,>=3.8" files = [ - {file = "aws-sam-translator-1.48.0.tar.gz", hash = "sha256:7171037323dfa30f8f73e9bccb9210e4c384a585e087219a9518a5204f0a2c44"}, - {file = "aws_sam_translator-1.48.0-py2-none-any.whl", hash = "sha256:be18dfa3dfe7ab291d281667c5f73ac62dbe6bfe86df7d122e4258b906b736f0"}, - {file = "aws_sam_translator-1.48.0-py3-none-any.whl", hash = "sha256:ca4f8f9910d7713aeaba59346775bfb3198f6acb47c6704572f9bd3fc0fb5bf0"}, + {file = "aws_sam_translator-1.88.0-py3-none-any.whl", hash = "sha256:aa93d498d8de3fb3d485c316155b1628144b823bbc176099a20de06df666fcac"}, + {file = "aws_sam_translator-1.88.0.tar.gz", hash = "sha256:e77c65f3488566122277accd44a0f1ec018e37403e0d5fe25120d96e537e91a7"}, ] [package.dependencies] boto3 = ">=1.19.5,<2.dev0" -jsonschema = ">=3.2,<4.0" +jsonschema = ">=3.2,<5" +pydantic = ">=1.8,<3" +typing-extensions = ">=4.4" [package.extras] -dev = ["black (==20.8b1)", "boto3 (>=1.23,<2)", "click (>=7.1,<8.0)", "coverage (>=5.3,<6.0)", "dateparser (>=0.7,<1.0)", "docopt (>=0.6.2,<0.7.0)", "flake8 (>=3.8.4,<3.9.0)", "parameterized (>=0.7.4,<0.8.0)", "pylint (>=2.9.0,<2.10.0)", "pytest (>=6.2.5,<6.3.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-env (>=0.6.2,<0.7.0)", "pytest-xdist (>=2.5,<3.0)", "pyyaml (>=5.4,<6.0)", "requests (>=2.24.0,<2.25.0)", "tenacity (>=7.0.0,<7.1.0)", "tox (>=3.24,<4.0)"] +dev = ["black (==24.3.0)", "boto3 (>=1.23,<2)", "boto3-stubs[appconfig,serverlessrepo] (>=1.19.5,<2.dev0)", "coverage (>=5.3,<8)", "dateparser (>=1.1,<2.0)", "mypy (>=1.3.0,<1.4.0)", "parameterized (>=0.7,<1.0)", "pytest (>=6.2,<8)", "pytest-cov (>=2.10,<5)", "pytest-env (>=0.6,<1)", "pytest-rerunfailures (>=9.1,<12)", "pytest-xdist (>=2.5,<4)", "pyyaml (>=6.0,<7.0)", "requests (>=2.28,<3.0)", "ruamel.yaml (==0.17.21)", "ruff (>=0.1.0,<0.2.0)", "tenacity (>=8.0,<9.0)", "types-PyYAML (>=6.0,<7.0)", "types-jsonschema (>=3.2,<4.0)"] [[package]] name = "aws-xray-sdk" @@ -689,13 +746,13 @@ typing-extensions = ">=4.1.0" [[package]] name = "certifi" -version = "2023.7.22" +version = "2024.7.4" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"}, - {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"}, + {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"}, + {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"}, ] [[package]] @@ -776,24 +833,26 @@ pycparser = "*" [[package]] name = "cfn-lint" -version = "0.61.3" +version = "0.87.1" description = "Checks CloudFormation templates for practices and behaviour that could potentially be improved" optional = false -python-versions = ">=3.6, <=4.0, !=4.0" +python-versions = "!=4.0,<=4.0,>=3.8" files = [ - {file = "cfn-lint-0.61.3.tar.gz", hash = "sha256:3806e010d77901f5e935496df690c10e39676434a738fce1a1161cf9c7bd36a2"}, - {file = "cfn_lint-0.61.3-py3-none-any.whl", hash = "sha256:8e9522fad0c7c98b31ecbdd4724f8d8a5787457cc0f71e62ae0d11104d6e52ab"}, + {file = "cfn_lint-0.87.1-py3-none-any.whl", hash = "sha256:d450f450635fc223b6f66880ccac52a5fd1a52966fa1705f1ba52b88dfed3071"}, + {file = "cfn_lint-0.87.1.tar.gz", hash = "sha256:b3ce9d3e5e0eadcea5d584c8ccaa00bf2a990a36a64d7ffd8683bc60b7e4f06f"}, ] [package.dependencies] -aws-sam-translator = ">=1.47.0" +aws-sam-translator = ">=1.87.0" jschema-to-python = ">=1.2.3,<1.3.0" jsonpatch = "*" -jsonschema = ">=3.0,<4.0" +jsonschema = ">=3.0,<5" junit-xml = ">=1.9,<2.0" -networkx = ">=2.4,<3.0" +networkx = ">=2.4,<4" pyyaml = ">5.4" +regex = ">=2021.7.1" sarif-om = ">=1.0.4,<1.1.0" +sympy = ">=1.0.0" [[package]] name = "charset-normalizer" @@ -823,6 +882,96 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} +[[package]] +name = "clickhouse-connect" +version = "0.7.17" +description = "ClickHouse Database Core Driver for Python, Pandas, and Superset" +optional = false +python-versions = "~=3.8" +files = [ + {file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66209e4634f457604c263bea176336079d26c284e251e68a8435b0b80c1a25ff"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4d86c5a561a2a99321c8b4af22257461b8e67142f34cfea6e70f39b45b1f406"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d200c9afa2725a96f9f3718221f641276b80c11bf504d8a2fbaafb5a05b2f0d3"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004d867b1005445a46e6742db1054bf2a717a451372663b46e09b5e9e90a31e3"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4ef94a4a8e008882259151833c3c47cfbb9c8f08de0f100aaf3b95c366dcfb24"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ee732c3df50c8b07d16b5836ff85e6b84569922455c03837c3add5cf1388fe1f"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d9dbe1235465bb946e24b90b0ca5b8800b5d645acb2d7d6ee819448c3e2fd959"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-win32.whl", hash = "sha256:e5db0d68dfb63db0297d44dc91406bcfd7d333708d7cd55086c8550fbf870b78"}, + {file = "clickhouse_connect-0.7.17-cp310-cp310-win_amd64.whl", hash = "sha256:800750f568c097ea312887785025006d6098bffd8ed2dd6a57048fb3ced6d778"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4eb390623b3d15dc9cda78f5c68f83ef9ad11743797e70af8fabc384b015a73c"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:35f172ca950f218f63072024c81d5b4ff6e5399620c255506c321ccc7b17c9a5"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae7918f060f7576fc931c692e0122b1b07576fabd81444af22e1f8582300d200"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff2881b93c7a1afb9c99fb59ad5fd666850421325d0931e2b77f3f4ba872303d"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a4d9b4f97271addf66aadbaf7f154f19a0ad6c22026d575a995c55ebd8576db"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e431469b1ff2d5c3e4c406d55c6afdf7102f5d2524c2ceb5481b94ac24412aa3"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2b6f80115176559f181a6b3ecad11aa3d70ef6014c3d2905b90fcef3f27d25c2"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8ac694f40dfafc8a3cc877116b4bc73e8877ebf66d4d96ee092484ee4c0b481"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-win32.whl", hash = "sha256:78b7a3f6b0fad4eaf8afb5f9a2e855bde53e82ea5804960e9cf779538f4606a1"}, + {file = "clickhouse_connect-0.7.17-cp311-cp311-win_amd64.whl", hash = "sha256:efd390cc045334ecc3f2a9c18cc07c041d0288b145967805fdcab65abeefa75f"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9228334a17dc0a7842222f54ba5b89fc563532424aad4f66be799df70ab37e9f"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e432a42bb788bda77e88eda2774392a60fbbb5ee2a79cb2881d182d26c45fe49"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85152ed2879965ee1fa2bd5e31fb27d281fd5f50d6e86a401efd95cd85b29ef"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29a126104aa5e11df570cbd89fca4988784084602ba77d17b2396b334c54fd75"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:882d8f9570549258e6eb6a97915fbf64ed29fe395d5e360866ea8d42c8283a35"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:06ebf99111171442f462fb8b357364c3e276da3e8f8557b2e8fee9eb55ab37d1"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e0cf6f99b2777b0d164bf8b65ec39104cdc0789a56bcb52d98289bbd6f5cc70e"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ee46c508fddfff3b7ac52326788e0c6dd8dfb416b6d7e02e5d30e8110749dac2"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-win32.whl", hash = "sha256:eb708b590a37d56b069a6088254ffa55d73b8cb65527339df81ef03fe67ffdec"}, + {file = "clickhouse_connect-0.7.17-cp312-cp312-win_amd64.whl", hash = "sha256:17f00dccddaeaf43733faa1fa21f7d24641454a73669fda862545ba7c88627f5"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab5d4b37a6dcc39e94c63beac0f22d9dda914f5eb865d166c64cf04dfadb7d16"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32aa90387f45f34cbc5a984789ed4c12760a3c0056c190ab0123ceafc36b1002"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21277b6bdd6c8ff14170bfcd52125c5c39f442ec4bafbb643ad7d0ca915f0029"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca68d8b7dee3fb4e7229e06152f5b0faaccafb4c87d9c2d48fa5bd117a3cc1c0"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:841c56282102b2fba1e0b332bb1c7a0c50992fbc321746af8d3e0e6ca2450e8b"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d7ffde5a4b95d8fe9ed38e08e504e497310e3d7a17691bd40bf65734648fdfc"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:055960086b6b92b6e44f5ba04c81c40c10b038588e4b3908b033c99f66125332"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:36491fec63ceb8503b6344c23477647030139f346b749dc5ee672c505939dbbe"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-win32.whl", hash = "sha256:8779a907e026db32e6bc0bc0c8d5de0e2e3afd166afc2d4adcc0603399af5539"}, + {file = "clickhouse_connect-0.7.17-cp38-cp38-win_amd64.whl", hash = "sha256:309854fa197885c6278438ddd032ab52e6fec56f162074e343c3635ca7266078"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8009f94550178dc971aeb4f8787ba7a5b473c22647490428b7229f540a51d2b"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:70f8422f407b13a404b3670fd097855abd5adaf890c710d6678d2b46ab61ac48"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:082783eb1e8baf7b3465dd045132dc5cb5a91432c899dc4e19891c5f782d8d23"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c30aad2a9c7584c4ee19e646a087b3bbd2d4daab3d88a2afeeae1a7f6febf9"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc8e245a9f4f0dce39f155e626405f60f1d3cf4d1e52dd2c793ea6b603ca111b"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:802372cb8a69c9ffdf4260e9f01616c8601ba531825ed6f08834827e0b880cd1"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:193a60271a3b105cdbde96fb20b40eab8a50fca3bb1f397546f7a18b53d9aa9c"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:59d58932916792fdbd09cb961a245a0c2d87b07b8296f9138915b998f4522941"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-win32.whl", hash = "sha256:3cfd0edabb589f640636a97ffc38d1b3d760faef208d44e50829cc1ad3f0d3e5"}, + {file = "clickhouse_connect-0.7.17-cp39-cp39-win_amd64.whl", hash = "sha256:5661b4629aac228481219abf2e149119af1a71d897f191665e182d9d192d7033"}, + {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7429d309109e7e4a70fd867d69fcfea9ddcb1a1e910caa6b0e2c3776b71f4613"}, + {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5ae619151006da84a0b1585a9bcc81be32459d8061aeb2e116bad5bbaa7d108"}, + {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec0c84a0880621cb2389656a89886ef3133f0b3f8dc016eee6f25bbb49ff6f70"}, + {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705464c23f821666b76f8f619cf2870225156276562756b3933aaa24708e0ff8"}, + {file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1822016f4b769e89264fe26cefe0bc5e50e4c3ca0747d89bb52d57dc4f1e5ffb"}, + {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6c92b0c342c1fbfa666010e8175e05026dc570a7ef91d8fa81ce503180f318aa"}, + {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2e106536540e906c3c866f8615fcf870a9a77c1bfab9ef4b042febfd2fdb953"}, + {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac9a32e62384b4341ba51a451084eb3b00c6e59aaac1499145dd8b897cb585c"}, + {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0feed93b9912b7862a8c41be1febcd44b68a824a5c1059b19d5c567afdaa6273"}, + {file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2e2dd6db52e799f065fd565143fde5a872cfe903de1bee7775bc3a349856a790"}, + {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed13add5d579a5960155f3000420544368501c9703d2fb94f103b4a6126081f6"}, + {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c257a23ed3bf1858593fb03927d9d073fbbdfa24dc2afee537c3314bd66b4e24"}, + {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47866f64cbdc2d5cc4f8a7a8c49e3ee90c9e487091b9eda7c3a3576418e1cbe"}, + {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b850e2f17e0a0b5a37d996d3fb728050227489d64d271d678d166abea94f26e"}, + {file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:349682288987dc84ac7695f7cd6b510be8d0ec0eee7c1b72dbf2146b4e9efdb8"}, +] + +[package.dependencies] +certifi = "*" +lz4 = "*" +pytz = "*" +urllib3 = ">=1.26" +zstandard = "*" + +[package.extras] +arrow = ["pyarrow"] +numpy = ["numpy"] +orjson = ["orjson"] +pandas = ["pandas"] +sqlalchemy = ["sqlalchemy (>1.3.21,<2.0)"] +tzlocal = ["tzlocal (>=4.0)"] + [[package]] name = "colorama" version = "0.4.5" @@ -836,47 +985,51 @@ files = [ [[package]] name = "cryptography" -version = "41.0.6" +version = "43.0.1" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false python-versions = ">=3.7" files = [ - {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:0f27acb55a4e77b9be8d550d762b0513ef3fc658cd3eb15110ebbcbd626db12c"}, - {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:ae236bb8760c1e55b7a39b6d4d32d2279bc6c7c8500b7d5a13b6fb9fc97be35b"}, - {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afda76d84b053923c27ede5edc1ed7d53e3c9f475ebaf63c68e69f1403c405a8"}, - {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da46e2b5df770070412c46f87bac0849b8d685c5f2679771de277a422c7d0b86"}, - {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:ff369dd19e8fe0528b02e8df9f2aeb2479f89b1270d90f96a63500afe9af5cae"}, - {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:b648fe2a45e426aaee684ddca2632f62ec4613ef362f4d681a9a6283d10e079d"}, - {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5daeb18e7886a358064a68dbcaf441c036cbdb7da52ae744e7b9207b04d3908c"}, - {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:068bc551698c234742c40049e46840843f3d98ad7ce265fd2bd4ec0d11306596"}, - {file = "cryptography-41.0.6-cp37-abi3-win32.whl", hash = "sha256:2132d5865eea673fe6712c2ed5fb4fa49dba10768bb4cc798345748380ee3660"}, - {file = "cryptography-41.0.6-cp37-abi3-win_amd64.whl", hash = "sha256:48783b7e2bef51224020efb61b42704207dde583d7e371ef8fc2a5fb6c0aabc7"}, - {file = "cryptography-41.0.6-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:8efb2af8d4ba9dbc9c9dd8f04d19a7abb5b49eab1f3694e7b5a16a5fc2856f5c"}, - {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5a550dc7a3b50b116323e3d376241829fd326ac47bc195e04eb33a8170902a9"}, - {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:85abd057699b98fce40b41737afb234fef05c67e116f6f3650782c10862c43da"}, - {file = "cryptography-41.0.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f39812f70fc5c71a15aa3c97b2bbe213c3f2a460b79bd21c40d033bb34a9bf36"}, - {file = "cryptography-41.0.6-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:742ae5e9a2310e9dade7932f9576606836ed174da3c7d26bc3d3ab4bd49b9f65"}, - {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:35f3f288e83c3f6f10752467c48919a7a94b7d88cc00b0668372a0d2ad4f8ead"}, - {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4d03186af98b1c01a4eda396b137f29e4e3fb0173e30f885e27acec8823c1b09"}, - {file = "cryptography-41.0.6-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b27a7fd4229abef715e064269d98a7e2909ebf92eb6912a9603c7e14c181928c"}, - {file = "cryptography-41.0.6-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:398ae1fc711b5eb78e977daa3cbf47cec20f2c08c5da129b7a296055fbb22aed"}, - {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7e00fb556bda398b99b0da289ce7053639d33b572847181d6483ad89835115f6"}, - {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:60e746b11b937911dc70d164060d28d273e31853bb359e2b2033c9e93e6f3c43"}, - {file = "cryptography-41.0.6-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3288acccef021e3c3c10d58933f44e8602cf04dba96d9796d70d537bb2f4bbc4"}, - {file = "cryptography-41.0.6.tar.gz", hash = "sha256:422e3e31d63743855e43e5a6fcc8b4acab860f560f9321b0ee6269cc7ed70cc3"}, + {file = "cryptography-43.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d"}, + {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062"}, + {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68aaecc4178e90719e95298515979814bda0cbada1256a4485414860bd7ab962"}, + {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:de41fd81a41e53267cb020bb3a7212861da53a7d39f863585d13ea11049cf277"}, + {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f98bf604c82c416bc829e490c700ca1553eafdf2912a91e23a79d97d9801372a"}, + {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:61ec41068b7b74268fa86e3e9e12b9f0c21fcf65434571dbb13d954bceb08042"}, + {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:014f58110f53237ace6a408b5beb6c427b64e084eb451ef25a28308270086494"}, + {file = "cryptography-43.0.1-cp37-abi3-win32.whl", hash = "sha256:2bd51274dcd59f09dd952afb696bf9c61a7a49dfc764c04dd33ef7a6b502a1e2"}, + {file = "cryptography-43.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:666ae11966643886c2987b3b721899d250855718d6d9ce41b521252a17985f4d"}, + {file = "cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac119bb76b9faa00f48128b7f5679e1d8d437365c5d26f1c2c3f0da4ce1b553d"}, + {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bbcce1a551e262dfbafb6e6252f1ae36a248e615ca44ba302df077a846a8806"}, + {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58d4e9129985185a06d849aa6df265bdd5a74ca6e1b736a77959b498e0505b85"}, + {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d03a475165f3134f773d1388aeb19c2d25ba88b6a9733c5c590b9ff7bbfa2e0c"}, + {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:511f4273808ab590912a93ddb4e3914dfd8a388fed883361b02dea3791f292e1"}, + {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:80eda8b3e173f0f247f711eef62be51b599b5d425c429b5d4ca6a05e9e856baa"}, + {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38926c50cff6f533f8a2dae3d7f19541432610d114a70808f0926d5aaa7121e4"}, + {file = "cryptography-43.0.1-cp39-abi3-win32.whl", hash = "sha256:a575913fb06e05e6b4b814d7f7468c2c660e8bb16d8d5a1faf9b33ccc569dd47"}, + {file = "cryptography-43.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:d75601ad10b059ec832e78823b348bfa1a59f6b8d545db3a24fd44362a1564cb"}, + {file = "cryptography-43.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ea25acb556320250756e53f9e20a4177515f012c9eaea17eb7587a8c4d8ae034"}, + {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c1332724be35d23a854994ff0b66530119500b6053d0bd3363265f7e5e77288d"}, + {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fba1007b3ef89946dbbb515aeeb41e30203b004f0b4b00e5e16078b518563289"}, + {file = "cryptography-43.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5b43d1ea6b378b54a1dc99dd8a2b5be47658fe9a7ce0a58ff0b55f4b43ef2b84"}, + {file = "cryptography-43.0.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:88cce104c36870d70c49c7c8fd22885875d950d9ee6ab54df2745f83ba0dc365"}, + {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:9d3cdb25fa98afdd3d0892d132b8d7139e2c087da1712041f6b762e4f807cc96"}, + {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e710bf40870f4db63c3d7d929aa9e09e4e7ee219e703f949ec4073b4294f6172"}, + {file = "cryptography-43.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7c05650fe8023c5ed0d46793d4b7d7e6cd9c04e68eabe5b0aeea836e37bdcec2"}, + {file = "cryptography-43.0.1.tar.gz", hash = "sha256:203e92a75716d8cfb491dc47c79e17d0d9207ccffcbcb35f598fbe463ae3444d"}, ] [package.dependencies] -cffi = ">=1.12" +cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""} [package.extras] docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] -docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] +docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"] nox = ["nox"] -pep8test = ["black", "check-sdist", "mypy", "ruff"] +pep8test = ["check-sdist", "click", "mypy", "ruff"] sdist = ["build"] ssh = ["bcrypt (>=3.1.5)"] -test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] +test = ["certifi", "cryptography-vectors (==43.0.1)", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] test-randomorder = ["pytest-randomly"] [[package]] @@ -900,24 +1053,6 @@ websocket-client = ">=0.32.0" ssh = ["paramiko (>=2.4.2)"] tls = ["cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"] -[[package]] -name = "ecdsa" -version = "0.18.0" -description = "ECDSA cryptographic signature library (pure python)" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" -files = [ - {file = "ecdsa-0.18.0-py2.py3-none-any.whl", hash = "sha256:80600258e7ed2f16b9aa1d7c295bd70194109ad5a30fdee0eaeefef1d4c559dd"}, - {file = "ecdsa-0.18.0.tar.gz", hash = "sha256:190348041559e21b22a1d65cee485282ca11a6f81d503fddb84d5017e9ed1e49"}, -] - -[package.dependencies] -six = ">=1.9.0" - -[package.extras] -gmpy = ["gmpy"] -gmpy2 = ["gmpy2"] - [[package]] name = "exceptiongroup" version = "1.1.1" @@ -970,18 +1105,17 @@ dotenv = ["python-dotenv"] [[package]] name = "flask-cors" -version = "3.0.10" +version = "5.0.0" description = "A Flask extension adding a decorator for CORS support" optional = false python-versions = "*" files = [ - {file = "Flask-Cors-3.0.10.tar.gz", hash = "sha256:b60839393f3b84a0f3746f6cdca56c1ad7426aa738b70d6c61375857823181de"}, - {file = "Flask_Cors-3.0.10-py2.py3-none-any.whl", hash = "sha256:74efc975af1194fc7891ff5cd85b0f7478be4f7f59fe158102e91abb72bb4438"}, + {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"}, + {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"}, ] [package.dependencies] Flask = ">=0.9" -Six = "*" [[package]] name = "frozenlist" @@ -1064,15 +1198,109 @@ files = [ {file = "graphql_core-3.2.1-py3-none-any.whl", hash = "sha256:f83c658e4968998eed1923a2e3e3eddd347e005ac0315fbb7ca4d70ea9156323"}, ] +[[package]] +name = "h11" +version = "0.14.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +optional = false +python-versions = ">=3.7" +files = [ + {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, + {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, +] + +[[package]] +name = "h2" +version = "4.1.0" +description = "HTTP/2 State-Machine based protocol implementation" +optional = false +python-versions = ">=3.6.1" +files = [ + {file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"}, + {file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"}, +] + +[package.dependencies] +hpack = ">=4.0,<5" +hyperframe = ">=6.0,<7" + +[[package]] +name = "hpack" +version = "4.0.0" +description = "Pure-Python HPACK header compression" +optional = false +python-versions = ">=3.6.1" +files = [ + {file = "hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c"}, + {file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"}, +] + +[[package]] +name = "httpcore" +version = "1.0.3" +description = "A minimal low-level HTTP client." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpcore-1.0.3-py3-none-any.whl", hash = "sha256:9a6a501c3099307d9fd76ac244e08503427679b1e81ceb1d922485e2f2462ad2"}, + {file = "httpcore-1.0.3.tar.gz", hash = "sha256:5c0f9546ad17dac4d0772b0808856eb616eb8b48ce94f49ed819fd6982a8a544"}, +] + +[package.dependencies] +certifi = "*" +h11 = ">=0.13,<0.15" + +[package.extras] +asyncio = ["anyio (>=4.0,<5.0)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +trio = ["trio (>=0.22.0,<0.24.0)"] + +[[package]] +name = "httpx" +version = "0.26.0" +description = "The next generation HTTP client." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpx-0.26.0-py3-none-any.whl", hash = "sha256:8915f5a3627c4d47b73e8202457cb28f1266982d1159bd5779d86a80c0eab1cd"}, + {file = "httpx-0.26.0.tar.gz", hash = "sha256:451b55c30d5185ea6b23c2c793abf9bb237d2a7dfb901ced6ff69ad37ec1dfaf"}, +] + +[package.dependencies] +anyio = "*" +certifi = "*" +h2 = {version = ">=3,<5", optional = true, markers = "extra == \"http2\""} +httpcore = "==1.*" +idna = "*" +sniffio = "*" + +[package.extras] +brotli = ["brotli", "brotlicffi"] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] + +[[package]] +name = "hyperframe" +version = "6.0.1" +description = "HTTP/2 framing layer for Python" +optional = false +python-versions = ">=3.6.1" +files = [ + {file = "hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15"}, + {file = "hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"}, +] + [[package]] name = "idna" -version = "3.3" +version = "3.7" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.5" files = [ - {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, - {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, + {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, + {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, ] [[package]] @@ -1118,13 +1346,13 @@ files = [ [[package]] name = "jinja2" -version = "3.1.3" +version = "3.1.4" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" files = [ - {file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"}, - {file = "Jinja2-3.1.3.tar.gz", hash = "sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90"}, + {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, + {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, ] [package.dependencies] @@ -1144,6 +1372,23 @@ files = [ {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] +[[package]] +name = "joserfc" +version = "0.9.0" +description = "The ultimate Python library for JOSE RFCs, including JWS, JWE, JWK, JWA, JWT" +optional = false +python-versions = ">=3.8" +files = [ + {file = "joserfc-0.9.0-py3-none-any.whl", hash = "sha256:4026bdbe2c196cd40574e916fa1e28874d99649412edaab0e373dec3077153fb"}, + {file = "joserfc-0.9.0.tar.gz", hash = "sha256:eebca7f587b1761ce43a98ffd5327f2b600b9aa5bb0a77b947687f503ad43bc0"}, +] + +[package.dependencies] +cryptography = "*" + +[package.extras] +drafts = ["pycryptodome"] + [[package]] name = "jschema-to-python" version = "1.2.3" @@ -1185,6 +1430,20 @@ files = [ [package.dependencies] jsonpointer = ">=1.9" +[[package]] +name = "jsonpath-ng" +version = "1.6.1" +description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming." +optional = false +python-versions = "*" +files = [ + {file = "jsonpath-ng-1.6.1.tar.gz", hash = "sha256:086c37ba4917304850bd837aeab806670224d3f038fe2833ff593a672ef0a5fa"}, + {file = "jsonpath_ng-1.6.1-py3-none-any.whl", hash = "sha256:8f22cd8273d7772eea9aaa84d922e0841aa36fdb8a2c6b7f6c3791a16a9bc0be"}, +] + +[package.dependencies] +ply = "*" + [[package]] name = "jsonpickle" version = "2.2.0" @@ -1214,24 +1473,39 @@ files = [ [[package]] name = "jsonschema" -version = "3.2.0" +version = "4.17.3" description = "An implementation of JSON Schema validation for Python" optional = false -python-versions = "*" +python-versions = ">=3.7" files = [ - {file = "jsonschema-3.2.0-py2.py3-none-any.whl", hash = "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163"}, - {file = "jsonschema-3.2.0.tar.gz", hash = "sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a"}, + {file = "jsonschema-4.17.3-py3-none-any.whl", hash = "sha256:a870ad254da1a8ca84b6a2905cac29d265f805acc57af304784962a2aa6508f6"}, + {file = "jsonschema-4.17.3.tar.gz", hash = "sha256:0f864437ab8b6076ba6707453ef8f98a6a0d512a80e93f8abdb676f737ecb60d"}, ] [package.dependencies] attrs = ">=17.4.0" -pyrsistent = ">=0.14.0" -setuptools = "*" -six = ">=1.11.0" +pyrsistent = ">=0.14.0,<0.17.0 || >0.17.0,<0.17.1 || >0.17.1,<0.17.2 || >0.17.2" [package.extras] -format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"] -format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"] +format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] +format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"] + +[[package]] +name = "jsonschema-spec" +version = "0.1.6" +description = "JSONSchema Spec with object-oriented paths" +optional = false +python-versions = ">=3.7.0,<4.0.0" +files = [ + {file = "jsonschema_spec-0.1.6-py3-none-any.whl", hash = "sha256:f2206d18c89d1824c1f775ba14ed039743b41a9167bd2c5bdb774b66b3ca0bbf"}, + {file = "jsonschema_spec-0.1.6.tar.gz", hash = "sha256:90215863b56e212086641956b20127ccbf6d8a3a38343dad01d6a74d19482f76"}, +] + +[package.dependencies] +jsonschema = ">=4.0.0,<4.18.0" +pathable = ">=0.4.1,<0.5.0" +PyYAML = ">=5.1" +requests = ">=2.31.0,<3.0.0" [[package]] name = "junit-xml" @@ -1247,6 +1521,116 @@ files = [ [package.dependencies] six = "*" +[[package]] +name = "kafka-python" +version = "2.0.2" +description = "Pure Python client for Apache Kafka" +optional = false +python-versions = "*" +files = [ + {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"}, + {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"}, +] + +[package.extras] +crc32c = ["crc32c"] + +[[package]] +name = "lazy-object-proxy" +version = "1.10.0" +description = "A fast and thorough lazy object proxy." +optional = false +python-versions = ">=3.8" +files = [ + {file = "lazy-object-proxy-1.10.0.tar.gz", hash = "sha256:78247b6d45f43a52ef35c25b5581459e85117225408a4128a3daf8bf9648ac69"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:855e068b0358ab916454464a884779c7ffa312b8925c6f7401e952dcf3b89977"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab7004cf2e59f7c2e4345604a3e6ea0d92ac44e1c2375527d56492014e690c3"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc0d2fc424e54c70c4bc06787e4072c4f3b1aa2f897dfdc34ce1013cf3ceef05"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e2adb09778797da09d2b5ebdbceebf7dd32e2c96f79da9052b2e87b6ea495895"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b1f711e2c6dcd4edd372cf5dec5c5a30d23bba06ee012093267b3376c079ec83"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-win32.whl", hash = "sha256:76a095cfe6045c7d0ca77db9934e8f7b71b14645f0094ffcd842349ada5c5fb9"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:b4f87d4ed9064b2628da63830986c3d2dca7501e6018347798313fcf028e2fd4"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fec03caabbc6b59ea4a638bee5fce7117be8e99a4103d9d5ad77f15d6f81020c"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02c83f957782cbbe8136bee26416686a6ae998c7b6191711a04da776dc9e47d4"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:009e6bb1f1935a62889ddc8541514b6a9e1fcf302667dcb049a0be5c8f613e56"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75fc59fc450050b1b3c203c35020bc41bd2695ed692a392924c6ce180c6f1dc9"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:782e2c9b2aab1708ffb07d4bf377d12901d7a1d99e5e410d648d892f8967ab1f"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-win32.whl", hash = "sha256:edb45bb8278574710e68a6b021599a10ce730d156e5b254941754a9cc0b17d03"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:e271058822765ad5e3bca7f05f2ace0de58a3f4e62045a8c90a0dfd2f8ad8cc6"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e98c8af98d5707dcdecc9ab0863c0ea6e88545d42ca7c3feffb6b4d1e370c7ba"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:952c81d415b9b80ea261d2372d2a4a2332a3890c2b83e0535f263ddfe43f0d43"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80b39d3a151309efc8cc48675918891b865bdf742a8616a337cb0090791a0de9"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e221060b701e2aa2ea991542900dd13907a5c90fa80e199dbf5a03359019e7a3"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:92f09ff65ecff3108e56526f9e2481b8116c0b9e1425325e13245abfd79bdb1b"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-win32.whl", hash = "sha256:3ad54b9ddbe20ae9f7c1b29e52f123120772b06dbb18ec6be9101369d63a4074"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:127a789c75151db6af398b8972178afe6bda7d6f68730c057fbbc2e96b08d282"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4ed0518a14dd26092614412936920ad081a424bdcb54cc13349a8e2c6d106a"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ad9e6ed739285919aa9661a5bbed0aaf410aa60231373c5579c6b4801bd883c"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fc0a92c02fa1ca1e84fc60fa258458e5bf89d90a1ddaeb8ed9cc3147f417255"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0aefc7591920bbd360d57ea03c995cebc204b424524a5bd78406f6e1b8b2a5d8"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5faf03a7d8942bb4476e3b62fd0f4cf94eaf4618e304a19865abf89a35c0bbee"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-win32.whl", hash = "sha256:e333e2324307a7b5d86adfa835bb500ee70bfcd1447384a822e96495796b0ca4"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:cb73507defd385b7705c599a94474b1d5222a508e502553ef94114a143ec6696"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:366c32fe5355ef5fc8a232c5436f4cc66e9d3e8967c01fb2e6302fd6627e3d94"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2297f08f08a2bb0d32a4265e98a006643cd7233fb7983032bd61ac7a02956b3b"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18dd842b49456aaa9a7cf535b04ca4571a302ff72ed8740d06b5adcd41fe0757"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:217138197c170a2a74ca0e05bddcd5f1796c735c37d0eee33e43259b192aa424"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9a3a87cf1e133e5b1994144c12ca4aa3d9698517fe1e2ca82977781b16955658"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-win32.whl", hash = "sha256:30b339b2a743c5288405aa79a69e706a06e02958eab31859f7f3c04980853b70"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:a899b10e17743683b293a729d3a11f2f399e8a90c73b089e29f5d0fe3509f0dd"}, + {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"}, +] + +[[package]] +name = "lz4" +version = "4.3.3" +description = "LZ4 Bindings for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"}, + {file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"}, + {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7"}, + {file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05"}, + {file = "lz4-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc"}, + {file = "lz4-4.3.3-cp310-cp310-win32.whl", hash = "sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6"}, + {file = "lz4-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2"}, + {file = "lz4-4.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6"}, + {file = "lz4-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61"}, + {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7"}, + {file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563"}, + {file = "lz4-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21"}, + {file = "lz4-4.3.3-cp311-cp311-win32.whl", hash = "sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d"}, + {file = "lz4-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c"}, + {file = "lz4-4.3.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d"}, + {file = "lz4-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2"}, + {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809"}, + {file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf"}, + {file = "lz4-4.3.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e"}, + {file = "lz4-4.3.3-cp312-cp312-win32.whl", hash = "sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1"}, + {file = "lz4-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f"}, + {file = "lz4-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394"}, + {file = "lz4-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0"}, + {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd"}, + {file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775"}, + {file = "lz4-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604"}, + {file = "lz4-4.3.3-cp38-cp38-win32.whl", hash = "sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa"}, + {file = "lz4-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24"}, + {file = "lz4-4.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba"}, + {file = "lz4-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205"}, + {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d"}, + {file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071"}, + {file = "lz4-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0"}, + {file = "lz4-4.3.3-cp39-cp39-win32.whl", hash = "sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2"}, + {file = "lz4-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807"}, + {file = "lz4-4.3.3.tar.gz", hash = "sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e"}, +] + +[package.extras] +docs = ["sphinx (>=1.6.0)", "sphinx-bootstrap-theme"] +flake8 = ["flake8"] +tests = ["psutil", "pytest (!=3.3.0)", "pytest-cov"] + [[package]] name = "markupsafe" version = "2.1.1" @@ -1298,64 +1682,80 @@ files = [ [[package]] name = "moto" -version = "4.1.2" +version = "5.0.6" description = "" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "moto-4.1.2-py2.py3-none-any.whl", hash = "sha256:1b361ece638c74a657325378a259276f368aafce2f8be84f8143e69fa93ce8ec"}, - {file = "moto-4.1.2.tar.gz", hash = "sha256:63431733d2a02c7bd652ad71ec1da442a0e0d580cbac5eeb50d440a2ce066eac"}, + {file = "moto-5.0.6-py2.py3-none-any.whl", hash = "sha256:ca1e22831a741733b581ff2ef4d6ae2e1c6db1eab97af1b78b86ca2c6e88c609"}, + {file = "moto-5.0.6.tar.gz", hash = "sha256:ad8b23f2b555ad694da8b2432a42b6d96beaaf67a4e7d932196a72193a2eee2c"}, ] [package.dependencies] +antlr4-python3-runtime = {version = "*", optional = true, markers = "extra == \"server\""} aws-xray-sdk = {version = ">=0.93,<0.96 || >0.96", optional = true, markers = "extra == \"server\""} boto3 = ">=1.9.201" -botocore = ">=1.12.201" +botocore = ">=1.14.0" cfn-lint = {version = ">=0.40.0", optional = true, markers = "extra == \"server\""} cryptography = ">=3.3.1" -docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""} -ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""} +docker = {version = ">=3.0.0", optional = true, markers = "extra == \"server\""} flask = {version = "<2.2.0 || >2.2.0,<2.2.1 || >2.2.1", optional = true, markers = "extra == \"server\""} flask-cors = {version = "*", optional = true, markers = "extra == \"server\""} graphql-core = {version = "*", optional = true, markers = "extra == \"server\""} Jinja2 = ">=2.10.1" +joserfc = {version = ">=0.9.0", optional = true, markers = "extra == \"server\""} jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""} -openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""} +jsonpath-ng = {version = "*", optional = true, markers = "extra == \"server\""} +openapi-spec-validator = {version = ">=0.5.0", optional = true, markers = "extra == \"server\""} +py-partiql-parser = {version = "0.5.4", optional = true, markers = "extra == \"server\""} pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""} python-dateutil = ">=2.1,<3.0.0" -python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""} PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""} requests = ">=2.5" -responses = ">=0.13.0" +responses = ">=0.15.0" setuptools = {version = "*", optional = true, markers = "extra == \"server\""} -sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""} werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1" xmltodict = "*" [package.extras] -all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] -apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] -apigatewayv2 = ["PyYAML (>=5.1)"] +all = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +apigateway = ["PyYAML (>=5.1)", "joserfc (>=0.9.0)", "openapi-spec-validator (>=0.5.0)"] +apigatewayv2 = ["PyYAML (>=5.1)", "openapi-spec-validator (>=0.5.0)"] appsync = ["graphql-core"] -awslambda = ["docker (>=2.5.1)"] -batch = ["docker (>=2.5.1)"] -cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] -cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] -ds = ["sshpubkeys (>=3.1.0)"] -dynamodb = ["docker (>=2.5.1)"] -dynamodbstreams = ["docker (>=2.5.1)"] -ebs = ["sshpubkeys (>=3.1.0)"] -ec2 = ["sshpubkeys (>=3.1.0)"] -efs = ["sshpubkeys (>=3.1.0)"] -eks = ["sshpubkeys (>=3.1.0)"] +awslambda = ["docker (>=3.0.0)"] +batch = ["docker (>=3.0.0)"] +cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +cognitoidp = ["joserfc (>=0.9.0)"] +dynamodb = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"] +dynamodbstreams = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"] glue = ["pyparsing (>=3.0.7)"] iotdata = ["jsondiff (>=1.1.2)"] -route53resolver = ["sshpubkeys (>=3.1.0)"] -s3 = ["PyYAML (>=5.1)"] -server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +proxy = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +resourcegroupstaggingapi = ["PyYAML (>=5.1)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)"] +s3 = ["PyYAML (>=5.1)", "py-partiql-parser (==0.5.4)"] +s3crc32c = ["PyYAML (>=5.1)", "crc32c", "py-partiql-parser (==0.5.4)"] +server = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] ssm = ["PyYAML (>=5.1)"] +stepfunctions = ["antlr4-python3-runtime", "jsonpath-ng"] xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] +[[package]] +name = "mpmath" +version = "1.3.0" +description = "Python library for arbitrary-precision floating-point arithmetic" +optional = false +python-versions = "*" +files = [ + {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, + {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"}, +] + +[package.extras] +develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"] +docs = ["sphinx"] +gmpy = ["gmpy2 (>=2.1.0a4)"] +tests = ["pytest (>=4.6)"] + [[package]] name = "multidict" version = "6.0.4" @@ -1530,42 +1930,38 @@ test = ["codecov (>=2.1)", "pytest (>=7.1)", "pytest-cov (>=3.0)"] [[package]] name = "openapi-schema-validator" -version = "0.2.3" +version = "0.4.4" description = "OpenAPI schema validation for Python" optional = false python-versions = ">=3.7.0,<4.0.0" files = [ - {file = "openapi-schema-validator-0.2.3.tar.gz", hash = "sha256:2c64907728c3ef78e23711c8840a423f0b241588c9ed929855e4b2d1bb0cf5f2"}, - {file = "openapi_schema_validator-0.2.3-py3-none-any.whl", hash = "sha256:9bae709212a19222892cabcc60cafd903cbf4b220223f48583afa3c0e3cc6fc4"}, + {file = "openapi_schema_validator-0.4.4-py3-none-any.whl", hash = "sha256:79f37f38ef9fd5206b924ed7a6f382cea7b649b3b56383c47f1906082b7b9015"}, + {file = "openapi_schema_validator-0.4.4.tar.gz", hash = "sha256:c573e2be2c783abae56c5a1486ab716ca96e09d1c3eab56020d1dc680aa57bf8"}, ] [package.dependencies] -jsonschema = ">=3.0.0,<5.0.0" +jsonschema = ">=4.0.0,<4.18.0" +rfc3339-validator = "*" [package.extras] -isodate = ["isodate"] -rfc3339-validator = ["rfc3339-validator"] -strict-rfc3339 = ["strict-rfc3339"] +docs = ["sphinx (>=5.3.0,<6.0.0)", "sphinx-immaterial (>=0.11.0,<0.12.0)"] [[package]] name = "openapi-spec-validator" -version = "0.4.0" -description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3.0 spec validator" +version = "0.5.7" +description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3 spec validator" optional = false python-versions = ">=3.7.0,<4.0.0" files = [ - {file = "openapi-spec-validator-0.4.0.tar.gz", hash = "sha256:97f258850afc97b048f7c2653855e0f88fa66ac103c2be5077c7960aca2ad49a"}, - {file = "openapi_spec_validator-0.4.0-py3-none-any.whl", hash = "sha256:06900ac4d546a1df3642a779da0055be58869c598e3042a2fef067cfd99d04d0"}, + {file = "openapi_spec_validator-0.5.7-py3-none-any.whl", hash = "sha256:8712d2879db7692974ef89c47a3ebfc79436442921ec3a826ac0ce80cde8c549"}, + {file = "openapi_spec_validator-0.5.7.tar.gz", hash = "sha256:6c2d42180045a80fd6314de848b94310bdb0fa4949f4b099578b69f79d9fa5ac"}, ] [package.dependencies] -jsonschema = ">=3.2.0,<5.0.0" -openapi-schema-validator = ">=0.2.0,<0.3.0" -PyYAML = ">=5.1" -setuptools = "*" - -[package.extras] -requests = ["requests"] +jsonschema = ">=4.0.0,<4.18.0" +jsonschema-spec = ">=0.1.1,<0.2.0" +lazy-object-proxy = ">=1.7.1,<2.0.0" +openapi-schema-validator = ">=0.4.2,<0.5.0" [[package]] name = "packaging" @@ -1578,6 +1974,17 @@ files = [ {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, ] +[[package]] +name = "pathable" +version = "0.4.3" +description = "Object-oriented paths" +optional = false +python-versions = ">=3.7.0,<4.0.0" +files = [ + {file = "pathable-0.4.3-py3-none-any.whl", hash = "sha256:cdd7b1f9d7d5c8b8d3315dbf5a86b2596053ae845f056f57d97c0eefff84da14"}, + {file = "pathable-0.4.3.tar.gz", hash = "sha256:5c869d315be50776cc8a993f3af43e0c60dc01506b399643f919034ebf4cdcab"}, +] + [[package]] name = "pbr" version = "5.9.0" @@ -1604,6 +2011,17 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "ply" +version = "3.11" +description = "Python Lex & Yacc" +optional = false +python-versions = "*" +files = [ + {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"}, + {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, +] + [[package]] name = "prometheus-client" version = "0.14.1" @@ -1716,16 +2134,19 @@ files = [ ] [[package]] -name = "pyasn1" -version = "0.4.8" -description = "ASN.1 types and codecs" +name = "py-partiql-parser" +version = "0.5.4" +description = "Pure Python PartiQL Parser" optional = false python-versions = "*" files = [ - {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"}, - {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, + {file = "py_partiql_parser-0.5.4-py2.py3-none-any.whl", hash = "sha256:3dc4295a47da9587681a96b35c6e151886fdbd0a4acbe0d97c4c68e5f689d315"}, + {file = "py_partiql_parser-0.5.4.tar.gz", hash = "sha256:72e043919538fa63edae72fb59afc7e3fd93adbde656718a7d2b4666f23dd114"}, ] +[package.extras] +dev = ["black (==22.6.0)", "flake8", "mypy", "pytest"] + [[package]] name = "pycparser" version = "2.21" @@ -1737,6 +2158,116 @@ files = [ {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] +[[package]] +name = "pydantic" +version = "2.7.1" +description = "Data validation using Python type hints" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydantic-2.7.1-py3-none-any.whl", hash = "sha256:e029badca45266732a9a79898a15ae2e8b14840b1eabbb25844be28f0b33f3d5"}, + {file = "pydantic-2.7.1.tar.gz", hash = "sha256:e9dbb5eada8abe4d9ae5f46b9939aead650cd2b68f249bb3a8139dbe125803cc"}, +] + +[package.dependencies] +annotated-types = ">=0.4.0" +pydantic-core = "2.18.2" +typing-extensions = ">=4.6.1" + +[package.extras] +email = ["email-validator (>=2.0.0)"] + +[[package]] +name = "pydantic-core" +version = "2.18.2" +description = "Core functionality for Pydantic validation and serialization" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydantic_core-2.18.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:9e08e867b306f525802df7cd16c44ff5ebbe747ff0ca6cf3fde7f36c05a59a81"}, + {file = "pydantic_core-2.18.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f0a21cbaa69900cbe1a2e7cad2aa74ac3cf21b10c3efb0fa0b80305274c0e8a2"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0680b1f1f11fda801397de52c36ce38ef1c1dc841a0927a94f226dea29c3ae3d"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:95b9d5e72481d3780ba3442eac863eae92ae43a5f3adb5b4d0a1de89d42bb250"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fcf5cd9c4b655ad666ca332b9a081112cd7a58a8b5a6ca7a3104bc950f2038"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b5155ff768083cb1d62f3e143b49a8a3432e6789a3abee8acd005c3c7af1c74"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:553ef617b6836fc7e4df130bb851e32fe357ce36336d897fd6646d6058d980af"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b89ed9eb7d616ef5714e5590e6cf7f23b02d0d539767d33561e3675d6f9e3857"}, + {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:75f7e9488238e920ab6204399ded280dc4c307d034f3924cd7f90a38b1829563"}, + {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ef26c9e94a8c04a1b2924149a9cb081836913818e55681722d7f29af88fe7b38"}, + {file = "pydantic_core-2.18.2-cp310-none-win32.whl", hash = "sha256:182245ff6b0039e82b6bb585ed55a64d7c81c560715d1bad0cbad6dfa07b4027"}, + {file = "pydantic_core-2.18.2-cp310-none-win_amd64.whl", hash = "sha256:e23ec367a948b6d812301afc1b13f8094ab7b2c280af66ef450efc357d2ae543"}, + {file = "pydantic_core-2.18.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:219da3f096d50a157f33645a1cf31c0ad1fe829a92181dd1311022f986e5fbe3"}, + {file = "pydantic_core-2.18.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc1cfd88a64e012b74e94cd00bbe0f9c6df57049c97f02bb07d39e9c852e19a4"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05b7133a6e6aeb8df37d6f413f7705a37ab4031597f64ab56384c94d98fa0e90"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:224c421235f6102e8737032483f43c1a8cfb1d2f45740c44166219599358c2cd"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b14d82cdb934e99dda6d9d60dc84a24379820176cc4a0d123f88df319ae9c150"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2728b01246a3bba6de144f9e3115b532ee44bd6cf39795194fb75491824a1413"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:470b94480bb5ee929f5acba6995251ada5e059a5ef3e0dfc63cca287283ebfa6"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:997abc4df705d1295a42f95b4eec4950a37ad8ae46d913caeee117b6b198811c"}, + {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75250dbc5290e3f1a0f4618db35e51a165186f9034eff158f3d490b3fed9f8a0"}, + {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4456f2dca97c425231d7315737d45239b2b51a50dc2b6f0c2bb181fce6207664"}, + {file = "pydantic_core-2.18.2-cp311-none-win32.whl", hash = "sha256:269322dcc3d8bdb69f054681edff86276b2ff972447863cf34c8b860f5188e2e"}, + {file = "pydantic_core-2.18.2-cp311-none-win_amd64.whl", hash = "sha256:800d60565aec896f25bc3cfa56d2277d52d5182af08162f7954f938c06dc4ee3"}, + {file = "pydantic_core-2.18.2-cp311-none-win_arm64.whl", hash = "sha256:1404c69d6a676245199767ba4f633cce5f4ad4181f9d0ccb0577e1f66cf4c46d"}, + {file = "pydantic_core-2.18.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:fb2bd7be70c0fe4dfd32c951bc813d9fe6ebcbfdd15a07527796c8204bd36242"}, + {file = "pydantic_core-2.18.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6132dd3bd52838acddca05a72aafb6eab6536aa145e923bb50f45e78b7251043"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d904828195733c183d20a54230c0df0eb46ec746ea1a666730787353e87182"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c9bd70772c720142be1020eac55f8143a34ec9f82d75a8e7a07852023e46617f"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b8ed04b3582771764538f7ee7001b02e1170223cf9b75dff0bc698fadb00cf3"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e6dac87ddb34aaec85f873d737e9d06a3555a1cc1a8e0c44b7f8d5daeb89d86f"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ca4ae5a27ad7a4ee5170aebce1574b375de390bc01284f87b18d43a3984df72"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:886eec03591b7cf058467a70a87733b35f44707bd86cf64a615584fd72488b7c"}, + {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ca7b0c1f1c983e064caa85f3792dd2fe3526b3505378874afa84baf662e12241"}, + {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b4356d3538c3649337df4074e81b85f0616b79731fe22dd11b99499b2ebbdf3"}, + {file = "pydantic_core-2.18.2-cp312-none-win32.whl", hash = "sha256:8b172601454f2d7701121bbec3425dd71efcb787a027edf49724c9cefc14c038"}, + {file = "pydantic_core-2.18.2-cp312-none-win_amd64.whl", hash = "sha256:b1bd7e47b1558ea872bd16c8502c414f9e90dcf12f1395129d7bb42a09a95438"}, + {file = "pydantic_core-2.18.2-cp312-none-win_arm64.whl", hash = "sha256:98758d627ff397e752bc339272c14c98199c613f922d4a384ddc07526c86a2ec"}, + {file = "pydantic_core-2.18.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:9fdad8e35f278b2c3eb77cbdc5c0a49dada440657bf738d6905ce106dc1de439"}, + {file = "pydantic_core-2.18.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1d90c3265ae107f91a4f279f4d6f6f1d4907ac76c6868b27dc7fb33688cfb347"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:390193c770399861d8df9670fb0d1874f330c79caaca4642332df7c682bf6b91"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:82d5d4d78e4448683cb467897fe24e2b74bb7b973a541ea1dcfec1d3cbce39fb"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4774f3184d2ef3e14e8693194f661dea5a4d6ca4e3dc8e39786d33a94865cefd"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4d938ec0adf5167cb335acb25a4ee69a8107e4984f8fbd2e897021d9e4ca21b"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0e8b1be28239fc64a88a8189d1df7fad8be8c1ae47fcc33e43d4be15f99cc70"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:868649da93e5a3d5eacc2b5b3b9235c98ccdbfd443832f31e075f54419e1b96b"}, + {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:78363590ef93d5d226ba21a90a03ea89a20738ee5b7da83d771d283fd8a56761"}, + {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:852e966fbd035a6468fc0a3496589b45e2208ec7ca95c26470a54daed82a0788"}, + {file = "pydantic_core-2.18.2-cp38-none-win32.whl", hash = "sha256:6a46e22a707e7ad4484ac9ee9f290f9d501df45954184e23fc29408dfad61350"}, + {file = "pydantic_core-2.18.2-cp38-none-win_amd64.whl", hash = "sha256:d91cb5ea8b11607cc757675051f61b3d93f15eca3cefb3e6c704a5d6e8440f4e"}, + {file = "pydantic_core-2.18.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:ae0a8a797a5e56c053610fa7be147993fe50960fa43609ff2a9552b0e07013e8"}, + {file = "pydantic_core-2.18.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:042473b6280246b1dbf530559246f6842b56119c2926d1e52b631bdc46075f2a"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a388a77e629b9ec814c1b1e6b3b595fe521d2cdc625fcca26fbc2d44c816804"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25add29b8f3b233ae90ccef2d902d0ae0432eb0d45370fe315d1a5cf231004b"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f459a5ce8434614dfd39bbebf1041952ae01da6bed9855008cb33b875cb024c0"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eff2de745698eb46eeb51193a9f41d67d834d50e424aef27df2fcdee1b153845"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8309f67285bdfe65c372ea3722b7a5642680f3dba538566340a9d36e920b5f0"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f93a8a2e3938ff656a7c1bc57193b1319960ac015b6e87d76c76bf14fe0244b4"}, + {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:22057013c8c1e272eb8d0eebc796701167d8377441ec894a8fed1af64a0bf399"}, + {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfeecd1ac6cc1fb2692c3d5110781c965aabd4ec5d32799773ca7b1456ac636b"}, + {file = "pydantic_core-2.18.2-cp39-none-win32.whl", hash = "sha256:0d69b4c2f6bb3e130dba60d34c0845ba31b69babdd3f78f7c0c8fae5021a253e"}, + {file = "pydantic_core-2.18.2-cp39-none-win_amd64.whl", hash = "sha256:d9319e499827271b09b4e411905b24a426b8fb69464dfa1696258f53a3334641"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a1874c6dd4113308bd0eb568418e6114b252afe44319ead2b4081e9b9521fe75"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:ccdd111c03bfd3666bd2472b674c6899550e09e9f298954cfc896ab92b5b0e6d"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e18609ceaa6eed63753037fc06ebb16041d17d28199ae5aba0052c51449650a9"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e5c584d357c4e2baf0ff7baf44f4994be121e16a2c88918a5817331fc7599d7"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43f0f463cf89ace478de71a318b1b4f05ebc456a9b9300d027b4b57c1a2064fb"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e1b395e58b10b73b07b7cf740d728dd4ff9365ac46c18751bf8b3d8cca8f625a"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0098300eebb1c837271d3d1a2cd2911e7c11b396eac9661655ee524a7f10587b"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:36789b70d613fbac0a25bb07ab3d9dba4d2e38af609c020cf4d888d165ee0bf3"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3f9a801e7c8f1ef8718da265bba008fa121243dfe37c1cea17840b0944dfd72c"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:3a6515ebc6e69d85502b4951d89131ca4e036078ea35533bb76327f8424531ce"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20aca1e2298c56ececfd8ed159ae4dde2df0781988c97ef77d5c16ff4bd5b400"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:223ee893d77a310a0391dca6df00f70bbc2f36a71a895cecd9a0e762dc37b349"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2334ce8c673ee93a1d6a65bd90327588387ba073c17e61bf19b4fd97d688d63c"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cbca948f2d14b09d20268cda7b0367723d79063f26c4ffc523af9042cad95592"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b3ef08e20ec49e02d5c6717a91bb5af9b20f1805583cb0adfe9ba2c6b505b5ae"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6fdc8627910eed0c01aed6a390a252fe3ea6d472ee70fdde56273f198938374"}, + {file = "pydantic_core-2.18.2.tar.gz", hash = "sha256:2e29d20810dfc3043ee13ac7d9e25105799817683348823f305ab3f349b9386e"}, +] + +[package.dependencies] +typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" + [[package]] name = "pyjwt" version = "2.4.0" @@ -1900,6 +2431,20 @@ pytest = [ {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, ] +[[package]] +name = "pytest-repeat" +version = "0.9.3" +description = "pytest plugin for repeating tests" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest_repeat-0.9.3-py3-none-any.whl", hash = "sha256:26ab2df18226af9d5ce441c858f273121e92ff55f5bb311d25755b8d7abdd8ed"}, + {file = "pytest_repeat-0.9.3.tar.gz", hash = "sha256:ffd3836dfcd67bb270bec648b330e20be37d2966448c4148c4092d1e8aba8185"}, +] + +[package.dependencies] +pytest = "*" + [[package]] name = "pytest-rerunfailures" version = "13.0" @@ -1978,27 +2523,16 @@ files = [ six = ">=1.5" [[package]] -name = "python-jose" -version = "3.3.0" -description = "JOSE implementation in Python" +name = "pytz" +version = "2024.1" +description = "World timezone definitions, modern and historical" optional = false python-versions = "*" files = [ - {file = "python-jose-3.3.0.tar.gz", hash = "sha256:55779b5e6ad599c6336191246e95eb2293a9ddebd555f796a65f838f07e5d78a"}, - {file = "python_jose-3.3.0-py2.py3-none-any.whl", hash = "sha256:9b1376b023f8b298536eedd47ae1089bcdb848f1535ab30555cd92002d78923a"}, + {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"}, + {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"}, ] -[package.dependencies] -cryptography = {version = ">=3.4.0", optional = true, markers = "extra == \"cryptography\""} -ecdsa = "!=0.15" -pyasn1 = "*" -rsa = "*" - -[package.extras] -cryptography = ["cryptography (>=3.4.0)"] -pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"] -pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"] - [[package]] name = "pywin32" version = "301" @@ -2043,6 +2577,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2077,15 +2612,103 @@ files = [ {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, ] +[[package]] +name = "regex" +version = "2024.4.28" +description = "Alternative regular expression module, to replace re." +optional = false +python-versions = ">=3.8" +files = [ + {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd196d056b40af073d95a2879678585f0b74ad35190fac04ca67954c582c6b61"}, + {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8bb381f777351bd534462f63e1c6afb10a7caa9fa2a421ae22c26e796fe31b1f"}, + {file = "regex-2024.4.28-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:47af45b6153522733aa6e92543938e97a70ce0900649ba626cf5aad290b737b6"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99d6a550425cc51c656331af0e2b1651e90eaaa23fb4acde577cf15068e2e20f"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf29304a8011feb58913c382902fde3395957a47645bf848eea695839aa101b7"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:92da587eee39a52c91aebea8b850e4e4f095fe5928d415cb7ed656b3460ae79a"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6277d426e2f31bdbacb377d17a7475e32b2d7d1f02faaecc48d8e370c6a3ff31"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:28e1f28d07220c0f3da0e8fcd5a115bbb53f8b55cecf9bec0c946eb9a059a94c"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:aaa179975a64790c1f2701ac562b5eeb733946eeb036b5bcca05c8d928a62f10"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6f435946b7bf7a1b438b4e6b149b947c837cb23c704e780c19ba3e6855dbbdd3"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:19d6c11bf35a6ad077eb23852827f91c804eeb71ecb85db4ee1386825b9dc4db"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:fdae0120cddc839eb8e3c15faa8ad541cc6d906d3eb24d82fb041cfe2807bc1e"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e672cf9caaf669053121f1766d659a8813bd547edef6e009205378faf45c67b8"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f57515750d07e14743db55d59759893fdb21d2668f39e549a7d6cad5d70f9fea"}, + {file = "regex-2024.4.28-cp310-cp310-win32.whl", hash = "sha256:a1409c4eccb6981c7baabc8888d3550df518add6e06fe74fa1d9312c1838652d"}, + {file = "regex-2024.4.28-cp310-cp310-win_amd64.whl", hash = "sha256:1f687a28640f763f23f8a9801fe9e1b37338bb1ca5d564ddd41619458f1f22d1"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:84077821c85f222362b72fdc44f7a3a13587a013a45cf14534df1cbbdc9a6796"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b45d4503de8f4f3dc02f1d28a9b039e5504a02cc18906cfe744c11def942e9eb"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:457c2cd5a646dd4ed536c92b535d73548fb8e216ebee602aa9f48e068fc393f3"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b51739ddfd013c6f657b55a508de8b9ea78b56d22b236052c3a85a675102dc6"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:459226445c7d7454981c4c0ce0ad1a72e1e751c3e417f305722bbcee6697e06a"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:670fa596984b08a4a769491cbdf22350431970d0112e03d7e4eeaecaafcd0fec"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe00f4fe11c8a521b173e6324d862ee7ee3412bf7107570c9b564fe1119b56fb"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36f392dc7763fe7924575475736bddf9ab9f7a66b920932d0ea50c2ded2f5636"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:23a412b7b1a7063f81a742463f38821097b6a37ce1e5b89dd8e871d14dbfd86b"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f1d6e4b7b2ae3a6a9df53efbf199e4bfcff0959dbdb5fd9ced34d4407348e39a"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:499334ad139557de97cbc4347ee921c0e2b5e9c0f009859e74f3f77918339257"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:0940038bec2fe9e26b203d636c44d31dd8766abc1fe66262da6484bd82461ccf"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:66372c2a01782c5fe8e04bff4a2a0121a9897e19223d9eab30c54c50b2ebeb7f"}, + {file = "regex-2024.4.28-cp311-cp311-win32.whl", hash = "sha256:c77d10ec3c1cf328b2f501ca32583625987ea0f23a0c2a49b37a39ee5c4c4630"}, + {file = "regex-2024.4.28-cp311-cp311-win_amd64.whl", hash = "sha256:fc0916c4295c64d6890a46e02d4482bb5ccf33bf1a824c0eaa9e83b148291f90"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:08a1749f04fee2811c7617fdd46d2e46d09106fa8f475c884b65c01326eb15c5"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b8eb28995771c087a73338f695a08c9abfdf723d185e57b97f6175c5051ff1ae"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dd7ef715ccb8040954d44cfeff17e6b8e9f79c8019daae2fd30a8806ef5435c0"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb0315a2b26fde4005a7c401707c5352df274460f2f85b209cf6024271373013"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f2fc053228a6bd3a17a9b0a3f15c3ab3cf95727b00557e92e1cfe094b88cc662"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7fe9739a686dc44733d52d6e4f7b9c77b285e49edf8570754b322bca6b85b4cc"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74fcf77d979364f9b69fcf8200849ca29a374973dc193a7317698aa37d8b01c"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:965fd0cf4694d76f6564896b422724ec7b959ef927a7cb187fc6b3f4e4f59833"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2fef0b38c34ae675fcbb1b5db760d40c3fc3612cfa186e9e50df5782cac02bcd"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bc365ce25f6c7c5ed70e4bc674f9137f52b7dd6a125037f9132a7be52b8a252f"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:ac69b394764bb857429b031d29d9604842bc4cbfd964d764b1af1868eeebc4f0"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:144a1fc54765f5c5c36d6d4b073299832aa1ec6a746a6452c3ee7b46b3d3b11d"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2630ca4e152c221072fd4a56d4622b5ada876f668ecd24d5ab62544ae6793ed6"}, + {file = "regex-2024.4.28-cp312-cp312-win32.whl", hash = "sha256:7f3502f03b4da52bbe8ba962621daa846f38489cae5c4a7b5d738f15f6443d17"}, + {file = "regex-2024.4.28-cp312-cp312-win_amd64.whl", hash = "sha256:0dd3f69098511e71880fb00f5815db9ed0ef62c05775395968299cb400aeab82"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:374f690e1dd0dbdcddea4a5c9bdd97632cf656c69113f7cd6a361f2a67221cb6"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25f87ae6b96374db20f180eab083aafe419b194e96e4f282c40191e71980c666"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5dbc1bcc7413eebe5f18196e22804a3be1bfdfc7e2afd415e12c068624d48247"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f85151ec5a232335f1be022b09fbbe459042ea1951d8a48fef251223fc67eee1"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:57ba112e5530530fd175ed550373eb263db4ca98b5f00694d73b18b9a02e7185"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:224803b74aab56aa7be313f92a8d9911dcade37e5f167db62a738d0c85fdac4b"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a54a047b607fd2d2d52a05e6ad294602f1e0dec2291152b745870afc47c1397"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a2a512d623f1f2d01d881513af9fc6a7c46e5cfffb7dc50c38ce959f9246c94"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c06bf3f38f0707592898428636cbb75d0a846651b053a1cf748763e3063a6925"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1031a5e7b048ee371ab3653aad3030ecfad6ee9ecdc85f0242c57751a05b0ac4"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d7a353ebfa7154c871a35caca7bfd8f9e18666829a1dc187115b80e35a29393e"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7e76b9cfbf5ced1aca15a0e5b6f229344d9b3123439ffce552b11faab0114a02"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:5ce479ecc068bc2a74cb98dd8dba99e070d1b2f4a8371a7dfe631f85db70fe6e"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7d77b6f63f806578c604dca209280e4c54f0fa9a8128bb8d2cc5fb6f99da4150"}, + {file = "regex-2024.4.28-cp38-cp38-win32.whl", hash = "sha256:d84308f097d7a513359757c69707ad339da799e53b7393819ec2ea36bc4beb58"}, + {file = "regex-2024.4.28-cp38-cp38-win_amd64.whl", hash = "sha256:2cc1b87bba1dd1a898e664a31012725e48af826bf3971e786c53e32e02adae6c"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7413167c507a768eafb5424413c5b2f515c606be5bb4ef8c5dee43925aa5718b"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:108e2dcf0b53a7c4ab8986842a8edcb8ab2e59919a74ff51c296772e8e74d0ae"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f1c5742c31ba7d72f2dedf7968998730664b45e38827637e0f04a2ac7de2f5f1"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecc6148228c9ae25ce403eade13a0961de1cb016bdb35c6eafd8e7b87ad028b1"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7d893c8cf0e2429b823ef1a1d360a25950ed11f0e2a9df2b5198821832e1947"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4290035b169578ffbbfa50d904d26bec16a94526071ebec3dadbebf67a26b25e"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44a22ae1cfd82e4ffa2066eb3390777dc79468f866f0625261a93e44cdf6482b"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd24fd140b69f0b0bcc9165c397e9b2e89ecbeda83303abf2a072609f60239e2"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:39fb166d2196413bead229cd64a2ffd6ec78ebab83fff7d2701103cf9f4dfd26"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9301cc6db4d83d2c0719f7fcda37229691745168bf6ae849bea2e85fc769175d"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7c3d389e8d76a49923683123730c33e9553063d9041658f23897f0b396b2386f"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:99ef6289b62042500d581170d06e17f5353b111a15aa6b25b05b91c6886df8fc"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:b91d529b47798c016d4b4c1d06cc826ac40d196da54f0de3c519f5a297c5076a"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:43548ad74ea50456e1c68d3c67fff3de64c6edb85bcd511d1136f9b5376fc9d1"}, + {file = "regex-2024.4.28-cp39-cp39-win32.whl", hash = "sha256:05d9b6578a22db7dedb4df81451f360395828b04f4513980b6bd7a1412c679cc"}, + {file = "regex-2024.4.28-cp39-cp39-win_amd64.whl", hash = "sha256:3986217ec830c2109875be740531feb8ddafe0dfa49767cdcd072ed7e8927962"}, + {file = "regex-2024.4.28.tar.gz", hash = "sha256:83ab366777ea45d58f72593adf35d36ca911ea8bd838483c1823b883a121b0e4"}, +] + [[package]] name = "requests" -version = "2.31.0" +version = "2.32.0" description = "Python HTTP for Humans." optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, - {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, + {file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"}, + {file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"}, ] [package.dependencies] @@ -2117,43 +2740,43 @@ urllib3 = ">=1.25.10" tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-localserver", "types-mock", "types-requests"] [[package]] -name = "rsa" -version = "4.9" -description = "Pure-Python RSA implementation" +name = "rfc3339-validator" +version = "0.1.4" +description = "A pure python RFC3339 validator" optional = false -python-versions = ">=3.6,<4" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ - {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"}, - {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"}, + {file = "rfc3339_validator-0.1.4-py2.py3-none-any.whl", hash = "sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa"}, + {file = "rfc3339_validator-0.1.4.tar.gz", hash = "sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b"}, ] [package.dependencies] -pyasn1 = ">=0.1.3" +six = "*" [[package]] name = "ruff" -version = "0.1.11" +version = "0.2.2" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a7f772696b4cdc0a3b2e527fc3c7ccc41cdcb98f5c80fdd4f2b8c50eb1458196"}, - {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:934832f6ed9b34a7d5feea58972635c2039c7a3b434fe5ba2ce015064cb6e955"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea0d3e950e394c4b332bcdd112aa566010a9f9c95814844a7468325290aabfd9"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9bd4025b9c5b429a48280785a2b71d479798a69f5c2919e7d274c5f4b32c3607"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1ad00662305dcb1e987f5ec214d31f7d6a062cae3e74c1cbccef15afd96611d"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4b077ce83f47dd6bea1991af08b140e8b8339f0ba8cb9b7a484c30ebab18a23f"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4a88efecec23c37b11076fe676e15c6cdb1271a38f2b415e381e87fe4517f18"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b25093dad3b055667730a9b491129c42d45e11cdb7043b702e97125bcec48a1"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231d8fb11b2cc7c0366a326a66dafc6ad449d7fcdbc268497ee47e1334f66f77"}, - {file = "ruff-0.1.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:09c415716884950080921dd6237767e52e227e397e2008e2bed410117679975b"}, - {file = "ruff-0.1.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0f58948c6d212a6b8d41cd59e349751018797ce1727f961c2fa755ad6208ba45"}, - {file = "ruff-0.1.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:190a566c8f766c37074d99640cd9ca3da11d8deae2deae7c9505e68a4a30f740"}, - {file = "ruff-0.1.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6464289bd67b2344d2a5d9158d5eb81025258f169e69a46b741b396ffb0cda95"}, - {file = "ruff-0.1.11-py3-none-win32.whl", hash = "sha256:9b8f397902f92bc2e70fb6bebfa2139008dc72ae5177e66c383fa5426cb0bf2c"}, - {file = "ruff-0.1.11-py3-none-win_amd64.whl", hash = "sha256:eb85ee287b11f901037a6683b2374bb0ec82928c5cbc984f575d0437979c521a"}, - {file = "ruff-0.1.11-py3-none-win_arm64.whl", hash = "sha256:97ce4d752f964ba559c7023a86e5f8e97f026d511e48013987623915431c7ea9"}, - {file = "ruff-0.1.11.tar.gz", hash = "sha256:f9d4d88cb6eeb4dfe20f9f0519bd2eaba8119bde87c3d5065c541dbae2b5a2cb"}, + {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0a9efb032855ffb3c21f6405751d5e147b0c6b631e3ca3f6b20f917572b97eb6"}, + {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d450b7fbff85913f866a5384d8912710936e2b96da74541c82c1b458472ddb39"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecd46e3106850a5c26aee114e562c329f9a1fbe9e4821b008c4404f64ff9ce73"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e22676a5b875bd72acd3d11d5fa9075d3a5f53b877fe7b4793e4673499318ba"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1695700d1e25a99d28f7a1636d85bafcc5030bba9d0578c0781ba1790dbcf51c"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b0c232af3d0bd8f521806223723456ffebf8e323bd1e4e82b0befb20ba18388e"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f63d96494eeec2fc70d909393bcd76c69f35334cdbd9e20d089fb3f0640216ca"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a61ea0ff048e06de273b2e45bd72629f470f5da8f71daf09fe481278b175001"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e1439c8f407e4f356470e54cdecdca1bd5439a0673792dbe34a2b0a551a2fe3"}, + {file = "ruff-0.2.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:940de32dc8853eba0f67f7198b3e79bc6ba95c2edbfdfac2144c8235114d6726"}, + {file = "ruff-0.2.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c126da55c38dd917621552ab430213bdb3273bb10ddb67bc4b761989210eb6e"}, + {file = "ruff-0.2.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:3b65494f7e4bed2e74110dac1f0d17dc8e1f42faaa784e7c58a98e335ec83d7e"}, + {file = "ruff-0.2.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1ec49be4fe6ddac0503833f3ed8930528e26d1e60ad35c2446da372d16651ce9"}, + {file = "ruff-0.2.2-py3-none-win32.whl", hash = "sha256:d920499b576f6c68295bc04e7b17b6544d9d05f196bb3aac4358792ef6f34325"}, + {file = "ruff-0.2.2-py3-none-win_amd64.whl", hash = "sha256:cc9a91ae137d687f43a44c900e5d95e9617cb37d4c989e462980ba27039d239d"}, + {file = "ruff-0.2.2-py3-none-win_arm64.whl", hash = "sha256:c9d15fc41e6054bfc7200478720570078f0b41c9ae4f010bcc16bd6f4d1aacdd"}, + {file = "ruff-0.2.2.tar.gz", hash = "sha256:e62ed7f36b3068a30ba39193a14274cd706bc486fad521276458022f7bccb31d"}, ] [[package]] @@ -2190,19 +2813,18 @@ pbr = "*" [[package]] name = "setuptools" -version = "65.5.1" +version = "70.0.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31"}, - {file = "setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f"}, + {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"}, + {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] -testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "six" @@ -2216,22 +2838,29 @@ files = [ ] [[package]] -name = "sshpubkeys" -version = "3.3.1" -description = "SSH public key parser" +name = "sniffio" +version = "1.3.0" +description = "Sniff out which async library your code is running under" optional = false -python-versions = ">=3" +python-versions = ">=3.7" files = [ - {file = "sshpubkeys-3.3.1-py2.py3-none-any.whl", hash = "sha256:946f76b8fe86704b0e7c56a00d80294e39bc2305999844f079a217885060b1ac"}, - {file = "sshpubkeys-3.3.1.tar.gz", hash = "sha256:3020ed4f8c846849299370fbe98ff4157b0ccc1accec105e07cfa9ae4bb55064"}, + {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"}, + {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, +] + +[[package]] +name = "sympy" +version = "1.12" +description = "Computer algebra system (CAS) in Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"}, + {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"}, ] [package.dependencies] -cryptography = ">=2.1.4" -ecdsa = ">=0.13" - -[package.extras] -dev = ["twine", "wheel", "yapf"] +mpmath = ">=0.19" [[package]] name = "toml" @@ -2348,13 +2977,13 @@ files = [ [[package]] name = "urllib3" -version = "1.26.18" +version = "1.26.19" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" files = [ - {file = "urllib3-1.26.18-py2.py3-none-any.whl", hash = "sha256:34b97092d7e0a3a8cf7cd10e386f401b3737364026c45e622aa02903dffe0f07"}, - {file = "urllib3-1.26.18.tar.gz", hash = "sha256:f8ecc1bba5667413457c529ab955bf8c67b45db799d159066261719e328580a0"}, + {file = "urllib3-1.26.19-py2.py3-none-any.whl", hash = "sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3"}, + {file = "urllib3-1.26.19.tar.gz", hash = "sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429"}, ] [package.extras] @@ -2378,15 +3007,96 @@ docs = ["Sphinx (>=3.4)", "sphinx-rtd-theme (>=0.5)"] optional = ["python-socks", "wsaccel"] test = ["websockets"] +[[package]] +name = "websockets" +version = "12.0" +description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" +optional = false +python-versions = ">=3.8" +files = [ + {file = "websockets-12.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d554236b2a2006e0ce16315c16eaa0d628dab009c33b63ea03f41c6107958374"}, + {file = "websockets-12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2d225bb6886591b1746b17c0573e29804619c8f755b5598d875bb4235ea639be"}, + {file = "websockets-12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eb809e816916a3b210bed3c82fb88eaf16e8afcf9c115ebb2bacede1797d2547"}, + {file = "websockets-12.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c588f6abc13f78a67044c6b1273a99e1cf31038ad51815b3b016ce699f0d75c2"}, + {file = "websockets-12.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5aa9348186d79a5f232115ed3fa9020eab66d6c3437d72f9d2c8ac0c6858c558"}, + {file = "websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6350b14a40c95ddd53e775dbdbbbc59b124a5c8ecd6fbb09c2e52029f7a9f480"}, + {file = "websockets-12.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:70ec754cc2a769bcd218ed8d7209055667b30860ffecb8633a834dde27d6307c"}, + {file = "websockets-12.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6e96f5ed1b83a8ddb07909b45bd94833b0710f738115751cdaa9da1fb0cb66e8"}, + {file = "websockets-12.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4d87be612cbef86f994178d5186add3d94e9f31cc3cb499a0482b866ec477603"}, + {file = "websockets-12.0-cp310-cp310-win32.whl", hash = "sha256:befe90632d66caaf72e8b2ed4d7f02b348913813c8b0a32fae1cc5fe3730902f"}, + {file = "websockets-12.0-cp310-cp310-win_amd64.whl", hash = "sha256:363f57ca8bc8576195d0540c648aa58ac18cf85b76ad5202b9f976918f4219cf"}, + {file = "websockets-12.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5d873c7de42dea355d73f170be0f23788cf3fa9f7bed718fd2830eefedce01b4"}, + {file = "websockets-12.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3f61726cae9f65b872502ff3c1496abc93ffbe31b278455c418492016e2afc8f"}, + {file = "websockets-12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed2fcf7a07334c77fc8a230755c2209223a7cc44fc27597729b8ef5425aa61a3"}, + {file = "websockets-12.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e332c210b14b57904869ca9f9bf4ca32f5427a03eeb625da9b616c85a3a506c"}, + {file = "websockets-12.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5693ef74233122f8ebab026817b1b37fe25c411ecfca084b29bc7d6efc548f45"}, + {file = "websockets-12.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e9e7db18b4539a29cc5ad8c8b252738a30e2b13f033c2d6e9d0549b45841c04"}, + {file = "websockets-12.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6e2df67b8014767d0f785baa98393725739287684b9f8d8a1001eb2839031447"}, + {file = "websockets-12.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bea88d71630c5900690fcb03161ab18f8f244805c59e2e0dc4ffadae0a7ee0ca"}, + {file = "websockets-12.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dff6cdf35e31d1315790149fee351f9e52978130cef6c87c4b6c9b3baf78bc53"}, + {file = "websockets-12.0-cp311-cp311-win32.whl", hash = "sha256:3e3aa8c468af01d70332a382350ee95f6986db479ce7af14d5e81ec52aa2b402"}, + {file = "websockets-12.0-cp311-cp311-win_amd64.whl", hash = "sha256:25eb766c8ad27da0f79420b2af4b85d29914ba0edf69f547cc4f06ca6f1d403b"}, + {file = "websockets-12.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0e6e2711d5a8e6e482cacb927a49a3d432345dfe7dea8ace7b5790df5932e4df"}, + {file = "websockets-12.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:dbcf72a37f0b3316e993e13ecf32f10c0e1259c28ffd0a85cee26e8549595fbc"}, + {file = "websockets-12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:12743ab88ab2af1d17dd4acb4645677cb7063ef4db93abffbf164218a5d54c6b"}, + {file = "websockets-12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b645f491f3c48d3f8a00d1fce07445fab7347fec54a3e65f0725d730d5b99cb"}, + {file = "websockets-12.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9893d1aa45a7f8b3bc4510f6ccf8db8c3b62120917af15e3de247f0780294b92"}, + {file = "websockets-12.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f38a7b376117ef7aff996e737583172bdf535932c9ca021746573bce40165ed"}, + {file = "websockets-12.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f764ba54e33daf20e167915edc443b6f88956f37fb606449b4a5b10ba42235a5"}, + {file = "websockets-12.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:1e4b3f8ea6a9cfa8be8484c9221ec0257508e3a1ec43c36acdefb2a9c3b00aa2"}, + {file = "websockets-12.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9fdf06fd06c32205a07e47328ab49c40fc1407cdec801d698a7c41167ea45113"}, + {file = "websockets-12.0-cp312-cp312-win32.whl", hash = "sha256:baa386875b70cbd81798fa9f71be689c1bf484f65fd6fb08d051a0ee4e79924d"}, + {file = "websockets-12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ae0a5da8f35a5be197f328d4727dbcfafa53d1824fac3d96cdd3a642fe09394f"}, + {file = "websockets-12.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5f6ffe2c6598f7f7207eef9a1228b6f5c818f9f4d53ee920aacd35cec8110438"}, + {file = "websockets-12.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9edf3fc590cc2ec20dc9d7a45108b5bbaf21c0d89f9fd3fd1685e223771dc0b2"}, + {file = "websockets-12.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8572132c7be52632201a35f5e08348137f658e5ffd21f51f94572ca6c05ea81d"}, + {file = "websockets-12.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:604428d1b87edbf02b233e2c207d7d528460fa978f9e391bd8aaf9c8311de137"}, + {file = "websockets-12.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1a9d160fd080c6285e202327aba140fc9a0d910b09e423afff4ae5cbbf1c7205"}, + {file = "websockets-12.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87b4aafed34653e465eb77b7c93ef058516cb5acf3eb21e42f33928616172def"}, + {file = "websockets-12.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b2ee7288b85959797970114deae81ab41b731f19ebcd3bd499ae9ca0e3f1d2c8"}, + {file = "websockets-12.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:7fa3d25e81bfe6a89718e9791128398a50dec6d57faf23770787ff441d851967"}, + {file = "websockets-12.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a571f035a47212288e3b3519944f6bf4ac7bc7553243e41eac50dd48552b6df7"}, + {file = "websockets-12.0-cp38-cp38-win32.whl", hash = "sha256:3c6cc1360c10c17463aadd29dd3af332d4a1adaa8796f6b0e9f9df1fdb0bad62"}, + {file = "websockets-12.0-cp38-cp38-win_amd64.whl", hash = "sha256:1bf386089178ea69d720f8db6199a0504a406209a0fc23e603b27b300fdd6892"}, + {file = "websockets-12.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ab3d732ad50a4fbd04a4490ef08acd0517b6ae6b77eb967251f4c263011a990d"}, + {file = "websockets-12.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a1d9697f3337a89691e3bd8dc56dea45a6f6d975f92e7d5f773bc715c15dde28"}, + {file = "websockets-12.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1df2fbd2c8a98d38a66f5238484405b8d1d16f929bb7a33ed73e4801222a6f53"}, + {file = "websockets-12.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23509452b3bc38e3a057382c2e941d5ac2e01e251acce7adc74011d7d8de434c"}, + {file = "websockets-12.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2e5fc14ec6ea568200ea4ef46545073da81900a2b67b3e666f04adf53ad452ec"}, + {file = "websockets-12.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46e71dbbd12850224243f5d2aeec90f0aaa0f2dde5aeeb8fc8df21e04d99eff9"}, + {file = "websockets-12.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b81f90dcc6c85a9b7f29873beb56c94c85d6f0dac2ea8b60d995bd18bf3e2aae"}, + {file = "websockets-12.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:a02413bc474feda2849c59ed2dfb2cddb4cd3d2f03a2fedec51d6e959d9b608b"}, + {file = "websockets-12.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bbe6013f9f791944ed31ca08b077e26249309639313fff132bfbf3ba105673b9"}, + {file = "websockets-12.0-cp39-cp39-win32.whl", hash = "sha256:cbe83a6bbdf207ff0541de01e11904827540aa069293696dd528a6640bd6a5f6"}, + {file = "websockets-12.0-cp39-cp39-win_amd64.whl", hash = "sha256:fc4e7fa5414512b481a2483775a8e8be7803a35b30ca805afa4998a84f9fd9e8"}, + {file = "websockets-12.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:248d8e2446e13c1d4326e0a6a4e9629cb13a11195051a73acf414812700badbd"}, + {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f44069528d45a933997a6fef143030d8ca8042f0dfaad753e2906398290e2870"}, + {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c4e37d36f0d19f0a4413d3e18c0d03d0c268ada2061868c1e6f5ab1a6d575077"}, + {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d829f975fc2e527a3ef2f9c8f25e553eb7bc779c6665e8e1d52aa22800bb38b"}, + {file = "websockets-12.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:2c71bd45a777433dd9113847af751aae36e448bc6b8c361a566cb043eda6ec30"}, + {file = "websockets-12.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0bee75f400895aef54157b36ed6d3b308fcab62e5260703add87f44cee9c82a6"}, + {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:423fc1ed29f7512fceb727e2d2aecb952c46aa34895e9ed96071821309951123"}, + {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27a5e9964ef509016759f2ef3f2c1e13f403725a5e6a1775555994966a66e931"}, + {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3181df4583c4d3994d31fb235dc681d2aaad744fbdbf94c4802485ececdecf2"}, + {file = "websockets-12.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b067cb952ce8bf40115f6c19f478dc71c5e719b7fbaa511359795dfd9d1a6468"}, + {file = "websockets-12.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:00700340c6c7ab788f176d118775202aadea7602c5cc6be6ae127761c16d6b0b"}, + {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e469d01137942849cff40517c97a30a93ae79917752b34029f0ec72df6b46399"}, + {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffefa1374cd508d633646d51a8e9277763a9b78ae71324183693959cf94635a7"}, + {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba0cab91b3956dfa9f512147860783a1829a8d905ee218a9837c18f683239611"}, + {file = "websockets-12.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2cb388a5bfb56df4d9a406783b7f9dbefb888c09b71629351cc6b036e9259370"}, + {file = "websockets-12.0-py3-none-any.whl", hash = "sha256:dc284bbc8d7c78a6c69e0c7325ab46ee5e40bb4d50e494d8131a07ef47500e9e"}, + {file = "websockets-12.0.tar.gz", hash = "sha256:81df9cbcbb6c260de1e007e58c011bfebe2dafc8435107b0537f393dd38c8b1b"}, +] + [[package]] name = "werkzeug" -version = "3.0.1" +version = "3.0.3" description = "The comprehensive WSGI web application library." optional = false python-versions = ">=3.8" files = [ - {file = "werkzeug-3.0.1-py3-none-any.whl", hash = "sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10"}, - {file = "werkzeug-3.0.1.tar.gz", hash = "sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc"}, + {file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"}, + {file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"}, ] [package.dependencies] @@ -2594,18 +3304,18 @@ multidict = ">=4.0" [[package]] name = "zipp" -version = "3.8.1" +version = "3.19.1" description = "Backport of pathlib-compatible object wrapper for zip files" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"}, - {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"}, + {file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"}, + {file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"}, ] [package.extras] -docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx"] -testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] [[package]] name = "zstandard" @@ -2668,4 +3378,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "9cf2734cafd5b6963165d398f1b24621193d5284d0bc7cc26a720a014f523860" +content-hash = "c09bcb333ab550958b33dbf4fec968c500d8e701fd4c96402cddbd9bb8048055" diff --git a/pre-commit.py b/pre-commit.py index c5ed63ac44..ae432e8225 100755 --- a/pre-commit.py +++ b/pre-commit.py @@ -2,6 +2,7 @@ import argparse import enum +import os import subprocess import sys from typing import List @@ -93,7 +94,7 @@ if __name__ == "__main__": "--no-color", action="store_true", help="disable colored output", - default=not sys.stdout.isatty(), + default=not sys.stdout.isatty() or os.getenv("TERM") == "dumb", ) args = parser.parse_args() diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index f075c718a7..21d92abb20 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -9,8 +9,16 @@ default = [] testing = [] [dependencies] +ahash.workspace = true anyhow.workspace = true +arc-swap.workspace = true +async-compression.workspace = true async-trait.workspace = true +atomic-take.workspace = true +aws-config.workspace = true +aws-sdk-iam.workspace = true +aws-sigv4.workspace = true +aws-types.workspace = true base64.workspace = true bstr.workspace = true bytes = { workspace = true, features = ["serde"] } @@ -18,7 +26,10 @@ camino.workspace = true chrono.workspace = true clap.workspace = true consumption_metrics.workspace = true +crossbeam-deque.workspace = true dashmap.workspace = true +env_logger.workspace = true +framed-websockets.workspace = true futures.workspace = true git-version.workspace = true hashbrown.workspace = true @@ -26,19 +37,25 @@ hashlink.workspace = true hex.workspace = true hmac.workspace = true hostname.workspace = true +http.workspace = true humantime.workspace = true -hyper-tungstenite.workspace = true +humantime-serde.workspace = true hyper.workspace = true +hyper1 = { package = "hyper", version = "1.2", features = ["server"] } +hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] } +http-body-util = { version = "0.1" } +indexmap.workspace = true ipnet.workspace = true itertools.workspace = true +lasso = { workspace = true, features = ["multi-threaded"] } md5.workspace = true +measured = { workspace = true, features = ["lasso"] } metrics.workspace = true once_cell.workspace = true opentelemetry.workspace = true parking_lot.workspace = true parquet.workspace = true parquet_derive.workspace = true -pbkdf2 = { workspace = true, features = ["simple", "std"] } pin-project-lite.workspace = true postgres_backend.workspace = true pq_proto.workspace = true @@ -46,8 +63,8 @@ prometheus.workspace = true rand.workspace = true regex.workspace = true remote_storage = { version = "0.1", path = "../libs/remote_storage/" } -reqwest = { workspace = true, features = ["json"] } -reqwest-middleware.workspace = true +reqwest.workspace = true +reqwest-middleware = { workspace = true, features = ["json"] } reqwest-retry.workspace = true reqwest-tracing.workspace = true routerify.workspace = true @@ -57,36 +74,53 @@ rustls.workspace = true scopeguard.workspace = true serde.workspace = true serde_json.workspace = true -sha2.workspace = true +sha2 = { workspace = true, features = ["asm", "oid"] } +smol_str.workspace = true +smallvec.workspace = true socket2.workspace = true -sync_wrapper.workspace = true +subtle.workspace = true task-local-extensions.workspace = true thiserror.workspace = true -tls-listener.workspace = true +tikv-jemallocator.workspace = true +tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] } tokio-postgres.workspace = true +tokio-postgres-rustls.workspace = true tokio-rustls.workspace = true tokio-util.workspace = true tokio = { workspace = true, features = ["signal"] } +tower-service.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true tracing.workspace = true +try-lock.workspace = true +typed-json.workspace = true url.workspace = true +urlencoding.workspace = true utils.workspace = true uuid.workspace = true -webpki-roots.workspace = true +rustls-native-certs.workspace = true x509-parser.workspace = true -native-tls.workspace = true -postgres-native-tls.workspace = true postgres-protocol.workspace = true redis.workspace = true -smol_str.workspace = true + +# jwt stuff +jose-jwa = "0.1.2" +jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] } +signature = "2" +ecdsa = "0.16" +p256 = "0.13" +rsa = "0.9" workspace_hack.workspace = true [dev-dependencies] camino-tempfile.workspace = true +fallible-iterator.workspace = true +tokio-tungstenite.workspace = true +pbkdf2 = { workspace = true, features = ["simple", "std"] } rcgen.workspace = true rstest.workspace = true tokio-postgres-rustls.workspace = true walkdir.workspace = true +rand_distr = "0.4" diff --git a/proxy/README.md b/proxy/README.md index d1f2e3f27b..8d850737be 100644 --- a/proxy/README.md +++ b/proxy/README.md @@ -6,7 +6,7 @@ Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme a new SCRAM-based console API; uses SNI info to select the destination project (endpoint soon) * postgres uses postgres to select auth secrets of existing roles. Useful for local testing -* link +* web (or link) sends login link for all usernames Also proxy can expose following services to the external world: @@ -36,7 +36,7 @@ To play with it locally one may start proxy over a local postgres installation ``` If both postgres and proxy are running you may send a SQL query: -```json +```console curl -k -X POST 'https://proxy.localtest.me:4444/sql' \ -H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \ -H 'Content-Type: application/json' \ @@ -44,7 +44,8 @@ curl -k -X POST 'https://proxy.localtest.me:4444/sql' \ "query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num", "params":[ "{{1,2},{\"3\",4}}", {"key":"val", "ikey":4242}] }' | jq - +``` +```json { "command": "SELECT", "fields": [ diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 8d1b861a66..7c408f817c 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,39 +1,41 @@ //! Client authentication mechanisms. pub mod backend; -pub use backend::BackendType; +pub use backend::Backend; mod credentials; -pub use credentials::{ - check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint, IpPattern, +pub(crate) use credentials::{ + check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint, + ComputeUserInfoParseError, IpPattern, }; mod password_hack; -pub use password_hack::parse_endpoint_param; +pub(crate) use password_hack::parse_endpoint_param; use password_hack::PasswordHackPayload; mod flow; -pub use flow::*; +pub(crate) use flow::*; +use tokio::time::error::Elapsed; -use crate::{console, error::UserFacingError}; -use std::io; +use crate::{ + console, + error::{ReportableError, UserFacingError}, +}; +use std::{io, net::IpAddr}; use thiserror::Error; /// Convenience wrapper for the authentication error. -pub type Result = std::result::Result; +pub(crate) type Result = std::result::Result; /// Common authentication error. #[derive(Debug, Error)] -pub enum AuthErrorImpl { +pub(crate) enum AuthErrorImpl { #[error(transparent)] - Link(#[from] backend::LinkAuthError), + Web(#[from] backend::WebAuthError), #[error(transparent)] GetAuthInfo(#[from] console::errors::GetAuthInfoError), - #[error(transparent)] - WakeCompute(#[from] console::errors::WakeComputeError), - /// SASL protocol errors (includes [SCRAM](crate::scram)). #[error(transparent)] Sasl(#[from] crate::sasl::Error), @@ -60,39 +62,47 @@ pub enum AuthErrorImpl { Io(#[from] io::Error), #[error( - "This IP address is not allowed to connect to this endpoint. \ - Please add it to the allowed list in the Neon console." + "This IP address {0} is not allowed to connect to this endpoint. \ + Please add it to the allowed list in the Neon console. \ + Make sure to check for IPv4 or IPv6 addresses." )] - IpAddressNotAllowed, + IpAddressNotAllowed(IpAddr), #[error("Too many connections to this endpoint. Please try again later.")] TooManyConnections, + + #[error("Authentication timed out")] + UserTimeout(Elapsed), } #[derive(Debug, Error)] #[error(transparent)] -pub struct AuthError(Box); +pub(crate) struct AuthError(Box); impl AuthError { - pub fn bad_auth_method(name: impl Into>) -> Self { + pub(crate) fn bad_auth_method(name: impl Into>) -> Self { AuthErrorImpl::BadAuthMethod(name.into()).into() } - pub fn auth_failed(user: impl Into>) -> Self { + pub(crate) fn auth_failed(user: impl Into>) -> Self { AuthErrorImpl::AuthFailed(user.into()).into() } - pub fn ip_address_not_allowed() -> Self { - AuthErrorImpl::IpAddressNotAllowed.into() + pub(crate) fn ip_address_not_allowed(ip: IpAddr) -> Self { + AuthErrorImpl::IpAddressNotAllowed(ip).into() } - pub fn too_many_connections() -> Self { + pub(crate) fn too_many_connections() -> Self { AuthErrorImpl::TooManyConnections.into() } - pub fn is_auth_failed(&self) -> bool { + pub(crate) fn is_auth_failed(&self) -> bool { matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_)) } + + pub(crate) fn user_timeout(elapsed: Elapsed) -> Self { + AuthErrorImpl::UserTimeout(elapsed).into() + } } impl> From for AuthError { @@ -103,19 +113,36 @@ impl> From for AuthError { impl UserFacingError for AuthError { fn to_string_client(&self) -> String { - use AuthErrorImpl::*; match self.0.as_ref() { - Link(e) => e.to_string_client(), - GetAuthInfo(e) => e.to_string_client(), - WakeCompute(e) => e.to_string_client(), - Sasl(e) => e.to_string_client(), - AuthFailed(_) => self.to_string(), - BadAuthMethod(_) => self.to_string(), - MalformedPassword(_) => self.to_string(), - MissingEndpointName => self.to_string(), - Io(_) => "Internal error".to_string(), - IpAddressNotAllowed => self.to_string(), - TooManyConnections => self.to_string(), + AuthErrorImpl::Web(e) => e.to_string_client(), + AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(), + AuthErrorImpl::Sasl(e) => e.to_string_client(), + AuthErrorImpl::AuthFailed(_) => self.to_string(), + AuthErrorImpl::BadAuthMethod(_) => self.to_string(), + AuthErrorImpl::MalformedPassword(_) => self.to_string(), + AuthErrorImpl::MissingEndpointName => self.to_string(), + AuthErrorImpl::Io(_) => "Internal error".to_string(), + AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(), + AuthErrorImpl::TooManyConnections => self.to_string(), + AuthErrorImpl::UserTimeout(_) => self.to_string(), + } + } +} + +impl ReportableError for AuthError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self.0.as_ref() { + AuthErrorImpl::Web(e) => e.get_error_kind(), + AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(), + AuthErrorImpl::Sasl(e) => e.get_error_kind(), + AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User, + AuthErrorImpl::BadAuthMethod(_) => crate::error::ErrorKind::User, + AuthErrorImpl::MalformedPassword(_) => crate::error::ErrorKind::User, + AuthErrorImpl::MissingEndpointName => crate::error::ErrorKind::User, + AuthErrorImpl::Io(_) => crate::error::ErrorKind::ClientDisconnect, + AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User, + AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit, + AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User, } } } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index b1634906c9..1d28c6df31 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -1,20 +1,32 @@ mod classic; mod hacks; -mod link; +pub mod jwt; +pub mod local; +mod web; -pub use link::LinkAuthError; +use std::net::IpAddr; +use std::sync::Arc; +use std::time::Duration; + +use ipnet::{Ipv4Net, Ipv6Net}; +use local::LocalBackend; +use tokio::io::{AsyncRead, AsyncWrite}; use tokio_postgres::config::AuthKeys; +use tracing::{info, warn}; +pub(crate) use web::WebAuthError; use crate::auth::credentials::check_peer_addr_is_in_list; -use crate::auth::validate_password_and_exchange; +use crate::auth::{validate_password_and_exchange, AuthError}; use crate::cache::Cached; use crate::console::errors::GetAuthInfoError; -use crate::console::provider::ConsoleBackend; -use crate::console::AuthSecret; +use crate::console::provider::{CachedRoleSecret, ConsoleBackend}; +use crate::console::{AuthSecret, NodeInfo}; use crate::context::RequestMonitoring; -use crate::proxy::connect_compute::handle_try_wake; -use crate::proxy::retry::retry_after; +use crate::intern::EndpointIdInt; +use crate::metrics::Metrics; +use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; +use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo}; use crate::stream::Stream; use crate::{ auth::{self, ComputeUserInfoMaybeEndpoint}, @@ -27,14 +39,23 @@ use crate::{ stream, url, }; use crate::{scram, EndpointCacheKey, EndpointId, RoleName}; -use futures::TryFutureExt; -use std::borrow::Cow; -use std::ops::ControlFlow; -use std::sync::Arc; -use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{error, info, warn}; -use super::IpPattern; +/// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality +pub enum MaybeOwned<'a, T> { + Owned(T), + Borrowed(&'a T), +} + +impl std::ops::Deref for MaybeOwned<'_, T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + match self { + MaybeOwned::Owned(t) => t, + MaybeOwned::Borrowed(t) => t, + } + } +} /// This type serves two purposes: /// @@ -44,112 +65,107 @@ use super::IpPattern; /// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`], /// this helps us provide the credentials only to those auth /// backends which require them for the authentication process. -pub enum BackendType<'a, T> { +pub enum Backend<'a, T, D> { /// Cloud API (V2). - Console(Cow<'a, ConsoleBackend>, T), + Console(MaybeOwned<'a, ConsoleBackend>, T), /// Authentication via a web browser. - Link(Cow<'a, url::ApiUrl>), - #[cfg(test)] - /// Test backend. - Test(&'a dyn TestBackend), + Web(MaybeOwned<'a, url::ApiUrl>, D), + /// Local proxy uses configured auth credentials and does not wake compute + Local(MaybeOwned<'a, LocalBackend>), } -pub trait TestBackend: Send + Sync + 'static { +#[cfg(test)] +pub(crate) trait TestBackend: Send + Sync + 'static { fn wake_compute(&self) -> Result; - fn get_allowed_ips(&self) -> Result, console::errors::GetAuthInfoError>; + fn get_allowed_ips_and_secret( + &self, + ) -> Result<(CachedAllowedIps, Option), console::errors::GetAuthInfoError>; } -impl std::fmt::Display for BackendType<'_, ()> { +impl std::fmt::Display for Backend<'_, (), ()> { fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - use BackendType::*; match self { - Console(api, _) => match &**api { + Self::Console(api, ()) => match &**api { ConsoleBackend::Console(endpoint) => { fmt.debug_tuple("Console").field(&endpoint.url()).finish() } - #[cfg(feature = "testing")] + #[cfg(any(test, feature = "testing"))] ConsoleBackend::Postgres(endpoint) => { fmt.debug_tuple("Postgres").field(&endpoint.url()).finish() } + #[cfg(test)] + ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(), }, - Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(), - #[cfg(test)] - Test(_) => fmt.debug_tuple("Test").finish(), + Self::Web(url, ()) => fmt.debug_tuple("Web").field(&url.as_str()).finish(), + Self::Local(_) => fmt.debug_tuple("Local").finish(), } } } -impl BackendType<'_, T> { +impl Backend<'_, T, D> { /// Very similar to [`std::option::Option::as_ref`]. /// This helps us pass structured config to async tasks. - pub fn as_ref(&self) -> BackendType<'_, &T> { - use BackendType::*; + pub(crate) fn as_ref(&self) -> Backend<'_, &T, &D> { match self { - Console(c, x) => Console(Cow::Borrowed(c), x), - Link(c) => Link(Cow::Borrowed(c)), - #[cfg(test)] - Test(x) => Test(*x), + Self::Console(c, x) => Backend::Console(MaybeOwned::Borrowed(c), x), + Self::Web(c, x) => Backend::Web(MaybeOwned::Borrowed(c), x), + Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)), } } } -impl<'a, T> BackendType<'a, T> { +impl<'a, T, D> Backend<'a, T, D> { /// Very similar to [`std::option::Option::map`]. - /// Maps [`BackendType`] to [`BackendType`] by applying + /// Maps [`Backend`] to [`Backend`] by applying /// a function to a contained value. - pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R> { - use BackendType::*; + pub(crate) fn map(self, f: impl FnOnce(T) -> R) -> Backend<'a, R, D> { match self { - Console(c, x) => Console(c, f(x)), - Link(c) => Link(c), - #[cfg(test)] - Test(x) => Test(x), + Self::Console(c, x) => Backend::Console(c, f(x)), + Self::Web(c, x) => Backend::Web(c, x), + Self::Local(l) => Backend::Local(l), } } } - -impl<'a, T, E> BackendType<'a, Result> { +impl<'a, T, D, E> Backend<'a, Result, D> { /// Very similar to [`std::option::Option::transpose`]. /// This is most useful for error handling. - pub fn transpose(self) -> Result, E> { - use BackendType::*; + pub(crate) fn transpose(self) -> Result, E> { match self { - Console(c, x) => x.map(|x| Console(c, x)), - Link(c) => Ok(Link(c)), - #[cfg(test)] - Test(x) => Ok(Test(x)), + Self::Console(c, x) => x.map(|x| Backend::Console(c, x)), + Self::Web(c, x) => Ok(Backend::Web(c, x)), + Self::Local(l) => Ok(Backend::Local(l)), } } } -pub struct ComputeCredentials { - pub info: ComputeUserInfo, - pub keys: T, +pub(crate) struct ComputeCredentials { + pub(crate) info: ComputeUserInfo, + pub(crate) keys: ComputeCredentialKeys, } #[derive(Debug, Clone)] -pub struct ComputeUserInfoNoEndpoint { - pub user: RoleName, - pub options: NeonOptions, +pub(crate) struct ComputeUserInfoNoEndpoint { + pub(crate) user: RoleName, + pub(crate) options: NeonOptions, } #[derive(Debug, Clone)] -pub struct ComputeUserInfo { - pub endpoint: EndpointId, - pub user: RoleName, - pub options: NeonOptions, +pub(crate) struct ComputeUserInfo { + pub(crate) endpoint: EndpointId, + pub(crate) user: RoleName, + pub(crate) options: NeonOptions, } impl ComputeUserInfo { - pub fn endpoint_cache_key(&self) -> EndpointCacheKey { + pub(crate) fn endpoint_cache_key(&self) -> EndpointCacheKey { self.options.get_cache_key(&self.endpoint) } } -pub enum ComputeCredentialKeys { - #[cfg(feature = "testing")] +pub(crate) enum ComputeCredentialKeys { Password(Vec), AuthKeys(AuthKeys), + None, } impl TryFrom for ComputeUserInfo { @@ -171,47 +187,159 @@ impl TryFrom for ComputeUserInfo { } } +#[derive(PartialEq, PartialOrd, Hash, Eq, Ord, Debug, Copy, Clone)] +pub struct MaskedIp(IpAddr); + +impl MaskedIp { + fn new(value: IpAddr, prefix: u8) -> Self { + match value { + IpAddr::V4(v4) => Self(IpAddr::V4( + Ipv4Net::new(v4, prefix).map_or(v4, |x| x.trunc().addr()), + )), + IpAddr::V6(v6) => Self(IpAddr::V6( + Ipv6Net::new(v6, prefix).map_or(v6, |x| x.trunc().addr()), + )), + } + } +} + +// This can't be just per IP because that would limit some PaaS that share IP addresses +pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, MaskedIp)>; + +impl RateBucketInfo { + /// All of these are per endpoint-maskedip pair. + /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus). + /// + /// First bucket: 1000mcpus total per endpoint-ip pair + /// * 4096000 requests per second with 1 hash rounds. + /// * 1000 requests per second with 4096 hash rounds. + /// * 6.8 requests per second with 600000 hash rounds. + pub const DEFAULT_AUTH_SET: [Self; 3] = [ + Self::new(1000 * 4096, Duration::from_secs(1)), + Self::new(600 * 4096, Duration::from_secs(60)), + Self::new(300 * 4096, Duration::from_secs(600)), + ]; +} + +impl AuthenticationConfig { + pub(crate) fn check_rate_limit( + &self, + ctx: &RequestMonitoring, + config: &AuthenticationConfig, + secret: AuthSecret, + endpoint: &EndpointId, + is_cleartext: bool, + ) -> auth::Result { + // we have validated the endpoint exists, so let's intern it. + let endpoint_int = EndpointIdInt::from(endpoint.normalize()); + + // only count the full hash count if password hack or websocket flow. + // in other words, if proxy needs to run the hashing + let password_weight = if is_cleartext { + match &secret { + #[cfg(any(test, feature = "testing"))] + AuthSecret::Md5(_) => 1, + AuthSecret::Scram(s) => s.iterations + 1, + } + } else { + // validating scram takes just 1 hmac_sha_256 operation. + 1 + }; + + let limit_not_exceeded = self.rate_limiter.check( + ( + endpoint_int, + MaskedIp::new(ctx.peer_addr(), config.rate_limit_ip_subnet), + ), + password_weight, + ); + + if !limit_not_exceeded { + warn!( + enabled = self.rate_limiter_enabled, + "rate limiting authentication" + ); + Metrics::get().proxy.requests_auth_rate_limits_total.inc(); + Metrics::get() + .proxy + .endpoints_auth_rate_limits + .get_metric() + .measure(endpoint); + + if self.rate_limiter_enabled { + return Err(auth::AuthError::too_many_connections()); + } + } + + Ok(secret) + } +} + /// True to its name, this function encapsulates our current auth trade-offs. /// Here, we choose the appropriate auth flow based on circumstances. /// /// All authentication flows will emit an AuthenticationOk message if successful. async fn auth_quirks( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, api: &impl console::Api, user_info: ComputeUserInfoMaybeEndpoint, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, -) -> auth::Result> { + endpoint_rate_limiter: Arc, +) -> auth::Result { // If there's no project so far, that entails that client doesn't // support SNI or other means of passing the endpoint (project) name. // We now expect to see a very specific payload in the place of password. let (info, unauthenticated_password) = match user_info.try_into() { Err(info) => { - let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer) - .await?; - ctx.set_endpoint_id(Some(res.info.endpoint.clone())); - (res.info, Some(res.keys)) + let res = hacks::password_hack_no_authentication(ctx, info, client).await?; + + ctx.set_endpoint_id(res.info.endpoint.clone()); + let password = match res.keys { + ComputeCredentialKeys::Password(p) => p, + ComputeCredentialKeys::AuthKeys(_) | ComputeCredentialKeys::None => { + unreachable!("password hack should return a password") + } + }; + (res.info, Some(password)) } Ok(info) => (info, None), }; info!("fetching user's authentication info"); - let allowed_ips = api.get_allowed_ips(ctx, &info).await?; + let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?; // check allowed list - if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { - return Err(auth::AuthError::ip_address_not_allowed()); + if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { + return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr())); } - let cached_secret = api.get_role_secret(ctx, &info).await?; - let secret = cached_secret.value.clone().unwrap_or_else(|| { + if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) { + return Err(AuthError::too_many_connections()); + } + let cached_secret = match maybe_secret { + Some(secret) => secret, + None => api.get_role_secret(ctx, &info).await?, + }; + let (cached_entry, secret) = cached_secret.take_value(); + + let secret = if let Some(secret) = secret { + config.check_rate_limit( + ctx, + config, + secret, + &info.endpoint, + unauthenticated_password.is_some() || allow_cleartext, + )? + } else { // If we don't have an authentication secret, we mock one to // prevent malicious probing (possible due to missing protocol steps). // This mocked secret will never lead to successful authentication. info!("authentication info not found, mocking it"); - AuthSecret::Scram(scram::ServerSecret::mock(&info.user, rand::random())) - }); + AuthSecret::Scram(scram::ServerSecret::mock(rand::random())) + }; + match authenticate_with_secret( ctx, secret, @@ -227,7 +355,7 @@ async fn auth_quirks( Err(e) => { if e.is_auth_failed() { // The password could have been changed, so we invalidate the cache. - cached_secret.invalidate(); + cached_entry.invalidate(); } Err(e) } @@ -235,16 +363,19 @@ async fn auth_quirks( } async fn authenticate_with_secret( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, secret: AuthSecret, info: ComputeUserInfo, client: &mut stream::PqStream>, unauthenticated_password: Option>, allow_cleartext: bool, config: &'static AuthenticationConfig, -) -> auth::Result> { +) -> auth::Result { if let Some(password) = unauthenticated_password { - let auth_outcome = validate_password_and_exchange(&password, secret)?; + let ep = EndpointIdInt::from(&info.endpoint); + + let auth_outcome = + validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?; let keys = match auth_outcome { crate::sasl::Outcome::Success(key) => key, crate::sasl::Outcome::Failure(reason) => { @@ -264,119 +395,64 @@ async fn authenticate_with_secret( // Perform cleartext auth if we're allowed to do that. // Currently, we use it for websocket connections (latency). if allow_cleartext { - return hacks::authenticate_cleartext(info, client, &mut ctx.latency_timer, secret).await; + ctx.set_auth_method(crate::context::AuthMethod::Cleartext); + return hacks::authenticate_cleartext(ctx, info, client, secret, config).await; } // Finally, proceed with the main auth flow (SCRAM-based). - classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await + classic::authenticate(ctx, info, client, config, secret).await } -/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache) -/// only if authentication was successfuly. -async fn auth_and_wake_compute( - ctx: &mut RequestMonitoring, - api: &impl console::Api, - user_info: ComputeUserInfoMaybeEndpoint, - client: &mut stream::PqStream>, - allow_cleartext: bool, - config: &'static AuthenticationConfig, -) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> { - let compute_credentials = - auth_quirks(ctx, api, user_info, client, allow_cleartext, config).await?; - - let mut num_retries = 0; - let mut node = loop { - let wake_res = api.wake_compute(ctx, &compute_credentials.info).await; - match handle_try_wake(wake_res, num_retries) { - Err(e) => { - error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); - return Err(e.into()); - } - Ok(ControlFlow::Continue(e)) => { - warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); - } - Ok(ControlFlow::Break(n)) => break n, - } - - let wait_duration = retry_after(num_retries); - num_retries += 1; - tokio::time::sleep(wait_duration).await; - }; - - ctx.set_project(node.aux.clone()); - - match compute_credentials.keys { - #[cfg(feature = "testing")] - ComputeCredentialKeys::Password(password) => node.config.password(password), - ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys), - }; - - Ok((node, compute_credentials.info)) -} - -impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> { - /// Get compute endpoint name from the credentials. - pub fn get_endpoint(&self) -> Option { - use BackendType::*; - - match self { - Console(_, user_info) => user_info.endpoint_id.clone(), - Link(_) => Some("link".into()), - #[cfg(test)] - Test(_) => Some("test".into()), - } - } - +impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { /// Get username from the credentials. - pub fn get_user(&self) -> &str { - use BackendType::*; - + pub(crate) fn get_user(&self) -> &str { match self { - Console(_, user_info) => &user_info.user, - Link(_) => "link", - #[cfg(test)] - Test(_) => "test", + Self::Console(_, user_info) => &user_info.user, + Self::Web(_, ()) => "web", + Self::Local(_) => "local", } } /// Authenticate the client via the requested backend, possibly using credentials. #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)] - pub async fn authenticate( + pub(crate) async fn authenticate( self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, - ) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> { - use BackendType::*; - + endpoint_rate_limiter: Arc, + ) -> auth::Result> { let res = match self { - Console(api, user_info) => { + Self::Console(api, user_info) => { info!( user = &*user_info.user, project = user_info.endpoint(), "performing authentication using the console" ); - let (cache_info, user_info) = - auth_and_wake_compute(ctx, &*api, user_info, client, allow_cleartext, config) - .await?; - (cache_info, BackendType::Console(api, user_info)) + let credentials = auth_quirks( + ctx, + &*api, + user_info, + client, + allow_cleartext, + config, + endpoint_rate_limiter, + ) + .await?; + Backend::Console(api, credentials) } // NOTE: this auth backend doesn't use client credentials. - Link(url) => { - info!("performing link authentication"); + Self::Web(url, ()) => { + info!("performing web authentication"); - let node_info = link::authenticate(&url, client).await?; + let info = web::authenticate(ctx, &url, client).await?; - ( - CachedNodeInfo::new_uncached(node_info), - BackendType::Link(url), - ) + Backend::Web(url, info) } - #[cfg(test)] - Test(_) => { - unreachable!("this function should never be called in the test backend") + Self::Local(_) => { + return Err(auth::AuthError::bad_auth_method("invalid for local proxy")) } }; @@ -385,33 +461,385 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> { } } -impl BackendType<'_, ComputeUserInfo> { - pub async fn get_allowed_ips( +impl Backend<'_, ComputeUserInfo, &()> { + pub(crate) async fn get_role_secret( &self, - ctx: &mut RequestMonitoring, - ) -> Result { - use BackendType::*; + ctx: &RequestMonitoring, + ) -> Result { match self { - Console(api, user_info) => api.get_allowed_ips(ctx, user_info).await, - Link(_) => Ok(Cached::new_uncached(Arc::new(vec![]))), - #[cfg(test)] - Test(x) => Ok(Cached::new_uncached(Arc::new(x.get_allowed_ips()?))), + Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await, + Self::Web(_, ()) => Ok(Cached::new_uncached(None)), + Self::Local(_) => Ok(Cached::new_uncached(None)), } } - /// When applicable, wake the compute node, gaining its connection info in the process. - /// The link auth flow doesn't support this, so we return [`None`] in that case. - pub async fn wake_compute( + pub(crate) async fn get_allowed_ips_and_secret( &self, - ctx: &mut RequestMonitoring, - ) -> Result, console::errors::WakeComputeError> { - use BackendType::*; - + ctx: &RequestMonitoring, + ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { match self { - Console(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await, - Link(_) => Ok(None), - #[cfg(test)] - Test(x) => x.wake_compute().map(Some), + Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await, + Self::Web(_, ()) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), + Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), } } } + +#[async_trait::async_trait] +impl ComputeConnectBackend for Backend<'_, ComputeCredentials, NodeInfo> { + async fn wake_compute( + &self, + ctx: &RequestMonitoring, + ) -> Result { + match self { + Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await, + Self::Web(_, info) => Ok(Cached::new_uncached(info.clone())), + Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())), + } + } + + fn get_keys(&self) -> &ComputeCredentialKeys { + match self { + Self::Console(_, creds) => &creds.keys, + Self::Web(_, _) => &ComputeCredentialKeys::None, + Self::Local(_) => &ComputeCredentialKeys::None, + } + } +} + +#[async_trait::async_trait] +impl ComputeConnectBackend for Backend<'_, ComputeCredentials, &()> { + async fn wake_compute( + &self, + ctx: &RequestMonitoring, + ) -> Result { + match self { + Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await, + Self::Web(_, ()) => { + unreachable!("web auth flow doesn't support waking the compute") + } + Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())), + } + } + + fn get_keys(&self) -> &ComputeCredentialKeys { + match self { + Self::Console(_, creds) => &creds.keys, + Self::Web(_, ()) => &ComputeCredentialKeys::None, + Self::Local(_) => &ComputeCredentialKeys::None, + } + } +} + +#[cfg(test)] +mod tests { + use std::{net::IpAddr, sync::Arc, time::Duration}; + + use bytes::BytesMut; + use fallible_iterator::FallibleIterator; + use once_cell::sync::Lazy; + use postgres_protocol::{ + authentication::sasl::{ChannelBinding, ScramSha256}, + message::{backend::Message as PgMessage, frontend}, + }; + use provider::AuthSecret; + use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; + + use crate::{ + auth::{backend::MaskedIp, ComputeUserInfoMaybeEndpoint, IpPattern}, + config::AuthenticationConfig, + console::{ + self, + provider::{self, CachedAllowedIps, CachedRoleSecret}, + CachedNodeInfo, + }, + context::RequestMonitoring, + proxy::NeonOptions, + rate_limiter::{EndpointRateLimiter, RateBucketInfo}, + scram::{threadpool::ThreadPool, ServerSecret}, + stream::{PqStream, Stream}, + }; + + use super::{auth_quirks, AuthRateLimiter}; + + struct Auth { + ips: Vec, + secret: AuthSecret, + } + + impl console::Api for Auth { + async fn get_role_secret( + &self, + _ctx: &RequestMonitoring, + _user_info: &super::ComputeUserInfo, + ) -> Result { + Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone()))) + } + + async fn get_allowed_ips_and_secret( + &self, + _ctx: &RequestMonitoring, + _user_info: &super::ComputeUserInfo, + ) -> Result<(CachedAllowedIps, Option), console::errors::GetAuthInfoError> + { + Ok(( + CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())), + Some(CachedRoleSecret::new_uncached(Some(self.secret.clone()))), + )) + } + + async fn wake_compute( + &self, + _ctx: &RequestMonitoring, + _user_info: &super::ComputeUserInfo, + ) -> Result { + unimplemented!() + } + } + + static CONFIG: Lazy = Lazy::new(|| AuthenticationConfig { + thread_pool: ThreadPool::new(1), + scram_protocol_timeout: std::time::Duration::from_secs(5), + rate_limiter_enabled: true, + rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET), + rate_limit_ip_subnet: 64, + }); + + async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage { + loop { + r.read_buf(&mut *b).await.unwrap(); + if let Some(m) = PgMessage::parse(&mut *b).unwrap() { + break m; + } + } + } + + #[test] + fn masked_ip() { + let ip_a = IpAddr::V4([127, 0, 0, 1].into()); + let ip_b = IpAddr::V4([127, 0, 0, 2].into()); + let ip_c = IpAddr::V4([192, 168, 1, 101].into()); + let ip_d = IpAddr::V4([192, 168, 1, 102].into()); + let ip_e = IpAddr::V6("abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd".parse().unwrap()); + let ip_f = IpAddr::V6("abcd:abcd:abcd:abcd:1234:abcd:abcd:abcd".parse().unwrap()); + + assert_ne!(MaskedIp::new(ip_a, 64), MaskedIp::new(ip_b, 64)); + assert_ne!(MaskedIp::new(ip_a, 32), MaskedIp::new(ip_b, 32)); + assert_eq!(MaskedIp::new(ip_a, 30), MaskedIp::new(ip_b, 30)); + assert_eq!(MaskedIp::new(ip_c, 30), MaskedIp::new(ip_d, 30)); + + assert_ne!(MaskedIp::new(ip_e, 128), MaskedIp::new(ip_f, 128)); + assert_eq!(MaskedIp::new(ip_e, 64), MaskedIp::new(ip_f, 64)); + } + + #[test] + fn test_default_auth_rate_limit_set() { + // these values used to exceed u32::MAX + assert_eq!( + RateBucketInfo::DEFAULT_AUTH_SET, + [ + RateBucketInfo { + interval: Duration::from_secs(1), + max_rpi: 1000 * 4096, + }, + RateBucketInfo { + interval: Duration::from_secs(60), + max_rpi: 600 * 4096 * 60, + }, + RateBucketInfo { + interval: Duration::from_secs(600), + max_rpi: 300 * 4096 * 600, + } + ] + ); + + for x in RateBucketInfo::DEFAULT_AUTH_SET { + let y = x.to_string().parse().unwrap(); + assert_eq!(x, y); + } + } + + #[tokio::test] + async fn auth_quirks_scram() { + let (mut client, server) = tokio::io::duplex(1024); + let mut stream = PqStream::new(Stream::from_raw(server)); + + let ctx = RequestMonitoring::test(); + let api = Auth { + ips: vec![], + secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), + }; + + let user_info = ComputeUserInfoMaybeEndpoint { + user: "conrad".into(), + endpoint_id: Some("endpoint".into()), + options: NeonOptions::default(), + }; + + let handle = tokio::spawn(async move { + let mut scram = ScramSha256::new(b"my-secret-password", ChannelBinding::unsupported()); + + let mut read = BytesMut::new(); + + // server should offer scram + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationSasl(a) => { + let options: Vec<&str> = a.mechanisms().collect().unwrap(); + assert_eq!(options, ["SCRAM-SHA-256"]); + } + _ => panic!("wrong message"), + } + + // client sends client-first-message + let mut write = BytesMut::new(); + frontend::sasl_initial_response("SCRAM-SHA-256", scram.message(), &mut write).unwrap(); + client.write_all(&write).await.unwrap(); + + // server response with server-first-message + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationSaslContinue(a) => { + scram.update(a.data()).await.unwrap(); + } + _ => panic!("wrong message"), + } + + // client response with client-final-message + write.clear(); + frontend::sasl_response(scram.message(), &mut write).unwrap(); + client.write_all(&write).await.unwrap(); + + // server response with server-final-message + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationSaslFinal(a) => { + scram.finish(a.data()).unwrap(); + } + _ => panic!("wrong message"), + } + }); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + EndpointRateLimiter::DEFAULT, + 64, + )); + + let _creds = auth_quirks( + &ctx, + &api, + user_info, + &mut stream, + false, + &CONFIG, + endpoint_rate_limiter, + ) + .await + .unwrap(); + + handle.await.unwrap(); + } + + #[tokio::test] + async fn auth_quirks_cleartext() { + let (mut client, server) = tokio::io::duplex(1024); + let mut stream = PqStream::new(Stream::from_raw(server)); + + let ctx = RequestMonitoring::test(); + let api = Auth { + ips: vec![], + secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), + }; + + let user_info = ComputeUserInfoMaybeEndpoint { + user: "conrad".into(), + endpoint_id: Some("endpoint".into()), + options: NeonOptions::default(), + }; + + let handle = tokio::spawn(async move { + let mut read = BytesMut::new(); + let mut write = BytesMut::new(); + + // server should offer cleartext + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationCleartextPassword => {} + _ => panic!("wrong message"), + } + + // client responds with password + write.clear(); + frontend::password_message(b"my-secret-password", &mut write).unwrap(); + client.write_all(&write).await.unwrap(); + }); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + EndpointRateLimiter::DEFAULT, + 64, + )); + + let _creds = auth_quirks( + &ctx, + &api, + user_info, + &mut stream, + true, + &CONFIG, + endpoint_rate_limiter, + ) + .await + .unwrap(); + + handle.await.unwrap(); + } + + #[tokio::test] + async fn auth_quirks_password_hack() { + let (mut client, server) = tokio::io::duplex(1024); + let mut stream = PqStream::new(Stream::from_raw(server)); + + let ctx = RequestMonitoring::test(); + let api = Auth { + ips: vec![], + secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), + }; + + let user_info = ComputeUserInfoMaybeEndpoint { + user: "conrad".into(), + endpoint_id: None, + options: NeonOptions::default(), + }; + + let handle = tokio::spawn(async move { + let mut read = BytesMut::new(); + + // server should offer cleartext + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationCleartextPassword => {} + _ => panic!("wrong message"), + } + + // client responds with password + let mut write = BytesMut::new(); + frontend::password_message(b"endpoint=my-endpoint;my-secret-password", &mut write) + .unwrap(); + client.write_all(&write).await.unwrap(); + }); + + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + EndpointRateLimiter::DEFAULT, + 64, + )); + + let creds = auth_quirks( + &ctx, + &api, + user_info, + &mut stream, + true, + &CONFIG, + endpoint_rate_limiter, + ) + .await + .unwrap(); + + assert_eq!(creds.info.endpoint, "my-endpoint"); + + handle.await.unwrap(); + } +} diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index 358b335b88..285fa29428 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -4,7 +4,7 @@ use crate::{ compute, config::AuthenticationConfig, console::AuthSecret, - metrics::LatencyTimer, + context::RequestMonitoring, sasl, stream::{PqStream, Stream}, }; @@ -12,28 +12,26 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; pub(super) async fn authenticate( + ctx: &RequestMonitoring, creds: ComputeUserInfo, client: &mut PqStream>, config: &'static AuthenticationConfig, - latency_timer: &mut LatencyTimer, secret: AuthSecret, -) -> auth::Result> { +) -> auth::Result { let flow = AuthFlow::new(client); let scram_keys = match secret { - #[cfg(feature = "testing")] + #[cfg(any(test, feature = "testing"))] AuthSecret::Md5(_) => { info!("auth endpoint chooses MD5"); return Err(auth::AuthError::bad_auth_method("MD5")); } AuthSecret::Scram(secret) => { info!("auth endpoint chooses SCRAM"); - let scram = auth::Scram(&secret); + let scram = auth::Scram(&secret, ctx); let auth_outcome = tokio::time::timeout( config.scram_protocol_timeout, async { - // pause the timer while we communicate with the client - let _paused = latency_timer.pause(); flow.begin(scram).await.map_err(|error| { warn!(?error, "error sending scram acknowledgement"); @@ -45,9 +43,9 @@ pub(super) async fn authenticate( } ) .await - .map_err(|error| { - warn!("error processing scram messages error = authentication timed out, execution time exeeded {} seconds", config.scram_protocol_timeout.as_secs()); - auth::io::Error::new(auth::io::ErrorKind::TimedOut, error) + .map_err(|e| { + warn!("error processing scram messages error = authentication timed out, execution time exceeded {} seconds", config.scram_protocol_timeout.as_secs()); + auth::AuthError::user_timeout(e) })??; let client_key = match auth_outcome { diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index b6c1a92d3c..e9019ce2cf 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -3,8 +3,10 @@ use super::{ }; use crate::{ auth::{self, AuthFlow}, + config::AuthenticationConfig, console::AuthSecret, - metrics::LatencyTimer, + context::RequestMonitoring, + intern::EndpointIdInt, sasl, stream::{self, Stream}, }; @@ -15,22 +17,33 @@ use tracing::{info, warn}; /// one round trip and *expensive* computations (>= 4096 HMAC iterations). /// These properties are benefical for serverless JS workers, so we /// use this mechanism for websocket connections. -pub async fn authenticate_cleartext( +pub(crate) async fn authenticate_cleartext( + ctx: &RequestMonitoring, info: ComputeUserInfo, client: &mut stream::PqStream>, - latency_timer: &mut LatencyTimer, secret: AuthSecret, -) -> auth::Result> { + config: &'static AuthenticationConfig, +) -> auth::Result { warn!("cleartext auth flow override is enabled, proceeding"); + ctx.set_auth_method(crate::context::AuthMethod::Cleartext); // pause the timer while we communicate with the client - let _paused = latency_timer.pause(); + let paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); - let auth_outcome = AuthFlow::new(client) - .begin(auth::CleartextPassword(secret)) - .await? - .authenticate() + let ep = EndpointIdInt::from(&info.endpoint); + + let auth_flow = AuthFlow::new(client) + .begin(auth::CleartextPassword { + secret, + endpoint: ep, + pool: config.thread_pool.clone(), + }) .await?; + drop(paused); + // cleartext auth is only allowed to the ws/http protocol. + // If we're here, we already received the password in the first message. + // Scram protocol will be executed on the proxy side. + let auth_outcome = auth_flow.authenticate().await?; let keys = match auth_outcome { sasl::Outcome::Success(key) => key, @@ -46,15 +59,16 @@ pub async fn authenticate_cleartext( /// Workaround for clients which don't provide an endpoint (project) name. /// Similar to [`authenticate_cleartext`], but there's a specific password format, /// and passwords are not yet validated (we don't know how to validate them!) -pub async fn password_hack_no_authentication( +pub(crate) async fn password_hack_no_authentication( + ctx: &RequestMonitoring, info: ComputeUserInfoNoEndpoint, client: &mut stream::PqStream>, - latency_timer: &mut LatencyTimer, -) -> auth::Result>> { +) -> auth::Result { warn!("project not specified, resorting to the password hack auth flow"); + ctx.set_auth_method(crate::context::AuthMethod::Cleartext); // pause the timer while we communicate with the client - let _paused = latency_timer.pause(); + let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let payload = AuthFlow::new(client) .begin(auth::PasswordHack) @@ -71,6 +85,6 @@ pub async fn password_hack_no_authentication( options: info.options, endpoint: payload.endpoint, }, - keys: payload.password, + keys: ComputeCredentialKeys::Password(payload.password), }) } diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs new file mode 100644 index 0000000000..1f44e4af5d --- /dev/null +++ b/proxy/src/auth/backend/jwt.rs @@ -0,0 +1,725 @@ +use std::{ + future::Future, + sync::Arc, + time::{Duration, SystemTime}, +}; + +use anyhow::{bail, ensure, Context}; +use arc_swap::ArcSwapOption; +use dashmap::DashMap; +use jose_jwk::crypto::KeyInfo; +use serde::{Deserialize, Deserializer}; +use signature::Verifier; +use tokio::time::Instant; + +use crate::{context::RequestMonitoring, http::parse_json_body_with_limit, EndpointId, RoleName}; + +// TODO(conrad): make these configurable. +const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30); +const MIN_RENEW: Duration = Duration::from_secs(30); +const AUTO_RENEW: Duration = Duration::from_secs(300); +const MAX_RENEW: Duration = Duration::from_secs(3600); +const MAX_JWK_BODY_SIZE: usize = 64 * 1024; + +/// How to get the JWT auth rules +pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static { + fn fetch_auth_rules( + &self, + role_name: RoleName, + ) -> impl Future>> + Send; +} + +pub(crate) struct AuthRule { + pub(crate) id: String, + pub(crate) jwks_url: url::Url, + pub(crate) audience: Option, +} + +#[derive(Default)] +pub(crate) struct JwkCache { + client: reqwest::Client, + + map: DashMap<(EndpointId, RoleName), Arc>, +} + +pub(crate) struct JwkCacheEntry { + /// Should refetch at least every hour to verify when old keys have been removed. + /// Should refetch when new key IDs are seen only every 5 minutes or so + last_retrieved: Instant, + + /// cplane will return multiple JWKs urls that we need to scrape. + key_sets: ahash::HashMap, +} + +impl JwkCacheEntry { + fn find_jwk_and_audience(&self, key_id: &str) -> Option<(&jose_jwk::Jwk, Option<&str>)> { + self.key_sets.values().find_map(|key_set| { + key_set + .find_key(key_id) + .map(|jwk| (jwk, key_set.audience.as_deref())) + }) + } +} + +struct KeySet { + jwks: jose_jwk::JwkSet, + audience: Option, +} + +impl KeySet { + fn find_key(&self, key_id: &str) -> Option<&jose_jwk::Jwk> { + self.jwks + .keys + .iter() + .find(|jwk| jwk.prm.kid.as_deref() == Some(key_id)) + } +} + +pub(crate) struct JwkCacheEntryLock { + cached: ArcSwapOption, + lookup: tokio::sync::Semaphore, +} + +impl Default for JwkCacheEntryLock { + fn default() -> Self { + JwkCacheEntryLock { + cached: ArcSwapOption::empty(), + lookup: tokio::sync::Semaphore::new(1), + } + } +} + +impl JwkCacheEntryLock { + async fn acquire_permit<'a>(self: &'a Arc) -> JwkRenewalPermit<'a> { + JwkRenewalPermit::acquire_permit(self).await + } + + fn try_acquire_permit<'a>(self: &'a Arc) -> Option> { + JwkRenewalPermit::try_acquire_permit(self) + } + + async fn renew_jwks( + &self, + _permit: JwkRenewalPermit<'_>, + client: &reqwest::Client, + role_name: RoleName, + auth_rules: &F, + ) -> anyhow::Result> { + // double check that no one beat us to updating the cache. + let now = Instant::now(); + let guard = self.cached.load_full(); + if let Some(cached) = guard { + let last_update = now.duration_since(cached.last_retrieved); + if last_update < Duration::from_secs(300) { + return Ok(cached); + } + } + + let rules = auth_rules.fetch_auth_rules(role_name).await?; + let mut key_sets = + ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new()); + // TODO(conrad): run concurrently + // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284) + for rule in rules { + let req = client.get(rule.jwks_url.clone()); + // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`. + // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only. + match req.send().await.and_then(|r| r.error_for_status()) { + // todo: should we re-insert JWKs if we want to keep this JWKs URL? + // I expect these failures would be quite sparse. + Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"), + Ok(r) => { + let resp: http::Response = r.into(); + match parse_json_body_with_limit::( + resp.into_body(), + MAX_JWK_BODY_SIZE, + ) + .await + { + Err(e) => { + tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs"); + } + Ok(jwks) => { + key_sets.insert( + rule.id, + KeySet { + jwks, + audience: rule.audience, + }, + ); + } + } + } + } + } + + let entry = Arc::new(JwkCacheEntry { + last_retrieved: now, + key_sets, + }); + self.cached.swap(Some(Arc::clone(&entry))); + + Ok(entry) + } + + async fn get_or_update_jwk_cache( + self: &Arc, + ctx: &RequestMonitoring, + client: &reqwest::Client, + role_name: RoleName, + fetch: &F, + ) -> Result, anyhow::Error> { + let now = Instant::now(); + let guard = self.cached.load_full(); + + // if we have no cached JWKs, try and get some + let Some(cached) = guard else { + let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); + let permit = self.acquire_permit().await; + return self.renew_jwks(permit, client, role_name, fetch).await; + }; + + let last_update = now.duration_since(cached.last_retrieved); + + // check if the cached JWKs need updating. + if last_update > MAX_RENEW { + let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); + let permit = self.acquire_permit().await; + + // it's been too long since we checked the keys. wait for them to update. + return self.renew_jwks(permit, client, role_name, fetch).await; + } + + // every 5 minutes we should spawn a job to eagerly update the token. + if last_update > AUTO_RENEW { + if let Some(permit) = self.try_acquire_permit() { + tracing::debug!("JWKs should be renewed. Renewal permit acquired"); + let permit = permit.into_owned(); + let entry = self.clone(); + let client = client.clone(); + let fetch = fetch.clone(); + tokio::spawn(async move { + if let Err(e) = entry.renew_jwks(permit, &client, role_name, &fetch).await { + tracing::warn!(error=?e, "could not fetch JWKs in background job"); + } + }); + } else { + tracing::debug!("JWKs should be renewed. Renewal permit already taken, skipping"); + } + } + + Ok(cached) + } + + async fn check_jwt( + self: &Arc, + ctx: &RequestMonitoring, + jwt: &str, + client: &reqwest::Client, + role_name: RoleName, + fetch: &F, + ) -> Result<(), anyhow::Error> { + // JWT compact form is defined to be + // || . || || . || + // where Signature = alg( || . || ); + + let (header_payload, signature) = jwt + .rsplit_once('.') + .context("Provided authentication token is not a valid JWT encoding")?; + let (header, payload) = header_payload + .split_once('.') + .context("Provided authentication token is not a valid JWT encoding")?; + + let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD) + .context("Provided authentication token is not a valid JWT encoding")?; + let header = serde_json::from_slice::>(&header) + .context("Provided authentication token is not a valid JWT encoding")?; + + let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD) + .context("Provided authentication token is not a valid JWT encoding")?; + + ensure!(header.typ == "JWT"); + let kid = header.key_id.context("missing key id")?; + + let mut guard = self + .get_or_update_jwk_cache(ctx, client, role_name.clone(), fetch) + .await?; + + // get the key from the JWKs if possible. If not, wait for the keys to update. + let (jwk, expected_audience) = loop { + match guard.find_jwk_and_audience(kid) { + Some(jwk) => break jwk, + None if guard.last_retrieved.elapsed() > MIN_RENEW => { + let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); + + let permit = self.acquire_permit().await; + guard = self + .renew_jwks(permit, client, role_name.clone(), fetch) + .await?; + } + _ => { + bail!("jwk not found"); + } + } + }; + + ensure!( + jwk.is_supported(&header.algorithm), + "signature algorithm not supported" + ); + + match &jwk.key { + jose_jwk::Key::Ec(key) => { + verify_ec_signature(header_payload.as_bytes(), &sig, key)?; + } + jose_jwk::Key::Rsa(key) => { + verify_rsa_signature(header_payload.as_bytes(), &sig, key, &jwk.prm.alg)?; + } + key => bail!("unsupported key type {key:?}"), + }; + + let payload = base64::decode_config(payload, base64::URL_SAFE_NO_PAD) + .context("Provided authentication token is not a valid JWT encoding")?; + let payload = serde_json::from_slice::>(&payload) + .context("Provided authentication token is not a valid JWT encoding")?; + + tracing::debug!(?payload, "JWT signature valid with claims"); + + match (expected_audience, payload.audience) { + // check the audience matches + (Some(aud1), Some(aud2)) => ensure!(aud1 == aud2, "invalid JWT token audience"), + // the audience is expected but is missing + (Some(_), None) => bail!("invalid JWT token audience"), + // we don't care for the audience field + (None, _) => {} + } + + let now = SystemTime::now(); + + if let Some(exp) = payload.expiration { + ensure!(now < exp + CLOCK_SKEW_LEEWAY); + } + + if let Some(nbf) = payload.not_before { + ensure!(nbf < now + CLOCK_SKEW_LEEWAY); + } + + Ok(()) + } +} + +impl JwkCache { + pub(crate) async fn check_jwt( + &self, + ctx: &RequestMonitoring, + endpoint: EndpointId, + role_name: RoleName, + fetch: &F, + jwt: &str, + ) -> Result<(), anyhow::Error> { + // try with just a read lock first + let key = (endpoint, role_name.clone()); + let entry = self.map.get(&key).as_deref().map(Arc::clone); + let entry = entry.unwrap_or_else(|| { + // acquire a write lock after to insert. + let entry = self.map.entry(key).or_default(); + Arc::clone(&*entry) + }); + + entry + .check_jwt(ctx, jwt, &self.client, role_name, fetch) + .await + } +} + +fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> anyhow::Result<()> { + use ecdsa::Signature; + use signature::Verifier; + + match key.crv { + jose_jwk::EcCurves::P256 => { + let pk = + p256::PublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid P256 key"))?; + let key = p256::ecdsa::VerifyingKey::from(&pk); + let sig = Signature::from_slice(sig)?; + key.verify(data, &sig)?; + } + key => bail!("unsupported ec key type {key:?}"), + } + + Ok(()) +} + +fn verify_rsa_signature( + data: &[u8], + sig: &[u8], + key: &jose_jwk::Rsa, + alg: &Option, +) -> anyhow::Result<()> { + use jose_jwa::{Algorithm, Signing}; + use rsa::{ + pkcs1v15::{Signature, VerifyingKey}, + RsaPublicKey, + }; + + let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?; + + match alg { + Some(Algorithm::Signing(Signing::Rs256)) => { + let key = VerifyingKey::::new(key); + let sig = Signature::try_from(sig)?; + key.verify(data, &sig)?; + } + _ => bail!("invalid RSA signing algorithm"), + }; + + Ok(()) +} + +/// +#[derive(serde::Deserialize, serde::Serialize)] +struct JwtHeader<'a> { + /// must be "JWT" + #[serde(rename = "typ")] + typ: &'a str, + /// must be a supported alg + #[serde(rename = "alg")] + algorithm: jose_jwa::Algorithm, + /// key id, must be provided for our usecase + #[serde(rename = "kid")] + key_id: Option<&'a str>, +} + +/// +#[derive(serde::Deserialize, serde::Serialize, Debug)] +struct JwtPayload<'a> { + /// Audience - Recipient for which the JWT is intended + #[serde(rename = "aud")] + audience: Option<&'a str>, + /// Expiration - Time after which the JWT expires + #[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)] + expiration: Option, + /// Not before - Time after which the JWT expires + #[serde(deserialize_with = "numeric_date_opt", rename = "nbf", default)] + not_before: Option, + + // the following entries are only extracted for the sake of debug logging. + /// Issuer of the JWT + #[serde(rename = "iss")] + issuer: Option<&'a str>, + /// Subject of the JWT (the user) + #[serde(rename = "sub")] + subject: Option<&'a str>, + /// Unique token identifier + #[serde(rename = "jti")] + jwt_id: Option<&'a str>, + /// Unique session identifier + #[serde(rename = "sid")] + session_id: Option<&'a str>, +} + +fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result, D::Error> { + let d = >::deserialize(d)?; + Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n))) +} + +struct JwkRenewalPermit<'a> { + inner: Option>, +} + +enum JwkRenewalPermitInner<'a> { + Owned(Arc), + Borrowed(&'a Arc), +} + +impl JwkRenewalPermit<'_> { + fn into_owned(mut self) -> JwkRenewalPermit<'static> { + JwkRenewalPermit { + inner: self.inner.take().map(JwkRenewalPermitInner::into_owned), + } + } + + async fn acquire_permit(from: &Arc) -> JwkRenewalPermit<'_> { + match from.lookup.acquire().await { + Ok(permit) => { + permit.forget(); + JwkRenewalPermit { + inner: Some(JwkRenewalPermitInner::Borrowed(from)), + } + } + Err(_) => panic!("semaphore should not be closed"), + } + } + + fn try_acquire_permit(from: &Arc) -> Option> { + match from.lookup.try_acquire() { + Ok(permit) => { + permit.forget(); + Some(JwkRenewalPermit { + inner: Some(JwkRenewalPermitInner::Borrowed(from)), + }) + } + Err(tokio::sync::TryAcquireError::NoPermits) => None, + Err(tokio::sync::TryAcquireError::Closed) => panic!("semaphore should not be closed"), + } + } +} + +impl JwkRenewalPermitInner<'_> { + fn into_owned(self) -> JwkRenewalPermitInner<'static> { + match self { + JwkRenewalPermitInner::Owned(p) => JwkRenewalPermitInner::Owned(p), + JwkRenewalPermitInner::Borrowed(p) => JwkRenewalPermitInner::Owned(Arc::clone(p)), + } + } +} + +impl Drop for JwkRenewalPermit<'_> { + fn drop(&mut self) { + let entry = match &self.inner { + None => return, + Some(JwkRenewalPermitInner::Owned(p)) => p, + Some(JwkRenewalPermitInner::Borrowed(p)) => *p, + }; + entry.lookup.add_permits(1); + } +} + +#[cfg(test)] +mod tests { + use crate::RoleName; + + use super::*; + + use std::{future::IntoFuture, net::SocketAddr, time::SystemTime}; + + use base64::URL_SAFE_NO_PAD; + use bytes::Bytes; + use http::Response; + use http_body_util::Full; + use hyper1::service::service_fn; + use hyper_util::rt::TokioIo; + use rand::rngs::OsRng; + use rsa::pkcs8::DecodePrivateKey; + use signature::Signer; + use tokio::net::TcpListener; + + fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) { + let sk = p256::SecretKey::random(&mut OsRng); + let pk = sk.public_key().into(); + let jwk = jose_jwk::Jwk { + key: jose_jwk::Key::Ec(pk), + prm: jose_jwk::Parameters { + kid: Some(kid), + alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)), + ..Default::default() + }, + }; + (sk, jwk) + } + + fn new_rsa_jwk(key: &str, kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) { + let sk = rsa::RsaPrivateKey::from_pkcs8_pem(key).unwrap(); + let pk = sk.to_public_key().into(); + let jwk = jose_jwk::Jwk { + key: jose_jwk::Key::Rsa(pk), + prm: jose_jwk::Parameters { + kid: Some(kid), + alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)), + ..Default::default() + }, + }; + (sk, jwk) + } + + fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String { + let header = JwtHeader { + typ: "JWT", + algorithm: jose_jwa::Algorithm::Signing(sig), + key_id: Some(&kid), + }; + let body = typed_json::json! {{ + "exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600, + }}; + + let header = + base64::encode_config(serde_json::to_string(&header).unwrap(), URL_SAFE_NO_PAD); + let body = base64::encode_config(body.to_string(), URL_SAFE_NO_PAD); + + format!("{header}.{body}") + } + + fn new_ec_jwt(kid: String, key: p256::SecretKey) -> String { + use p256::ecdsa::{Signature, SigningKey}; + + let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256); + let sig: Signature = SigningKey::from(key).sign(payload.as_bytes()); + let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD); + + format!("{payload}.{sig}") + } + + fn new_rsa_jwt(kid: String, key: rsa::RsaPrivateKey) -> String { + use rsa::pkcs1v15::SigningKey; + use rsa::signature::SignatureEncoding; + + let payload = build_jwt_payload(kid, jose_jwa::Signing::Rs256); + let sig = SigningKey::::new(key).sign(payload.as_bytes()); + let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD); + + format!("{payload}.{sig}") + } + + // RSA key gen is slow.... + const RS1: &str = "-----BEGIN PRIVATE KEY----- +MIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQDNuWBIWTlo+54Y +aifpGInIrpv6LlsbI/2/2CC81Arlx4RsABORklgA9XSGwaCbHTshHsfd1S916JwA +SpjyPQYWfqo6iAV8a4MhjIeJIkRr74prDCSzOGZvIc6VaGeCIb9clf3HSrPHm3hA +cfLMB8/p5MgoxERPDOIn3XYoS9SEEuP7l0LkmEZMerg6W6lDjQRDny0Lb50Jky9X +mDqnYXBhs99ranbwL5vjy0ba6OIeCWFJme5u+rv5C/P0BOYrJfGxIcEoKa8Ukw5s +PlM+qrz9ope1eOuXMNNdyFDReNBUyaM1AwBAayU5rz57crer7K/UIofaJ42T4cMM +nx/SWfBNAgMBAAECggEACqdpBxYn1PoC6/zDaFzu9celKEWyTiuE/qRwvZa1ocS9 +ZOJ0IPvVNud/S2NHsADJiSOQ8joSJScQvSsf1Ju4bv3MTw+wSQtAVUJz2nQ92uEi +5/xPAkEPfP3hNvebNLAOuvrBk8qYmOPCTIQaMNrOt6wzeXkAmJ9wLuRXNCsJLHW+ +KLpf2WdgTYxqK06ZiJERFgJ2r1MsC2IgTydzjOAdEIrtMarerTLqqCpwFrk/l0cz +1O2OAb17ZxmhuzMhjNMin81c8F2fZAGMeOjn92Jl5kUsYw/pG+0S8QKlbveR/fdP +We2tJsgXw2zD0q7OJpp8NXS2yddrZGyysYsof983wQKBgQD2McqNJqo+eWL5zony +UbL19loYw0M15EjhzIuzW1Jk0rPj65yQyzpJ6pqicRuWr34MvzCx+ZHM2b3jSiNu +GES2fnC7xLIKyeRxfqsXF71xz+6UStEGRQX27r1YWEtyQVuBhvlqB+AGWP3PYAC+ +HecZecnZ+vcihJ2K3+l5O3paVQKBgQDV6vKH5h2SY9vgO8obx0P7XSS+djHhmPuU +f8C/Fq6AuRbIA1g04pzuLU2WS9T26eIjgM173uVNg2TuqJveWzz+CAAp6nCR6l24 +DBg49lMGCWrMo4FqPG46QkUqvK8uSj42GkX/e5Rut1Gyu0209emeM6h2d2K15SvY +9563tYSmGQKBgQDwcH5WTi20KA7e07TroJi8GKWzS3gneNUpGQBS4VxdtV4UuXXF +/4TkzafJ/9cm2iurvUmMd6XKP9lw0mY5zp/E70WgTCBp4vUlVsU3H2tYbO+filYL +3ntNx6nKTykX4/a/UJfj0t8as+zli+gNxNx/h+734V9dKdFG4Rl+2fTLpQKBgQCE +qJkTEe+Q0wCOBEYICADupwqcWqwAXWDW7IrZdfVtulqYWwqecVIkmk+dPxWosc4d +ekjz4nyNH0i+gC15LVebqdaAJ/T7aD4KXuW+nXNLMRfcJCGjgipRUruWD0EMEdqW +rqBuGXMpXeH6VxGPgVkJVLvKC6tZZe9VM+pnvteuMQKBgQC8GaL+Lz+al4biyZBf +JE8ekWrIotq/gfUBLP7x70+PB9bNtXtlgmTvjgYg4jiu3KR/ZIYYQ8vfVgkb6tDI +rWGZw86Pzuoi1ppg/pYhKk9qrmCIT4HPEXbHl7ATahu2BOCIU3hybjTh2lB6LbX9 +8LMFlz1QPqSZYN/A/kOcLBfa3A== +-----END PRIVATE KEY----- +"; + const RS2: &str = "-----BEGIN PRIVATE KEY----- +MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDipm6FIKSRab3J +HwmK18t7hp+pohllxIDUSPi7S5mIhN/JG2Plq2Lp746E/fuT8dcBF2R4sJlG2L0J +zmxOvBU/i/sQF9s1i4CEfg05k2//gKENIEsF3pMMmrH+mcZi0TTD6rezHpdVxPHk +qWxSyOCtIJV29X+wxPwAB59kQFHzy2ooPB1isZcpE8tO0KthAM+oZ3KuCwE0++cO +IWLeq9aPwyKhtip/xjTMxd1kzdKh592mGSyzr9D0QSWOYFGvgJXANDdiPdhSSOLt +ECWPNPlm2FQvGGvYYBafUqz7VumKHE6x8J6lKdYa2J0ZdDzCIo2IHzlxe+RZNgwy +uAD2jhVxAgMBAAECggEAbsZHWBu3MzcKQiVARbLoygvnN0J5xUqAaMDtiKUPejDv +K1yOu67DXnDuKEP2VL2rhuYG/hHaKE1AP227c9PrUq6424m9YvM2sgrlrdFIuQkG +LeMtp8W7+zoUasp/ssZrUqICfLIj5xCl5UuFHQT/Ar7dLlIYwa3VOLKBDb9+Dnfe +QH5/So4uMXG6vw34JN9jf+eAc8Yt0PeIz62ycvRwdpTJQ0MxZN9ZKpCAQp+VTuXT +zlzNvDMilabEdqUvAyGyz8lBLNl0wdaVrqPqAEWM5U45QXsdFZknWammP7/tijeX +0z+Bi0J0uSEU5X502zm7GArj/NNIiWMcjmDjwUUhwQKBgQD9C2GoqxOxuVPYqwYR ++Jz7f2qMjlSP8adA5Lzuh8UKXDp8JCEQC8ryweLzaOKS9C5MAw+W4W2wd4nJoQI1 +P1dgGvBlfvEeRHMgqWtq7FuTsjSe7e0uSEkC4ngDb4sc0QOpv15cMuEz+4+aFLPL +x29EcHWAaBX+rkid3zpQHFU4eQKBgQDlTCEqRuXwwa3V+Sq+mNWzD9QIGtD87TH/ +FPO/Ij/cK2+GISgFDqhetiGTH4qrvPL0psPT+iH5zGFYcoFmTtwLdWQJdxhxz0bg +iX/AceyX5e1Bm+ThT36sU83NrxKPkrdk6jNmr2iUF1OTzTwUKOYdHOPZqdMPfF4M +4XAaWVT2uQKBgQD4nKcNdU+7LE9Rr+4d1/o8Klp/0BMK/ayK2HE7lc8kt6qKb2DA +iCWUTqPw7Fq3cQrPia5WWhNP7pJEtFkcAaiR9sW7onW5fBz0uR+dhK0QtmR2xWJj +N4fsOp8ZGQ0/eae0rh1CTobucLkM9EwV6VLLlgYL67e4anlUCo8bSEr+WQKBgQCB +uf6RgqcY/RqyklPCnYlZ0zyskS9nyXKd1GbK3j+u+swP4LZZlh9f5j88k33LCA2U +qLzmMwAB6cWxWqcnELqhqPq9+ClWSmTZKDGk2U936NfAZMirSGRsbsVi9wfTPriP +WYlXMSpDjqb0WgsBhNob4npubQxCGKTFOM5Jufy90QKBgB0Lte1jX144uaXx6dtB +rjXNuWNir0Jy31wHnQuCA+XnfUgPcrKmRLm8taMbXgZwxkNvgFkpUWU8aPEK08Ne +X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL +5JiconnI5aLek0QVPoFaVXFa +-----END PRIVATE KEY----- +"; + + #[tokio::test] + async fn renew() { + let (rs1, jwk1) = new_rsa_jwk(RS1, "1".into()); + let (rs2, jwk2) = new_rsa_jwk(RS2, "2".into()); + let (ec1, jwk3) = new_ec_jwk("3".into()); + let (ec2, jwk4) = new_ec_jwk("4".into()); + + let jwt1 = new_rsa_jwt("1".into(), rs1); + let jwt2 = new_rsa_jwt("2".into(), rs2); + let jwt3 = new_ec_jwt("3".into(), ec1); + let jwt4 = new_ec_jwt("4".into(), ec2); + + let foo_jwks = jose_jwk::JwkSet { + keys: vec![jwk1, jwk3], + }; + let bar_jwks = jose_jwk::JwkSet { + keys: vec![jwk2, jwk4], + }; + + let service = service_fn(move |req| { + let foo_jwks = foo_jwks.clone(); + let bar_jwks = bar_jwks.clone(); + async move { + let jwks = match req.uri().path() { + "/foo" => &foo_jwks, + "/bar" => &bar_jwks, + _ => { + return Response::builder() + .status(404) + .body(Full::new(Bytes::new())); + } + }; + let body = serde_json::to_vec(jwks).unwrap(); + Response::builder() + .status(200) + .body(Full::new(Bytes::from(body))) + } + }); + + let listener = TcpListener::bind("0.0.0.0:0").await.unwrap(); + let server = hyper1::server::conn::http1::Builder::new(); + let addr = listener.local_addr().unwrap(); + tokio::spawn(async move { + loop { + let (s, _) = listener.accept().await.unwrap(); + let serve = server.serve_connection(TokioIo::new(s), service.clone()); + tokio::spawn(serve.into_future()); + } + }); + + let client = reqwest::Client::new(); + + #[derive(Clone)] + struct Fetch(SocketAddr); + + impl FetchAuthRules for Fetch { + async fn fetch_auth_rules( + &self, + _role_name: RoleName, + ) -> anyhow::Result> { + Ok(vec![ + AuthRule { + id: "foo".to_owned(), + jwks_url: format!("http://{}/foo", self.0).parse().unwrap(), + audience: None, + }, + AuthRule { + id: "bar".to_owned(), + jwks_url: format!("http://{}/bar", self.0).parse().unwrap(), + audience: None, + }, + ]) + } + } + + let role_name = RoleName::from("user"); + + let jwk_cache = Arc::new(JwkCacheEntryLock::default()); + + for token in [jwt1, jwt2, jwt3, jwt4] { + jwk_cache + .check_jwt( + &RequestMonitoring::test(), + &token, + &client, + role_name.clone(), + &Fetch(addr), + ) + .await + .unwrap(); + } + } +} diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs new file mode 100644 index 0000000000..8124f568cf --- /dev/null +++ b/proxy/src/auth/backend/local.rs @@ -0,0 +1,77 @@ +use std::{collections::HashMap, net::SocketAddr}; + +use anyhow::Context; +use arc_swap::ArcSwapOption; + +use crate::{ + compute::ConnCfg, + console::{ + messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}, + NodeInfo, + }, + intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag}, + RoleName, +}; + +use super::jwt::{AuthRule, FetchAuthRules, JwkCache}; + +pub struct LocalBackend { + pub(crate) jwks_cache: JwkCache, + pub(crate) node_info: NodeInfo, +} + +impl LocalBackend { + pub fn new(postgres_addr: SocketAddr) -> Self { + LocalBackend { + jwks_cache: JwkCache::default(), + node_info: NodeInfo { + config: { + let mut cfg = ConnCfg::new(); + cfg.host(&postgres_addr.ip().to_string()); + cfg.port(postgres_addr.port()); + cfg + }, + // TODO(conrad): make this better reflect compute info rather than endpoint info. + aux: MetricsAuxInfo { + endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"), + project_id: ProjectIdTag::get_interner().get_or_intern("local"), + branch_id: BranchIdTag::get_interner().get_or_intern("local"), + cold_start_info: ColdStartInfo::WarmCached, + }, + allow_self_signed_compute: false, + }, + } + } +} + +#[derive(Clone, Copy)] +pub(crate) struct StaticAuthRules; + +pub static JWKS_ROLE_MAP: ArcSwapOption = ArcSwapOption::const_empty(); + +#[derive(Debug, Clone)] +pub struct JwksRoleSettings { + pub roles: HashMap, + pub project_id: ProjectIdInt, + pub branch_id: BranchIdInt, +} + +impl FetchAuthRules for StaticAuthRules { + async fn fetch_auth_rules(&self, role_name: RoleName) -> anyhow::Result> { + let mappings = JWKS_ROLE_MAP.load(); + let role_mappings = mappings + .as_deref() + .and_then(|m| m.roles.get(&role_name)) + .context("JWKs settings for this role were not configured")?; + let mut rules = vec![]; + for setting in &role_mappings.jwks { + rules.push(AuthRule { + id: setting.id.clone(), + jwks_url: setting.jwks_url.clone(), + audience: setting.jwt_audience.clone(), + }); + } + + Ok(rules) + } +} diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/web.rs similarity index 76% rename from proxy/src/auth/backend/link.rs rename to proxy/src/auth/backend/web.rs index a7ddd257b3..58a4bef62e 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/web.rs @@ -1,7 +1,8 @@ use crate::{ auth, compute, console::{self, provider::NodeInfo}, - error::UserFacingError, + context::RequestMonitoring, + error::{ReportableError, UserFacingError}, stream::PqStream, waiters, }; @@ -12,11 +13,7 @@ use tokio_postgres::config::SslMode; use tracing::{info, info_span}; #[derive(Debug, Error)] -pub enum LinkAuthError { - /// Authentication error reported by the console. - #[error("Authentication failed: {0}")] - AuthFailed(String), - +pub(crate) enum WebAuthError { #[error(transparent)] WaiterRegister(#[from] waiters::RegisterError), @@ -27,12 +24,18 @@ pub enum LinkAuthError { Io(#[from] std::io::Error), } -impl UserFacingError for LinkAuthError { +impl UserFacingError for WebAuthError { fn to_string_client(&self) -> String { - use LinkAuthError::*; + "Internal error".to_string() + } +} + +impl ReportableError for WebAuthError { + fn get_error_kind(&self) -> crate::error::ErrorKind { match self { - AuthFailed(_) => self.to_string(), - _ => "Internal error".to_string(), + Self::WaiterRegister(_) => crate::error::ErrorKind::Service, + Self::WaiterWait(_) => crate::error::ErrorKind::Service, + Self::Io(_) => crate::error::ErrorKind::ClientDisconnect, } } } @@ -49,14 +52,17 @@ fn hello_message(redirect_uri: &reqwest::Url, session_id: &str) -> String { ) } -pub fn new_psql_session_id() -> String { +pub(crate) fn new_psql_session_id() -> String { hex::encode(rand::random::<[u8; 8]>()) } pub(super) async fn authenticate( + ctx: &RequestMonitoring, link_uri: &reqwest::Url, client: &mut PqStream, ) -> auth::Result { + ctx.set_auth_method(crate::context::AuthMethod::Web); + // registering waiter can fail if we get unlucky with rng. // just try again. let (psql_session_id, waiter) = loop { @@ -68,7 +74,7 @@ pub(super) async fn authenticate( } }; - let span = info_span!("link", psql_session_id = &psql_session_id); + let span = info_span!("web", psql_session_id = &psql_session_id); let greeting = hello_message(link_uri, &psql_session_id); // Give user a URL to spawn a new database. @@ -81,7 +87,7 @@ pub(super) async fn authenticate( // Wait for web console response (see `mgmt`). info!(parent: &span, "waiting for console's reply..."); - let db_info = waiter.await.map_err(LinkAuthError::from)?; + let db_info = waiter.await.map_err(WebAuthError::from)?; client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?; @@ -94,6 +100,11 @@ pub(super) async fn authenticate( .dbname(&db_info.dbname) .user(&db_info.user); + ctx.set_dbname(db_info.dbname.into()); + ctx.set_user(db_info.user.into()); + ctx.set_project(db_info.aux.clone()); + info!("woken up a compute node"); + // Backwards compatibility. pg_sni_proxy uses "--" in domain names // while direct connections do not. Once we migrate to pg_sni_proxy // everywhere, we can remove this. diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index bdb79f2517..0e91ae570a 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,18 +1,22 @@ //! User credentials used in authentication. use crate::{ - auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError, - metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, EndpointId, RoleName, + auth::password_hack::parse_endpoint_param, + context::RequestMonitoring, + error::{ReportableError, UserFacingError}, + metrics::{Metrics, SniKind}, + proxy::NeonOptions, + serverless::SERVERLESS_DRIVER_SNI, + EndpointId, RoleName, }; use itertools::Itertools; use pq_proto::StartupMessageParams; -use smol_str::SmolStr; use std::{collections::HashSet, net::IpAddr, str::FromStr}; use thiserror::Error; use tracing::{info, warn}; #[derive(Debug, Error, PartialEq, Eq, Clone)] -pub enum ComputeUserInfoParseError { +pub(crate) enum ComputeUserInfoParseError { #[error("Parameter '{0}' is missing in startup packet.")] MissingKey(&'static str), @@ -38,26 +42,32 @@ pub enum ComputeUserInfoParseError { impl UserFacingError for ComputeUserInfoParseError {} +impl ReportableError for ComputeUserInfoParseError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + crate::error::ErrorKind::User + } +} + /// Various client credentials which we use for authentication. /// Note that we don't store any kind of client key or password here. #[derive(Debug, Clone, PartialEq, Eq)] -pub struct ComputeUserInfoMaybeEndpoint { - pub user: RoleName, - pub endpoint_id: Option, - pub options: NeonOptions, +pub(crate) struct ComputeUserInfoMaybeEndpoint { + pub(crate) user: RoleName, + pub(crate) endpoint_id: Option, + pub(crate) options: NeonOptions, } impl ComputeUserInfoMaybeEndpoint { #[inline] - pub fn endpoint(&self) -> Option<&str> { + pub(crate) fn endpoint(&self) -> Option<&str> { self.endpoint_id.as_deref() } } -pub fn endpoint_sni<'a>( - sni: &'a str, +pub(crate) fn endpoint_sni( + sni: &str, common_names: &HashSet, -) -> Result<&'a str, ComputeUserInfoParseError> { +) -> Result, ComputeUserInfoParseError> { let Some((subdomain, common_name)) = sni.split_once('.') else { return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() }); }; @@ -66,27 +76,27 @@ pub fn endpoint_sni<'a>( cn: common_name.into(), }); } - Ok(subdomain) + if subdomain == SERVERLESS_DRIVER_SNI { + return Ok(None); + } + Ok(Some(EndpointId::from(subdomain))) } impl ComputeUserInfoMaybeEndpoint { - pub fn parse( - ctx: &mut RequestMonitoring, + pub(crate) fn parse( + ctx: &RequestMonitoring, params: &StartupMessageParams, sni: Option<&str>, common_names: Option<&HashSet>, ) -> Result { - use ComputeUserInfoParseError::*; - // Some parameters are stored in the startup message. - let get_param = |key| params.get(key).ok_or(MissingKey(key)); + let get_param = |key| { + params + .get(key) + .ok_or(ComputeUserInfoParseError::MissingKey(key)) + }; let user: RoleName = get_param("user")?.into(); - // record the values if we have them - ctx.set_application(params.get("application_name").map(SmolStr::from)); - ctx.set_user(user.clone()); - ctx.set_endpoint_id(sni.map(EndpointId::from)); - // Project name might be passed via PG's command-line options. let endpoint_option = params .options_raw() @@ -103,7 +113,7 @@ impl ComputeUserInfoMaybeEndpoint { let endpoint_from_domain = if let Some(sni_str) = sni { if let Some(cn) = common_names { - Some(EndpointId::from(endpoint_sni(sni_str, cn)?)) + endpoint_sni(sni_str, cn)? } else { None } @@ -114,31 +124,42 @@ impl ComputeUserInfoMaybeEndpoint { let endpoint = match (endpoint_option, endpoint_from_domain) { // Invariant: if we have both project name variants, they should match. (Some(option), Some(domain)) if option != domain => { - Some(Err(InconsistentProjectNames { domain, option })) + Some(Err(ComputeUserInfoParseError::InconsistentProjectNames { + domain, + option, + })) } // Invariant: project name may not contain certain characters. - (a, b) => a.or(b).map(|name| match project_name_valid(&name) { - false => Err(MalformedProjectName(name)), - true => Ok(name), + (a, b) => a.or(b).map(|name| { + if project_name_valid(name.as_ref()) { + Ok(name) + } else { + Err(ComputeUserInfoParseError::MalformedProjectName(name)) + } }), } .transpose()?; - info!(%user, project = endpoint.as_deref(), "credentials"); + if let Some(ep) = &endpoint { + ctx.set_endpoint_id(ep.clone()); + } + + let metrics = Metrics::get(); + info!(%user, "credentials"); if sni.is_some() { info!("Connection with sni"); - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["sni"]) - .inc(); + metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni); } else if endpoint.is_some() { - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["no_sni"]) - .inc(); + metrics + .proxy + .accepted_connections_by_sni + .inc(SniKind::NoSni); info!("Connection without sni"); } else { - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["password_hack"]) - .inc(); + metrics + .proxy + .accepted_connections_by_sni + .inc(SniKind::PasswordHack); info!("Connection with password hack"); } @@ -146,18 +167,18 @@ impl ComputeUserInfoMaybeEndpoint { Ok(Self { user, - endpoint_id: endpoint.map(EndpointId::from), + endpoint_id: endpoint, options, }) } } -pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool { +pub(crate) fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool { ip_list.is_empty() || ip_list.iter().any(|pattern| check_ip(peer_addr, pattern)) } #[derive(Debug, Clone, Eq, PartialEq)] -pub enum IpPattern { +pub(crate) enum IpPattern { Subnet(ipnet::IpNet), Range(IpAddr, IpAddr), Single(IpAddr), @@ -173,7 +194,7 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern { impl<'de> serde::de::Visitor<'de> for StrVisitor { type Value = IpPattern; - fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask") } @@ -236,8 +257,8 @@ mod tests { fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. let options = StartupMessageParams::new([("user", "john_doe")]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id, None); @@ -251,8 +272,8 @@ mod tests { ("database", "world"), // should be ignored ("foo", "bar"), // should be ignored ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id, None); @@ -266,9 +287,9 @@ mod tests { let sni = Some("foo.localhost"); let common_names = Some(["localhost".into()].into()); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("foo")); assert_eq!(user_info.options.get_cache_key("foo"), "foo"); @@ -283,8 +304,8 @@ mod tests { ("options", "-ckey=1 project=bar -c geqo=off"), ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); @@ -298,8 +319,8 @@ mod tests { ("options", "-ckey=1 endpoint=bar -c geqo=off"), ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); @@ -316,8 +337,8 @@ mod tests { ), ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert!(user_info.endpoint_id.is_none()); @@ -331,8 +352,8 @@ mod tests { ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"), ]); - let mut ctx = RequestMonitoring::test(); - let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + let ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert!(user_info.endpoint_id.is_none()); @@ -346,9 +367,9 @@ mod tests { let sni = Some("baz.localhost"); let common_names = Some(["localhost".into()].into()); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("baz")); @@ -361,16 +382,16 @@ mod tests { let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.a.com"); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.b.com"); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); Ok(()) @@ -384,10 +405,9 @@ mod tests { let sni = Some("second.localhost"); let common_names = Some(["localhost".into()].into()); - let mut ctx = RequestMonitoring::test(); - let err = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref()) - .expect_err("should fail"); + let ctx = RequestMonitoring::test(); + let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) + .expect_err("should fail"); match err { InconsistentProjectNames { domain, option } => { assert_eq!(option, "first"); @@ -404,10 +424,9 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["example.com".into()].into()); - let mut ctx = RequestMonitoring::test(); - let err = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref()) - .expect_err("should fail"); + let ctx = RequestMonitoring::test(); + let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) + .expect_err("should fail"); match err { UnknownCommonName { cn } => { assert_eq!(cn, "localhost"); @@ -425,9 +444,9 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["localhost".into()].into()); - let mut ctx = RequestMonitoring::test(); + let ctx = RequestMonitoring::test(); let user_info = - ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("project")); assert_eq!( user_info.options.get_cache_key("project"), diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 3151a77263..f7e2b5296e 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -4,26 +4,33 @@ use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload}; use crate::{ config::TlsServerEndPoint, console::AuthSecret, - sasl, scram, + context::RequestMonitoring, + intern::EndpointIdInt, + sasl, + scram::{self, threadpool::ThreadPool}, stream::{PqStream, Stream}, }; +use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS}; use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; -use std::io; +use std::{io, sync::Arc}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; /// Every authentication selector is supposed to implement this trait. -pub trait AuthMethod { +pub(crate) trait AuthMethod { /// Any authentication selector should provide initial backend message /// containing auth method name and parameters, e.g. md5 salt. fn first_message(&self, channel_binding: bool) -> BeMessage<'_>; } /// Initial state of [`AuthFlow`]. -pub struct Begin; +pub(crate) struct Begin; /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`]. -pub struct Scram<'a>(pub &'a scram::ServerSecret); +pub(crate) struct Scram<'a>( + pub(crate) &'a scram::ServerSecret, + pub(crate) &'a RequestMonitoring, +); impl AuthMethod for Scram<'_> { #[inline(always)] @@ -40,7 +47,7 @@ impl AuthMethod for Scram<'_> { /// Use an ad hoc auth flow (for clients which don't support SNI) proposed in /// . -pub struct PasswordHack; +pub(crate) struct PasswordHack; impl AuthMethod for PasswordHack { #[inline(always)] @@ -51,7 +58,11 @@ impl AuthMethod for PasswordHack { /// Use clear-text password auth called `password` in docs /// -pub struct CleartextPassword(pub AuthSecret); +pub(crate) struct CleartextPassword { + pub(crate) pool: Arc, + pub(crate) endpoint: EndpointIdInt, + pub(crate) secret: AuthSecret, +} impl AuthMethod for CleartextPassword { #[inline(always)] @@ -62,7 +73,7 @@ impl AuthMethod for CleartextPassword { /// This wrapper for [`PqStream`] performs client authentication. #[must_use] -pub struct AuthFlow<'a, S, State> { +pub(crate) struct AuthFlow<'a, S, State> { /// The underlying stream which implements libpq's protocol. stream: &'a mut PqStream>, /// State might contain ancillary data (see [`Self::begin`]). @@ -73,7 +84,7 @@ pub struct AuthFlow<'a, S, State> { /// Initial state of the stream wrapper. impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { /// Create a new wrapper for client authentication. - pub fn new(stream: &'a mut PqStream>) -> Self { + pub(crate) fn new(stream: &'a mut PqStream>) -> Self { let tls_server_end_point = stream.get_ref().tls_server_end_point(); Self { @@ -84,7 +95,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { } /// Move to the next step by sending auth method's name & params to client. - pub async fn begin(self, method: M) -> io::Result> { + pub(crate) async fn begin(self, method: M) -> io::Result> { self.stream .write_message(&method.first_message(self.tls_server_end_point.supported())) .await?; @@ -99,7 +110,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { impl AuthFlow<'_, S, PasswordHack> { /// Perform user authentication. Raise an error in case authentication failed. - pub async fn get_password(self) -> super::Result { + pub(crate) async fn get_password(self) -> super::Result { let msg = self.stream.read_password_message().await?; let password = msg .strip_suffix(&[0]) @@ -118,13 +129,19 @@ impl AuthFlow<'_, S, PasswordHack> { impl AuthFlow<'_, S, CleartextPassword> { /// Perform user authentication. Raise an error in case authentication failed. - pub async fn authenticate(self) -> super::Result> { + pub(crate) async fn authenticate(self) -> super::Result> { let msg = self.stream.read_password_message().await?; let password = msg .strip_suffix(&[0]) .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; - let outcome = validate_password_and_exchange(password, self.state.0)?; + let outcome = validate_password_and_exchange( + &self.state.pool, + self.state.endpoint, + password, + self.state.secret, + ) + .await?; if let sasl::Outcome::Success(_) = &outcome { self.stream.write_message_noflush(&Be::AuthenticationOk)?; @@ -137,7 +154,12 @@ impl AuthFlow<'_, S, CleartextPassword> { /// Stream wrapper for handling [SCRAM](crate::scram) auth. impl AuthFlow<'_, S, Scram<'_>> { /// Perform user authentication. Raise an error in case authentication failed. - pub async fn authenticate(self) -> super::Result> { + pub(crate) async fn authenticate(self) -> super::Result> { + let Scram(secret, ctx) = self.state; + + // pause the timer while we communicate with the client + let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); + // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; let sasl = sasl::FirstMessage::parse(&msg) @@ -148,9 +170,13 @@ impl AuthFlow<'_, S, Scram<'_>> { return Err(super::AuthError::bad_auth_method(sasl.method)); } + match sasl.method { + SCRAM_SHA_256 => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256), + SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus), + _ => {} + } info!("client chooses {}", sasl.method); - let secret = self.state.0; let outcome = sasl::SaslStream::new(self.stream, sasl.message) .authenticate(scram::Exchange::new( secret, @@ -167,12 +193,14 @@ impl AuthFlow<'_, S, Scram<'_>> { } } -pub(super) fn validate_password_and_exchange( +pub(crate) async fn validate_password_and_exchange( + pool: &ThreadPool, + endpoint: EndpointIdInt, password: &[u8], secret: AuthSecret, ) -> super::Result> { match secret { - #[cfg(feature = "testing")] + #[cfg(any(test, feature = "testing"))] AuthSecret::Md5(_) => { // test only Ok(sasl::Outcome::Success(ComputeCredentialKeys::Password( @@ -181,13 +209,7 @@ pub(super) fn validate_password_and_exchange( } // perform scram authentication as both client and server to validate the keys AuthSecret::Scram(scram_secret) => { - use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256}; - let sasl_client = ScramSha256::new(password, ChannelBinding::unsupported()); - let outcome = crate::scram::exchange( - &scram_secret, - sasl_client, - crate::config::TlsServerEndPoint::Undefined, - )?; + let outcome = crate::scram::exchange(pool, endpoint, &scram_secret, password).await?; let client_key = match outcome { sasl::Outcome::Success(client_key) => client_key, diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs index 2ddf46fe25..8585b8ff48 100644 --- a/proxy/src/auth/password_hack.rs +++ b/proxy/src/auth/password_hack.rs @@ -1,5 +1,5 @@ //! Payload for ad hoc authentication method for clients that don't support SNI. -//! See the `impl` for [`super::backend::BackendType`]. +//! See the `impl` for [`super::backend::Backend`]. //! Read more: . //! UPDATE (Mon Aug 8 13:20:34 UTC 2022): the payload format has been simplified. @@ -7,13 +7,13 @@ use bstr::ByteSlice; use crate::EndpointId; -pub struct PasswordHackPayload { - pub endpoint: EndpointId, - pub password: Vec, +pub(crate) struct PasswordHackPayload { + pub(crate) endpoint: EndpointId, + pub(crate) password: Vec, } impl PasswordHackPayload { - pub fn parse(bytes: &[u8]) -> Option { + pub(crate) fn parse(bytes: &[u8]) -> Option { // The format is `project=;` or `project=$`. let separators = [";", "$"]; for sep in separators { @@ -30,7 +30,7 @@ impl PasswordHackPayload { } } -pub fn parse_endpoint_param(bytes: &str) -> Option<&str> { +pub(crate) fn parse_endpoint_param(bytes: &str) -> Option<&str> { bytes .strip_prefix("project=") .or_else(|| bytes.strip_prefix("endpoint=")) diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs new file mode 100644 index 0000000000..08effeff99 --- /dev/null +++ b/proxy/src/bin/local_proxy.rs @@ -0,0 +1,316 @@ +use std::{ + net::SocketAddr, + path::{Path, PathBuf}, + pin::pin, + sync::Arc, + time::Duration, +}; + +use anyhow::{bail, ensure}; +use dashmap::DashMap; +use futures::{future::Either, FutureExt}; +use proxy::{ + auth::backend::local::{JwksRoleSettings, LocalBackend, JWKS_ROLE_MAP}, + cancellation::CancellationHandlerMain, + config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig}, + console::{locks::ApiLocks, messages::JwksRoleMapping}, + http::health_server::AppMetrics, + metrics::{Metrics, ThreadPoolMetrics}, + rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo}, + scram::threadpool::ThreadPool, + serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions}, +}; + +project_git_version!(GIT_VERSION); +project_build_tag!(BUILD_TAG); + +use clap::Parser; +use tokio::{net::TcpListener, task::JoinSet}; +use tokio_util::sync::CancellationToken; +use tracing::{error, info, warn}; +use utils::{project_build_tag, project_git_version, sentry_init::init_sentry}; + +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +/// Neon proxy/router +#[derive(Parser)] +#[command(version = GIT_VERSION, about)] +struct LocalProxyCliArgs { + /// listen for incoming metrics connections on ip:port + #[clap(long, default_value = "127.0.0.1:7001")] + metrics: String, + /// listen for incoming http connections on ip:port + #[clap(long)] + http: String, + /// timeout for the TLS handshake + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + handshake_timeout: tokio::time::Duration, + /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] + connect_compute_lock: String, + #[clap(flatten)] + sql_over_http: SqlOverHttpArgs, + /// User rate limiter max number of requests per second. + /// + /// Provided in the form `@`. + /// Can be given multiple times for different bucket sizes. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] + user_rps_limit: Vec, + /// Whether the auth rate limiter actually takes effect (for testing) + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + auth_rate_limit_enabled: bool, + /// Authentication rate limiter max number of hashes per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] + auth_rate_limit: Vec, + /// The IP subnet to use when considering whether two IP addresses are considered the same. + #[clap(long, default_value_t = 64)] + auth_rate_limit_ip_subnet: u8, + /// Whether to retry the connection to the compute node + #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] + connect_to_compute_retry: String, + /// Address of the postgres server + #[clap(long, default_value = "127.0.0.1:5432")] + compute: SocketAddr, + /// File address of the local proxy config file + #[clap(long, default_value = "./localproxy.json")] + config_path: PathBuf, +} + +#[derive(clap::Args, Clone, Copy, Debug)] +struct SqlOverHttpArgs { + /// How many connections to pool for each endpoint. Excess connections are discarded + #[clap(long, default_value_t = 200)] + sql_over_http_pool_max_total_conns: usize, + + /// How long pooled connections should remain idle for before closing + #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] + sql_over_http_idle_timeout: tokio::time::Duration, + + #[clap(long, default_value_t = 100)] + sql_over_http_client_conn_threshold: u64, + + #[clap(long, default_value_t = 16)] + sql_over_http_cancel_set_shards: usize, +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let _logging_guard = proxy::logging::init().await?; + let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + + Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); + + info!("Version: {GIT_VERSION}"); + info!("Build_tag: {BUILD_TAG}"); + let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }); + + let jemalloc = match proxy::jemalloc::MetricRecorder::new() { + Ok(t) => Some(t), + Err(e) => { + tracing::error!(error = ?e, "could not start jemalloc metrics loop"); + None + } + }; + + let args = LocalProxyCliArgs::parse(); + let config = build_config(&args)?; + + let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?; + let http_listener = TcpListener::bind(args.http).await?; + let shutdown = CancellationToken::new(); + + // todo: should scale with CU + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + LeakyBucketConfig { + rps: 10.0, + max: 100.0, + }, + 16, + )); + + refresh_config(args.config_path.clone()).await; + + let mut maintenance_tasks = JoinSet::new(); + maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), move || { + refresh_config(args.config_path.clone()).map(Ok) + })); + maintenance_tasks.spawn(proxy::http::health_server::task_main( + metrics_listener, + AppMetrics { + jemalloc, + neon_metrics, + proxy: proxy::metrics::Metrics::get(), + }, + )); + + let task = serverless::task_main( + config, + http_listener, + shutdown.clone(), + Arc::new(CancellationHandlerMain::new( + Arc::new(DashMap::new()), + None, + proxy::metrics::CancellationSource::Local, + )), + endpoint_rate_limiter, + ); + + match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await { + // exit immediately on maintenance task completion + Either::Left((Some(res), _)) => match proxy::flatten_err(res)? {}, + // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) + Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), + // exit immediately on client task error + Either::Right((res, _)) => res?, + } + + Ok(()) +} + +/// ProxyConfig is created at proxy startup, and lives forever. +fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.connect_compute_lock.parse()?; + info!( + ?limiter, + shards, + ?epoch, + "Using NodeLocks (connect_compute)" + ); + let connect_compute_locks = ApiLocks::new( + "connect_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().proxy.connect_compute_lock, + )?; + + let http_config = HttpConfig { + accept_websockets: false, + pool_options: GlobalConnPoolOptions { + gc_epoch: Duration::from_secs(60), + pool_shards: 2, + idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, + opt_in: false, + + max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns, + max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, + }, + cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), + client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, + }; + + Ok(Box::leak(Box::new(ProxyConfig { + tls_config: None, + auth_backend: proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned( + LocalBackend::new(args.compute), + )), + metric_collection: None, + allow_self_signed_compute: false, + http_config, + authentication_config: AuthenticationConfig { + thread_pool: ThreadPool::new(0), + scram_protocol_timeout: Duration::from_secs(10), + rate_limiter_enabled: false, + rate_limiter: BucketRateLimiter::new(vec![]), + rate_limit_ip_subnet: 64, + }, + require_client_ip: false, + handshake_timeout: Duration::from_secs(10), + region: "local".into(), + wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?, + connect_compute_locks, + connect_to_compute_retry_config: RetryConfig::parse( + RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES, + )?, + }))) +} + +async fn refresh_config(path: PathBuf) { + match refresh_config_inner(&path).await { + Ok(()) => {} + Err(e) => { + error!(error=?e, ?path, "could not read config file"); + } + } +} + +async fn refresh_config_inner(path: &Path) -> anyhow::Result<()> { + let bytes = tokio::fs::read(&path).await?; + let mut data: JwksRoleMapping = serde_json::from_slice(&bytes)?; + + let mut settings = None; + + for mapping in data.roles.values_mut() { + for jwks in &mut mapping.jwks { + ensure!( + jwks.jwks_url.has_authority() + && (jwks.jwks_url.scheme() == "http" || jwks.jwks_url.scheme() == "https"), + "Invalid JWKS url. Must be HTTP", + ); + + ensure!( + jwks.jwks_url + .host() + .is_some_and(|h| h != url::Host::Domain("")), + "Invalid JWKS url. No domain listed", + ); + + // clear username, password and ports + jwks.jwks_url.set_username("").expect( + "url can be a base and has a valid host and is not a file. should not error", + ); + jwks.jwks_url.set_password(None).expect( + "url can be a base and has a valid host and is not a file. should not error", + ); + // local testing is hard if we need to have a specific restricted port + if cfg!(not(feature = "testing")) { + jwks.jwks_url.set_port(None).expect( + "url can be a base and has a valid host and is not a file. should not error", + ); + } + + // clear query params + jwks.jwks_url.set_fragment(None); + jwks.jwks_url.query_pairs_mut().clear().finish(); + + if jwks.jwks_url.scheme() != "https" { + // local testing is hard if we need to set up https support. + if cfg!(not(feature = "testing")) { + jwks.jwks_url + .set_scheme("https") + .expect("should not error to set the scheme to https if it was http"); + } else { + warn!(scheme = jwks.jwks_url.scheme(), "JWKS url is not HTTPS"); + } + } + + let (pr, br) = settings.get_or_insert((jwks.project_id, jwks.branch_id)); + ensure!( + *pr == jwks.project_id, + "inconsistent project IDs configured" + ); + ensure!(*br == jwks.branch_id, "inconsistent branch IDs configured"); + } + } + + if let Some((project_id, branch_id)) = settings { + JWKS_ROLE_MAP.store(Some(Arc::new(JwksRoleSettings { + roles: data.roles, + project_id, + branch_id, + }))); + } + + Ok(()) +} diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 1edbc1e7e7..20d2d3df9a 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -9,13 +9,14 @@ use futures::future::Either; use itertools::Itertools; use proxy::config::TlsServerEndPoint; use proxy::context::RequestMonitoring; -use proxy::proxy::run_until_cancelled; +use proxy::metrics::{Metrics, ThreadPoolMetrics}; +use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; +use rustls::pki_types::PrivateKeyDer; use tokio::net::TcpListener; use anyhow::{anyhow, bail, ensure, Context}; -use clap::{self, Arg}; +use clap::Arg; use futures::TryFutureExt; -use proxy::console::messages::MetricsAuxInfo; use proxy::stream::{PqStream, Stream}; use tokio::io::{AsyncRead, AsyncWrite}; @@ -65,6 +66,8 @@ async fn main() -> anyhow::Result<()> { let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); + let args = cli().get_matches(); let destination: String = args.get_one::("dest").unwrap().parse()?; @@ -76,37 +79,40 @@ async fn main() -> anyhow::Result<()> { (Some(key_path), Some(cert_path)) => { let key = { let key_bytes = std::fs::read(key_path).context("TLS key file")?; - let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) - .context(format!("Failed to read TLS keys at '{key_path}'"))?; + + let mut keys = + rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - keys.pop().map(rustls::PrivateKey).unwrap() + PrivateKeyDer::Pkcs8( + keys.pop() + .unwrap() + .context(format!("Failed to read TLS keys at '{key_path}'"))?, + ) }; let cert_chain_bytes = std::fs::read(cert_path) .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; - let cert_chain = { + let cert_chain: Vec<_> = { rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .context(format!( - "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." - ))? - .into_iter() - .map(rustls::Certificate) - .collect_vec() + .try_collect() + .with_context(|| { + format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") + })? }; // needed for channel bindings let first_cert = cert_chain.first().context("missing certificate")?; let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - let tls_config = rustls::ServerConfig::builder() - .with_safe_default_cipher_suites() - .with_safe_default_kx_groups() - .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])? - .with_no_client_auth() - .with_single_cert(cert_chain, key)? - .into(); + let tls_config = rustls::ServerConfig::builder_with_protocol_versions(&[ + &rustls::version::TLS13, + &rustls::version::TLS12, + ]) + .with_no_client_auth() + .with_single_cert(cert_chain, key)? + .into(); (tls_config, tls_server_end_point) } @@ -127,7 +133,9 @@ async fn main() -> anyhow::Result<()> { proxy_listener, cancellation_token.clone(), )); - let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token)); + let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || async { + Ok(()) + })); // the signal task cant ever succeed. // the main task can error, or can succeed on cancellation. @@ -171,16 +179,13 @@ async fn task_main( .context("failed to set socket option")?; info!(%peer_addr, "serving"); - let mut ctx = - RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni"); - handle_client( - &mut ctx, - dest_suffix, - tls_config, - tls_server_end_point, - socket, - ) - .await + let ctx = RequestMonitoring::new( + session_id, + peer_addr.ip(), + proxy::metrics::Protocol::SniRouter, + "sni", + ); + handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await } .unwrap_or_else(|e| { // Acknowledge that the task has finished with an error. @@ -202,6 +207,7 @@ async fn task_main( const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; async fn ssl_handshake( + ctx: &RequestMonitoring, raw_stream: S, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, @@ -212,10 +218,11 @@ async fn ssl_handshake( use pq_proto::FeStartupPacket::*; match msg { - SslRequest => { + SslRequest { direct: false } => { stream .write_message(&pq_proto::BeMessage::EncryptionResponse(true)) .await?; + // Upgrade raw stream into a secure TLS-backed stream. // NOTE: We've consumed `tls`; this fact will be used later. @@ -231,7 +238,10 @@ async fn ssl_handshake( } Ok(Stream::Tls { - tls: Box::new(raw.upgrade(tls_config).await?), + tls: Box::new( + raw.upgrade(tls_config, !ctx.has_private_peer_addr()) + .await?, + ), tls_server_end_point, }) } @@ -240,19 +250,21 @@ async fn ssl_handshake( ?unexpected, "unexpected startup packet, rejecting connection" ); - stream.throw_error_str(ERR_INSECURE_CONNECTION).await? + stream + .throw_error_str(ERR_INSECURE_CONNECTION, proxy::error::ErrorKind::User) + .await? } } } async fn handle_client( - ctx: &mut RequestMonitoring, + ctx: RequestMonitoring, dest_suffix: Arc, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, stream: impl AsyncRead + AsyncWrite + Unpin, ) -> anyhow::Result<()> { - let tls_stream = ssl_handshake(stream, tls_config, tls_server_end_point).await?; + let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?; // Cut off first part of the SNI domain // We receive required destination details in the format of @@ -269,8 +281,18 @@ async fn handle_client( info!("destination: {}", destination); - let client = tokio::net::TcpStream::connect(destination).await?; + let mut client = tokio::net::TcpStream::connect(destination).await?; - let metrics_aux: MetricsAuxInfo = Default::default(); - proxy::proxy::proxy_pass(ctx, tls_stream, client, metrics_aux).await + // doesn't yet matter as pg-sni-router doesn't report analytics logs + ctx.set_success(); + ctx.log_connect(); + + // Starting from here we only proxy the client's traffic. + info!("performing the proxy pass..."); + + match copy_bidirectional_client_compute(&mut tls_stream, &mut client).await { + Ok(_) => Ok(()), + Err(ErrorSource::Client(err)) => Err(err).context("client"), + Err(ErrorSource::Compute(err)) => Err(err).context("compute"), + } } diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index ba113a89eb..7706a1f7cd 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,5 +1,18 @@ +use aws_config::environment::EnvironmentVariableCredentialsProvider; +use aws_config::imds::credentials::ImdsCredentialsProvider; +use aws_config::meta::credentials::CredentialsProviderChain; +use aws_config::meta::region::RegionProviderChain; +use aws_config::profile::ProfileFileCredentialsProvider; +use aws_config::provider_config::ProviderConfig; +use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; +use aws_config::Region; use futures::future::Either; use proxy::auth; +use proxy::auth::backend::AuthRateLimiter; +use proxy::auth::backend::MaybeOwned; +use proxy::cancellation::CancelMap; +use proxy::cancellation::CancellationHandler; +use proxy::config::remote_storage_from_toml; use proxy::config::AuthenticationConfig; use proxy::config::CacheOptions; use proxy::config::HttpConfig; @@ -7,24 +20,35 @@ use proxy::config::ProjectInfoCacheOptions; use proxy::console; use proxy::context::parquet::ParquetUploadArgs; use proxy::http; +use proxy::http::health_server::AppMetrics; +use proxy::metrics::Metrics; use proxy::rate_limiter::EndpointRateLimiter; +use proxy::rate_limiter::LeakyBucketConfig; use proxy::rate_limiter::RateBucketInfo; -use proxy::rate_limiter::RateLimiterConfig; +use proxy::rate_limiter::WakeComputeRateLimiter; +use proxy::redis::cancellation_publisher::RedisPublisherClient; +use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use proxy::redis::elasticache; use proxy::redis::notifications; +use proxy::scram::threadpool::ThreadPool; +use proxy::serverless::cancel_set::CancelSet; use proxy::serverless::GlobalConnPoolOptions; use proxy::usage_metrics; use anyhow::bail; use proxy::config::{self, ProxyConfig}; use proxy::serverless; +use remote_storage::RemoteStorageConfig; +use std::net::SocketAddr; use std::pin::pin; use std::sync::Arc; -use std::{borrow::Cow, net::SocketAddr}; use tokio::net::TcpListener; +use tokio::sync::Mutex; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::info; use tracing::warn; +use tracing::Instrument; use utils::{project_build_tag, project_git_version, sentry_init::init_sentry}; project_git_version!(GIT_VERSION); @@ -32,12 +56,18 @@ project_build_tag!(BUILD_TAG); use clap::{Parser, ValueEnum}; +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + #[derive(Clone, Debug, ValueEnum)] -enum AuthBackend { +enum AuthBackendType { Console, #[cfg(feature = "testing")] Postgres, - Link, + // clap only shows the name, not the alias, in usage text. + // TODO: swap name/alias and deprecate "link" + #[value(name("link"), alias("web"))] + Web, } /// Neon proxy/router @@ -50,8 +80,8 @@ struct ProxyCliArgs { /// listen for incoming client connections on ip:port #[clap(short, long, default_value = "127.0.0.1:4432")] proxy: String, - #[clap(value_enum, long, default_value_t = AuthBackend::Link)] - auth_backend: AuthBackend, + #[clap(value_enum, long, default_value_t = AuthBackendType::Web)] + auth_backend: AuthBackendType, /// listen for management callback connection on ip:port #[clap(short, long, default_value = "127.0.0.1:7000")] mgmt: String, @@ -61,7 +91,7 @@ struct ProxyCliArgs { /// listen for incoming wss connections on ip:port #[clap(long)] wss: Option, - /// redirect unauthenticated users to the given uri in case of link auth + /// redirect unauthenticated users to the given uri in case of web auth #[clap(short, long, default_value = "http://localhost:3000/psql_session/")] uri: String, /// cloud API endpoint for authenticating users @@ -84,6 +114,9 @@ struct ProxyCliArgs { /// path to directory with TLS certificates for client postgres connections #[clap(long)] certs_dir: Option, + /// timeout for the TLS handshake + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + handshake_timeout: tokio::time::Duration, /// http endpoint to receive periodic metric updates #[clap(long)] metric_collection_endpoint: Option, @@ -94,8 +127,11 @@ struct ProxyCliArgs { #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] wake_compute_cache: String, /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). - #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)] + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)] wake_compute_lock: String, + /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] + connect_compute_lock: String, /// Allow self-signed certificates for compute nodes (for testing) #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] allow_self_signed_compute: bool, @@ -104,47 +140,90 @@ struct ProxyCliArgs { /// timeout for scram authentication protocol #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] scram_protocol_timeout: tokio::time::Duration, + /// size of the threadpool for password hashing + #[clap(long, default_value_t = 4)] + scram_thread_pool_size: u8, /// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated. #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] require_client_ip: bool, /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour. - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] disable_dynamic_rate_limiter: bool, - /// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`. - #[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)] - rate_limit_algorithm: proxy::rate_limiter::RateLimitAlgorithm, - /// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error. - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - rate_limiter_timeout: tokio::time::Duration, /// Endpoint rate limiter max number of requests per second. /// - /// Provided in the form '@'. + /// Provided in the form `@`. /// Can be given multiple times for different bucket sizes. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] endpoint_rps_limit: Vec, - /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`. - #[clap(long, default_value_t = 100)] - initial_limit: usize, - #[clap(flatten)] - aimd_config: proxy::rate_limiter::AimdConfig, + /// Wake compute rate limiter max number of requests per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] + wake_compute_limit: Vec, + /// Whether the auth rate limiter actually takes effect (for testing) + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + auth_rate_limit_enabled: bool, + /// Authentication rate limiter max number of hashes per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] + auth_rate_limit: Vec, + /// The IP subnet to use when considering whether two IP addresses are considered the same. + #[clap(long, default_value_t = 64)] + auth_rate_limit_ip_subnet: u8, + /// Redis rate limiter max number of requests per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] + redis_rps_limit: Vec, /// cache for `allowed_ips` (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] allowed_ips_cache: String, /// cache for `role_secret` (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] role_secret_cache: String, - /// disable ip check for http requests. If it is too time consuming, it could be turned off. - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - disable_ip_check_for_http: bool, - /// redis url for notifications. + /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections) #[clap(long)] redis_notifications: Option, + /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain". + #[clap(long, default_value = "irsa")] + redis_auth_type: String, + /// redis host for streaming connections (might be different from the notifications host) + #[clap(long)] + redis_host: Option, + /// redis port for streaming connections (might be different from the notifications host) + #[clap(long)] + redis_port: Option, + /// redis cluster name, used in aws elasticache + #[clap(long)] + redis_cluster_name: Option, + /// redis user_id, used in aws elasticache + #[clap(long)] + redis_user_id: Option, + /// aws region to retrieve credentials + #[clap(long, default_value_t = String::new())] + aws_region: String, /// cache for `project_info` (use `size=0` to disable) #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] project_info_cache: String, - + /// cache for all valid endpoints + #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)] + endpoint_cache_config: String, #[clap(flatten)] parquet_upload: ParquetUploadArgs, + + /// interval for backup metric collection + #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] + metric_backup_collection_interval: std::time::Duration, + /// remote storage configuration for backup metric collection + /// Encoded as toml (same format as pageservers), eg + /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` + #[clap(long, value_parser = remote_storage_from_toml)] + metric_backup_collection_remote_storage: Option, + /// chunk size for backup metric collection + /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. + #[clap(long, default_value = "4194304")] + metric_backup_collection_chunk_size: usize, + /// Whether to retry the connection to the compute node + #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] + connect_to_compute_retry: String, + /// Whether to retry the wake_compute request + #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)] + wake_compute_retry: String, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -161,6 +240,10 @@ struct SqlOverHttpArgs { #[clap(long, default_value_t = 20)] sql_over_http_pool_max_conns_per_endpoint: usize, + /// How many connections to pool for each endpoint. Excess connections are discarded + #[clap(long, default_value_t = 20000)] + sql_over_http_pool_max_total_conns: usize, + /// How long pooled connections should remain idle for before closing #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] sql_over_http_idle_timeout: tokio::time::Duration, @@ -175,6 +258,12 @@ struct SqlOverHttpArgs { /// increase memory used by the pool #[clap(long, default_value_t = 128)] sql_over_http_pool_shards: usize, + + #[clap(long, default_value_t = 10000)] + sql_over_http_client_conn_threshold: u64, + + #[clap(long, default_value_t = 64)] + sql_over_http_cancel_set_shards: usize, } #[tokio::main] @@ -185,12 +274,93 @@ async fn main() -> anyhow::Result<()> { info!("Version: {GIT_VERSION}"); info!("Build_tag: {BUILD_TAG}"); - ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG); + let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }); + + let jemalloc = match proxy::jemalloc::MetricRecorder::new() { + Ok(t) => Some(t), + Err(e) => { + tracing::error!(error = ?e, "could not start jemalloc metrics loop"); + None + } + }; let args = ProxyCliArgs::parse(); let config = build_config(&args)?; info!("Authentication backend: {}", config.auth_backend); + info!("Using region: {}", args.aws_region); + + let region_provider = + RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone())); + let provider_conf = + ProviderConfig::without_region().with_region(region_provider.region().await); + let aws_credentials_provider = { + // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" + CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new()) + // uses "AWS_PROFILE" / `aws sso login --profile ` + .or_else( + "profile-sso", + ProfileFileCredentialsProvider::builder() + .configure(&provider_conf) + .build(), + ) + // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" + // needed to access remote extensions bucket + .or_else( + "token", + WebIdentityTokenCredentialsProvider::builder() + .configure(&provider_conf) + .build(), + ) + // uses imds v2 + .or_else("imds", ImdsCredentialsProvider::builder().build()) + }; + let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new( + elasticache::AWSIRSAConfig::new( + args.aws_region.clone(), + args.redis_cluster_name, + args.redis_user_id, + ), + aws_credentials_provider, + )); + let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { + ("plain", redis_url) => match redis_url { + None => { + bail!("plain auth requires redis_notifications to be set"); + } + Some(url) => Some( + ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()), + ), + }, + ("irsa", _) => match (&args.redis_host, args.redis_port) { + (Some(host), Some(port)) => Some( + ConnectionWithCredentialsProvider::new_with_credentials_provider( + host.to_string(), + port, + elasticache_credentials_provider.clone(), + ), + ), + (None, None) => { + warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client"); + None + } + _ => { + bail!("redis-host and redis-port must be specified together"); + } + }, + _ => { + bail!("unknown auth type given"); + } + }; + + let redis_notifications_client = if let Some(url) = args.redis_notifications { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string())) + } else { + regional_redis_client.clone() + }; // Check that we can bind to address before further initialization let http_address: SocketAddr = args.http.parse()?; @@ -206,7 +376,45 @@ async fn main() -> anyhow::Result<()> { let proxy_listener = TcpListener::bind(proxy_address).await?; let cancellation_token = CancellationToken::new(); - let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit)); + let cancel_map = CancelMap::default(); + + let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone()); + RateBucketInfo::validate(redis_rps_limit)?; + + let redis_publisher = match ®ional_redis_client { + Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new( + redis_publisher.clone(), + args.region.clone(), + redis_rps_limit, + )?))), + None => None, + }; + let cancellation_handler = Arc::new(CancellationHandler::< + Option>>, + >::new( + cancel_map.clone(), + redis_publisher, + proxy::metrics::CancellationSource::FromClient, + )); + + // bit of a hack - find the min rps and max rps supported and turn it into + // leaky bucket config instead + let max = args + .endpoint_rps_limit + .iter() + .map(|x| x.rps()) + .max_by(f64::total_cmp) + .unwrap_or(EndpointRateLimiter::DEFAULT.max); + let rps = args + .endpoint_rps_limit + .iter() + .map(|x| x.rps()) + .min_by(f64::total_cmp) + .unwrap_or(EndpointRateLimiter::DEFAULT.rps); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + LeakyBucketConfig { rps, max }, + 64, + )); // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) @@ -215,6 +423,7 @@ async fn main() -> anyhow::Result<()> { config, proxy_listener, cancellation_token.clone(), + cancellation_handler.clone(), endpoint_rate_limiter.clone(), )); @@ -229,6 +438,7 @@ async fn main() -> anyhow::Result<()> { config, serverless_listener, cancellation_token.clone(), + cancellation_handler.clone(), endpoint_rate_limiter.clone(), )); } @@ -240,27 +450,63 @@ async fn main() -> anyhow::Result<()> { // maintenance tasks. these never return unless there's an error let mut maintenance_tasks = JoinSet::new(); - maintenance_tasks.spawn(proxy::handle_signals(cancellation_token)); - maintenance_tasks.spawn(http::health_server::task_main(http_listener)); + maintenance_tasks.spawn(proxy::handle_signals( + cancellation_token.clone(), + || async { Ok(()) }, + )); + maintenance_tasks.spawn(http::health_server::task_main( + http_listener, + AppMetrics { + jemalloc, + neon_metrics, + proxy: proxy::metrics::Metrics::get(), + }, + )); maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener)); if let Some(metrics_config) = &config.metric_collection { + // TODO: Add gc regardles of the metric collection being enabled. maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); + client_tasks.spawn(usage_metrics::task_backup( + &metrics_config.backup_metric_collection_config, + cancellation_token.clone(), + )); } - if let auth::BackendType::Console(api, _) = &config.auth_backend { - match &**api { - proxy::console::provider::ConsoleBackend::Console(api) => { - let cache = api.caches.project_info.clone(); - if let Some(url) = args.redis_notifications { - info!("Starting redis notifications listener ({url})"); - maintenance_tasks - .spawn(notifications::task_main(url.to_owned(), cache.clone())); + if let auth::Backend::Console(api, _) = &config.auth_backend { + if let proxy::console::provider::ConsoleBackend::Console(api) = &**api { + match (redis_notifications_client, regional_redis_client.clone()) { + (None, None) => {} + (client1, client2) => { + let cache = api.caches.project_info.clone(); + if let Some(client) = client1 { + maintenance_tasks.spawn(notifications::task_main( + client, + cache.clone(), + cancel_map.clone(), + args.region.clone(), + )); + } + if let Some(client) = client2 { + maintenance_tasks.spawn(notifications::task_main( + client, + cache.clone(), + cancel_map.clone(), + args.region.clone(), + )); + } + maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); } - maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); } - #[cfg(feature = "testing")] - proxy::console::provider::ConsoleBackend::Postgres(_) => {} + if let Some(regional_redis_client) = regional_redis_client { + let cache = api.caches.endpoints_cache.clone(); + let con = regional_redis_client; + let span = tracing::info_span!("endpoints_cache"); + maintenance_tasks.spawn( + async move { cache.do_read(con, cancellation_token.clone()).await } + .instrument(span), + ); + } } } @@ -290,6 +536,9 @@ async fn main() -> anyhow::Result<()> { /// ProxyConfig is created at proxy startup, and lives forever. fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { + let thread_pool = ThreadPool::new(args.scram_thread_pool_size); + Metrics::install(thread_pool.metrics.clone()); + let tls_config = match (&args.tls_key, &args.tls_cert) { (Some(key_path), Some(cert_path)) => Some(config::configure_tls( key_path, @@ -303,6 +552,11 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { if args.allow_self_signed_compute { warn!("allowing self-signed compute certificates"); } + let backup_metric_collection_config = config::MetricBackupCollectionConfig { + interval: args.metric_backup_collection_interval, + remote_storage_config: args.metric_backup_collection_remote_storage.clone(), + chunk_size: args.metric_backup_collection_chunk_size, + }; let metric_collection = match ( &args.metric_collection_endpoint, @@ -311,6 +565,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig { endpoint: endpoint.parse()?, interval: humantime::parse_duration(interval)?, + backup_metric_collection_config, }), (None, None) => None, _ => bail!( @@ -318,78 +573,117 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { and metric-collection-interval must be specified" ), }; - let rate_limiter_config = RateLimiterConfig { - disable: args.disable_dynamic_rate_limiter, - algorithm: args.rate_limit_algorithm, - timeout: args.rate_limiter_timeout, - initial_limit: args.initial_limit, - aimd_config: Some(args.aimd_config), - }; + if !args.disable_dynamic_rate_limiter { + bail!("dynamic rate limiter should be disabled"); + } let auth_backend = match &args.auth_backend { - AuthBackend::Console => { + AuthBackendType::Console => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = args.project_info_cache.parse()?; + let endpoint_cache_config: config::EndpointCacheConfig = + args.endpoint_cache_config.parse()?; info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); info!( "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" ); + info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); let caches = Box::leak(Box::new(console::caches::ApiCaches::new( wake_compute_cache_config, project_info_cache_config, + endpoint_cache_config, ))); - let config::WakeComputeLockOptions { + let config::ConcurrencyLockOptions { shards, - permits, + limiter, epoch, timeout, } = args.wake_compute_lock.parse()?; - info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)"); - let locks = Box::leak(Box::new( - console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout) - .unwrap(), - )); - tokio::spawn(locks.garbage_collect_worker(epoch)); + info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); + let locks = Box::leak(Box::new(console::locks::ApiLocks::new( + "wake_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().wake_compute_lock, + )?)); + tokio::spawn(locks.garbage_collect_worker()); let url = args.auth_endpoint.parse()?; - let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config)); + let endpoint = http::Endpoint::new(url, http::new_client()); - let api = console::provider::neon::Api::new(endpoint, caches, locks); + let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); + RateBucketInfo::validate(&mut wake_compute_rps_limit)?; + let wake_compute_endpoint_rate_limiter = + Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); + let api = console::provider::neon::Api::new( + endpoint, + caches, + locks, + wake_compute_endpoint_rate_limiter, + ); let api = console::provider::ConsoleBackend::Console(api); - auth::BackendType::Console(Cow::Owned(api), ()) + auth::Backend::Console(MaybeOwned::Owned(api), ()) } #[cfg(feature = "testing")] - AuthBackend::Postgres => { + AuthBackendType::Postgres => { let url = args.auth_endpoint.parse()?; let api = console::provider::mock::Api::new(url); let api = console::provider::ConsoleBackend::Postgres(api); - auth::BackendType::Console(Cow::Owned(api), ()) + auth::Backend::Console(MaybeOwned::Owned(api), ()) } - AuthBackend::Link => { + AuthBackendType::Web => { let url = args.uri.parse()?; - auth::BackendType::Link(Cow::Owned(url)) + auth::Backend::Web(MaybeOwned::Owned(url), ()) } }; + + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.connect_compute_lock.parse()?; + info!( + ?limiter, + shards, + ?epoch, + "Using NodeLocks (connect_compute)" + ); + let connect_compute_locks = console::locks::ApiLocks::new( + "connect_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().proxy.connect_compute_lock, + )?; + let http_config = HttpConfig { - request_timeout: args.sql_over_http.sql_over_http_timeout, + accept_websockets: true, pool_options: GlobalConnPoolOptions { max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, pool_shards: args.sql_over_http.sql_over_http_pool_shards, idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, opt_in: args.sql_over_http.sql_over_http_pool_opt_in, + max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, }, + cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), + client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, }; let authentication_config = AuthenticationConfig { + thread_pool, scram_protocol_timeout: args.scram_protocol_timeout, + rate_limiter_enabled: args.auth_rate_limit_enabled, + rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), + rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, }; - let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); - RateBucketInfo::validate(&mut endpoint_rps_limit)?; - let config = Box::leak(Box::new(ProxyConfig { tls_config, auth_backend, @@ -398,12 +692,17 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { http_config, authentication_config, require_client_ip: args.require_client_ip, - disable_ip_check_for_http: args.disable_ip_check_for_http, - endpoint_rps_limit, - // TODO: add this argument + handshake_timeout: args.handshake_timeout, region: args.region.clone(), + wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, + connect_compute_locks, + connect_to_compute_retry_config: config::RetryConfig::parse( + &args.connect_to_compute_retry, + )?, })); + tokio::spawn(config.connect_compute_locks.garbage_collect_worker()); + Ok(config) } diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs index fc5f416395..6c168144a7 100644 --- a/proxy/src/cache.rs +++ b/proxy/src/cache.rs @@ -1,6 +1,7 @@ -pub mod common; -pub mod project_info; +pub(crate) mod common; +pub(crate) mod endpoints; +pub(crate) mod project_info; mod timed_lru; -pub use common::{Cache, Cached}; -pub use timed_lru::TimedLru; +pub(crate) use common::{Cache, Cached}; +pub(crate) use timed_lru::TimedLru; diff --git a/proxy/src/cache/common.rs b/proxy/src/cache/common.rs index 2af6a70e90..b5caf94788 100644 --- a/proxy/src/cache/common.rs +++ b/proxy/src/cache/common.rs @@ -3,7 +3,7 @@ use std::ops::{Deref, DerefMut}; /// A generic trait which exposes types of cache's key and value, /// as well as the notion of cache entry invalidation. /// This is useful for [`Cached`]. -pub trait Cache { +pub(crate) trait Cache { /// Entry's key. type Key; @@ -24,27 +24,44 @@ impl Cache for &C { type LookupInfo = C::LookupInfo; fn invalidate(&self, info: &Self::LookupInfo) { - C::invalidate(self, info) + C::invalidate(self, info); } } /// Wrapper for convenient entry invalidation. -pub struct Cached::Value> { +pub(crate) struct Cached::Value> { /// Cache + lookup info. - pub token: Option<(C, C::LookupInfo)>, + pub(crate) token: Option<(C, C::LookupInfo)>, /// The value itself. - pub value: V, + pub(crate) value: V, } impl Cached { /// Place any entry into this wrapper; invalidation will be a no-op. - pub fn new_uncached(value: V) -> Self { + pub(crate) fn new_uncached(value: V) -> Self { Self { token: None, value } } + pub(crate) fn take_value(self) -> (Cached, V) { + ( + Cached { + token: self.token, + value: (), + }, + self.value, + ) + } + + pub(crate) fn map(self, f: impl FnOnce(V) -> U) -> Cached { + Cached { + token: self.token, + value: f(self.value), + } + } + /// Drop this entry from a cache if it's still there. - pub fn invalidate(self) -> V { + pub(crate) fn invalidate(self) -> V { if let Some((cache, info)) = &self.token { cache.invalidate(info); } @@ -52,7 +69,7 @@ impl Cached { } /// Tell if this entry is actually cached. - pub fn cached(&self) -> bool { + pub(crate) fn cached(&self) -> bool { self.token.is_some() } } diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs new file mode 100644 index 0000000000..f4762232d8 --- /dev/null +++ b/proxy/src/cache/endpoints.rs @@ -0,0 +1,247 @@ +use std::{ + convert::Infallible, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + time::Duration, +}; + +use dashmap::DashSet; +use redis::{ + streams::{StreamReadOptions, StreamReadReply}, + AsyncCommands, FromRedisValue, Value, +}; +use serde::Deserialize; +use tokio::sync::Mutex; +use tokio_util::sync::CancellationToken; +use tracing::info; + +use crate::{ + config::EndpointCacheConfig, + context::RequestMonitoring, + intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}, + metrics::{Metrics, RedisErrors, RedisEventsCount}, + rate_limiter::GlobalRateLimiter, + redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider, + EndpointId, +}; + +#[derive(Deserialize, Debug, Clone)] +pub(crate) struct ControlPlaneEventKey { + endpoint_created: Option, + branch_created: Option, + project_created: Option, +} +#[derive(Deserialize, Debug, Clone)] +struct EndpointCreated { + endpoint_id: String, +} +#[derive(Deserialize, Debug, Clone)] +struct BranchCreated { + branch_id: String, +} +#[derive(Deserialize, Debug, Clone)] +struct ProjectCreated { + project_id: String, +} + +pub struct EndpointsCache { + config: EndpointCacheConfig, + endpoints: DashSet, + branches: DashSet, + projects: DashSet, + ready: AtomicBool, + limiter: Arc>, +} + +impl EndpointsCache { + pub(crate) fn new(config: EndpointCacheConfig) -> Self { + Self { + limiter: Arc::new(Mutex::new(GlobalRateLimiter::new( + config.limiter_info.clone(), + ))), + config, + endpoints: DashSet::new(), + branches: DashSet::new(), + projects: DashSet::new(), + ready: AtomicBool::new(false), + } + } + pub(crate) async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool { + if !self.ready.load(Ordering::Acquire) { + return true; + } + let rejected = self.should_reject(endpoint); + ctx.set_rejected(rejected); + info!(?rejected, "check endpoint is valid, disabled cache"); + // If cache is disabled, just collect the metrics and return or + // If the limiter allows, we don't need to check the cache. + if self.config.disable_cache || self.limiter.lock().await.check() { + return true; + } + !rejected + } + fn should_reject(&self, endpoint: &EndpointId) -> bool { + if endpoint.is_endpoint() { + !self.endpoints.contains(&EndpointIdInt::from(endpoint)) + } else if endpoint.is_branch() { + !self + .branches + .contains(&BranchIdInt::from(&endpoint.as_branch())) + } else { + !self + .projects + .contains(&ProjectIdInt::from(&endpoint.as_project())) + } + } + fn insert_event(&self, key: ControlPlaneEventKey) { + // Do not do normalization here, we expect the events to be normalized. + if let Some(endpoint_created) = key.endpoint_created { + self.endpoints + .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into())); + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::EndpointCreated); + } + if let Some(branch_created) = key.branch_created { + self.branches + .insert(BranchIdInt::from(&branch_created.branch_id.into())); + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::BranchCreated); + } + if let Some(project_created) = key.project_created { + self.projects + .insert(ProjectIdInt::from(&project_created.project_id.into())); + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::ProjectCreated); + } + } + pub async fn do_read( + &self, + mut con: ConnectionWithCredentialsProvider, + cancellation_token: CancellationToken, + ) -> anyhow::Result { + let mut last_id = "0-0".to_string(); + loop { + if let Err(e) = con.connect().await { + tracing::error!("error connecting to redis: {:?}", e); + self.ready.store(false, Ordering::Release); + } + if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await { + tracing::error!("error reading from redis: {:?}", e); + self.ready.store(false, Ordering::Release); + } + if cancellation_token.is_cancelled() { + info!("cancellation token is cancelled, exiting"); + tokio::time::sleep(Duration::from_secs(60 * 60 * 24 * 7)).await; + // 1 week. + } + tokio::time::sleep(self.config.retry_interval).await; + } + } + async fn read_from_stream( + &self, + con: &mut ConnectionWithCredentialsProvider, + last_id: &mut String, + ) -> anyhow::Result<()> { + tracing::info!("reading endpoints/branches/projects from redis"); + self.batch_read( + con, + StreamReadOptions::default().count(self.config.initial_batch_size), + last_id, + true, + ) + .await?; + tracing::info!("ready to filter user requests"); + self.ready.store(true, Ordering::Release); + self.batch_read( + con, + StreamReadOptions::default() + .count(self.config.default_batch_size) + .block(self.config.xread_timeout.as_millis() as usize), + last_id, + false, + ) + .await + } + fn parse_key_value(value: &Value) -> anyhow::Result { + let s: String = FromRedisValue::from_redis_value(value)?; + Ok(serde_json::from_str(&s)?) + } + async fn batch_read( + &self, + conn: &mut ConnectionWithCredentialsProvider, + opts: StreamReadOptions, + last_id: &mut String, + return_when_finish: bool, + ) -> anyhow::Result<()> { + let mut total: usize = 0; + loop { + let mut res: StreamReadReply = conn + .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts) + .await?; + + if res.keys.is_empty() { + if return_when_finish { + if total != 0 { + break; + } + anyhow::bail!( + "Redis stream {} is empty, cannot be used to filter endpoints", + self.config.stream_name + ); + } + // If we are not returning when finish, we should wait for more data. + continue; + } + if res.keys.len() != 1 { + anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name); + } + + let res = res.keys.pop().expect("Checked length above"); + let len = res.ids.len(); + for x in res.ids { + total += 1; + for (_, v) in x.map { + let key = match Self::parse_key_value(&v) { + Ok(x) => x, + Err(e) => { + Metrics::get().proxy.redis_errors_total.inc(RedisErrors { + channel: &self.config.stream_name, + }); + tracing::error!("error parsing value {v:?}: {e:?}"); + continue; + } + }; + self.insert_event(key); + } + if total.is_power_of_two() { + tracing::debug!("endpoints read {}", total); + } + *last_id = x.id; + } + if return_when_finish && len <= self.config.default_batch_size { + break; + } + } + tracing::info!("read {} endpoints/branches/projects from redis", total); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::ControlPlaneEventKey; + + #[test] + fn test() { + let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}"; + let _: ControlPlaneEventKey = serde_json::from_str(s).unwrap(); + } +} diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index 6f37868a8c..ceae74a9a0 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -5,24 +5,30 @@ use std::{ time::Duration, }; +use async_trait::async_trait; use dashmap::DashMap; use rand::{thread_rng, Rng}; use smol_str::SmolStr; +use tokio::sync::Mutex; use tokio::time::Instant; use tracing::{debug, info}; use crate::{ - auth::IpPattern, config::ProjectInfoCacheOptions, console::AuthSecret, EndpointId, ProjectId, - RoleName, + auth::IpPattern, + config::ProjectInfoCacheOptions, + console::AuthSecret, + intern::{EndpointIdInt, ProjectIdInt, RoleNameInt}, + EndpointId, RoleName, }; use super::{Cache, Cached}; -pub trait ProjectInfoCache { - fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId); - fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName); - fn enable_ttl(&self); - fn disable_ttl(&self); +#[async_trait] +pub(crate) trait ProjectInfoCache { + fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt); + fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt); + async fn decrement_active_listeners(&self); + async fn increment_active_listeners(&self); } struct Entry { @@ -31,7 +37,7 @@ struct Entry { } impl Entry { - pub fn new(value: T) -> Self { + pub(crate) fn new(value: T) -> Self { Self { created_at: Instant::now(), value, @@ -47,7 +53,7 @@ impl From for Entry { #[derive(Default)] struct EndpointInfo { - secret: std::collections::HashMap>>, + secret: std::collections::HashMap>>, allowed_ips: Option>>>, } @@ -58,13 +64,13 @@ impl EndpointInfo { Some(t) => t < created_at, } } - pub fn get_role_secret( + pub(crate) fn get_role_secret( &self, - role_name: &RoleName, + role_name: RoleNameInt, valid_since: Instant, ignore_cache_since: Option, ) -> Option<(Option, bool)> { - if let Some(secret) = self.secret.get(role_name) { + if let Some(secret) = self.secret.get(&role_name) { if valid_since < secret.created_at { return Some(( secret.value.clone(), @@ -75,7 +81,7 @@ impl EndpointInfo { None } - pub fn get_allowed_ips( + pub(crate) fn get_allowed_ips( &self, valid_since: Instant, ignore_cache_since: Option, @@ -90,11 +96,11 @@ impl EndpointInfo { } None } - pub fn invalidate_allowed_ips(&mut self) { + pub(crate) fn invalidate_allowed_ips(&mut self) { self.allowed_ips = None; } - pub fn invalidate_role_secret(&mut self, role_name: &RoleName) { - self.secret.remove(role_name); + pub(crate) fn invalidate_role_secret(&mut self, role_name: RoleNameInt) { + self.secret.remove(&role_name); } } @@ -106,21 +112,23 @@ impl EndpointInfo { /// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available? /// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache. pub struct ProjectInfoCacheImpl { - cache: DashMap, + cache: DashMap, - project2ep: DashMap>, + project2ep: DashMap>, config: ProjectInfoCacheOptions, start_time: Instant, ttl_disabled_since_us: AtomicU64, + active_listeners_lock: Mutex, } +#[async_trait] impl ProjectInfoCache for ProjectInfoCacheImpl { - fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId) { + fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) { info!("invalidating allowed ips for project `{}`", project_id); let endpoints = self .project2ep - .get(project_id) + .get(&project_id) .map(|kv| kv.value().clone()) .unwrap_or_default(); for endpoint_id in endpoints { @@ -129,14 +137,14 @@ impl ProjectInfoCache for ProjectInfoCacheImpl { } } } - fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName) { + fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt) { info!( "invalidating role secret for project_id `{}` and role_name `{}`", - project_id, role_name + project_id, role_name, ); let endpoints = self .project2ep - .get(project_id) + .get(&project_id) .map(|kv| kv.value().clone()) .unwrap_or_default(); for endpoint_id in endpoints { @@ -145,43 +153,58 @@ impl ProjectInfoCache for ProjectInfoCacheImpl { } } } - fn enable_ttl(&self) { - self.ttl_disabled_since_us - .store(u64::MAX, std::sync::atomic::Ordering::Relaxed); + async fn decrement_active_listeners(&self) { + let mut listeners_guard = self.active_listeners_lock.lock().await; + if *listeners_guard == 0 { + tracing::error!("active_listeners count is already 0, something is broken"); + return; + } + *listeners_guard -= 1; + if *listeners_guard == 0 { + self.ttl_disabled_since_us + .store(u64::MAX, std::sync::atomic::Ordering::SeqCst); + } } - fn disable_ttl(&self) { - let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64; - self.ttl_disabled_since_us - .store(new_ttl, std::sync::atomic::Ordering::Relaxed); + async fn increment_active_listeners(&self) { + let mut listeners_guard = self.active_listeners_lock.lock().await; + *listeners_guard += 1; + if *listeners_guard == 1 { + let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64; + self.ttl_disabled_since_us + .store(new_ttl, std::sync::atomic::Ordering::SeqCst); + } } } impl ProjectInfoCacheImpl { - pub fn new(config: ProjectInfoCacheOptions) -> Self { + pub(crate) fn new(config: ProjectInfoCacheOptions) -> Self { Self { cache: DashMap::new(), project2ep: DashMap::new(), config, ttl_disabled_since_us: AtomicU64::new(u64::MAX), start_time: Instant::now(), + active_listeners_lock: Mutex::new(0), } } - pub fn get_role_secret( + pub(crate) fn get_role_secret( &self, endpoint_id: &EndpointId, role_name: &RoleName, ) -> Option>> { + let endpoint_id = EndpointIdInt::get(endpoint_id)?; + let role_name = RoleNameInt::get(role_name)?; let (valid_since, ignore_cache_since) = self.get_cache_times(); - let endpoint_info = self.cache.get(endpoint_id)?; + let endpoint_info = self.cache.get(&endpoint_id)?; let (value, ignore_cache) = endpoint_info.get_role_secret(role_name, valid_since, ignore_cache_since)?; if !ignore_cache { let cached = Cached { token: Some(( self, - CachedLookupInfo::new_role_secret(endpoint_id.clone(), role_name.clone()), + CachedLookupInfo::new_role_secret(endpoint_id, role_name), )), value, }; @@ -189,62 +212,60 @@ impl ProjectInfoCacheImpl { } Some(Cached::new_uncached(value)) } - pub fn get_allowed_ips( + pub(crate) fn get_allowed_ips( &self, endpoint_id: &EndpointId, ) -> Option>>> { + let endpoint_id = EndpointIdInt::get(endpoint_id)?; let (valid_since, ignore_cache_since) = self.get_cache_times(); - let endpoint_info = self.cache.get(endpoint_id)?; + let endpoint_info = self.cache.get(&endpoint_id)?; let value = endpoint_info.get_allowed_ips(valid_since, ignore_cache_since); let (value, ignore_cache) = value?; if !ignore_cache { let cached = Cached { - token: Some((self, CachedLookupInfo::new_allowed_ips(endpoint_id.clone()))), + token: Some((self, CachedLookupInfo::new_allowed_ips(endpoint_id))), value, }; return Some(cached); } Some(Cached::new_uncached(value)) } - pub fn insert_role_secret( + pub(crate) fn insert_role_secret( &self, - project_id: &ProjectId, - endpoint_id: &EndpointId, - role_name: &RoleName, + project_id: ProjectIdInt, + endpoint_id: EndpointIdInt, + role_name: RoleNameInt, secret: Option, ) { if self.cache.len() >= self.config.size { // If there are too many entries, wait until the next gc cycle. return; } - self.inser_project2endpoint(project_id, endpoint_id); - let mut entry = self.cache.entry(endpoint_id.clone()).or_default(); + self.insert_project2endpoint(project_id, endpoint_id); + let mut entry = self.cache.entry(endpoint_id).or_default(); if entry.secret.len() < self.config.max_roles { - entry.secret.insert(role_name.clone(), secret.into()); + entry.secret.insert(role_name, secret.into()); } } - pub fn insert_allowed_ips( + pub(crate) fn insert_allowed_ips( &self, - project_id: &ProjectId, - endpoint_id: &EndpointId, + project_id: ProjectIdInt, + endpoint_id: EndpointIdInt, allowed_ips: Arc>, ) { if self.cache.len() >= self.config.size { // If there are too many entries, wait until the next gc cycle. return; } - self.inser_project2endpoint(project_id, endpoint_id); - self.cache - .entry(endpoint_id.clone()) - .or_default() - .allowed_ips = Some(allowed_ips.into()); + self.insert_project2endpoint(project_id, endpoint_id); + self.cache.entry(endpoint_id).or_default().allowed_ips = Some(allowed_ips.into()); } - fn inser_project2endpoint(&self, project_id: &ProjectId, endpoint_id: &EndpointId) { - if let Some(mut endpoints) = self.project2ep.get_mut(project_id) { - endpoints.insert(endpoint_id.clone()); + fn insert_project2endpoint(&self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt) { + if let Some(mut endpoints) = self.project2ep.get_mut(&project_id) { + endpoints.insert(endpoint_id); } else { self.project2ep - .insert(project_id.clone(), HashSet::from([endpoint_id.clone()])); + .insert(project_id, HashSet::from([endpoint_id])); } } fn get_cache_times(&self) -> (Instant, Option) { @@ -253,13 +274,13 @@ impl ProjectInfoCacheImpl { let ttl_disabled_since_us = self .ttl_disabled_since_us .load(std::sync::atomic::Ordering::Relaxed); - let ignore_cache_since = if ttl_disabled_since_us != u64::MAX { + let ignore_cache_since = if ttl_disabled_since_us == u64::MAX { + None + } else { let ignore_cache_since = self.start_time + Duration::from_micros(ttl_disabled_since_us); // We are fine if entry is not older than ttl or was added before we are getting notifications. valid_since = valid_since.min(ignore_cache_since); Some(ignore_cache_since) - } else { - None }; (valid_since, ignore_cache_since) } @@ -285,7 +306,7 @@ impl ProjectInfoCacheImpl { let mut removed = 0; let shard = self.project2ep.shards()[shard].write(); for (_, endpoints) in shard.iter() { - for endpoint in endpoints.get().iter() { + for endpoint in endpoints.get() { self.cache.remove(endpoint); removed += 1; } @@ -298,20 +319,20 @@ impl ProjectInfoCacheImpl { /// Lookup info for project info cache. /// This is used to invalidate cache entries. -pub struct CachedLookupInfo { +pub(crate) struct CachedLookupInfo { /// Search by this key. - endpoint_id: EndpointId, + endpoint_id: EndpointIdInt, lookup_type: LookupType, } impl CachedLookupInfo { - pub(self) fn new_role_secret(endpoint_id: EndpointId, role_name: RoleName) -> Self { + pub(self) fn new_role_secret(endpoint_id: EndpointIdInt, role_name: RoleNameInt) -> Self { Self { endpoint_id, lookup_type: LookupType::RoleSecret(role_name), } } - pub(self) fn new_allowed_ips(endpoint_id: EndpointId) -> Self { + pub(self) fn new_allowed_ips(endpoint_id: EndpointIdInt) -> Self { Self { endpoint_id, lookup_type: LookupType::AllowedIps, @@ -320,7 +341,7 @@ impl CachedLookupInfo { } enum LookupType { - RoleSecret(RoleName), + RoleSecret(RoleNameInt), AllowedIps, } @@ -335,7 +356,7 @@ impl Cache for ProjectInfoCacheImpl { match &key.lookup_type { LookupType::RoleSecret(role_name) => { if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) { - endpoint_info.invalidate_role_secret(role_name); + endpoint_info.invalidate_role_secret(*role_name); } } LookupType::AllowedIps => { @@ -350,8 +371,7 @@ impl Cache for ProjectInfoCacheImpl { #[cfg(test)] mod tests { use super::*; - use crate::{console::AuthSecret, scram::ServerSecret}; - use std::{sync::Arc, time::Duration}; + use crate::{scram::ServerSecret, ProjectId}; #[tokio::test] async fn test_project_info_cache_settings() { @@ -362,22 +382,33 @@ mod tests { ttl: Duration::from_secs(1), gc_interval: Duration::from_secs(600), }); - let project_id = "project".into(); - let endpoint_id = "endpoint".into(); + let project_id: ProjectId = "project".into(); + let endpoint_id: EndpointId = "endpoint".into(); let user1: RoleName = "user1".into(); let user2: RoleName = "user2".into(); - let secret1 = Some(AuthSecret::Scram(ServerSecret::mock( - user1.as_str(), - [1; 32], - ))); + let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); let secret2 = None; let allowed_ips = Arc::new(vec![ "127.0.0.1".parse().unwrap(), "127.0.0.2".parse().unwrap(), ]); - cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); - cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); - cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user1).into(), + secret1.clone(), + ); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user2).into(), + secret2.clone(), + ); + cache.insert_allowed_ips( + (&project_id).into(), + (&endpoint_id).into(), + allowed_ips.clone(), + ); let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); assert!(cached.cached()); @@ -388,11 +419,13 @@ mod tests { // Shouldn't add more than 2 roles. let user3: RoleName = "user3".into(); - let secret3 = Some(AuthSecret::Scram(ServerSecret::mock( - user3.as_str(), - [3; 32], - ))); - cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone()); + let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32]))); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user3).into(), + secret3.clone(), + ); assert!(cache.get_role_secret(&endpoint_id, &user3).is_none()); let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); @@ -417,28 +450,36 @@ mod tests { ttl: Duration::from_secs(1), gc_interval: Duration::from_secs(600), })); - cache.clone().disable_ttl(); + cache.clone().increment_active_listeners().await; tokio::time::advance(Duration::from_secs(2)).await; - let project_id = "project".into(); - let endpoint_id = "endpoint".into(); + let project_id: ProjectId = "project".into(); + let endpoint_id: EndpointId = "endpoint".into(); let user1: RoleName = "user1".into(); let user2: RoleName = "user2".into(); - let secret1 = Some(AuthSecret::Scram(ServerSecret::mock( - user1.as_str(), - [1; 32], - ))); - let secret2 = Some(AuthSecret::Scram(ServerSecret::mock( - user2.as_str(), - [2; 32], - ))); + let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); + let secret2 = Some(AuthSecret::Scram(ServerSecret::mock([2; 32]))); let allowed_ips = Arc::new(vec![ "127.0.0.1".parse().unwrap(), "127.0.0.2".parse().unwrap(), ]); - cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); - cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); - cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user1).into(), + secret1.clone(), + ); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user2).into(), + secret2.clone(), + ); + cache.insert_allowed_ips( + (&project_id).into(), + (&endpoint_id).into(), + allowed_ips.clone(), + ); tokio::time::advance(Duration::from_secs(2)).await; // Nothing should be invalidated. @@ -457,7 +498,7 @@ mod tests { assert_eq!(cached.value, secret2); // The only way to invalidate this value is to invalidate via the api. - cache.invalidate_role_secret_for_project(&project_id, &user2); + cache.invalidate_role_secret_for_project((&project_id).into(), (&user2).into()); assert!(cache.get_role_secret(&endpoint_id, &user2).is_none()); let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); @@ -466,7 +507,7 @@ mod tests { } #[tokio::test] - async fn test_disable_ttl_invalidate_added_before() { + async fn test_increment_active_listeners_invalidate_added_before() { tokio::time::pause(); let cache = Arc::new(ProjectInfoCacheImpl::new(ProjectInfoCacheOptions { size: 2, @@ -475,26 +516,30 @@ mod tests { gc_interval: Duration::from_secs(600), })); - let project_id = "project".into(); - let endpoint_id = "endpoint".into(); + let project_id: ProjectId = "project".into(); + let endpoint_id: EndpointId = "endpoint".into(); let user1: RoleName = "user1".into(); let user2: RoleName = "user2".into(); - let secret1 = Some(AuthSecret::Scram(ServerSecret::mock( - user1.as_str(), - [1; 32], - ))); - let secret2 = Some(AuthSecret::Scram(ServerSecret::mock( - user2.as_str(), - [2; 32], - ))); + let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); + let secret2 = Some(AuthSecret::Scram(ServerSecret::mock([2; 32]))); let allowed_ips = Arc::new(vec![ "127.0.0.1".parse().unwrap(), "127.0.0.2".parse().unwrap(), ]); - cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); - cache.clone().disable_ttl(); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user1).into(), + secret1.clone(), + ); + cache.clone().increment_active_listeners().await; tokio::time::advance(Duration::from_millis(100)).await; - cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user2).into(), + secret2.clone(), + ); // Added before ttl was disabled + ttl should be still cached. let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); @@ -508,7 +553,11 @@ mod tests { assert!(cache.get_role_secret(&endpoint_id, &user2).is_none()); // Added after ttl was disabled + ttl should not be cached. - cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); + cache.insert_allowed_ips( + (&project_id).into(), + (&endpoint_id).into(), + allowed_ips.clone(), + ); let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); assert!(!cached.cached()); diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs index 3b21381bb9..8bb482f7c6 100644 --- a/proxy/src/cache/timed_lru.rs +++ b/proxy/src/cache/timed_lru.rs @@ -39,7 +39,7 @@ use super::{common::Cached, *}; /// /// * It's possible for an entry that has not yet expired entry to be evicted /// before expired items. That's a bit wasteful, but probably fine in practice. -pub struct TimedLru { +pub(crate) struct TimedLru { /// Cache's name for tracing. name: &'static str, @@ -58,19 +58,21 @@ impl Cache for TimedLru { type LookupInfo = LookupInfo; fn invalidate(&self, info: &Self::LookupInfo) { - self.invalidate_raw(info) + self.invalidate_raw(info); } } struct Entry { created_at: Instant, expires_at: Instant, + ttl: Duration, + update_ttl_on_retrieval: bool, value: T, } impl TimedLru { /// Construct a new LRU cache with timed entries. - pub fn new( + pub(crate) fn new( name: &'static str, capacity: usize, ttl: Duration, @@ -122,7 +124,6 @@ impl TimedLru { Q: Hash + Eq + ?Sized, { let now = Instant::now(); - let deadline = now.checked_add(self.ttl).expect("time overflow"); // Do costly things before taking the lock. let mut cache = self.cache.lock(); @@ -142,7 +143,8 @@ impl TimedLru { let (created_at, expires_at) = (entry.created_at, entry.expires_at); // Update the deadline and the entry's position in the LRU list. - if self.update_ttl_on_retrieval { + let deadline = now.checked_add(raw_entry.get().ttl).expect("time overflow"); + if raw_entry.get().update_ttl_on_retrieval { raw_entry.get_mut().expires_at = deadline; } raw_entry.to_back(); @@ -162,12 +164,27 @@ impl TimedLru { /// existed, return the previous value and its creation timestamp. #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] fn insert_raw(&self, key: K, value: V) -> (Instant, Option) { + self.insert_raw_ttl(key, value, self.ttl, self.update_ttl_on_retrieval) + } + + /// Insert an entry to the cache. If an entry with the same key already + /// existed, return the previous value and its creation timestamp. + #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] + fn insert_raw_ttl( + &self, + key: K, + value: V, + ttl: Duration, + update: bool, + ) -> (Instant, Option) { let created_at = Instant::now(); - let expires_at = created_at.checked_add(self.ttl).expect("time overflow"); + let expires_at = created_at.checked_add(ttl).expect("time overflow"); let entry = Entry { created_at, expires_at, + ttl, + update_ttl_on_retrieval: update, value, }; @@ -190,12 +207,16 @@ impl TimedLru { } impl TimedLru { - pub fn insert(&self, key: K, value: V) -> (Option, Cached<&Self>) { - let (created_at, old) = self.insert_raw(key.clone(), value.clone()); + pub(crate) fn insert_ttl(&self, key: K, value: V, ttl: Duration) { + self.insert_raw_ttl(key, value, ttl, false); + } + + pub(crate) fn insert_unit(&self, key: K, value: V) -> (Option, Cached<&Self, ()>) { + let (created_at, old) = self.insert_raw(key.clone(), value); let cached = Cached { token: Some((self, LookupInfo { created_at, key })), - value, + value: (), }; (old, cached) @@ -204,7 +225,7 @@ impl TimedLru { impl TimedLru { /// Retrieve a cached entry in convenient wrapper. - pub fn get(&self, key: &Q) -> Option> + pub(crate) fn get(&self, key: &Q) -> Option> where K: Borrow + Clone, Q: Hash + Eq + ?Sized, @@ -221,32 +242,10 @@ impl TimedLru { } }) } - - /// Retrieve a cached entry in convenient wrapper, ignoring its TTL. - pub fn get_ignoring_ttl(&self, key: &Q) -> Option> - where - K: Borrow, - Q: Hash + Eq + ?Sized, - { - let mut cache = self.cache.lock(); - cache - .get(key) - .map(|entry| Cached::new_uncached(entry.value.clone())) - } - - /// Remove an entry from the cache. - pub fn remove(&self, key: &Q) -> Option - where - K: Borrow + Clone, - Q: Hash + Eq + ?Sized, - { - let mut cache = self.cache.lock(); - cache.remove(key).map(|entry| entry.value) - } } /// Lookup information for key invalidation. -pub struct LookupInfo { +pub(crate) struct LookupInfo { /// Time of creation of a cache [`Entry`]. /// We use this during invalidation lookups to prevent eviction of a newer /// entry sharing the same key (it might've been inserted by a different diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index a5eb3544b4..71a2a16af8 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,73 +1,147 @@ -use anyhow::{bail, Context}; use dashmap::DashMap; use pq_proto::CancelKeyData; -use std::net::SocketAddr; +use std::{net::SocketAddr, sync::Arc}; +use thiserror::Error; use tokio::net::TcpStream; +use tokio::sync::Mutex; use tokio_postgres::{CancelToken, NoTls}; use tracing::info; +use uuid::Uuid; + +use crate::{ + error::ReportableError, + metrics::{CancellationRequest, CancellationSource, Metrics}, + redis::cancellation_publisher::{ + CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, + }, +}; + +pub type CancelMap = Arc>>; +pub type CancellationHandlerMain = CancellationHandler>>>; +pub(crate) type CancellationHandlerMainInternal = Option>>; /// Enables serving `CancelRequest`s. -#[derive(Default)] -pub struct CancelMap(DashMap>); +/// +/// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances. +pub struct CancellationHandler

{ + map: CancelMap, + client: P, + /// This field used for the monitoring purposes. + /// Represents the source of the cancellation request. + from: CancellationSource, +} -impl CancelMap { - /// Cancel a running query for the corresponding connection. - pub async fn cancel_session(&self, key: CancelKeyData) -> anyhow::Result<()> { - // NB: we should immediately release the lock after cloning the token. - let cancel_closure = self - .0 - .get(&key) - .and_then(|x| x.clone()) - .with_context(|| format!("query cancellation key not found: {key}"))?; +#[derive(Debug, Error)] +pub(crate) enum CancelError { + #[error("{0}")] + IO(#[from] std::io::Error), + #[error("{0}")] + Postgres(#[from] tokio_postgres::Error), +} - info!("cancelling query per user's request using key {key}"); - cancel_closure.try_cancel_query().await +impl ReportableError for CancelError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + CancelError::IO(_) => crate::error::ErrorKind::Compute, + CancelError::Postgres(e) if e.as_db_error().is_some() => { + crate::error::ErrorKind::Postgres + } + CancelError::Postgres(_) => crate::error::ErrorKind::Compute, + } } +} +impl CancellationHandler

{ /// Run async action within an ephemeral session identified by [`CancelKeyData`]. - pub async fn with_session<'a, F, R, V>(&'a self, f: F) -> anyhow::Result - where - F: FnOnce(Session<'a>) -> R, - R: std::future::Future>, - { + pub(crate) fn get_session(self: Arc) -> Session

{ // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't // expose it and we don't want to do another roundtrip to query // for it. The client will be able to notice that this is not the // actual backend_pid, but backend_pid is not used for anything // so it doesn't matter. - let key = rand::random(); + let key = loop { + let key = rand::random(); - // Random key collisions are unlikely to happen here, but they're still possible, - // which is why we have to take care not to rewrite an existing key. - match self.0.entry(key) { - dashmap::mapref::entry::Entry::Occupied(_) => { - bail!("query cancellation key already exists: {key}") + // Random key collisions are unlikely to happen here, but they're still possible, + // which is why we have to take care not to rewrite an existing key. + match self.map.entry(key) { + dashmap::mapref::entry::Entry::Occupied(_) => continue, + dashmap::mapref::entry::Entry::Vacant(e) => { + e.insert(None); + } } - dashmap::mapref::entry::Entry::Vacant(e) => { - e.insert(None); - } - } - - // This will guarantee that the session gets dropped - // as soon as the future is finished. - scopeguard::defer! { - self.0.remove(&key); - info!("dropped query cancellation key {key}"); - } + break key; + }; info!("registered new query cancellation key {key}"); - let session = Session::new(key, self); - f(session).await + Session { + key, + cancellation_handler: self, + } + } + /// Try to cancel a running query for the corresponding connection. + /// If the cancellation key is not found, it will be published to Redis. + pub(crate) async fn cancel_session( + &self, + key: CancelKeyData, + session_id: Uuid, + ) -> Result<(), CancelError> { + // NB: we should immediately release the lock after cloning the token. + let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else { + tracing::warn!("query cancellation key not found: {key}"); + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::NotFound, + }); + match self.client.try_publish(key, session_id).await { + Ok(()) => {} // do nothing + Err(e) => { + return Err(CancelError::IO(std::io::Error::new( + std::io::ErrorKind::Other, + e.to_string(), + ))); + } + } + return Ok(()); + }; + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::Found, + }); + info!("cancelling query per user's request using key {key}"); + cancel_closure.try_cancel_query().await } #[cfg(test)] - fn contains(&self, session: &Session) -> bool { - self.0.contains_key(&session.key) + fn contains(&self, session: &Session

) -> bool { + self.map.contains_key(&session.key) } #[cfg(test)] fn is_empty(&self) -> bool { - self.0.is_empty() + self.map.is_empty() + } +} + +impl CancellationHandler<()> { + pub fn new(map: CancelMap, from: CancellationSource) -> Self { + Self { + map, + client: (), + from, + } + } +} + +impl CancellationHandler>>> { + pub fn new(map: CancelMap, client: Option>>, from: CancellationSource) -> Self { + Self { map, client, from } } } @@ -81,79 +155,82 @@ pub struct CancelClosure { } impl CancelClosure { - pub fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self { + pub(crate) fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self { Self { socket_addr, cancel_token, } } - /// Cancels the query running on user's compute node. - pub async fn try_cancel_query(self) -> anyhow::Result<()> { + pub(crate) async fn try_cancel_query(self) -> Result<(), CancelError> { let socket = TcpStream::connect(self.socket_addr).await?; self.cancel_token.cancel_query_raw(socket, NoTls).await?; - + info!("query was cancelled"); Ok(()) } } /// Helper for registering query cancellation tokens. -pub struct Session<'a> { +pub(crate) struct Session

{ /// The user-facing key identifying this session. key: CancelKeyData, /// The [`CancelMap`] this session belongs to. - cancel_map: &'a CancelMap, + cancellation_handler: Arc>, } -impl<'a> Session<'a> { - fn new(key: CancelKeyData, cancel_map: &'a CancelMap) -> Self { - Self { key, cancel_map } +impl

Session

{ + /// Store the cancel token for the given session. + /// This enables query cancellation in `crate::proxy::prepare_client_connection`. + pub(crate) fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData { + info!("enabling query cancellation for this session"); + self.cancellation_handler + .map + .insert(self.key, Some(cancel_closure)); + + self.key } } -impl Session<'_> { - /// Store the cancel token for the given session. - /// This enables query cancellation in `crate::proxy::prepare_client_connection`. - pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData { - info!("enabling query cancellation for this session"); - self.cancel_map.0.insert(self.key, Some(cancel_closure)); - - self.key +impl

Drop for Session

{ + fn drop(&mut self) { + self.cancellation_handler.map.remove(&self.key); + info!("dropped query cancellation key {}", &self.key); } } #[cfg(test)] mod tests { use super::*; - use once_cell::sync::Lazy; #[tokio::test] async fn check_session_drop() -> anyhow::Result<()> { - static CANCEL_MAP: Lazy = Lazy::new(Default::default); - - let (tx, rx) = tokio::sync::oneshot::channel(); - let task = tokio::spawn(CANCEL_MAP.with_session(|session| async move { - assert!(CANCEL_MAP.contains(&session)); - - tx.send(()).expect("failed to send"); - futures::future::pending::<()>().await; // sleep forever - - Ok(()) - })); - - // Wait until the task has been spawned. - rx.await.context("failed to hear from the task")?; - - // Drop the session's entry by cancelling the task. - task.abort(); - let error = task.await.expect_err("task should have failed"); - if !error.is_cancelled() { - anyhow::bail!(error); - } + let cancellation_handler = Arc::new(CancellationHandler::<()>::new( + CancelMap::default(), + CancellationSource::FromRedis, + )); + let session = cancellation_handler.clone().get_session(); + assert!(cancellation_handler.contains(&session)); + drop(session); // Check that the session has been dropped. - assert!(CANCEL_MAP.is_empty()); + assert!(cancellation_handler.is_empty()); Ok(()) } + + #[tokio::test] + async fn cancel_session_noop_regression() { + let handler = + CancellationHandler::<()>::new(CancelMap::default(), CancellationSource::Local); + handler + .cancel_session( + CancelKeyData { + backend_pid: 0, + cancel_key: 0, + }, + Uuid::new_v4(), + ) + .await + .unwrap(); + } } diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index aef1aab733..8d3cb8ee3c 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,22 +1,29 @@ use crate::{ - auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError, - context::RequestMonitoring, error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE, + auth::parse_endpoint_param, + cancellation::CancelClosure, + console::{errors::WakeComputeError, messages::MetricsAuxInfo, provider::ApiLockError}, + context::RequestMonitoring, + error::{ReportableError, UserFacingError}, + metrics::{Metrics, NumDbConnectionsGuard}, proxy::neon_option, + Host, }; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; -use metrics::IntCounterPairGuard; +use once_cell::sync::OnceCell; use pq_proto::StartupMessageParams; -use std::{io, net::SocketAddr, time::Duration}; +use rustls::{client::danger::ServerCertVerifier, pki_types::InvalidDnsNameError}; +use std::{io, net::SocketAddr, sync::Arc, time::Duration}; use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::tls::MakeTlsConnect; +use tokio_postgres_rustls::MakeRustlsConnect; use tracing::{error, info, warn}; const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; #[derive(Debug, Error)] -pub enum ConnectionError { +pub(crate) enum ConnectionError { /// This error doesn't seem to reveal any secrets; for instance, /// `tokio_postgres::error::Kind` doesn't contain ip addresses and such. #[error("{COULD_NOT_CONNECT}: {0}")] @@ -26,19 +33,21 @@ pub enum ConnectionError { CouldNotConnect(#[from] io::Error), #[error("{COULD_NOT_CONNECT}: {0}")] - TlsError(#[from] native_tls::Error), + TlsError(#[from] InvalidDnsNameError), #[error("{COULD_NOT_CONNECT}: {0}")] WakeComputeError(#[from] WakeComputeError), + + #[error("error acquiring resource permit: {0}")] + TooManyConnectionAttempts(#[from] ApiLockError), } impl UserFacingError for ConnectionError { fn to_string_client(&self) -> String { - use ConnectionError::*; match self { // This helps us drop irrelevant library-specific prefixes. // TODO: propagate severity level and other parameters. - Postgres(err) => match err.as_db_error() { + ConnectionError::Postgres(err) => match err.as_db_error() { Some(err) => { let msg = err.message(); @@ -52,30 +61,47 @@ impl UserFacingError for ConnectionError { } None => err.to_string(), }, - WakeComputeError(err) => err.to_string_client(), + ConnectionError::WakeComputeError(err) => err.to_string_client(), + ConnectionError::TooManyConnectionAttempts(_) => { + "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() + } _ => COULD_NOT_CONNECT.to_owned(), } } } +impl ReportableError for ConnectionError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + ConnectionError::Postgres(e) if e.as_db_error().is_some() => { + crate::error::ErrorKind::Postgres + } + ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute, + ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute, + ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute, + ConnectionError::WakeComputeError(e) => e.get_error_kind(), + ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(), + } + } +} + /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`. -pub type ScramKeys = tokio_postgres::config::ScramKeys<32>; +pub(crate) type ScramKeys = tokio_postgres::config::ScramKeys<32>; /// A config for establishing a connection to compute node. /// Eventually, `tokio_postgres` will be replaced with something better. /// Newtype allows us to implement methods on top of it. -#[derive(Clone)] -#[repr(transparent)] -pub struct ConnCfg(Box); +#[derive(Clone, Default)] +pub(crate) struct ConnCfg(Box); /// Creation and initialization routines. impl ConnCfg { - pub fn new() -> Self { - Self(Default::default()) + pub(crate) fn new() -> Self { + Self::default() } /// Reuse password or auth keys from the other config. - pub fn reuse_password(&mut self, other: &Self) { + pub(crate) fn reuse_password(&mut self, other: Self) { if let Some(password) = other.get_password() { self.password(password); } @@ -85,16 +111,26 @@ impl ConnCfg { } } + pub(crate) fn get_host(&self) -> Result { + match self.0.get_hosts() { + [tokio_postgres::config::Host::Tcp(s)] => Ok(s.into()), + // we should not have multiple address or unix addresses. + _ => Err(WakeComputeError::BadComputeAddress( + "invalid compute address".into(), + )), + } + } + /// Apply startup message params to the connection config. - pub fn set_startup_params(&mut self, params: &StartupMessageParams) { + pub(crate) fn set_startup_params(&mut self, params: &StartupMessageParams) { // Only set `user` if it's not present in the config. - // Link auth flow takes username from the console's response. + // Web auth flow takes username from the console's response. if let (None, Some(user)) = (self.get_user(), params.get("user")) { self.user(user); } // Only set `dbname` if it's not present in the config. - // Link auth flow takes dbname from the console's response. + // Web auth flow takes dbname from the console's response. if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) { self.dbname(dbname); } @@ -147,12 +183,6 @@ impl std::ops::DerefMut for ConnCfg { } } -impl Default for ConnCfg { - fn default() -> Self { - Self::new() - } -} - impl ConnCfg { /// Establish a raw TCP connection to the compute node. async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> { @@ -225,43 +255,62 @@ impl ConnCfg { } } -pub struct PostgresConnection { +pub(crate) struct PostgresConnection { /// Socket connected to a compute node. - pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream< + pub(crate) stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream< tokio::net::TcpStream, - postgres_native_tls::TlsStream, + tokio_postgres_rustls::RustlsStream, >, /// PostgreSQL connection parameters. - pub params: std::collections::HashMap, + pub(crate) params: std::collections::HashMap, /// Query cancellation token. - pub cancel_closure: CancelClosure, + pub(crate) cancel_closure: CancelClosure, + /// Labels for proxy's metrics. + pub(crate) aux: MetricsAuxInfo, - _guage: IntCounterPairGuard, + _guage: NumDbConnectionsGuard<'static>, } impl ConnCfg { /// Connect to a corresponding compute node. - pub async fn connect( + pub(crate) async fn connect( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, allow_self_signed_compute: bool, + aux: MetricsAuxInfo, timeout: Duration, ) -> Result { + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (socket_addr, stream, host) = self.connect_raw(timeout).await?; + drop(pause); - let tls_connector = native_tls::TlsConnector::builder() - .danger_accept_invalid_certs(allow_self_signed_compute) - .build() - .unwrap(); - let mut mk_tls = postgres_native_tls::MakeTlsConnector::new(tls_connector); - let tls = MakeTlsConnect::::make_tls_connect(&mut mk_tls, host)?; + let client_config = if allow_self_signed_compute { + // Allow all certificates for creating the connection + let verifier = Arc::new(AcceptEverythingVerifier); + rustls::ClientConfig::builder() + .dangerous() + .with_custom_certificate_verifier(verifier) + } else { + let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone(); + rustls::ClientConfig::builder().with_root_certificates(root_store) + }; + let client_config = client_config.with_no_client_auth(); + + let mut mk_tls = tokio_postgres_rustls::MakeRustlsConnect::new(client_config); + let tls = >::make_tls_connect( + &mut mk_tls, + host, + )?; // connect_raw() will not use TLS if sslmode is "disable" + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (client, connection) = self.0.connect_raw(stream, tls).await?; - tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id())); + drop(pause); + tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); let stream = connection.stream.into_inner(); info!( + cold_start_info = ctx.cold_start_info().as_str(), "connected to compute node at {host} ({socket_addr}) sslmode={:?}", self.0.get_ssl_mode() ); @@ -279,9 +328,8 @@ impl ConnCfg { stream, params, cancel_closure, - _guage: NUM_DB_CONNECTIONS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(), + aux, + _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()), }; Ok(connection) @@ -305,6 +353,58 @@ fn filtered_options(params: &StartupMessageParams) -> Option { Some(options) } +fn load_certs() -> Result, io::Error> { + let der_certs = rustls_native_certs::load_native_certs()?; + let mut store = rustls::RootCertStore::empty(); + store.add_parsable_certificates(der_certs); + Ok(Arc::new(store)) +} +static TLS_ROOTS: OnceCell> = OnceCell::new(); + +#[derive(Debug)] +struct AcceptEverythingVerifier; +impl ServerCertVerifier for AcceptEverythingVerifier { + fn supported_verify_schemes(&self) -> Vec { + use rustls::SignatureScheme; + // The schemes for which `SignatureScheme::supported_in_tls13` returns true. + vec![ + SignatureScheme::ECDSA_NISTP521_SHA512, + SignatureScheme::ECDSA_NISTP384_SHA384, + SignatureScheme::ECDSA_NISTP256_SHA256, + SignatureScheme::RSA_PSS_SHA512, + SignatureScheme::RSA_PSS_SHA384, + SignatureScheme::RSA_PSS_SHA256, + SignatureScheme::ED25519, + ] + } + fn verify_server_cert( + &self, + _end_entity: &rustls::pki_types::CertificateDer<'_>, + _intermediates: &[rustls::pki_types::CertificateDer<'_>], + _server_name: &rustls::pki_types::ServerName<'_>, + _ocsp_response: &[u8], + _now: rustls::pki_types::UnixTime, + ) -> Result { + Ok(rustls::client::danger::ServerCertVerified::assertion()) + } + fn verify_tls12_signature( + &self, + _message: &[u8], + _cert: &rustls::pki_types::CertificateDer<'_>, + _dss: &rustls::DigitallySignedStruct, + ) -> Result { + Ok(rustls::client::danger::HandshakeSignatureValid::assertion()) + } + fn verify_tls13_signature( + &self, + _message: &[u8], + _cert: &rustls::pki_types::CertificateDer<'_>, + _dss: &rustls::DigitallySignedStruct, + ) -> Result { + Ok(rustls::client::danger::HandshakeSignatureValid::assertion()) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 2c46458a49..d7fc6eee22 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,6 +1,18 @@ -use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions}; +use crate::{ + auth::{self, backend::AuthRateLimiter}, + console::locks::ApiLocks, + rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}, + scram::threadpool::ThreadPool, + serverless::{cancel_set::CancelSet, GlobalConnPoolOptions}, + Host, +}; use anyhow::{bail, ensure, Context, Ok}; -use rustls::{sign, Certificate, PrivateKey}; +use itertools::Itertools; +use remote_storage::RemoteStorageConfig; +use rustls::{ + crypto::ring::sign, + pki_types::{CertificateDer, PrivateKeyDer}, +}; use sha2::{Digest, Sha256}; use std::{ collections::{HashMap, HashSet}, @@ -13,21 +25,24 @@ use x509_parser::oid_registry; pub struct ProxyConfig { pub tls_config: Option, - pub auth_backend: auth::BackendType<'static, ()>, + pub auth_backend: auth::Backend<'static, (), ()>, pub metric_collection: Option, pub allow_self_signed_compute: bool, pub http_config: HttpConfig, pub authentication_config: AuthenticationConfig, pub require_client_ip: bool, - pub disable_ip_check_for_http: bool, - pub endpoint_rps_limit: Vec, pub region: String, + pub handshake_timeout: Duration, + pub wake_compute_retry_config: RetryConfig, + pub connect_compute_locks: ApiLocks, + pub connect_to_compute_retry_config: RetryConfig, } #[derive(Debug)] pub struct MetricCollectionConfig { pub endpoint: reqwest::Url, pub interval: Duration, + pub backup_metric_collection_config: MetricBackupCollectionConfig, } pub struct TlsConfig { @@ -37,12 +52,18 @@ pub struct TlsConfig { } pub struct HttpConfig { - pub request_timeout: tokio::time::Duration, + pub accept_websockets: bool, pub pool_options: GlobalConnPoolOptions, + pub cancel_set: CancelSet, + pub client_conn_threshold: u64, } pub struct AuthenticationConfig { + pub thread_pool: Arc, pub scram_protocol_timeout: tokio::time::Duration, + pub rate_limiter_enabled: bool, + pub rate_limiter: AuthRateLimiter, + pub rate_limit_ip_subnet: u8, } impl TlsConfig { @@ -51,6 +72,9 @@ impl TlsConfig { } } +/// +pub const PG_ALPN_PROTOCOL: &[u8] = b"postgresql"; + /// Configure TLS for the main endpoint. pub fn configure_tls( key_path: &str, @@ -86,17 +110,18 @@ pub fn configure_tls( let cert_resolver = Arc::new(cert_resolver); - let config = rustls::ServerConfig::builder() - .with_safe_default_cipher_suites() - .with_safe_default_kx_groups() - // allow TLS 1.2 to be compatible with older client libraries - .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])? - .with_no_client_auth() - .with_cert_resolver(cert_resolver.clone()) - .into(); + // allow TLS 1.2 to be compatible with older client libraries + let mut config = rustls::ServerConfig::builder_with_protocol_versions(&[ + &rustls::version::TLS13, + &rustls::version::TLS12, + ]) + .with_no_client_auth() + .with_cert_resolver(cert_resolver.clone()); + + config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()]; Ok(TlsConfig { - config, + config: Arc::new(config), common_names, cert_resolver, }) @@ -131,14 +156,14 @@ pub enum TlsServerEndPoint { } impl TlsServerEndPoint { - pub fn new(cert: &Certificate) -> anyhow::Result { + pub fn new(cert: &CertificateDer<'_>) -> anyhow::Result { let sha256_oids = [ // I'm explicitly not adding MD5 or SHA1 here... They're bad. oid_registry::OID_SIG_ECDSA_WITH_SHA256, oid_registry::OID_PKCS1_SHA256WITHRSA, ]; - let pem = x509_parser::parse_x509_certificate(&cert.0) + let pem = x509_parser::parse_x509_certificate(cert) .context("Failed to parse PEM object from cerficiate")? .1; @@ -148,8 +173,7 @@ impl TlsServerEndPoint { let oid = pem.signature_algorithm.oid(); let alg = reg.get(oid); if sha256_oids.contains(oid) { - let tls_server_end_point: [u8; 32] = - Sha256::new().chain_update(&cert.0).finalize().into(); + let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into(); info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding"); Ok(Self::Sha256(tls_server_end_point)) } else { @@ -163,7 +187,7 @@ impl TlsServerEndPoint { } } -#[derive(Default)] +#[derive(Default, Debug)] pub struct CertResolver { certs: HashMap, TlsServerEndPoint)>, default: Option<(Arc, TlsServerEndPoint)>, @@ -183,11 +207,14 @@ impl CertResolver { let priv_key = { let key_bytes = std::fs::read(key_path) .context(format!("Failed to read TLS keys at '{key_path}'"))?; - let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) - .context(format!("Failed to parse TLS keys at '{key_path}'"))?; + let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - keys.pop().map(rustls::PrivateKey).unwrap() + PrivateKeyDer::Pkcs8( + keys.pop() + .unwrap() + .context(format!("Failed to parse TLS keys at '{key_path}'"))?, + ) }; let cert_chain_bytes = std::fs::read(cert_path) @@ -195,14 +222,10 @@ impl CertResolver { let cert_chain = { rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .try_collect() .with_context(|| { - format!( - "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." - ) + format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") })? - .into_iter() - .map(rustls::Certificate) - .collect() }; self.add_cert(priv_key, cert_chain, is_default) @@ -210,21 +233,21 @@ impl CertResolver { pub fn add_cert( &mut self, - priv_key: PrivateKey, - cert_chain: Vec, + priv_key: PrivateKeyDer<'static>, + cert_chain: Vec>, is_default: bool, ) -> anyhow::Result<()> { let key = sign::any_supported_type(&priv_key).context("invalid private key")?; let first_cert = &cert_chain[0]; let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - let pem = x509_parser::parse_x509_certificate(&first_cert.0) + let pem = x509_parser::parse_x509_certificate(first_cert) .context("Failed to parse PEM object from cerficiate")? .1; let common_name = pem.subject().to_string(); - // We only use non-wildcard certificates in link proxy so it seems okay to treat them the same as + // We only use non-wildcard certificates in web auth proxy so it seems okay to treat them the same as // wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so // verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names // and passed None instead, which blows up number of cases downstream code should handle. Proper coding @@ -256,7 +279,7 @@ impl CertResolver { impl rustls::server::ResolvesServerCert for CertResolver { fn resolve( &self, - client_hello: rustls::server::ClientHello, + client_hello: rustls::server::ClientHello<'_>, ) -> Option> { self.resolve(client_hello.server_name()).map(|x| x.0) } @@ -295,11 +318,96 @@ impl CertResolver { // a) Instead of multi-cert approach use single cert with extra // domains listed in Subject Alternative Name (SAN). // b) Deploy separate proxy instances for extra domains. - self.default.as_ref().cloned() + self.default.clone() } } } +#[derive(Debug)] +pub struct EndpointCacheConfig { + /// Batch size to receive all endpoints on the startup. + pub initial_batch_size: usize, + /// Batch size to receive endpoints. + pub default_batch_size: usize, + /// Timeouts for the stream read operation. + pub xread_timeout: Duration, + /// Stream name to read from. + pub stream_name: String, + /// Limiter info (to distinguish when to enable cache). + pub limiter_info: Vec, + /// Disable cache. + /// If true, cache is ignored, but reports all statistics. + pub disable_cache: bool, + /// Retry interval for the stream read operation. + pub retry_interval: Duration, +} + +impl EndpointCacheConfig { + /// Default options for [`crate::console::provider::NodeInfoCache`]. + /// Notice that by default the limiter is empty, which means that cache is disabled. + pub const CACHE_DEFAULT_OPTIONS: &'static str = + "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s"; + + /// Parse cache options passed via cmdline. + /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. + fn parse(options: &str) -> anyhow::Result { + let mut initial_batch_size = None; + let mut default_batch_size = None; + let mut xread_timeout = None; + let mut stream_name = None; + let mut limiter_info = vec![]; + let mut disable_cache = false; + let mut retry_interval = None; + + for option in options.split(',') { + let (key, value) = option + .split_once('=') + .with_context(|| format!("bad key-value pair: {option}"))?; + + match key { + "initial_batch_size" => initial_batch_size = Some(value.parse()?), + "default_batch_size" => default_batch_size = Some(value.parse()?), + "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?), + "stream_name" => stream_name = Some(value.to_string()), + "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?), + "disable_cache" => disable_cache = value.parse()?, + "retry_interval" => retry_interval = Some(humantime::parse_duration(value)?), + unknown => bail!("unknown key: {unknown}"), + } + } + RateBucketInfo::validate(&mut limiter_info)?; + + Ok(Self { + initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?, + default_batch_size: default_batch_size.context("missing `default_batch_size`")?, + xread_timeout: xread_timeout.context("missing `xread_timeout`")?, + stream_name: stream_name.context("missing `stream_name`")?, + disable_cache, + limiter_info, + retry_interval: retry_interval.context("missing `retry_interval`")?, + }) + } +} + +impl FromStr for EndpointCacheConfig { + type Err = anyhow::Error; + + fn from_str(options: &str) -> Result { + let error = || format!("failed to parse endpoint cache options '{options}'"); + Self::parse(options).with_context(error) + } +} +#[derive(Debug)] +pub struct MetricBackupCollectionConfig { + pub interval: Duration, + pub remote_storage_config: Option, + pub chunk_size: usize, +} + +pub fn remote_storage_from_toml(s: &str) -> anyhow::Result { + RemoteStorageConfig::from_toml(&s.parse()?) +} + /// Helper for cmdline cache options parsing. #[derive(Debug)] pub struct CacheOptions { @@ -415,27 +523,92 @@ impl FromStr for ProjectInfoCacheOptions { } } +/// This is a config for connect to compute and wake compute. +#[derive(Clone, Copy, Debug)] +pub struct RetryConfig { + /// Number of times we should retry. + pub max_retries: u32, + /// Retry duration is base_delay * backoff_factor ^ n, where n starts at 0 + pub base_delay: tokio::time::Duration, + /// Exponential base for retry wait duration + pub backoff_factor: f64, +} + +impl RetryConfig { + /// Default options for RetryConfig. + + /// Total delay for 5 retries with 200ms base delay and 2 backoff factor is about 6s. + pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str = + "num_retries=5,base_retry_wait_duration=200ms,retry_wait_exponent_base=2"; + /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s. + /// Cplane has timeout of 60s on each request. 8m7s in total. + pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str = + "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6"; + + /// Parse retry options passed via cmdline. + /// Example: [`Self::CONNECT_TO_COMPUTE_DEFAULT_VALUES`]. + pub fn parse(options: &str) -> anyhow::Result { + let mut num_retries = None; + let mut base_retry_wait_duration = None; + let mut retry_wait_exponent_base = None; + + for option in options.split(',') { + let (key, value) = option + .split_once('=') + .with_context(|| format!("bad key-value pair: {option}"))?; + + match key { + "num_retries" => num_retries = Some(value.parse()?), + "base_retry_wait_duration" => { + base_retry_wait_duration = Some(humantime::parse_duration(value)?); + } + "retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?), + unknown => bail!("unknown key: {unknown}"), + } + } + + Ok(Self { + max_retries: num_retries.context("missing `num_retries`")?, + base_delay: base_retry_wait_duration.context("missing `base_retry_wait_duration`")?, + backoff_factor: retry_wait_exponent_base + .context("missing `retry_wait_exponent_base`")?, + }) + } +} + /// Helper for cmdline cache options parsing. -pub struct WakeComputeLockOptions { +#[derive(serde::Deserialize)] +pub struct ConcurrencyLockOptions { /// The number of shards the lock map should have pub shards: usize, /// The number of allowed concurrent requests for each endpoitn - pub permits: usize, + #[serde(flatten)] + pub limiter: RateLimiterConfig, /// Garbage collection epoch + #[serde(deserialize_with = "humantime_serde::deserialize")] pub epoch: Duration, /// Lock timeout + #[serde(deserialize_with = "humantime_serde::deserialize")] pub timeout: Duration, } -impl WakeComputeLockOptions { +impl ConcurrencyLockOptions { /// Default options for [`crate::console::provider::ApiLocks`]. pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0"; + /// Default options for [`crate::console::provider::ApiLocks`]. + pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str = + "shards=64,permits=100,epoch=10m,timeout=10ms"; // pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s"; /// Parse lock options passed via cmdline. /// Example: [`Self::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK`]. fn parse(options: &str) -> anyhow::Result { + let options = options.trim(); + if options.starts_with('{') && options.ends_with('}') { + return Ok(serde_json::from_str(options)?); + } + let mut shards = None; let mut permits = None; let mut epoch = None; @@ -462,9 +635,13 @@ impl WakeComputeLockOptions { shards = Some(2); } + let permits = permits.context("missing `permits`")?; let out = Self { shards: shards.context("missing `shards`")?, - permits: permits.context("missing `permits`")?, + limiter: RateLimiterConfig { + algorithm: RateLimitAlgorithm::Fixed, + initial_limit: permits, + }, epoch: epoch.context("missing `epoch`")?, timeout: timeout.context("missing `timeout`")?, }; @@ -479,7 +656,7 @@ impl WakeComputeLockOptions { } } -impl FromStr for WakeComputeLockOptions { +impl FromStr for ConcurrencyLockOptions { type Err = anyhow::Error; fn from_str(options: &str) -> Result { @@ -490,6 +667,8 @@ impl FromStr for WakeComputeLockOptions { #[cfg(test)] mod tests { + use crate::rate_limiter::Aimd; + use super::*; #[test] @@ -515,38 +694,70 @@ mod tests { #[test] fn test_parse_lock_options() -> anyhow::Result<()> { - let WakeComputeLockOptions { + let ConcurrencyLockOptions { epoch, - permits, + limiter, shards, timeout, } = "shards=32,permits=4,epoch=10m,timeout=1s".parse()?; assert_eq!(epoch, Duration::from_secs(10 * 60)); assert_eq!(timeout, Duration::from_secs(1)); assert_eq!(shards, 32); - assert_eq!(permits, 4); + assert_eq!(limiter.initial_limit, 4); + assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed); - let WakeComputeLockOptions { + let ConcurrencyLockOptions { epoch, - permits, + limiter, shards, timeout, } = "epoch=60s,shards=16,timeout=100ms,permits=8".parse()?; assert_eq!(epoch, Duration::from_secs(60)); assert_eq!(timeout, Duration::from_millis(100)); assert_eq!(shards, 16); - assert_eq!(permits, 8); + assert_eq!(limiter.initial_limit, 8); + assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed); - let WakeComputeLockOptions { + let ConcurrencyLockOptions { epoch, - permits, + limiter, shards, timeout, } = "permits=0".parse()?; assert_eq!(epoch, Duration::ZERO); assert_eq!(timeout, Duration::ZERO); assert_eq!(shards, 2); - assert_eq!(permits, 0); + assert_eq!(limiter.initial_limit, 0); + assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed); + + Ok(()) + } + + #[test] + fn test_parse_json_lock_options() -> anyhow::Result<()> { + let ConcurrencyLockOptions { + epoch, + limiter, + shards, + timeout, + } = r#"{"shards":32,"initial_limit":44,"aimd":{"min":5,"max":500,"inc":10,"dec":0.9,"utilisation":0.8},"epoch":"10m","timeout":"1s"}"# + .parse()?; + assert_eq!(epoch, Duration::from_secs(10 * 60)); + assert_eq!(timeout, Duration::from_secs(1)); + assert_eq!(shards, 32); + assert_eq!(limiter.initial_limit, 44); + assert_eq!( + limiter.algorithm, + RateLimitAlgorithm::Aimd { + conf: Aimd { + min: 5, + max: 500, + dec: 0.9, + inc: 10, + utilisation: 0.8 + } + }, + ); Ok(()) } diff --git a/proxy/src/console.rs b/proxy/src/console.rs index fd3c46b946..87d8e781aa 100644 --- a/proxy/src/console.rs +++ b/proxy/src/console.rs @@ -6,11 +6,11 @@ pub mod messages; /// Wrappers for console APIs and their mocks. pub mod provider; -pub use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo}; +pub(crate) use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo}; /// Various cache-related types. pub mod caches { - pub use super::provider::{ApiCaches, NodeInfoCache}; + pub use super::provider::ApiCaches; } /// Various cache-related types. diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index 6ef9bcf4eb..a48c7316f6 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -1,24 +1,231 @@ -use serde::Deserialize; -use std::fmt; +use measured::FixedCardinalityLabel; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::fmt::{self, Display}; use crate::auth::IpPattern; -use crate::{BranchId, EndpointId, ProjectId}; +use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; +use crate::proxy::retry::CouldRetry; +use crate::RoleName; /// Generic error response with human-readable description. /// Note that we can't always present it to user as is. -#[derive(Debug, Deserialize)] -pub struct ConsoleError { - pub error: Box, +#[derive(Debug, Deserialize, Clone)] +pub(crate) struct ConsoleError { + pub(crate) error: Box, + #[serde(skip)] + pub(crate) http_status_code: http::StatusCode, + pub(crate) status: Option, +} + +impl ConsoleError { + pub(crate) fn get_reason(&self) -> Reason { + self.status + .as_ref() + .and_then(|s| s.details.error_info.as_ref()) + .map_or(Reason::Unknown, |e| e.reason) + } + + pub(crate) fn get_user_facing_message(&self) -> String { + use super::provider::errors::REQUEST_FAILED; + self.status + .as_ref() + .and_then(|s| s.details.user_facing_message.as_ref()) + .map_or_else(|| { + // Ask @neondatabase/control-plane for review before adding more. + match self.http_status_code { + http::StatusCode::NOT_FOUND => { + // Status 404: failed to get a project-related resource. + format!("{REQUEST_FAILED}: endpoint cannot be found") + } + http::StatusCode::NOT_ACCEPTABLE => { + // Status 406: endpoint is disabled (we don't allow connections). + format!("{REQUEST_FAILED}: endpoint is disabled") + } + http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => { + // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded. + format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.") + } + _ => REQUEST_FAILED.to_owned(), + } + }, |m| m.message.clone().into()) + } +} + +impl Display for ConsoleError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let msg: &str = self + .status + .as_ref() + .and_then(|s| s.details.user_facing_message.as_ref()) + .map_or_else(|| self.error.as_ref(), |m| m.message.as_ref()); + write!(f, "{msg}") + } +} + +impl CouldRetry for ConsoleError { + fn could_retry(&self) -> bool { + // If the error message does not have a status, + // the error is unknown and probably should not retry automatically + let Some(status) = &self.status else { + return false; + }; + + // retry if the retry info is set. + if status.details.retry_info.is_some() { + return true; + } + + // if no retry info set, attempt to use the error code to guess the retry state. + let reason = status + .details + .error_info + .map_or(Reason::Unknown, |e| e.reason); + + reason.can_retry() + } +} + +#[derive(Debug, Deserialize, Clone)] +#[allow(dead_code)] +pub(crate) struct Status { + pub(crate) code: Box, + pub(crate) message: Box, + pub(crate) details: Details, +} + +#[derive(Debug, Deserialize, Clone)] +pub(crate) struct Details { + pub(crate) error_info: Option, + pub(crate) retry_info: Option, + pub(crate) user_facing_message: Option, +} + +#[derive(Copy, Clone, Debug, Deserialize)] +pub(crate) struct ErrorInfo { + pub(crate) reason: Reason, + // Schema could also have `metadata` field, but it's not structured. Skip it for now. +} + +#[derive(Clone, Copy, Debug, Deserialize, Default)] +pub(crate) enum Reason { + /// RoleProtected indicates that the role is protected and the attempted operation is not permitted on protected roles. + #[serde(rename = "ROLE_PROTECTED")] + RoleProtected, + /// ResourceNotFound indicates that a resource (project, endpoint, branch, etc.) wasn't found, + /// usually due to the provided ID not being correct or because the subject doesn't have enough permissions to + /// access the requested resource. + /// Prefer a more specific reason if possible, e.g., ProjectNotFound, EndpointNotFound, etc. + #[serde(rename = "RESOURCE_NOT_FOUND")] + ResourceNotFound, + /// ProjectNotFound indicates that the project wasn't found, usually due to the provided ID not being correct, + /// or that the subject doesn't have enough permissions to access the requested project. + #[serde(rename = "PROJECT_NOT_FOUND")] + ProjectNotFound, + /// EndpointNotFound indicates that the endpoint wasn't found, usually due to the provided ID not being correct, + /// or that the subject doesn't have enough permissions to access the requested endpoint. + #[serde(rename = "ENDPOINT_NOT_FOUND")] + EndpointNotFound, + /// BranchNotFound indicates that the branch wasn't found, usually due to the provided ID not being correct, + /// or that the subject doesn't have enough permissions to access the requested branch. + #[serde(rename = "BRANCH_NOT_FOUND")] + BranchNotFound, + /// RateLimitExceeded indicates that the rate limit for the operation has been exceeded. + #[serde(rename = "RATE_LIMIT_EXCEEDED")] + RateLimitExceeded, + /// NonDefaultBranchComputeTimeExceeded indicates that the compute time quota of non-default branches has been + /// exceeded. + #[serde(rename = "NON_PRIMARY_BRANCH_COMPUTE_TIME_EXCEEDED")] + NonDefaultBranchComputeTimeExceeded, + /// ActiveTimeQuotaExceeded indicates that the active time quota was exceeded. + #[serde(rename = "ACTIVE_TIME_QUOTA_EXCEEDED")] + ActiveTimeQuotaExceeded, + /// ComputeTimeQuotaExceeded indicates that the compute time quota was exceeded. + #[serde(rename = "COMPUTE_TIME_QUOTA_EXCEEDED")] + ComputeTimeQuotaExceeded, + /// WrittenDataQuotaExceeded indicates that the written data quota was exceeded. + #[serde(rename = "WRITTEN_DATA_QUOTA_EXCEEDED")] + WrittenDataQuotaExceeded, + /// DataTransferQuotaExceeded indicates that the data transfer quota was exceeded. + #[serde(rename = "DATA_TRANSFER_QUOTA_EXCEEDED")] + DataTransferQuotaExceeded, + /// LogicalSizeQuotaExceeded indicates that the logical size quota was exceeded. + #[serde(rename = "LOGICAL_SIZE_QUOTA_EXCEEDED")] + LogicalSizeQuotaExceeded, + /// RunningOperations indicates that the project already has some running operations + /// and scheduling of new ones is prohibited. + #[serde(rename = "RUNNING_OPERATIONS")] + RunningOperations, + /// ConcurrencyLimitReached indicates that the concurrency limit for an action was reached. + #[serde(rename = "CONCURRENCY_LIMIT_REACHED")] + ConcurrencyLimitReached, + /// LockAlreadyTaken indicates that the we attempted to take a lock that was already taken. + #[serde(rename = "LOCK_ALREADY_TAKEN")] + LockAlreadyTaken, + #[default] + #[serde(other)] + Unknown, +} + +impl Reason { + pub(crate) fn is_not_found(self) -> bool { + matches!( + self, + Reason::ResourceNotFound + | Reason::ProjectNotFound + | Reason::EndpointNotFound + | Reason::BranchNotFound + ) + } + + pub(crate) fn can_retry(self) -> bool { + match self { + // do not retry role protected errors + // not a transitive error + Reason::RoleProtected => false, + // on retry, it will still not be found + Reason::ResourceNotFound + | Reason::ProjectNotFound + | Reason::EndpointNotFound + | Reason::BranchNotFound => false, + // we were asked to go away + Reason::RateLimitExceeded + | Reason::NonDefaultBranchComputeTimeExceeded + | Reason::ActiveTimeQuotaExceeded + | Reason::ComputeTimeQuotaExceeded + | Reason::WrittenDataQuotaExceeded + | Reason::DataTransferQuotaExceeded + | Reason::LogicalSizeQuotaExceeded => false, + // transitive error. control plane is currently busy + // but might be ready soon + Reason::RunningOperations + | Reason::ConcurrencyLimitReached + | Reason::LockAlreadyTaken => true, + // unknown error. better not retry it. + Reason::Unknown => false, + } + } +} + +#[derive(Copy, Clone, Debug, Deserialize)] +#[allow(dead_code)] +pub(crate) struct RetryInfo { + pub(crate) retry_delay_ms: u64, +} + +#[derive(Debug, Deserialize, Clone)] +pub(crate) struct UserFacingMessage { + pub(crate) message: Box, } /// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`]. /// Returned by the `/proxy_get_role_secret` API method. #[derive(Deserialize)] -pub struct GetRoleSecret { - pub role_secret: Box, - pub allowed_ips: Option>, - pub project_id: Option, +pub(crate) struct GetRoleSecret { + pub(crate) role_secret: Box, + pub(crate) allowed_ips: Option>, + pub(crate) project_id: Option, } // Manually implement debug to omit sensitive info. @@ -31,21 +238,21 @@ impl fmt::Debug for GetRoleSecret { /// Response which holds compute node's `host:port` pair. /// Returned by the `/proxy_wake_compute` API method. #[derive(Debug, Deserialize)] -pub struct WakeCompute { - pub address: Box, - pub aux: MetricsAuxInfo, +pub(crate) struct WakeCompute { + pub(crate) address: Box, + pub(crate) aux: MetricsAuxInfo, } -/// Async response which concludes the link auth flow. +/// Async response which concludes the web auth flow. /// Also known as `kickResponse` in the console. #[derive(Debug, Deserialize)] -pub struct KickSession<'a> { +pub(crate) struct KickSession<'a> { /// Session ID is assigned by the proxy. - pub session_id: &'a str, + pub(crate) session_id: &'a str, /// Compute node connection params. #[serde(deserialize_with = "KickSession::parse_db_info")] - pub result: DatabaseInfo, + pub(crate) result: DatabaseInfo, } impl KickSession<'_> { @@ -68,20 +275,20 @@ impl KickSession<'_> { /// Compute node connection params. #[derive(Deserialize)] -pub struct DatabaseInfo { - pub host: Box, - pub port: u16, - pub dbname: Box, - pub user: Box, +pub(crate) struct DatabaseInfo { + pub(crate) host: Box, + pub(crate) port: u16, + pub(crate) dbname: Box, + pub(crate) user: Box, /// Console always provides a password, but it might /// be inconvenient for debug with local PG instance. - pub password: Option>, - pub aux: MetricsAuxInfo, + pub(crate) password: Option>, + pub(crate) aux: MetricsAuxInfo, } // Manually implement debug to omit sensitive info. impl fmt::Debug for DatabaseInfo { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("DatabaseInfo") .field("host", &self.host) .field("port", &self.port) @@ -93,38 +300,71 @@ impl fmt::Debug for DatabaseInfo { /// Various labels for prometheus metrics. /// Also known as `ProxyMetricsAuxInfo` in the console. -#[derive(Debug, Deserialize, Clone, Default)] -pub struct MetricsAuxInfo { - pub endpoint_id: EndpointId, - pub project_id: ProjectId, - pub branch_id: BranchId, +#[derive(Debug, Deserialize, Clone)] +pub(crate) struct MetricsAuxInfo { + pub(crate) endpoint_id: EndpointIdInt, + pub(crate) project_id: ProjectIdInt, + pub(crate) branch_id: BranchIdInt, + #[serde(default)] + pub(crate) cold_start_info: ColdStartInfo, } -impl MetricsAuxInfo { - /// Definitions of labels for traffic metric. - pub const TRAFFIC_LABELS: &'static [&'static str] = &[ - // Received (rx) / sent (tx). - "direction", - // ID of a project. - "project_id", - // ID of an endpoint within a project. - "endpoint_id", - // ID of a branch within a project (snapshot). - "branch_id", - ]; +#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)] +#[serde(rename_all = "snake_case")] +pub enum ColdStartInfo { + #[default] + Unknown, + /// Compute was already running + Warm, + #[serde(rename = "pool_hit")] + #[label(rename = "pool_hit")] + /// Compute was not running but there was an available VM + VmPoolHit, + #[serde(rename = "pool_miss")] + #[label(rename = "pool_miss")] + /// Compute was not running and there were no VMs available + VmPoolMiss, - /// Values of labels for traffic metric. - // TODO: add more type safety (validate arity & positions). - pub fn traffic_labels(&self, direction: &'static str) -> [&str; 4] { - [ - direction, - &self.project_id, - &self.endpoint_id, - &self.branch_id, - ] + // not provided by control plane + /// Connection available from HTTP pool + HttpPoolHit, + /// Cached connection info + WarmCached, +} + +impl ColdStartInfo { + pub(crate) fn as_str(self) -> &'static str { + match self { + ColdStartInfo::Unknown => "unknown", + ColdStartInfo::Warm => "warm", + ColdStartInfo::VmPoolHit => "pool_hit", + ColdStartInfo::VmPoolMiss => "pool_miss", + ColdStartInfo::HttpPoolHit => "http_pool_hit", + ColdStartInfo::WarmCached => "warm_cached", + } } } +#[derive(Debug, Deserialize, Clone)] +pub struct JwksRoleMapping { + pub roles: HashMap, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct EndpointJwksResponse { + pub jwks: Vec, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct JwksSettings { + pub id: String, + pub project_id: ProjectIdInt, + pub branch_id: BranchIdInt, + pub jwks_url: url::Url, + pub provider_name: String, + pub jwt_audience: Option, +} + #[cfg(test)] mod tests { use super::*; @@ -135,6 +375,7 @@ mod tests { "endpoint_id": "endpoint", "project_id": "project", "branch_id": "branch", + "cold_start_info": "unknown", }) } @@ -154,7 +395,7 @@ mod tests { } } }); - let _: KickSession = serde_json::from_str(&json.to_string())?; + let _: KickSession<'_> = serde_json::from_str(&json.to_string())?; Ok(()) } diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs index 373138b09e..2ed4f5f206 100644 --- a/proxy/src/console/mgmt.rs +++ b/proxy/src/console/mgmt.rs @@ -4,27 +4,28 @@ use crate::{ }; use anyhow::Context; use once_cell::sync::Lazy; -use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, QueryError}; +use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError}; use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; -use std::{convert::Infallible, future}; +use std::convert::Infallible; use tokio::net::{TcpListener, TcpStream}; +use tokio_util::sync::CancellationToken; use tracing::{error, info, info_span, Instrument}; static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); /// Give caller an opportunity to wait for the cloud's reply. -pub fn get_waiter( +pub(crate) fn get_waiter( psql_session_id: impl Into, ) -> Result, waiters::RegisterError> { CPLANE_WAITERS.register(psql_session_id.into()) } -pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::NotifyError> { +pub(crate) fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::NotifyError> { CPLANE_WAITERS.notify(psql_session_id, msg) } /// Console management API listener task. -/// It spawns console response handlers needed for the link auth. +/// It spawns console response handlers needed for the web auth. pub async fn task_main(listener: TcpListener) -> anyhow::Result { scopeguard::defer! { info!("mgmt has shut down"); @@ -67,11 +68,13 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result { async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> { let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?; - pgbackend.run(&mut MgmtHandler, future::pending::<()>).await + pgbackend + .run(&mut MgmtHandler, &CancellationToken::new()) + .await } /// A message received by `mgmt` when a compute node is ready. -pub type ComputeReady = DatabaseInfo; +pub(crate) type ComputeReady = DatabaseInfo; // TODO: replace with an http-based protocol. struct MgmtHandler; @@ -90,7 +93,8 @@ impl postgres_backend::Handler for MgmtHandler { } fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> { - let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?; + let resp: KickSession<'_> = + serde_json::from_str(query).context("Failed to parse query as json")?; let span = info_span!("event", session_id = resp.session_id); let _enter = span.enter(); diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index a6dfbd79db..12a6e2f12a 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -1,43 +1,47 @@ -#[cfg(feature = "testing")] +#[cfg(any(test, feature = "testing"))] pub mod mock; pub mod neon; -use super::messages::MetricsAuxInfo; +use super::messages::{ConsoleError, MetricsAuxInfo}; use crate::{ - auth::{backend::ComputeUserInfo, IpPattern}, - cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru}, + auth::{ + backend::{ComputeCredentialKeys, ComputeUserInfo}, + IpPattern, + }, + cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru}, compute, - config::{CacheOptions, ProjectInfoCacheOptions}, + config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}, context::RequestMonitoring, - scram, EndpointCacheKey, ProjectId, + error::ReportableError, + intern::ProjectIdInt, + metrics::ApiLockMetrics, + rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token}, + scram, EndpointCacheKey, }; -use async_trait::async_trait; use dashmap::DashMap; -use std::{sync::Arc, time::Duration}; -use tokio::sync::{OwnedSemaphorePermit, Semaphore}; +use std::{hash::Hash, sync::Arc, time::Duration}; use tokio::time::Instant; use tracing::info; -pub mod errors { +pub(crate) mod errors { use crate::{ - error::{io_error, UserFacingError}, - http, - proxy::retry::ShouldRetry, + console::messages::{self, ConsoleError, Reason}, + error::{io_error, ErrorKind, ReportableError, UserFacingError}, + proxy::retry::CouldRetry, }; use thiserror::Error; + use super::ApiLockError; + /// A go-to error message which doesn't leak any detail. - const REQUEST_FAILED: &str = "Console request failed"; + pub(crate) const REQUEST_FAILED: &str = "Console request failed"; /// Common console API error. #[derive(Debug, Error)] - pub enum ApiError { + pub(crate) enum ApiError { /// Error returned by the console itself. - #[error("{REQUEST_FAILED} with {}: {}", .status, .text)] - Console { - status: http::StatusCode, - text: Box, - }, + #[error("{REQUEST_FAILED} with {0}")] + Console(ConsoleError), /// Various IO errors like broken pipe or malformed payload. #[error("{REQUEST_FAILED}: {0}")] @@ -46,66 +50,85 @@ pub mod errors { impl ApiError { /// Returns HTTP status code if it's the reason for failure. - pub fn http_status_code(&self) -> Option { - use ApiError::*; + pub(crate) fn get_reason(&self) -> messages::Reason { match self { - Console { status, .. } => Some(*status), - _ => None, + ApiError::Console(e) => e.get_reason(), + ApiError::Transport(_) => messages::Reason::Unknown, } } } impl UserFacingError for ApiError { fn to_string_client(&self) -> String { - use ApiError::*; match self { // To minimize risks, only select errors are forwarded to users. - // Ask @neondatabase/control-plane for review before adding more. - Console { status, .. } => match *status { - http::StatusCode::NOT_FOUND => { - // Status 404: failed to get a project-related resource. - format!("{REQUEST_FAILED}: endpoint cannot be found") - } - http::StatusCode::NOT_ACCEPTABLE => { - // Status 406: endpoint is disabled (we don't allow connections). - format!("{REQUEST_FAILED}: endpoint is disabled") - } - http::StatusCode::LOCKED => { - // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded. - format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support") - } - _ => REQUEST_FAILED.to_owned(), - }, - _ => REQUEST_FAILED.to_owned(), + ApiError::Console(c) => c.get_user_facing_message(), + ApiError::Transport(_) => REQUEST_FAILED.to_owned(), } } } - impl ShouldRetry for ApiError { + impl ReportableError for ApiError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + ApiError::Console(e) => match e.get_reason() { + Reason::RoleProtected => ErrorKind::User, + Reason::ResourceNotFound => ErrorKind::User, + Reason::ProjectNotFound => ErrorKind::User, + Reason::EndpointNotFound => ErrorKind::User, + Reason::BranchNotFound => ErrorKind::User, + Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit, + Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::User, + Reason::ActiveTimeQuotaExceeded => ErrorKind::User, + Reason::ComputeTimeQuotaExceeded => ErrorKind::User, + Reason::WrittenDataQuotaExceeded => ErrorKind::User, + Reason::DataTransferQuotaExceeded => ErrorKind::User, + Reason::LogicalSizeQuotaExceeded => ErrorKind::User, + Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane, + Reason::LockAlreadyTaken => ErrorKind::ControlPlane, + Reason::RunningOperations => ErrorKind::ControlPlane, + Reason::Unknown => match &e { + ConsoleError { + http_status_code: + http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE, + .. + } => crate::error::ErrorKind::User, + ConsoleError { + http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY, + error, + .. + } if error + .contains("compute time quota of non-primary branches is exceeded") => + { + crate::error::ErrorKind::User + } + ConsoleError { + http_status_code: http::StatusCode::LOCKED, + error, + .. + } if error.contains("quota exceeded") + || error.contains("the limit for current plan reached") => + { + crate::error::ErrorKind::User + } + ConsoleError { + http_status_code: http::StatusCode::TOO_MANY_REQUESTS, + .. + } => crate::error::ErrorKind::ServiceRateLimit, + ConsoleError { .. } => crate::error::ErrorKind::ControlPlane, + }, + }, + ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane, + } + } + } + + impl CouldRetry for ApiError { fn could_retry(&self) -> bool { match self { // retry some transport errors Self::Transport(io) => io.could_retry(), - // retry some temporary failures because the compute was in a bad state - // (bad request can be returned when the endpoint was in transition) - Self::Console { - status: http::StatusCode::BAD_REQUEST, - .. - } => true, - // locked can be returned when the endpoint was in transition - // or when quotas are exceeded. don't retry when quotas are exceeded - Self::Console { - status: http::StatusCode::LOCKED, - ref text, - } => { - // written data quota exceeded - // data transfer quota exceeded - // compute time quota exceeded - // logical size quota exceeded - !text.contains("quota exceeded") - && !text.contains("the limit for current plan reached") - } - _ => false, + Self::Console(e) => e.could_retry(), } } } @@ -123,7 +146,7 @@ pub mod errors { } #[derive(Debug, Error)] - pub enum GetAuthInfoError { + pub(crate) enum GetAuthInfoError { // We shouldn't include the actual secret here. #[error("Console responded with a malformed auth secret")] BadSecret, @@ -141,25 +164,37 @@ pub mod errors { impl UserFacingError for GetAuthInfoError { fn to_string_client(&self) -> String { - use GetAuthInfoError::*; match self { // We absolutely should not leak any secrets! - BadSecret => REQUEST_FAILED.to_owned(), + Self::BadSecret => REQUEST_FAILED.to_owned(), // However, API might return a meaningful error. - ApiError(e) => e.to_string_client(), + Self::ApiError(e) => e.to_string_client(), } } } + + impl ReportableError for GetAuthInfoError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + Self::BadSecret => crate::error::ErrorKind::ControlPlane, + Self::ApiError(_) => crate::error::ErrorKind::ControlPlane, + } + } + } + #[derive(Debug, Error)] - pub enum WakeComputeError { + pub(crate) enum WakeComputeError { #[error("Console responded with a malformed compute address: {0}")] BadComputeAddress(Box), #[error(transparent)] ApiError(ApiError), - #[error("Timeout waiting to acquire wake compute lock")] - TimeoutError, + #[error("Too many connections attempts")] + TooManyConnections, + + #[error("error acquiring resource permit: {0}")] + TooManyConnectionAttempts(#[from] ApiLockError), } // This allows more useful interactions than `#[from]`. @@ -169,28 +204,42 @@ pub mod errors { } } - impl From for WakeComputeError { - fn from(_: tokio::sync::AcquireError) -> Self { - WakeComputeError::TimeoutError - } - } - impl From for WakeComputeError { - fn from(_: tokio::time::error::Elapsed) -> Self { - WakeComputeError::TimeoutError - } - } - impl UserFacingError for WakeComputeError { fn to_string_client(&self) -> String { - use WakeComputeError::*; match self { // We shouldn't show user the address even if it's broken. // Besides, user is unlikely to care about this detail. - BadComputeAddress(_) => REQUEST_FAILED.to_owned(), + Self::BadComputeAddress(_) => REQUEST_FAILED.to_owned(), // However, API might return a meaningful error. - ApiError(e) => e.to_string_client(), + Self::ApiError(e) => e.to_string_client(), - TimeoutError => "timeout while acquiring the compute resource lock".to_owned(), + Self::TooManyConnections => self.to_string(), + + Self::TooManyConnectionAttempts(_) => { + "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() + } + } + } + } + + impl ReportableError for WakeComputeError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + Self::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane, + Self::ApiError(e) => e.get_error_kind(), + Self::TooManyConnections => crate::error::ErrorKind::RateLimit, + Self::TooManyConnectionAttempts(e) => e.get_error_kind(), + } + } + } + + impl CouldRetry for WakeComputeError { + fn could_retry(&self) -> bool { + match self { + Self::BadComputeAddress(_) => false, + Self::ApiError(e) => e.could_retry(), + Self::TooManyConnections => false, + Self::TooManyConnectionAttempts(_) => false, } } } @@ -198,8 +247,8 @@ pub mod errors { /// Auth secret which is managed by the cloud. #[derive(Clone, Eq, PartialEq, Debug)] -pub enum AuthSecret { - #[cfg(feature = "testing")] +pub(crate) enum AuthSecret { + #[cfg(any(test, feature = "testing"))] /// Md5 hash of user's password. Md5([u8; 16]), @@ -208,110 +257,145 @@ pub enum AuthSecret { } #[derive(Default)] -pub struct AuthInfo { - pub secret: Option, +pub(crate) struct AuthInfo { + pub(crate) secret: Option, /// List of IP addresses allowed for the autorization. - pub allowed_ips: Vec, + pub(crate) allowed_ips: Vec, /// Project ID. This is used for cache invalidation. - pub project_id: Option, + pub(crate) project_id: Option, } /// Info for establishing a connection to a compute node. /// This is what we get after auth succeeded, but not before! #[derive(Clone)] -pub struct NodeInfo { +pub(crate) struct NodeInfo { /// Compute node connection params. /// It's sad that we have to clone this, but this will improve /// once we migrate to a bespoke connection logic. - pub config: compute::ConnCfg, + pub(crate) config: compute::ConnCfg, /// Labels for proxy's metrics. - pub aux: MetricsAuxInfo, + pub(crate) aux: MetricsAuxInfo, /// Whether we should accept self-signed certificates (for testing) - pub allow_self_signed_compute: bool, + pub(crate) allow_self_signed_compute: bool, } -pub type NodeInfoCache = TimedLru; -pub type CachedNodeInfo = Cached<&'static NodeInfoCache>; -pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option>; -pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc>>; +impl NodeInfo { + pub(crate) async fn connect( + &self, + ctx: &RequestMonitoring, + timeout: Duration, + ) -> Result { + self.config + .connect( + ctx, + self.allow_self_signed_compute, + self.aux.clone(), + timeout, + ) + .await + } + pub(crate) fn reuse_settings(&mut self, other: Self) { + self.allow_self_signed_compute = other.allow_self_signed_compute; + self.config.reuse_password(other.config); + } + + pub(crate) fn set_keys(&mut self, keys: &ComputeCredentialKeys) { + match keys { + ComputeCredentialKeys::Password(password) => self.config.password(password), + ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys), + ComputeCredentialKeys::None => &mut self.config, + }; + } +} + +pub(crate) type NodeInfoCache = TimedLru>>; +pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>; +pub(crate) type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option>; +pub(crate) type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc>>; /// This will allocate per each call, but the http requests alone /// already require a few allocations, so it should be fine. -#[async_trait] -pub trait Api { +pub(crate) trait Api { /// Get the client's auth secret for authentication. /// Returns option because user not found situation is special. /// We still have to mock the scram to avoid leaking information that user doesn't exist. async fn get_role_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result; - async fn get_allowed_ips( + async fn get_allowed_ips_and_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, - ) -> Result; + ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError>; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result; } -#[derive(Clone)] +#[non_exhaustive] pub enum ConsoleBackend { /// Current Cloud API (V2). Console(neon::Api), /// Local mock of Cloud API (V2). - #[cfg(feature = "testing")] + #[cfg(any(test, feature = "testing"))] Postgres(mock::Api), + /// Internal testing + #[cfg(test)] + #[allow(private_interfaces)] + Test(Box), } -#[async_trait] impl Api for ConsoleBackend { async fn get_role_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - use ConsoleBackend::*; match self { - Console(api) => api.get_role_secret(ctx, user_info).await, - #[cfg(feature = "testing")] - Postgres(api) => api.get_role_secret(ctx, user_info).await, + Self::Console(api) => api.get_role_secret(ctx, user_info).await, + #[cfg(any(test, feature = "testing"))] + Self::Postgres(api) => api.get_role_secret(ctx, user_info).await, + #[cfg(test)] + Self::Test(_) => { + unreachable!("this function should never be called in the test backend") + } } } - async fn get_allowed_ips( + async fn get_allowed_ips_and_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, - ) -> Result { - use ConsoleBackend::*; + ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError> { match self { - Console(api) => api.get_allowed_ips(ctx, user_info).await, - #[cfg(feature = "testing")] - Postgres(api) => api.get_allowed_ips(ctx, user_info).await, + Self::Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, + #[cfg(any(test, feature = "testing"))] + Self::Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, + #[cfg(test)] + Self::Test(api) => api.get_allowed_ips_and_secret(), } } async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - use ConsoleBackend::*; - match self { - Console(api) => api.wake_compute(ctx, user_info).await, - #[cfg(feature = "testing")] - Postgres(api) => api.wake_compute(ctx, user_info).await, + Self::Console(api) => api.wake_compute(ctx, user_info).await, + #[cfg(any(test, feature = "testing"))] + Self::Postgres(api) => api.wake_compute(ctx, user_info).await, + #[cfg(test)] + Self::Test(api) => api.wake_compute(), } } } @@ -319,15 +403,18 @@ impl Api for ConsoleBackend { /// Various caches for [`console`](super). pub struct ApiCaches { /// Cache for the `wake_compute` API method. - pub node_info: NodeInfoCache, + pub(crate) node_info: NodeInfoCache, /// Cache which stores project_id -> endpoint_ids mapping. pub project_info: Arc, + /// List of all valid endpoints. + pub endpoints_cache: Arc, } impl ApiCaches { pub fn new( wake_compute_cache_config: CacheOptions, project_info_cache_config: ProjectInfoCacheOptions, + endpoint_cache_config: EndpointCacheConfig, ) -> Self { Self { node_info: NodeInfoCache::new( @@ -337,85 +424,59 @@ impl ApiCaches { true, ), project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)), + endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)), } } } /// Various caches for [`console`](super). -pub struct ApiLocks { +pub struct ApiLocks { name: &'static str, - node_locks: DashMap>, - permits: usize, + node_locks: DashMap>, + config: RateLimiterConfig, timeout: Duration, - registered: prometheus::IntCounter, - unregistered: prometheus::IntCounter, - reclamation_lag: prometheus::Histogram, - lock_acquire_lag: prometheus::Histogram, + epoch: std::time::Duration, + metrics: &'static ApiLockMetrics, } -impl ApiLocks { +#[derive(Debug, thiserror::Error)] +pub(crate) enum ApiLockError { + #[error("timeout acquiring resource permit")] + TimeoutError(#[from] tokio::time::error::Elapsed), +} + +impl ReportableError for ApiLockError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit, + } + } +} + +impl ApiLocks { pub fn new( name: &'static str, - permits: usize, + config: RateLimiterConfig, shards: usize, timeout: Duration, + epoch: std::time::Duration, + metrics: &'static ApiLockMetrics, ) -> prometheus::Result { - let registered = prometheus::IntCounter::with_opts( - prometheus::Opts::new( - "semaphores_registered", - "Number of semaphores registered in this api lock", - ) - .namespace(name), - )?; - prometheus::register(Box::new(registered.clone()))?; - let unregistered = prometheus::IntCounter::with_opts( - prometheus::Opts::new( - "semaphores_unregistered", - "Number of semaphores unregistered in this api lock", - ) - .namespace(name), - )?; - prometheus::register(Box::new(unregistered.clone()))?; - let reclamation_lag = prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "reclamation_lag_seconds", - "Time it takes to reclaim unused semaphores in the api lock", - ) - .namespace(name) - // 1us -> 65ms - // benchmarks on my mac indicate it's usually in the range of 256us and 512us - .buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?), - )?; - prometheus::register(Box::new(reclamation_lag.clone()))?; - let lock_acquire_lag = prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "semaphore_acquire_seconds", - "Time it takes to reclaim unused semaphores in the api lock", - ) - .namespace(name) - // 0.1ms -> 6s - .buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?), - )?; - prometheus::register(Box::new(lock_acquire_lag.clone()))?; - Ok(Self { name, node_locks: DashMap::with_shard_amount(shards), - permits, + config, timeout, - lock_acquire_lag, - registered, - unregistered, - reclamation_lag, + epoch, + metrics, }) } - pub async fn get_wake_compute_permit( - &self, - key: &EndpointCacheKey, - ) -> Result { - if self.permits == 0 { - return Ok(WakeComputePermit { permit: None }); + pub(crate) async fn get_permit(&self, key: &K) -> Result { + if self.config.initial_limit == 0 { + return Ok(WakeComputePermit { + permit: Token::disabled(), + }); } let now = Instant::now(); let semaphore = { @@ -426,28 +487,27 @@ impl ApiLocks { self.node_locks .entry(key.clone()) .or_insert_with(|| { - self.registered.inc(); - Arc::new(Semaphore::new(self.permits)) + self.metrics.semaphores_registered.inc(); + DynamicLimiter::new(self.config) }) .clone() } }; - let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await; + let permit = semaphore.acquire_timeout(self.timeout).await; - self.lock_acquire_lag - .observe((Instant::now() - now).as_secs_f64()); - - Ok(WakeComputePermit { - permit: Some(permit??), - }) + self.metrics + .semaphore_acquire_seconds + .observe(now.elapsed().as_secs_f64()); + info!("acquired permit {:?}", now.elapsed().as_secs_f64()); + Ok(WakeComputePermit { permit: permit? }) } - pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) { - if self.permits == 0 { + pub async fn garbage_collect_worker(&self) { + if self.config.initial_limit == 0 { return; } - - let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32); + let mut interval = + tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32); loop { for (i, shard) in self.node_locks.shards().iter().enumerate() { interval.tick().await; @@ -460,25 +520,34 @@ impl ApiLocks { "performing epoch reclamation on api lock" ); let mut lock = shard.write(); - let timer = self.reclamation_lag.start_timer(); + let timer = self.metrics.reclamation_lag_seconds.start_timer(); let count = lock .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1) .count(); drop(lock); - self.unregistered.inc_by(count as u64); - timer.observe_duration() + self.metrics.semaphores_unregistered.inc_by(count as u64); + timer.observe(); } } } } -pub struct WakeComputePermit { - // None if the lock is disabled - permit: Option, +pub(crate) struct WakeComputePermit { + permit: Token, } impl WakeComputePermit { - pub fn should_check_cache(&self) -> bool { - self.permit.is_some() + pub(crate) fn should_check_cache(&self) -> bool { + !self.permit.is_disabled() + } + pub(crate) fn release(self, outcome: Outcome) { + self.permit.release(outcome); + } + pub(crate) fn release_result(self, res: Result) -> Result { + match res { + Ok(_) => self.release(Outcome::Success), + Err(_) => self.release(Outcome::Overload), + } + res } } diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index 55f395a403..08b87cd87a 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -4,11 +4,16 @@ use super::{ errors::{ApiError, GetAuthInfoError, WakeComputeError}, AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo, }; -use crate::console::provider::{CachedAllowedIps, CachedRoleSecret}; use crate::context::RequestMonitoring; use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; use crate::{auth::IpPattern, cache::Cached}; -use async_trait::async_trait; +use crate::{ + console::{ + messages::MetricsAuxInfo, + provider::{CachedAllowedIps, CachedRoleSecret}, + }, + BranchId, EndpointId, ProjectId, +}; use futures::TryFutureExt; use std::{str::FromStr, sync::Arc}; use thiserror::Error; @@ -43,7 +48,7 @@ impl Api { Self { endpoint } } - pub fn url(&self) -> &str { + pub(crate) fn url(&self) -> &str { self.endpoint.as_str() } @@ -59,7 +64,7 @@ impl Api { tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?; tokio::spawn(connection); - let secret = match get_execute_postgres_query( + let secret = if let Some(entry) = get_execute_postgres_query( &client, "select rolpassword from pg_catalog.pg_authid where rolname = $1", &[&&*user_info.user], @@ -67,15 +72,12 @@ impl Api { ) .await? { - Some(entry) => { - info!("got a secret: {entry}"); // safe since it's not a prod scenario - let secret = scram::ServerSecret::parse(&entry).map(AuthSecret::Scram); - secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5)) - } - None => { - warn!("user '{}' does not exist", user_info.user); - None - } + info!("got a secret: {entry}"); // safe since it's not a prod scenario + let secret = scram::ServerSecret::parse(&entry).map(AuthSecret::Scram); + secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5)) + } else { + warn!("user '{}' does not exist", user_info.user); + None }; let allowed_ips = match get_execute_postgres_query( &client, @@ -115,7 +117,12 @@ impl Api { let node = NodeInfo { config, - aux: Default::default(), + aux: MetricsAuxInfo { + endpoint_id: (&EndpointId::from("endpoint")).into(), + project_id: (&ProjectId::from("project")).into(), + branch_id: (&BranchId::from("branch")).into(), + cold_start_info: crate::console::messages::ColdStartInfo::Warm, + }, allow_self_signed_compute: false, }; @@ -132,24 +139,22 @@ async fn get_execute_postgres_query( let rows = client.query(query, params).await?; // We can get at most one row, because `rolname` is unique. - let row = match rows.first() { - Some(row) => row, + let Some(row) = rows.first() else { // This means that the user doesn't exist, so there can be no secret. // However, this is still a *valid* outcome which is very similar // to getting `404 Not found` from the Neon console. - None => return Ok(None), + return Ok(None); }; let entry = row.try_get(idx).map_err(MockApiError::PasswordNotSet)?; Ok(Some(entry)) } -#[async_trait] impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_role_secret( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { Ok(CachedRoleSecret::new_uncached( @@ -157,25 +162,26 @@ impl super::Api for Api { )) } - async fn get_allowed_ips( + async fn get_allowed_ips_and_secret( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, user_info: &ComputeUserInfo, - ) -> Result { - Ok(Cached::new_uncached(Arc::new( - self.do_get_auth_info(user_info).await?.allowed_ips, - ))) + ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { + Ok(( + Cached::new_uncached(Arc::new( + self.do_get_auth_info(user_info).await?.allowed_ips, + )), + None, + )) } #[tracing::instrument(skip_all)] async fn wake_compute( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, _user_info: &ComputeUserInfo, ) -> Result { - self.do_wake_compute() - .map_ok(CachedNodeInfo::new_uncached) - .await + self.do_wake_compute().map_ok(Cached::new_uncached).await } } diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 33618faed8..b004bf4ecf 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -6,24 +6,27 @@ use super::{ ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, }; -use crate::{auth::backend::ComputeUserInfo, compute, http, scram}; use crate::{ - cache::Cached, - context::RequestMonitoring, - metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER}, + auth::backend::ComputeUserInfo, + compute, + console::messages::{ColdStartInfo, Reason}, + http, + metrics::{CacheOutcome, Metrics}, + rate_limiter::WakeComputeRateLimiter, + scram, EndpointCacheKey, }; -use async_trait::async_trait; +use crate::{cache::Cached, context::RequestMonitoring}; use futures::TryFutureExt; -use std::sync::Arc; +use std::{sync::Arc, time::Duration}; use tokio::time::Instant; use tokio_postgres::config::SslMode; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{debug, error, info, info_span, warn, Instrument}; -#[derive(Clone)] pub struct Api { endpoint: http::Endpoint, pub caches: &'static ApiCaches, - locks: &'static ApiLocks, + pub(crate) locks: &'static ApiLocks, + pub(crate) wake_compute_endpoint_rate_limiter: Arc, jwt: String, } @@ -32,30 +35,38 @@ impl Api { pub fn new( endpoint: http::Endpoint, caches: &'static ApiCaches, - locks: &'static ApiLocks, + locks: &'static ApiLocks, + wake_compute_endpoint_rate_limiter: Arc, ) -> Self { - let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") { - Ok(v) => v, - Err(_) => "".to_string(), - }; + let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN").unwrap_or_default(); Self { endpoint, caches, locks, + wake_compute_endpoint_rate_limiter, jwt, } } - pub fn url(&self) -> &str { + pub(crate) fn url(&self) -> &str { self.endpoint.url().as_str() } async fn do_get_auth_info( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - let request_id = uuid::Uuid::new_v4().to_string(); + if !self + .caches + .endpoints_cache + .is_valid(ctx, &user_info.endpoint.normalize()) + .await + { + info!("endpoint is not valid, skipping the request"); + return Ok(AuthInfo::default()); + } + let request_id = ctx.session_id().to_string(); let application_name = ctx.console_application_name(); async { let request = self @@ -63,7 +74,7 @@ impl Api { .get("proxy_get_role_secret") .header("X-Request-ID", &request_id) .header("Authorization", format!("Bearer {}", &self.jwt)) - .query(&[("session_id", ctx.session_id)]) + .query(&[("session_id", ctx.session_id())]) .query(&[ ("application_name", application_name.as_str()), ("project", user_info.endpoint.as_str()), @@ -73,15 +84,21 @@ impl Api { info!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); let response = self.endpoint.execute(request).await?; + drop(pause); info!(duration = ?start.elapsed(), "received http response"); let body = match parse_body::(response).await { Ok(body) => body, // Error 404 is special: it's ok not to have a secret. - Err(e) => match e.http_status_code() { - Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()), - _otherwise => return Err(e.into()), - }, + // TODO(anna): retry + Err(e) => { + return if e.get_reason().is_not_found() { + Ok(AuthInfo::default()) + } else { + Err(e.into()) + } + } }; let secret = if body.role_secret.is_empty() { @@ -93,7 +110,10 @@ impl Api { Some(secret) }; let allowed_ips = body.allowed_ips.unwrap_or_default(); - ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64); + Metrics::get() + .proxy + .allowed_ips_number + .observe(allowed_ips.len() as f64); Ok(AuthInfo { secret, allowed_ips, @@ -107,10 +127,10 @@ impl Api { async fn do_wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - let request_id = uuid::Uuid::new_v4().to_string(); + let request_id = ctx.session_id().to_string(); let application_name = ctx.console_application_name(); async { let mut request_builder = self @@ -118,7 +138,7 @@ impl Api { .get("proxy_wake_compute") .header("X-Request-ID", &request_id) .header("Authorization", format!("Bearer {}", &self.jwt)) - .query(&[("session_id", ctx.session_id)]) + .query(&[("session_id", ctx.session_id())]) .query(&[ ("application_name", application_name.as_str()), ("project", user_info.endpoint.as_str()), @@ -133,7 +153,9 @@ impl Api { info!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); let response = self.endpoint.execute(request).await?; + drop(pause); info!(duration = ?start.elapsed(), "received http response"); let body = parse_body::(response).await?; @@ -163,102 +185,176 @@ impl Api { } } -#[async_trait] impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_role_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - let ep = &user_info.endpoint; + let normalized_ep = &user_info.endpoint.normalize(); let user = &user_info.user; - if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) { + if let Some(role_secret) = self + .caches + .project_info + .get_role_secret(normalized_ep, user) + { return Ok(role_secret); } let auth_info = self.do_get_auth_info(ctx, user_info).await?; if let Some(project_id) = auth_info.project_id { + let normalized_ep_int = normalized_ep.into(); self.caches.project_info.insert_role_secret( - &project_id, - ep, - user, + project_id, + normalized_ep_int, + user.into(), auth_info.secret.clone(), ); self.caches.project_info.insert_allowed_ips( - &project_id, - ep, + project_id, + normalized_ep_int, Arc::new(auth_info.allowed_ips), ); + ctx.set_project_id(project_id); } // When we just got a secret, we don't need to invalidate it. Ok(Cached::new_uncached(auth_info.secret)) } - async fn get_allowed_ips( + async fn get_allowed_ips_and_secret( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, - ) -> Result { - let ep = &user_info.endpoint; - if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) { - ALLOWED_IPS_BY_CACHE_OUTCOME - .with_label_values(&["hit"]) - .inc(); - return Ok(allowed_ips); + ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { + let normalized_ep = &user_info.endpoint.normalize(); + if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) { + Metrics::get() + .proxy + .allowed_ips_cache_misses + .inc(CacheOutcome::Hit); + return Ok((allowed_ips, None)); } - ALLOWED_IPS_BY_CACHE_OUTCOME - .with_label_values(&["miss"]) - .inc(); + Metrics::get() + .proxy + .allowed_ips_cache_misses + .inc(CacheOutcome::Miss); let auth_info = self.do_get_auth_info(ctx, user_info).await?; let allowed_ips = Arc::new(auth_info.allowed_ips); let user = &user_info.user; if let Some(project_id) = auth_info.project_id { + let normalized_ep_int = normalized_ep.into(); self.caches.project_info.insert_role_secret( - &project_id, - ep, - user, + project_id, + normalized_ep_int, + user.into(), auth_info.secret.clone(), ); - self.caches - .project_info - .insert_allowed_ips(&project_id, ep, allowed_ips.clone()); + self.caches.project_info.insert_allowed_ips( + project_id, + normalized_ep_int, + allowed_ips.clone(), + ); + ctx.set_project_id(project_id); } - Ok(Cached::new_uncached(allowed_ips)) + Ok(( + Cached::new_uncached(allowed_ips), + Some(Cached::new_uncached(auth_info.secret)), + )) } #[tracing::instrument(skip_all)] async fn wake_compute( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { let key = user_info.endpoint_cache_key(); + macro_rules! check_cache { + () => { + if let Some(cached) = self.caches.node_info.get(&key) { + let (cached, info) = cached.take_value(); + let info = info.map_err(|c| { + info!(key = &*key, "found cached wake_compute error"); + WakeComputeError::ApiError(ApiError::Console(*c)) + })?; + + debug!(key = &*key, "found cached compute node info"); + ctx.set_project(info.aux.clone()); + return Ok(cached.map(|()| info)); + } + }; + } + // Every time we do a wakeup http request, the compute node will stay up // for some time (highly depends on the console's scale-to-zero policy); // The connection info remains the same during that period of time, // which means that we might cache it to reduce the load and latency. - if let Some(cached) = self.caches.node_info.get(&key) { - info!(key = &*key, "found cached compute node info"); - return Ok(cached); - } + check_cache!(); - let permit = self.locks.get_wake_compute_permit(&key).await?; + let permit = self.locks.get_permit(&key).await?; // after getting back a permit - it's possible the cache was filled // double check if permit.should_check_cache() { - if let Some(cached) = self.caches.node_info.get(&key) { - info!(key = &*key, "found cached compute node info"); - return Ok(cached); - } + check_cache!(); } - let node = self.do_wake_compute(ctx, user_info).await?; - let (_, cached) = self.caches.node_info.insert(key.clone(), node); - info!(key = &*key, "created a cache entry for compute node info"); + // check rate limit + if !self + .wake_compute_endpoint_rate_limiter + .check(user_info.endpoint.normalize_intern(), 1) + { + return Err(WakeComputeError::TooManyConnections); + } - Ok(cached) + let node = permit.release_result(self.do_wake_compute(ctx, user_info).await); + match node { + Ok(node) => { + ctx.set_project(node.aux.clone()); + debug!(key = &*key, "created a cache entry for woken compute node"); + + let mut stored_node = node.clone(); + // store the cached node as 'warm_cached' + stored_node.aux.cold_start_info = ColdStartInfo::WarmCached; + + let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node)); + + Ok(cached.map(|()| node)) + } + Err(err) => match err { + WakeComputeError::ApiError(ApiError::Console(err)) => { + let Some(status) = &err.status else { + return Err(WakeComputeError::ApiError(ApiError::Console(err))); + }; + + let reason = status + .details + .error_info + .map_or(Reason::Unknown, |x| x.reason); + + // if we can retry this error, do not cache it. + if reason.can_retry() { + return Err(WakeComputeError::ApiError(ApiError::Console(err))); + } + + // at this point, we should only have quota errors. + debug!( + key = &*key, + "created a cache entry for the wake compute error" + ); + + self.caches.node_info.insert_ttl( + key, + Err(Box::new(err.clone())), + Duration::from_secs(30), + ); + + Err(WakeComputeError::ApiError(ApiError::Console(err))) + } + err => return Err(err), + }, + } } } @@ -272,19 +368,24 @@ async fn parse_body serde::Deserialize<'a>>( info!("request succeeded, processing the body"); return Ok(response.json().await?); } + let s = response.bytes().await?; + // Log plaintext to be able to detect, whether there are some cases not covered by the error struct. + info!("response_error plaintext: {:?}", s); // Don't throw an error here because it's not as important // as the fact that the request itself has failed. - let body = response.json().await.unwrap_or_else(|e| { + let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| { warn!("failed to parse error body: {e}"); ConsoleError { error: "reason unclear (malformed error message)".into(), + http_status_code: status, + status: None, } }); + body.http_status_code = status; - let text = body.error; - error!("console responded with an error ({status}): {text}"); - Err(ApiError::Console { status, text }) + error!("console responded with an error ({status}): {body:?}"); + Err(ApiError::Console(body)) } fn parse_host_port(input: &str) -> Option<(&str, u16)> { diff --git a/proxy/src/context.rs b/proxy/src/context.rs index 9e2ea10031..72e1fa1cee 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -2,118 +2,386 @@ use chrono::Utc; use once_cell::sync::OnceCell; +use pq_proto::StartupMessageParams; use smol_str::SmolStr; use std::net::IpAddr; use tokio::sync::mpsc; +use tracing::{field::display, info, info_span, Span}; +use try_lock::TryLock; use uuid::Uuid; use crate::{ - console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer, BranchId, - EndpointId, ProjectId, RoleName, + console::messages::{ColdStartInfo, MetricsAuxInfo}, + error::ErrorKind, + intern::{BranchIdInt, ProjectIdInt}, + metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting}, + DbName, EndpointId, RoleName, }; +use self::parquet::RequestData; + pub mod parquet; -static LOG_CHAN: OnceCell> = OnceCell::new(); +pub(crate) static LOG_CHAN: OnceCell> = OnceCell::new(); +pub(crate) static LOG_CHAN_DISCONNECT: OnceCell> = + OnceCell::new(); -#[derive(Clone)] /// Context data for a single request to connect to a database. /// /// This data should **not** be used for connection logic, only for observability and limiting purposes. /// All connection logic should instead use strongly typed state machines, not a bunch of Options. -pub struct RequestMonitoring { - pub peer_addr: IpAddr, - pub session_id: Uuid, - pub protocol: &'static str, +pub struct RequestMonitoring( + /// To allow easier use of the ctx object, we have interior mutability. + /// I would typically use a RefCell but that would break the `Send` requirements + /// so we need something with thread-safety. `TryLock` is a cheap alternative + /// that offers similar semantics to a `RefCell` but with synchronisation. + TryLock, +); + +struct RequestMonitoringInner { + pub(crate) peer_addr: IpAddr, + pub(crate) session_id: Uuid, + pub(crate) protocol: Protocol, first_packet: chrono::DateTime, region: &'static str, + pub(crate) span: Span, // filled in as they are discovered - project: Option, - branch: Option, + project: Option, + branch: Option, endpoint_id: Option, + dbname: Option, user: Option, application: Option, error_kind: Option, + pub(crate) auth_method: Option, success: bool, + pub(crate) cold_start_info: ColdStartInfo, + pg_options: Option, // extra // This sender is here to keep the request monitoring channel open while requests are taking place. - sender: Option>, - pub latency_timer: LatencyTimer, + sender: Option>, + // This sender is only used to log the length of session in case of success. + disconnect_sender: Option>, + pub(crate) latency_timer: LatencyTimer, + // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane. + rejected: Option, + disconnect_timestamp: Option>, +} + +#[derive(Clone, Debug)] +pub(crate) enum AuthMethod { + // aka passwordless, fka link + Web, + ScramSha256, + ScramSha256Plus, + Cleartext, } impl RequestMonitoring { pub fn new( session_id: Uuid, peer_addr: IpAddr, - protocol: &'static str, + protocol: Protocol, region: &'static str, ) -> Self { - Self { + let span = info_span!( + "connect_request", + %protocol, + ?session_id, + %peer_addr, + ep = tracing::field::Empty, + role = tracing::field::Empty, + ); + + let inner = RequestMonitoringInner { peer_addr, session_id, protocol, first_packet: Utc::now(), region, + span, project: None, branch: None, endpoint_id: None, + dbname: None, user: None, application: None, error_kind: None, + auth_method: None, success: false, + rejected: None, + cold_start_info: ColdStartInfo::Unknown, + pg_options: None, sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()), + disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()), latency_timer: LatencyTimer::new(protocol), - } + disconnect_timestamp: None, + }; + + Self(TryLock::new(inner)) } #[cfg(test)] - pub fn test() -> Self { - RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), "test", "test") + pub(crate) fn test() -> Self { + RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test") } - pub fn console_application_name(&self) -> String { + pub(crate) fn console_application_name(&self) -> String { + let this = self.0.try_lock().expect("should not deadlock"); format!( "{}/{}", - self.application.as_deref().unwrap_or_default(), - self.protocol + this.application.as_deref().unwrap_or_default(), + this.protocol ) } - pub fn set_project(&mut self, x: MetricsAuxInfo) { - self.branch = Some(x.branch_id); - self.endpoint_id = Some(x.endpoint_id); - self.project = Some(x.project_id); + pub(crate) fn set_rejected(&self, rejected: bool) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.rejected = Some(rejected); } - pub fn set_endpoint_id(&mut self, endpoint_id: Option) { - self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone()); + pub(crate) fn set_cold_start_info(&self, info: ColdStartInfo) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_cold_start_info(info); } - pub fn set_application(&mut self, app: Option) { - self.application = app.or_else(|| self.application.clone()); + pub(crate) fn set_db_options(&self, options: StartupMessageParams) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.set_application(options.get("application_name").map(SmolStr::from)); + if let Some(user) = options.get("user") { + this.set_user(user.into()); + } + if let Some(dbname) = options.get("database") { + this.set_dbname(dbname.into()); + } + + this.pg_options = Some(options); } - pub fn set_user(&mut self, user: RoleName) { + pub(crate) fn set_project(&self, x: MetricsAuxInfo) { + let mut this = self.0.try_lock().expect("should not deadlock"); + if this.endpoint_id.is_none() { + this.set_endpoint_id(x.endpoint_id.as_str().into()); + } + this.branch = Some(x.branch_id); + this.project = Some(x.project_id); + this.set_cold_start_info(x.cold_start_info); + } + + pub(crate) fn set_project_id(&self, project_id: ProjectIdInt) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.project = Some(project_id); + } + + pub(crate) fn set_endpoint_id(&self, endpoint_id: EndpointId) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_endpoint_id(endpoint_id); + } + + pub(crate) fn set_dbname(&self, dbname: DbName) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_dbname(dbname); + } + + pub(crate) fn set_user(&self, user: RoleName) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_user(user); + } + + pub(crate) fn set_auth_method(&self, auth_method: AuthMethod) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.auth_method = Some(auth_method); + } + + pub fn has_private_peer_addr(&self) -> bool { + self.0 + .try_lock() + .expect("should not deadlock") + .has_private_peer_addr() + } + + pub(crate) fn set_error_kind(&self, kind: ErrorKind) { + let mut this = self.0.try_lock().expect("should not deadlock"); + // Do not record errors from the private address to metrics. + if !this.has_private_peer_addr() { + Metrics::get().proxy.errors_total.inc(kind); + } + if let Some(ep) = &this.endpoint_id { + let metric = &Metrics::get().proxy.endpoints_affected_by_errors; + let label = metric.with_labels(kind); + metric.get_metric(label).measure(ep); + } + this.error_kind = Some(kind); + } + + pub fn set_success(&self) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.success = true; + } + + pub fn log_connect(&self) { + self.0 + .try_lock() + .expect("should not deadlock") + .log_connect(); + } + + pub(crate) fn protocol(&self) -> Protocol { + self.0.try_lock().expect("should not deadlock").protocol + } + + pub(crate) fn span(&self) -> Span { + self.0.try_lock().expect("should not deadlock").span.clone() + } + + pub(crate) fn session_id(&self) -> Uuid { + self.0.try_lock().expect("should not deadlock").session_id + } + + pub(crate) fn peer_addr(&self) -> IpAddr { + self.0.try_lock().expect("should not deadlock").peer_addr + } + + pub(crate) fn cold_start_info(&self) -> ColdStartInfo { + self.0 + .try_lock() + .expect("should not deadlock") + .cold_start_info + } + + pub(crate) fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause<'_> { + LatencyTimerPause { + ctx: self, + start: tokio::time::Instant::now(), + waiting_for, + } + } + + pub(crate) fn success(&self) { + self.0 + .try_lock() + .expect("should not deadlock") + .latency_timer + .success(); + } +} + +pub(crate) struct LatencyTimerPause<'a> { + ctx: &'a RequestMonitoring, + start: tokio::time::Instant, + waiting_for: Waiting, +} + +impl Drop for LatencyTimerPause<'_> { + fn drop(&mut self) { + self.ctx + .0 + .try_lock() + .expect("should not deadlock") + .latency_timer + .unpause(self.start, self.waiting_for); + } +} + +impl RequestMonitoringInner { + fn set_cold_start_info(&mut self, info: ColdStartInfo) { + self.cold_start_info = info; + self.latency_timer.cold_start_info(info); + } + + fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { + if self.endpoint_id.is_none() { + self.span.record("ep", display(&endpoint_id)); + let metric = &Metrics::get().proxy.connecting_endpoints; + let label = metric.with_labels(self.protocol); + metric.get_metric(label).measure(&endpoint_id); + self.endpoint_id = Some(endpoint_id); + } + } + + fn set_application(&mut self, app: Option) { + if let Some(app) = app { + self.application = Some(app); + } + } + + fn set_dbname(&mut self, dbname: DbName) { + self.dbname = Some(dbname); + } + + fn set_user(&mut self, user: RoleName) { + self.span.record("role", display(&user)); self.user = Some(user); } - pub fn set_success(&mut self) { - self.success = true; + fn has_private_peer_addr(&self) -> bool { + match self.peer_addr { + IpAddr::V4(ip) => ip.is_private(), + IpAddr::V6(_) => false, + } } - pub fn log(&mut self) { + fn log_connect(&mut self) { + let outcome = if self.success { + ConnectOutcome::Success + } else { + ConnectOutcome::Failed + }; + if let Some(rejected) = self.rejected { + let ep = self + .endpoint_id + .as_ref() + .map(|x| x.as_str()) + .unwrap_or_default(); + // This makes sense only if cache is disabled + info!( + ?outcome, + ?rejected, + ?ep, + "check endpoint is valid with outcome" + ); + Metrics::get() + .proxy + .invalid_endpoints_total + .inc(InvalidEndpointsGroup { + protocol: self.protocol, + rejected: rejected.into(), + outcome, + }); + } if let Some(tx) = self.sender.take() { - let _: Result<(), _> = tx.send(self.clone()); + let _: Result<(), _> = tx.send(RequestData::from(&*self)); + } + } + + fn log_disconnect(&mut self) { + // If we are here, it's guaranteed that the user successfully connected to the endpoint. + // Here we log the length of the session. + self.disconnect_timestamp = Some(Utc::now()); + if let Some(tx) = self.disconnect_sender.take() { + let _: Result<(), _> = tx.send(RequestData::from(&*self)); } } } -impl Drop for RequestMonitoring { +impl Drop for RequestMonitoringInner { fn drop(&mut self) { - self.log() + if self.sender.is_some() { + self.log_connect(); + } else { + self.log_disconnect(); + } } } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 1e9e723938..c6f83fd069 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -1,7 +1,7 @@ use std::{sync::Arc, time::SystemTime}; use anyhow::Context; -use bytes::BytesMut; +use bytes::{buf::Writer, BufMut, BytesMut}; use chrono::{Datelike, Timelike}; use futures::{Stream, StreamExt}; use parquet::{ @@ -13,21 +13,28 @@ use parquet::{ }, record::RecordWriter, }; -use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig}; +use pq_proto::StartupMessageParams; +use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel}; +use serde::ser::SerializeMap; use tokio::{sync::mpsc, time}; use tokio_util::sync::CancellationToken; use tracing::{debug, info, Span}; use utils::backoff; -use super::{RequestMonitoring, LOG_CHAN}; +use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT}; + +use super::{RequestMonitoringInner, LOG_CHAN}; #[derive(clap::Args, Clone, Debug)] pub struct ParquetUploadArgs { /// Storage location to upload the parquet files to. /// Encoded as toml (same format as pageservers), eg /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` - #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)] - parquet_upload_remote_storage: OptRemoteStorageConfig, + #[clap(long, value_parser = remote_storage_from_toml)] + parquet_upload_remote_storage: Option, + + #[clap(long, value_parser = remote_storage_from_toml)] + parquet_upload_disconnect_events_remote_storage: Option, /// How many rows to include in a row group #[clap(long, default_value_t = 8192)] @@ -50,14 +57,6 @@ pub struct ParquetUploadArgs { parquet_upload_compression: Compression, } -/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get -/// runtime type errors from the value parser we use. -type OptRemoteStorageConfig = Option; - -fn remote_storage_from_toml(s: &str) -> anyhow::Result { - RemoteStorageConfig::from_toml(&s.parse()?) -} - // Occasional network issues and such can cause remote operations to fail, and // that's expected. If a upload fails, we log it at info-level, and retry. // But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN @@ -74,7 +73,7 @@ pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10; // * after each rowgroup write, we check the length of the file and upload to s3 if large enough #[derive(parquet_derive::ParquetRecordWriter)] -struct RequestData { +pub(crate) struct RequestData { region: &'static str, protocol: &'static str, /// Must be UTC. The derive macro doesn't like the timezones @@ -84,19 +83,43 @@ struct RequestData { username: Option, application_name: Option, endpoint_id: Option, + database: Option, project: Option, branch: Option, + pg_options: Option, + auth_method: Option<&'static str>, error: Option<&'static str>, /// Success is counted if we form a HTTP response with sql rows inside /// Or if we make it to proxy_pass success: bool, + /// Indicates if the cplane started the new compute node for this request. + cold_start_info: &'static str, /// Tracks time from session start (HTTP request/libpq TCP handshake) /// Through to success/failure duration_us: u64, + /// If the session was successful after the disconnect, will be created one more event with filled `disconnect_timestamp`. + disconnect_timestamp: Option, } -impl From for RequestData { - fn from(value: RequestMonitoring) -> Self { +struct Options<'a> { + options: &'a StartupMessageParams, +} + +impl<'a> serde::Serialize for Options<'a> { + fn serialize(&self, s: S) -> Result + where + S: serde::Serializer, + { + let mut state = s.serialize_map(None)?; + for (k, v) in self.options.iter() { + state.serialize_entry(k, v)?; + } + state.end() + } +} + +impl From<&RequestMonitoringInner> for RequestData { + fn from(value: &RequestMonitoringInner) -> Self { Self { session_id: value.session_id, peer_addr: value.peer_addr.to_string(), @@ -104,16 +127,29 @@ impl From for RequestData { username: value.user.as_deref().map(String::from), application_name: value.application.as_deref().map(String::from), endpoint_id: value.endpoint_id.as_deref().map(String::from), + database: value.dbname.as_deref().map(String::from), project: value.project.as_deref().map(String::from), branch: value.branch.as_deref().map(String::from), - protocol: value.protocol, + pg_options: value + .pg_options + .as_ref() + .and_then(|options| serde_json::to_string(&Options { options }).ok()), + auth_method: value.auth_method.as_ref().map(|x| match x { + super::AuthMethod::Web => "web", + super::AuthMethod::ScramSha256 => "scram_sha_256", + super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus", + super::AuthMethod::Cleartext => "cleartext", + }), + protocol: value.protocol.as_str(), region: value.region, - error: value.error_kind.as_ref().map(|e| e.to_str()), + error: value.error_kind.as_ref().map(|e| e.to_metric_label()), success: value.success, + cold_start_info: value.cold_start_info.as_str(), duration_us: SystemTime::from(value.first_packet) .elapsed() .unwrap_or_default() .as_micros() as u64, // 584 millenia... good enough + disconnect_timestamp: value.disconnect_timestamp.map(|x| x.naive_utc()), } } } @@ -135,8 +171,9 @@ pub async fn worker( LOG_CHAN.set(tx.downgrade()).unwrap(); // setup row stream that will close on cancellation + let cancellation_token2 = cancellation_token.clone(); tokio::spawn(async move { - cancellation_token.cancelled().await; + cancellation_token2.cancelled().await; // dropping this sender will cause the channel to close only once // all the remaining inflight requests have been completed. drop(tx); @@ -144,8 +181,9 @@ pub async fn worker( let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx)); let rx = rx.map(RequestData::from); - let storage = - GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?; + let storage = GenericRemoteStorage::from_config(&remote_storage_config) + .await + .context("remote storage init")?; let properties = WriterProperties::builder() .set_data_page_size_limit(config.parquet_upload_page_size) @@ -161,9 +199,39 @@ pub async fn worker( test_remote_failures: 0, }; - worker_inner(storage, rx, parquet_config).await + // TODO(anna): consider moving this to a separate function. + if let Some(disconnect_events_storage_config) = + config.parquet_upload_disconnect_events_remote_storage + { + let (tx_disconnect, mut rx_disconnect) = mpsc::unbounded_channel(); + LOG_CHAN_DISCONNECT.set(tx_disconnect.downgrade()).unwrap(); + + // setup row stream that will close on cancellation + tokio::spawn(async move { + cancellation_token.cancelled().await; + // dropping this sender will cause the channel to close only once + // all the remaining inflight requests have been completed. + drop(tx_disconnect); + }); + let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx)); + let rx_disconnect = rx_disconnect.map(RequestData::from); + + let storage_disconnect = + GenericRemoteStorage::from_config(&disconnect_events_storage_config) + .await + .context("remote storage for disconnect events init")?; + let parquet_config_disconnect = parquet_config.clone(); + tokio::try_join!( + worker_inner(storage, rx, parquet_config), + worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect) + ) + .map(|_| ()) + } else { + worker_inner(storage, rx, parquet_config).await + } } +#[derive(Clone, Debug)] struct ParquetConfig { propeties: WriterPropertiesPtr, rows_per_group: usize, @@ -192,8 +260,9 @@ async fn worker_inner( let mut rows = Vec::with_capacity(config.rows_per_group); let schema = rows.as_slice().schema()?; - let file = BytesWriter::default(); - let mut w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?; + let buffer = BytesMut::new(); + let w = buffer.writer(); + let mut w = SerializedFileWriter::new(w, schema.clone(), config.propeties.clone())?; let mut last_upload = time::Instant::now(); @@ -221,20 +290,23 @@ async fn worker_inner( } if !w.flushed_row_groups().is_empty() { - let _: BytesWriter = upload_parquet(w, len, &storage).await?; + let _: Writer = upload_parquet(w, len, &storage).await?; } Ok(()) } -async fn flush_rows( +async fn flush_rows( rows: Vec, - mut w: SerializedFileWriter, + mut w: SerializedFileWriter, ) -> anyhow::Result<( Vec, - SerializedFileWriter, + SerializedFileWriter, RowGroupMetaDataPtr, -)> { +)> +where + W: std::io::Write + Send + 'static, +{ let span = Span::current(); let (mut rows, w, rg_meta) = tokio::task::spawn_blocking(move || { let _enter = span.enter(); @@ -258,10 +330,10 @@ async fn flush_rows( } async fn upload_parquet( - w: SerializedFileWriter, + mut w: SerializedFileWriter>, len: i64, storage: &GenericRemoteStorage, -) -> anyhow::Result { +) -> anyhow::Result> { let len_uncompressed = w .flushed_row_groups() .iter() @@ -270,11 +342,16 @@ async fn upload_parquet( // I don't know how compute intensive this is, although it probably isn't much... better be safe than sorry. // finish method only available on the fork: https://github.com/apache/arrow-rs/issues/5253 - let (mut file, metadata) = tokio::task::spawn_blocking(move || w.finish()) + let (mut buffer, metadata) = + tokio::task::spawn_blocking(move || -> parquet::errors::Result<_> { + let metadata = w.finish()?; + let buffer = std::mem::take(w.inner_mut().get_mut()); + Ok((buffer, metadata)) + }) .await .unwrap()?; - let data = file.buf.split().freeze(); + let data = buffer.split().freeze(); let compression = len as f64 / len_uncompressed as f64; let size = data.len(); @@ -300,39 +377,32 @@ async fn upload_parquet( let path = RemotePath::from_string(&format!( "{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet" ))?; - backoff::retry( + let cancel = CancellationToken::new(); + let maybe_err = backoff::retry( || async { let stream = futures::stream::once(futures::future::ready(Ok(data.clone()))); - storage.upload(stream, data.len(), &path, None).await + storage + .upload(stream, data.len(), &path, None, &cancel) + .await }, - |_e| false, + TimeoutOrCancel::caused_by_cancel, FAILED_UPLOAD_WARN_THRESHOLD, FAILED_UPLOAD_MAX_RETRIES, "request_data_upload", // we don't want cancellation to interrupt here, so we make a dummy cancel token - backoff::Cancel::new(CancellationToken::new(), || anyhow::anyhow!("Cancelled")), + &cancel, ) .await - .context("request_data_upload")?; + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("request_data_upload") + .err(); - Ok(file) -} - -// why doesn't BytesMut impl io::Write? -#[derive(Default)] -struct BytesWriter { - buf: BytesMut, -} - -impl std::io::Write for BytesWriter { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - self.buf.extend_from_slice(buf); - Ok(buf.len()) + if let Some(err) = maybe_err { + tracing::warn!(%id, %err, "failed to upload request data"); } - fn flush(&mut self) -> std::io::Result<()> { - Ok(()) - } + Ok(buffer.writer()) } #[cfg(test)] @@ -414,7 +484,9 @@ mod tests { ) .unwrap(), max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, - }) + upload_storage_class: None, + }), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }) ); assert_eq!(parquet_upload.parquet_upload_row_group_size, 100); @@ -434,20 +506,26 @@ mod tests { RequestData { session_id: uuid::Builder::from_random_bytes(rng.gen()).into_uuid(), peer_addr: Ipv4Addr::from(rng.gen::<[u8; 4]>()).to_string(), - timestamp: chrono::NaiveDateTime::from_timestamp_millis( + timestamp: chrono::DateTime::from_timestamp_millis( rng.gen_range(1703862754..1803862754), ) - .unwrap(), + .unwrap() + .naive_utc(), application_name: Some("test".to_owned()), username: Some(hex::encode(rng.gen::<[u8; 4]>())), endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())), + database: Some(hex::encode(rng.gen::<[u8; 16]>())), project: Some(hex::encode(rng.gen::<[u8; 16]>())), branch: Some(hex::encode(rng.gen::<[u8; 16]>())), + pg_options: None, + auth_method: None, protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)], region: "us-east-1", error: None, success: rng.gen(), + cold_start_info: "no", duration_us: rng.gen_range(0..30_000_000), + disconnect_timestamp: None, } } @@ -464,9 +542,14 @@ mod tests { rx: impl Stream, ) -> Vec<(u64, usize, i64)> { let remote_storage_config = RemoteStorageConfig { - storage: RemoteStorageKind::LocalFs(tmpdir.to_path_buf()), + storage: RemoteStorageKind::LocalFs { + local_path: tmpdir.to_path_buf(), + }, + timeout: std::time::Duration::from_secs(120), }; - let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap(); + let storage = GenericRemoteStorage::from_config(&remote_storage_config) + .await + .unwrap(); worker_inner(storage, rx, config).await.unwrap(); @@ -515,50 +598,16 @@ mod tests { assert_eq!( file_stats, [ - (1087635, 3, 6000), - (1087288, 3, 6000), - (1087444, 3, 6000), - (1087572, 3, 6000), - (1087468, 3, 6000), - (1087500, 3, 6000), - (1087533, 3, 6000), - (1087566, 3, 6000), - (362671, 1, 2000) - ], - ); - - tmpdir.close().unwrap(); - } - - #[tokio::test] - async fn verify_parquet_min_compression() { - let tmpdir = camino_tempfile::tempdir().unwrap(); - - let config = ParquetConfig { - propeties: Arc::new( - WriterProperties::builder() - .set_compression(parquet::basic::Compression::ZSTD(ZstdLevel::default())) - .build(), - ), - rows_per_group: 2_000, - file_size: 1_000_000, - max_duration: time::Duration::from_secs(20 * 60), - test_remote_failures: 0, - }; - - let rx = random_stream(50_000); - let file_stats = run_test(tmpdir.path(), config, rx).await; - - // with compression, there are fewer files with more rows per file - assert_eq!( - file_stats, - [ - (1028637, 5, 10000), - (1031969, 5, 10000), - (1019900, 5, 10000), - (1020365, 5, 10000), - (1025010, 5, 10000) - ], + (1315874, 3, 6000), + (1315867, 3, 6000), + (1315927, 3, 6000), + (1315884, 3, 6000), + (1316014, 3, 6000), + (1315856, 3, 6000), + (1315648, 3, 6000), + (1315884, 3, 6000), + (438913, 1, 2000) + ] ); tmpdir.close().unwrap(); @@ -589,12 +638,12 @@ mod tests { assert_eq!( file_stats, [ - (1210770, 6, 12000), - (1211036, 6, 12000), - (1210990, 6, 12000), - (1210861, 6, 12000), - (202073, 1, 2000) - ], + (1208861, 5, 10000), + (1208592, 5, 10000), + (1208885, 5, 10000), + (1208873, 5, 10000), + (1209128, 5, 10000) + ] ); tmpdir.close().unwrap(); @@ -618,16 +667,16 @@ mod tests { assert_eq!( file_stats, [ - (1087635, 3, 6000), - (1087288, 3, 6000), - (1087444, 3, 6000), - (1087572, 3, 6000), - (1087468, 3, 6000), - (1087500, 3, 6000), - (1087533, 3, 6000), - (1087566, 3, 6000), - (362671, 1, 2000) - ], + (1315874, 3, 6000), + (1315867, 3, 6000), + (1315927, 3, 6000), + (1315884, 3, 6000), + (1316014, 3, 6000), + (1315856, 3, 6000), + (1315648, 3, 6000), + (1315884, 3, 6000), + (438913, 1, 2000) + ] ); tmpdir.close().unwrap(); @@ -653,7 +702,7 @@ mod tests { while let Some(r) = s.next().await { tx.send(r).unwrap(); } - time::sleep(time::Duration::from_secs(70)).await + time::sleep(time::Duration::from_secs(70)).await; } }); @@ -663,7 +712,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(545264, 2, 3001), (545025, 2, 3000), (544857, 2, 2999)], + [(659836, 2, 3001), (659550, 2, 3000), (659346, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/proxy/src/error.rs b/proxy/src/error.rs index 5b2dd7ecfd..53f9f75c5b 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -1,12 +1,14 @@ use std::{error::Error as StdError, fmt, io}; +use measured::FixedCardinalityLabel; + /// Upcast (almost) any error into an opaque [`io::Error`]. -pub fn io_error(e: impl Into>) -> io::Error { +pub(crate) fn io_error(e: impl Into>) -> io::Error { io::Error::new(io::ErrorKind::Other, e) } /// A small combinator for pluggable error logging. -pub fn log_error(e: E) -> E { +pub(crate) fn log_error(e: E) -> E { tracing::error!("{e}"); e } @@ -17,7 +19,7 @@ pub fn log_error(e: E) -> E { /// NOTE: This trait should not be implemented for [`anyhow::Error`], since it /// is way too convenient and tends to proliferate all across the codebase, /// ultimately leading to accidental leaks of sensitive data. -pub trait UserFacingError: fmt::Display { +pub(crate) trait UserFacingError: ReportableError { /// Format the error for client, stripping all sensitive info. /// /// Although this might be a no-op for many types, it's highly @@ -29,36 +31,63 @@ pub trait UserFacingError: fmt::Display { } } -#[derive(Clone)] +#[derive(Copy, Clone, Debug, Eq, PartialEq, FixedCardinalityLabel)] +#[label(singleton = "type")] pub enum ErrorKind { /// Wrong password, unknown endpoint, protocol violation, etc... User, /// Network error between user and proxy. Not necessarily user error - Disconnect, + #[label(rename = "clientdisconnect")] + ClientDisconnect, - /// Proxy self-imposed rate limits + /// Proxy self-imposed user rate limits + #[label(rename = "ratelimit")] RateLimit, + /// Proxy self-imposed service-wise rate limits + #[label(rename = "serviceratelimit")] + ServiceRateLimit, + /// internal errors Service, /// Error communicating with control plane + #[label(rename = "controlplane")] ControlPlane, + /// Postgres error + Postgres, + /// Error communicating with compute Compute, } impl ErrorKind { - pub fn to_str(&self) -> &'static str { + pub(crate) fn to_metric_label(self) -> &'static str { match self { - ErrorKind::User => "request failed due to user error", - ErrorKind::Disconnect => "client disconnected", - ErrorKind::RateLimit => "request cancelled due to rate limit", - ErrorKind::Service => "internal service error", - ErrorKind::ControlPlane => "non-retryable control plane error", - ErrorKind::Compute => "non-retryable compute error (or exhausted retry capacity)", + ErrorKind::User => "user", + ErrorKind::ClientDisconnect => "clientdisconnect", + ErrorKind::RateLimit => "ratelimit", + ErrorKind::ServiceRateLimit => "serviceratelimit", + ErrorKind::Service => "service", + ErrorKind::ControlPlane => "controlplane", + ErrorKind::Postgres => "postgres", + ErrorKind::Compute => "compute", + } + } +} + +pub(crate) trait ReportableError: fmt::Display + Send + 'static { + fn get_error_kind(&self) -> ErrorKind; +} + +impl ReportableError for tokio_postgres::error::Error { + fn get_error_kind(&self) -> ErrorKind { + if self.as_db_error().is_some() { + ErrorKind::Postgres + } else { + ErrorKind::Compute } } } diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 59e1492ed4..c77d95f47d 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -4,44 +4,48 @@ pub mod health_server; -use std::{sync::Arc, time::Duration}; +use std::time::Duration; -use futures::FutureExt; -pub use reqwest::{Request, Response, StatusCode}; -pub use reqwest_middleware::{ClientWithMiddleware, Error}; -pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; -use tokio::time::Instant; -use tracing::trace; +use anyhow::bail; +use bytes::Bytes; +use http_body_util::BodyExt; +use hyper1::body::Body; +use serde::de::DeserializeOwned; -use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl}; +pub(crate) use reqwest::{Request, Response}; +pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error}; +pub(crate) use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; + +use crate::{ + metrics::{ConsoleRequest, Metrics}, + url::ApiUrl, +}; use reqwest_middleware::RequestBuilder; /// This is the preferred way to create new http clients, /// because it takes care of observability (OpenTelemetry). /// We deliberately don't want to replace this with a public static. -pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> ClientWithMiddleware { +pub fn new_client() -> ClientWithMiddleware { let client = reqwest::ClientBuilder::new() - .dns_resolver(Arc::new(GaiResolver::default())) - .connection_verbose(true) .build() .expect("Failed to create http client"); reqwest_middleware::ClientBuilder::new(client) .with(reqwest_tracing::TracingMiddleware::default()) - .with(rate_limiter::Limiter::new(rate_limiter_config)) .build() } -pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware { +pub(crate) fn new_client_with_timeout( + request_timeout: Duration, + total_retry_duration: Duration, +) -> ClientWithMiddleware { let timeout_client = reqwest::ClientBuilder::new() - .dns_resolver(Arc::new(GaiResolver::default())) - .connection_verbose(true) - .timeout(default_timout) + .timeout(request_timeout) .build() .expect("Failed to create http client with timeout"); let retry_policy = - ExponentialBackoff::builder().build_with_total_retry_duration(default_timout); + ExponentialBackoff::builder().build_with_total_retry_duration(total_retry_duration); reqwest_middleware::ClientBuilder::new(timeout_client) .with(reqwest_tracing::TracingMiddleware::default()) @@ -76,59 +80,56 @@ impl Endpoint { } #[inline(always)] - pub fn url(&self) -> &ApiUrl { + pub(crate) fn url(&self) -> &ApiUrl { &self.endpoint } /// Return a [builder](RequestBuilder) for a `GET` request, /// appending a single `path` segment to the base endpoint URL. - pub fn get(&self, path: &str) -> RequestBuilder { + pub(crate) fn get(&self, path: &str) -> RequestBuilder { let mut url = self.endpoint.clone(); url.path_segments_mut().push(path); self.client.get(url.into_inner()) } /// Execute a [request](reqwest::Request). - pub async fn execute(&self, request: Request) -> Result { - let path = request.url().path().to_string(); - let start = Instant::now(); - let res = self.client.execute(request).await; - CONSOLE_REQUEST_LATENCY - .with_label_values(&[&path]) - .observe(start.elapsed().as_secs_f64()); - res + pub(crate) async fn execute(&self, request: Request) -> Result { + let _timer = Metrics::get() + .proxy + .console_request_latency + .start_timer(ConsoleRequest { + request: request.url().path(), + }); + + self.client.execute(request).await } } -/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html -use hyper::{ - client::connect::dns::{GaiResolver as HyperGaiResolver, Name}, - service::Service, -}; -use reqwest::dns::{Addrs, Resolve, Resolving}; -#[derive(Debug)] -pub struct GaiResolver(HyperGaiResolver); +pub(crate) async fn parse_json_body_with_limit( + mut b: impl Body + Unpin, + limit: usize, +) -> anyhow::Result { + // We could use `b.limited().collect().await.to_bytes()` here + // but this ends up being slightly more efficient as far as I can tell. -impl Default for GaiResolver { - fn default() -> Self { - Self(HyperGaiResolver::new()) - } -} + // check the lower bound of the size hint. + // in reqwest, this value is influenced by the Content-Length header. + let lower_bound = match usize::try_from(b.size_hint().lower()) { + Ok(bound) if bound <= limit => bound, + _ => bail!("Content length exceeds limit of {limit} bytes"), + }; + let mut bytes = Vec::with_capacity(lower_bound); -impl Resolve for GaiResolver { - fn resolve(&self, name: Name) -> Resolving { - let this = &mut self.0.clone(); - let start = Instant::now(); - Box::pin( - Service::::call(this, name.clone()).map(move |result| { - let resolve_duration = start.elapsed(); - trace!(duration = ?resolve_duration, addr = %name, "resolve host complete"); - result - .map(|addrs| -> Addrs { Box::new(addrs) }) - .map_err(|err| -> Box { Box::new(err) }) - }), - ) + while let Some(frame) = b.frame().await.transpose()? { + if let Ok(data) = frame.into_data() { + if bytes.len() + data.len() > limit { + bail!("Content length exceeds limit of {limit} bytes") + } + bytes.extend_from_slice(&data); + } } + + Ok(serde_json::from_slice::(&bytes)?) } #[cfg(test)] diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index 6186ddde0d..cae9eb5b97 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -1,23 +1,49 @@ use anyhow::{anyhow, bail}; -use hyper::{Body, Request, Response, StatusCode}; -use std::{convert::Infallible, net::TcpListener}; -use tracing::info; -use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService}; +use hyper::{header::CONTENT_TYPE, Body, Request, Response, StatusCode}; +use measured::{text::BufferedTextEncoder, MetricGroup}; +use metrics::NeonMetrics; +use std::{ + convert::Infallible, + net::TcpListener, + sync::{Arc, Mutex}, +}; +use tracing::{info, info_span}; +use utils::http::{ + endpoint::{self, request_span}, + error::ApiError, + json::json_response, + RouterBuilder, RouterService, +}; + +use crate::jemalloc; async fn status_handler(_: Request) -> Result, ApiError> { json_response(StatusCode::OK, "") } -fn make_router() -> RouterBuilder { - endpoint::make_router().get("/v1/status", status_handler) +fn make_router(metrics: AppMetrics) -> RouterBuilder { + let state = Arc::new(Mutex::new(PrometheusHandler { + encoder: BufferedTextEncoder::new(), + metrics, + })); + + endpoint::make_router() + .get("/metrics", move |r| { + let state = state.clone(); + request_span(r, move |b| prometheus_metrics_handler(b, state)) + }) + .get("/v1/status", status_handler) } -pub async fn task_main(http_listener: TcpListener) -> anyhow::Result { +pub async fn task_main( + http_listener: TcpListener, + metrics: AppMetrics, +) -> anyhow::Result { scopeguard::defer! { info!("http has shut down"); } - let service = || RouterService::new(make_router().build()?); + let service = || RouterService::new(make_router(metrics).build()?); hyper::Server::from_tcp(http_listener)? .serve(service().map_err(|e| anyhow!(e))?) @@ -25,3 +51,57 @@ pub async fn task_main(http_listener: TcpListener) -> anyhow::Result bail!("hyper server without shutdown handling cannot shutdown successfully"); } + +struct PrometheusHandler { + encoder: BufferedTextEncoder, + metrics: AppMetrics, +} + +#[derive(MetricGroup)] +pub struct AppMetrics { + #[metric(namespace = "jemalloc")] + pub jemalloc: Option, + #[metric(flatten)] + pub neon_metrics: NeonMetrics, + #[metric(flatten)] + pub proxy: &'static crate::metrics::Metrics, +} + +async fn prometheus_metrics_handler( + _req: Request, + state: Arc>, +) -> Result, ApiError> { + let started_at = std::time::Instant::now(); + + let span = info_span!("blocking"); + let body = tokio::task::spawn_blocking(move || { + let _span = span.entered(); + + let mut state = state.lock().unwrap(); + let PrometheusHandler { encoder, metrics } = &mut *state; + + metrics + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); + + let body = encoder.finish(); + + tracing::info!( + bytes = body.len(), + elapsed_ms = started_at.elapsed().as_millis(), + "responded /metrics" + ); + + body + }) + .await + .unwrap(); + + let response = Response::builder() + .status(200) + .header(CONTENT_TYPE, "text/plain; version=0.0.4") + .body(Body::from(body)) + .unwrap(); + + Ok(response) +} diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs new file mode 100644 index 0000000000..e5144cfe2e --- /dev/null +++ b/proxy/src/intern.rs @@ -0,0 +1,250 @@ +use std::{ + hash::BuildHasherDefault, marker::PhantomData, num::NonZeroUsize, ops::Index, sync::OnceLock, +}; + +use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo}; +use rustc_hash::FxHasher; + +use crate::{BranchId, EndpointId, ProjectId, RoleName}; + +pub trait InternId: Sized + 'static { + fn get_interner() -> &'static StringInterner; +} + +pub struct StringInterner { + inner: ThreadedRodeo>, + _id: PhantomData, +} + +#[derive(PartialEq, Debug, Clone, Copy, Eq, Hash)] +pub struct InternedString { + inner: Spur, + _id: PhantomData, +} + +impl std::fmt::Display for InternedString { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.as_str().fmt(f) + } +} + +impl InternedString { + pub(crate) fn as_str(&self) -> &'static str { + Id::get_interner().inner.resolve(&self.inner) + } + pub(crate) fn get(s: &str) -> Option { + Id::get_interner().get(s) + } +} + +impl AsRef for InternedString { + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl std::ops::Deref for InternedString { + type Target = str; + fn deref(&self) -> &str { + self.as_str() + } +} + +impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString { + fn deserialize>(d: D) -> Result { + struct Visitor(PhantomData); + impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor { + type Value = InternedString; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("a string") + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + Ok(Id::get_interner().get_or_intern(v)) + } + } + d.deserialize_str(Visitor::(PhantomData)) + } +} + +impl serde::Serialize for InternedString { + fn serialize(&self, s: S) -> Result { + self.as_str().serialize(s) + } +} + +impl StringInterner { + pub(crate) fn new() -> Self { + StringInterner { + inner: ThreadedRodeo::with_capacity_memory_limits_and_hasher( + Capacity::new(2500, NonZeroUsize::new(1 << 16).unwrap()), + // unbounded + MemoryLimits::for_memory_usage(usize::MAX), + BuildHasherDefault::::default(), + ), + _id: PhantomData, + } + } + + #[cfg(test)] + fn len(&self) -> usize { + self.inner.len() + } + + #[cfg(test)] + fn current_memory_usage(&self) -> usize { + self.inner.current_memory_usage() + } + + pub(crate) fn get_or_intern(&self, s: &str) -> InternedString { + InternedString { + inner: self.inner.get_or_intern(s), + _id: PhantomData, + } + } + + pub(crate) fn get(&self, s: &str) -> Option> { + Some(InternedString { + inner: self.inner.get(s)?, + _id: PhantomData, + }) + } +} + +impl Index> for StringInterner { + type Output = str; + + fn index(&self, index: InternedString) -> &Self::Output { + self.inner.resolve(&index.inner) + } +} + +impl Default for StringInterner { + fn default() -> Self { + Self::new() + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub(crate) struct RoleNameTag; +impl InternId for RoleNameTag { + fn get_interner() -> &'static StringInterner { + static ROLE_NAMES: OnceLock> = OnceLock::new(); + ROLE_NAMES.get_or_init(Default::default) + } +} +pub(crate) type RoleNameInt = InternedString; +impl From<&RoleName> for RoleNameInt { + fn from(value: &RoleName) -> Self { + RoleNameTag::get_interner().get_or_intern(value) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct EndpointIdTag; +impl InternId for EndpointIdTag { + fn get_interner() -> &'static StringInterner { + static ROLE_NAMES: OnceLock> = OnceLock::new(); + ROLE_NAMES.get_or_init(Default::default) + } +} +pub type EndpointIdInt = InternedString; +impl From<&EndpointId> for EndpointIdInt { + fn from(value: &EndpointId) -> Self { + EndpointIdTag::get_interner().get_or_intern(value) + } +} +impl From for EndpointIdInt { + fn from(value: EndpointId) -> Self { + EndpointIdTag::get_interner().get_or_intern(&value) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct BranchIdTag; +impl InternId for BranchIdTag { + fn get_interner() -> &'static StringInterner { + static ROLE_NAMES: OnceLock> = OnceLock::new(); + ROLE_NAMES.get_or_init(Default::default) + } +} +pub type BranchIdInt = InternedString; +impl From<&BranchId> for BranchIdInt { + fn from(value: &BranchId) -> Self { + BranchIdTag::get_interner().get_or_intern(value) + } +} +impl From for BranchIdInt { + fn from(value: BranchId) -> Self { + BranchIdTag::get_interner().get_or_intern(&value) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct ProjectIdTag; +impl InternId for ProjectIdTag { + fn get_interner() -> &'static StringInterner { + static ROLE_NAMES: OnceLock> = OnceLock::new(); + ROLE_NAMES.get_or_init(Default::default) + } +} +pub type ProjectIdInt = InternedString; +impl From<&ProjectId> for ProjectIdInt { + fn from(value: &ProjectId) -> Self { + ProjectIdTag::get_interner().get_or_intern(value) + } +} +impl From for ProjectIdInt { + fn from(value: ProjectId) -> Self { + ProjectIdTag::get_interner().get_or_intern(&value) + } +} + +#[cfg(test)] +mod tests { + use std::sync::OnceLock; + + use crate::intern::StringInterner; + + use super::InternId; + + struct MyId; + impl InternId for MyId { + fn get_interner() -> &'static StringInterner { + pub(crate) static ROLE_NAMES: OnceLock> = OnceLock::new(); + ROLE_NAMES.get_or_init(Default::default) + } + } + + #[test] + fn push_many_strings() { + use rand::{rngs::StdRng, Rng, SeedableRng}; + use rand_distr::Zipf; + + let endpoint_dist = Zipf::new(500000, 0.8).unwrap(); + let endpoints = StdRng::seed_from_u64(272488357).sample_iter(endpoint_dist); + + let interner = MyId::get_interner(); + + const N: usize = 100_000; + let mut verify = Vec::with_capacity(N); + for endpoint in endpoints.take(N) { + let endpoint = format!("ep-string-interning-{endpoint}"); + let key = interner.get_or_intern(&endpoint); + verify.push((endpoint, key)); + } + + for (s, key) in verify { + assert_eq!(interner[key], s); + } + + // 2031616/59861 = 34 bytes per string + assert_eq!(interner.len(), 59_861); + // will have other overhead for the internal hashmaps that are not accounted for. + assert_eq!(interner.current_memory_usage(), 2_031_616); + } +} diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs new file mode 100644 index 0000000000..d307d80f4a --- /dev/null +++ b/proxy/src/jemalloc.rs @@ -0,0 +1,116 @@ +use std::marker::PhantomData; + +use measured::{ + label::NoLabels, + metric::{ + gauge::GaugeState, group::Encoding, name::MetricNameEncoder, MetricEncoding, + MetricFamilyEncoding, MetricType, + }, + text::TextEncoder, + LabelGroup, MetricGroup, +}; +use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version}; + +pub struct MetricRecorder { + epoch: epoch_mib, + inner: Metrics, +} + +#[derive(MetricGroup)] +struct Metrics { + active_bytes: JemallocGaugeFamily, + allocated_bytes: JemallocGaugeFamily, + mapped_bytes: JemallocGaugeFamily, + metadata_bytes: JemallocGaugeFamily, + resident_bytes: JemallocGaugeFamily, + retained_bytes: JemallocGaugeFamily, +} + +impl MetricGroup for MetricRecorder +where + Metrics: MetricGroup, +{ + fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> { + if self.epoch.advance().is_ok() { + self.inner.collect_group_into(enc)?; + } + Ok(()) + } +} + +impl MetricRecorder { + pub fn new() -> Result { + tracing::info!( + config = config::malloc_conf::read()?, + version = version::read()?, + "starting jemalloc recorder" + ); + + Ok(Self { + epoch: epoch::mib()?, + inner: Metrics { + active_bytes: JemallocGaugeFamily(stats::active::mib()?), + allocated_bytes: JemallocGaugeFamily(stats::allocated::mib()?), + mapped_bytes: JemallocGaugeFamily(stats::mapped::mib()?), + metadata_bytes: JemallocGaugeFamily(stats::metadata::mib()?), + resident_bytes: JemallocGaugeFamily(stats::resident::mib()?), + retained_bytes: JemallocGaugeFamily(stats::retained::mib()?), + }, + }) + } +} + +struct JemallocGauge(PhantomData); + +impl Default for JemallocGauge { + fn default() -> Self { + JemallocGauge(PhantomData) + } +} +impl MetricType for JemallocGauge { + type Metadata = T; +} + +struct JemallocGaugeFamily(T); +impl MetricFamilyEncoding for JemallocGaugeFamily +where + JemallocGauge: MetricEncoding, +{ + fn collect_family_into(&self, name: impl MetricNameEncoder, enc: &mut T) -> Result<(), T::Err> { + JemallocGauge::write_type(&name, enc)?; + JemallocGauge(PhantomData).collect_into(&self.0, NoLabels, name, enc) + } +} + +macro_rules! jemalloc_gauge { + ($stat:ident, $mib:ident) => { + impl MetricEncoding> for JemallocGauge { + fn write_type( + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + GaugeState::write_type(name, enc) + } + + fn collect_into( + &self, + mib: &stats::$mib, + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + if let Ok(v) = mib.read() { + GaugeState::new(v as i64).collect_into(&(), labels, name, enc)?; + } + Ok(()) + } + } + }; +} + +jemalloc_gauge!(active, active_mib); +jemalloc_gauge!(allocated, allocated_mib); +jemalloc_gauge!(mapped, mapped_mib); +jemalloc_gauge!(metadata, metadata_mib); +jemalloc_gauge!(resident, resident_mib); +jemalloc_gauge!(retained, retained_mib); diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index a9e4a38302..8d7e586b3d 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -1,8 +1,86 @@ -#![deny(clippy::undocumented_unsafe_blocks)] +// rustc lints/lint groups +// https://doc.rust-lang.org/rustc/lints/groups.html +#![deny( + deprecated, + future_incompatible, + // TODO: consider let_underscore + nonstandard_style, + rust_2024_compatibility +)] +#![warn(clippy::all, clippy::pedantic, clippy::cargo)] +// List of denied lints from the clippy::restriction group. +// https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction +#![warn( + clippy::undocumented_unsafe_blocks, + // TODO: Enable once all individual checks are enabled. + //clippy::as_conversions, + clippy::dbg_macro, + clippy::empty_enum_variants_with_brackets, + clippy::exit, + clippy::float_cmp_const, + clippy::lossy_float_literal, + clippy::macro_use_imports, + clippy::manual_ok_or, + // TODO: consider clippy::map_err_ignore + // TODO: consider clippy::mem_forget + clippy::rc_mutex, + clippy::rest_pat_in_fully_bound_structs, + clippy::string_add, + clippy::string_to_string, + clippy::todo, + // TODO: consider clippy::unimplemented + // TODO: consider clippy::unwrap_used +)] +// List of permanently allowed lints. +#![allow( + // It's ok to cast bool to u8, etc. + clippy::cast_lossless, + // Seems unavoidable. + clippy::multiple_crate_versions, + // While #[must_use] is a great feature this check is too noisy. + clippy::must_use_candidate, + // Inline consts, structs, fns, imports, etc. are ok if they're used by + // the following statement(s). + clippy::items_after_statements, +)] +// List of temporarily allowed lints. +// TODO: Switch to except() once stable with 1.81. +// TODO: fix code and reduce list or move to permanent list above. +#![allow( + clippy::cargo_common_metadata, + clippy::cast_possible_truncation, + clippy::cast_possible_wrap, + clippy::cast_precision_loss, + clippy::cast_sign_loss, + clippy::doc_markdown, + clippy::implicit_hasher, + clippy::inline_always, + clippy::match_same_arms, + clippy::match_wild_err_arm, + clippy::missing_errors_doc, + clippy::missing_panics_doc, + clippy::module_name_repetitions, + clippy::needless_pass_by_value, + clippy::needless_raw_string_hashes, + clippy::redundant_closure_for_method_calls, + clippy::return_self_not_must_use, + clippy::similar_names, + clippy::single_match_else, + clippy::struct_excessive_bools, + clippy::struct_field_names, + clippy::too_many_lines, + clippy::unreadable_literal, + clippy::unused_async, + clippy::unused_self, + clippy::wildcard_imports +)] +// List of temporarily allowed lints to unblock beta/nightly. +#![allow(unknown_lints, clippy::manual_inspect)] -use std::convert::Infallible; +use std::{convert::Infallible, future::Future}; use anyhow::{bail, Context}; +use intern::{EndpointIdInt, EndpointIdTag, InternId}; use tokio::task::JoinError; use tokio_util::sync::CancellationToken; use tracing::warn; @@ -16,6 +94,8 @@ pub mod console; pub mod context; pub mod error; pub mod http; +pub mod intern; +pub mod jemalloc; pub mod logging; pub mod metrics; pub mod parse; @@ -32,7 +112,14 @@ pub mod usage_metrics; pub mod waiters; /// Handle unix signals appropriately. -pub async fn handle_signals(token: CancellationToken) -> anyhow::Result { +pub async fn handle_signals( + token: CancellationToken, + mut refresh_config: F, +) -> anyhow::Result +where + F: FnMut() -> Fut, + Fut: Future>, +{ use tokio::signal::unix::{signal, SignalKind}; let mut hangup = signal(SignalKind::hangup())?; @@ -43,7 +130,8 @@ pub async fn handle_signals(token: CancellationToken) -> anyhow::Result { - warn!("received SIGHUP; config reload is not supported"); + warn!("received SIGHUP"); + refresh_config().await?; } // Shut down the whole application. _ = interrupt.recv() => { @@ -69,7 +157,8 @@ macro_rules! smol_str_wrapper { pub struct $name(smol_str::SmolStr); impl $name { - pub fn as_str(&self) -> &str { + #[allow(unused)] + pub(crate) fn as_str(&self) -> &str { self.0.as_str() } } @@ -125,6 +214,26 @@ macro_rules! smol_str_wrapper { }; } +const POOLER_SUFFIX: &str = "-pooler"; + +impl EndpointId { + fn normalize(&self) -> Self { + if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) { + stripped.into() + } else { + self.clone() + } + } + + fn normalize_intern(&self) -> EndpointIdInt { + if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) { + EndpointIdTag::get_interner().get_or_intern(stripped) + } else { + self.into() + } + } +} + // 90% of role name strings are 20 characters or less. smol_str_wrapper!(RoleName); // 50% of endpoint strings are 23 characters or less. @@ -138,3 +247,25 @@ smol_str_wrapper!(ProjectId); smol_str_wrapper!(EndpointCacheKey); smol_str_wrapper!(DbName); + +// postgres hostname, will likely be a port:ip addr +smol_str_wrapper!(Host); + +// Endpoints are a bit tricky. Rare they might be branches or projects. +impl EndpointId { + pub(crate) fn is_endpoint(&self) -> bool { + self.0.starts_with("ep-") + } + pub(crate) fn is_branch(&self) -> bool { + self.0.starts_with("br-") + } + // pub(crate) fn is_project(&self) -> bool { + // !self.is_endpoint() && !self.is_branch() + // } + pub(crate) fn as_branch(&self) -> BranchId { + BranchId(self.0.clone()) + } + pub(crate) fn as_project(&self) -> ProjectId { + ProjectId(self.0.clone()) + } +} diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 3405b8cbc6..3b30ad8b46 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -15,7 +15,8 @@ use tracing_subscriber::{ pub async fn init() -> anyhow::Result { let env_filter = EnvFilter::builder() .with_default_directive(LevelFilter::INFO.into()) - .from_env_lossy(); + .from_env_lossy() + .add_directive("azure_core::policies::transport=off".parse().unwrap()); let fmt_layer = tracing_subscriber::fmt::layer() .with_ansi(false) diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 6e4cbb3f3a..2da7eac580 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -1,238 +1,621 @@ -use ::metrics::{ - exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec, - IntCounterPairVec, IntCounterVec, -}; -use prometheus::{ - register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec, - IntGaugeVec, +use std::sync::{Arc, OnceLock}; + +use lasso::ThreadedRodeo; +use measured::{ + label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet}, + metric::{histogram::Thresholds, name::MetricName}, + Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup, + MetricGroup, }; +use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; -use once_cell::sync::Lazy; -use tokio::time; +use tokio::time::{self, Instant}; -pub static NUM_DB_CONNECTIONS_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_opened_db_connections_total", - "Number of opened connections to a database.", - "proxy_closed_db_connections_total", - "Number of closed connections to a database.", - &["protocol"], - ) - .unwrap() -}); +use crate::console::messages::ColdStartInfo; -pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_opened_client_connections_total", - "Number of opened connections from a client.", - "proxy_closed_client_connections_total", - "Number of closed connections from a client.", - &["protocol"], - ) - .unwrap() -}); +#[derive(MetricGroup)] +#[metric(new(thread_pool: Arc))] +pub struct Metrics { + #[metric(namespace = "proxy")] + #[metric(init = ProxyMetrics::new(thread_pool))] + pub proxy: ProxyMetrics, -pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_accepted_connections_total", - "Number of client connections accepted.", - "proxy_closed_connections_total", - "Number of client connections closed.", - &["protocol"], - ) - .unwrap() -}); - -pub static COMPUTE_CONNECTION_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_compute_connection_latency_seconds", - "Time it took for proxy to establish a connection to the compute endpoint", - // http/ws/tcp, true/false, true/false, success/failure - // 3 * 2 * 2 * 2 = 24 counters - &["protocol", "cache_miss", "pool_miss", "outcome"], - // largest bucket = 2^16 * 0.5ms = 32s - exponential_buckets(0.0005, 2.0, 16).unwrap(), - ) - .unwrap() -}); - -pub static CONSOLE_REQUEST_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_console_request_latency", - "Time it took for proxy to establish a connection to the compute endpoint", - // proxy_wake_compute/proxy_get_role_info - &["request"], - // largest bucket = 2^16 * 0.2ms = 13s - exponential_buckets(0.0002, 2.0, 16).unwrap(), - ) - .unwrap() -}); - -pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_allowed_ips_cache_misses", - "Number of cache hits/misses for allowed ips", - // hit/miss - &["outcome"], - ) - .unwrap() -}); - -pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_control_plane_token_acquire_seconds", - "Time it took for proxy to establish a connection to the compute endpoint", - // largest bucket = 3^16 * 0.05ms = 2.15s - exponential_buckets(0.00005, 3.0, 16).unwrap(), - ) - .unwrap() -}); - -pub static RATE_LIMITER_LIMIT: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "semaphore_control_plane_limit", - "Current limit of the semaphore control plane", - &["limit"], // 2 counters - ) - .unwrap() -}); - -pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_accepted_connections_by_sni", - "Number of connections (per sni).", - &["kind"], - ) - .unwrap() -}); - -pub static ALLOWED_IPS_NUMBER: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_allowed_ips_number", - "Number of allowed ips", - vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0], - ) - .unwrap() -}); - -#[derive(Clone)] -pub struct LatencyTimer { - // time since the stopwatch was started - start: Option, - // accumulated time on the stopwatch - pub accumulated: std::time::Duration, - // label data - protocol: &'static str, - cache_miss: bool, - pool_miss: bool, - outcome: &'static str, + #[metric(namespace = "wake_compute_lock")] + pub wake_compute_lock: ApiLockMetrics, } -pub struct LatencyTimerPause<'a> { - timer: &'a mut LatencyTimer, +static SELF: OnceLock = OnceLock::new(); +impl Metrics { + pub fn install(thread_pool: Arc) { + SELF.set(Metrics::new(thread_pool)) + .ok() + .expect("proxy metrics must not be installed more than once"); + } + + pub fn get() -> &'static Self { + #[cfg(test)] + return SELF.get_or_init(|| Metrics::new(Arc::new(ThreadPoolMetrics::new(0)))); + + #[cfg(not(test))] + SELF.get() + .expect("proxy metrics must be installed by the main() function") + } +} + +#[derive(MetricGroup)] +#[metric(new(thread_pool: Arc))] +pub struct ProxyMetrics { + #[metric(flatten)] + pub db_connections: CounterPairVec, + #[metric(flatten)] + pub client_connections: CounterPairVec, + #[metric(flatten)] + pub connection_requests: CounterPairVec, + #[metric(flatten)] + pub http_endpoint_pools: HttpEndpointPools, + + /// Time it took for proxy to establish a connection to the compute endpoint. + // largest bucket = 2^16 * 0.5ms = 32s + #[metric(metadata = Thresholds::exponential_buckets(0.0005, 2.0))] + pub compute_connection_latency_seconds: HistogramVec, + + /// Time it took for proxy to receive a response from control plane. + #[metric( + // largest bucket = 2^16 * 0.2ms = 13s + metadata = Thresholds::exponential_buckets(0.0002, 2.0), + )] + pub console_request_latency: HistogramVec, + + /// Time it takes to acquire a token to call console plane. + // largest bucket = 3^16 * 0.05ms = 2.15s + #[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))] + pub control_plane_token_acquire_seconds: Histogram<16>, + + /// Size of the HTTP request body lengths. + // smallest bucket = 16 bytes + // largest bucket = 4^12 * 16 bytes = 256MB + #[metric(metadata = Thresholds::exponential_buckets(16.0, 4.0))] + pub http_conn_content_length_bytes: HistogramVec, 12>, + + /// Time it takes to reclaim unused connection pools. + #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))] + pub http_pool_reclaimation_lag_seconds: Histogram<16>, + + /// Number of opened connections to a database. + pub http_pool_opened_connections: Gauge, + + /// Number of cache hits/misses for allowed ips. + pub allowed_ips_cache_misses: CounterVec>, + + /// Number of allowed ips + #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] + pub allowed_ips_number: Histogram<10>, + + /// Number of connections (per sni). + pub accepted_connections_by_sni: CounterVec>, + + /// Number of connection failures (per kind). + pub connection_failures_total: CounterVec>, + + /// Number of wake-up failures (per kind). + pub connection_failures_breakdown: CounterVec, + + /// Number of bytes sent/received between all clients and backends. + pub io_bytes: CounterVec>, + + /// Number of errors by a given classification. + pub errors_total: CounterVec>, + + /// Number of cancellation requests (per found/not_found). + pub cancellation_requests_total: CounterVec, + + /// Number of errors by a given classification + pub redis_errors_total: CounterVec, + + /// Number of TLS handshake failures + pub tls_handshake_failures: Counter, + + /// Number of connection requests affected by authentication rate limits + pub requests_auth_rate_limits_total: Counter, + + /// HLL approximate cardinality of endpoints that are connecting + pub connecting_endpoints: HyperLogLogVec, 32>, + + /// Number of endpoints affected by errors of a given classification + pub endpoints_affected_by_errors: HyperLogLogVec, 32>, + + /// Number of endpoints affected by authentication rate limits + pub endpoints_auth_rate_limits: HyperLogLog<32>, + + /// Number of invalid endpoints (per protocol, per rejected). + pub invalid_endpoints_total: CounterVec, + + /// Number of retries (per outcome, per retry_type). + #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))] + pub retries_metric: HistogramVec, + + /// Number of events consumed from redis (per event type). + pub redis_events_count: CounterVec>, + + #[metric(namespace = "connect_compute_lock")] + pub connect_compute_lock: ApiLockMetrics, + + #[metric(namespace = "scram_pool")] + #[metric(init = thread_pool)] + pub scram_pool: Arc, +} + +#[derive(MetricGroup)] +#[metric(new())] +pub struct ApiLockMetrics { + /// Number of semaphores registered in this api lock + pub semaphores_registered: Counter, + /// Number of semaphores unregistered in this api lock + pub semaphores_unregistered: Counter, + /// Time it takes to reclaim unused semaphores in the api lock + #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))] + pub reclamation_lag_seconds: Histogram<16>, + /// Time it takes to acquire a semaphore lock + #[metric(metadata = Thresholds::exponential_buckets(1e-4, 2.0))] + pub semaphore_acquire_seconds: Histogram<16>, +} + +impl Default for ApiLockMetrics { + fn default() -> Self { + Self::new() + } +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "direction")] +pub enum HttpDirection { + Request, + Response, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "direction")] +pub enum Direction { + Tx, + Rx, +} + +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +#[label(singleton = "protocol")] +pub enum Protocol { + Http, + Ws, + Tcp, + SniRouter, +} + +impl Protocol { + pub fn as_str(&self) -> &'static str { + match self { + Protocol::Http => "http", + Protocol::Ws => "ws", + Protocol::Tcp => "tcp", + Protocol::SniRouter => "sni_router", + } + } +} + +impl std::fmt::Display for Protocol { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum Bool { + True, + False, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "outcome")] +pub enum Outcome { + Success, + Failed, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "outcome")] +pub enum CacheOutcome { + Hit, + Miss, +} + +#[derive(LabelGroup)] +#[label(set = ConsoleRequestSet)] +pub struct ConsoleRequest<'a> { + #[label(dynamic_with = ThreadedRodeo, default)] + pub request: &'a str, +} + +#[derive(MetricGroup, Default)] +pub struct HttpEndpointPools { + /// Number of endpoints we have registered pools for + pub http_pool_endpoints_registered_total: Counter, + /// Number of endpoints we have unregistered pools for + pub http_pool_endpoints_unregistered_total: Counter, +} + +pub struct HttpEndpointPoolsGuard<'a> { + dec: &'a Counter, +} + +impl Drop for HttpEndpointPoolsGuard<'_> { + fn drop(&mut self) { + self.dec.inc(); + } +} + +impl HttpEndpointPools { + pub fn guard(&self) -> HttpEndpointPoolsGuard<'_> { + self.http_pool_endpoints_registered_total.inc(); + HttpEndpointPoolsGuard { + dec: &self.http_pool_endpoints_unregistered_total, + } + } +} +pub struct NumDbConnectionsGauge; +impl CounterPairAssoc for NumDbConnectionsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("opened_db_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_db_connections_total"); + const INC_HELP: &'static str = "Number of opened connections to a database."; + const DEC_HELP: &'static str = "Number of closed connections to a database."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumDbConnectionsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumDbConnectionsGauge>; + +pub struct NumClientConnectionsGauge; +impl CounterPairAssoc for NumClientConnectionsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("opened_client_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_client_connections_total"); + const INC_HELP: &'static str = "Number of opened connections from a client."; + const DEC_HELP: &'static str = "Number of closed connections from a client."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumClientConnectionsGuard<'a> = + metrics::MeasuredCounterPairGuard<'a, NumClientConnectionsGauge>; + +pub struct NumConnectionRequestsGauge; +impl CounterPairAssoc for NumConnectionRequestsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("accepted_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_connections_total"); + const INC_HELP: &'static str = "Number of client connections accepted."; + const DEC_HELP: &'static str = "Number of client connections closed."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumConnectionRequestsGuard<'a> = + metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>; + +#[derive(LabelGroup)] +#[label(set = ComputeConnectionLatencySet)] +pub struct ComputeConnectionLatencyGroup { + protocol: Protocol, + cold_start_info: ColdStartInfo, + outcome: ConnectOutcome, + excluded: LatencyExclusions, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum LatencyExclusions { + Client, + ClientAndCplane, + ClientCplaneCompute, + ClientCplaneComputeRetry, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum SniKind { + Sni, + NoSni, + PasswordHack, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum ConnectionFailureKind { + ComputeCached, + ComputeUncached, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum WakeupFailureKind { + BadComputeAddress, + ApiTransportError, + QuotaExceeded, + ApiConsoleLocked, + ApiConsoleBadRequest, + ApiConsoleOtherServerError, + ApiConsoleOtherError, + TimeoutError, +} + +#[derive(LabelGroup)] +#[label(set = ConnectionFailuresBreakdownSet)] +pub struct ConnectionFailuresBreakdownGroup { + pub kind: WakeupFailureKind, + pub retry: Bool, +} + +#[derive(LabelGroup, Copy, Clone)] +#[label(set = RedisErrorsSet)] +pub struct RedisErrors<'a> { + #[label(dynamic_with = ThreadedRodeo, default)] + pub channel: &'a str, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum CancellationSource { + FromClient, + FromRedis, + Local, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum CancellationOutcome { + NotFound, + Found, +} + +#[derive(LabelGroup)] +#[label(set = CancellationRequestSet)] +pub struct CancellationRequest { + pub source: CancellationSource, + pub kind: CancellationOutcome, +} + +#[derive(Clone, Copy)] +pub enum Waiting { + Cplane, + Client, + Compute, + RetryTimeout, +} + +#[derive(Default)] +struct Accumulated { + cplane: time::Duration, + client: time::Duration, + compute: time::Duration, + retry: time::Duration, +} + +pub struct LatencyTimer { + // time since the stopwatch was started + start: time::Instant, + // time since the stopwatch was stopped + stop: Option, + // accumulated time on the stopwatch + accumulated: Accumulated, + // label data + protocol: Protocol, + cold_start_info: ColdStartInfo, + outcome: ConnectOutcome, } impl LatencyTimer { - pub fn new(protocol: &'static str) -> Self { + pub fn new(protocol: Protocol) -> Self { Self { - start: Some(time::Instant::now()), - accumulated: std::time::Duration::ZERO, + start: time::Instant::now(), + stop: None, + accumulated: Accumulated::default(), protocol, - cache_miss: false, - // by default we don't do pooling - pool_miss: true, + cold_start_info: ColdStartInfo::Unknown, // assume failed unless otherwise specified - outcome: "failed", + outcome: ConnectOutcome::Failed, } } - pub fn pause(&mut self) -> LatencyTimerPause<'_> { - // stop the stopwatch and record the time that we have accumulated - let start = self.start.take().expect("latency timer should be started"); - self.accumulated += start.elapsed(); - LatencyTimerPause { timer: self } + pub fn unpause(&mut self, start: Instant, waiting_for: Waiting) { + let dur = start.elapsed(); + match waiting_for { + Waiting::Cplane => self.accumulated.cplane += dur, + Waiting::Client => self.accumulated.client += dur, + Waiting::Compute => self.accumulated.compute += dur, + Waiting::RetryTimeout => self.accumulated.retry += dur, + } } - pub fn cache_miss(&mut self) { - self.cache_miss = true; - } - - pub fn pool_hit(&mut self) { - self.pool_miss = false; + pub fn cold_start_info(&mut self, cold_start_info: ColdStartInfo) { + self.cold_start_info = cold_start_info; } pub fn success(&mut self) { // stop the stopwatch and record the time that we have accumulated - let start = self.start.take().expect("latency timer should be started"); - self.accumulated += start.elapsed(); + self.stop = Some(time::Instant::now()); // success - self.outcome = "success"; + self.outcome = ConnectOutcome::Success; } } -impl Drop for LatencyTimerPause<'_> { - fn drop(&mut self) { - // start the stopwatch again - self.timer.start = Some(time::Instant::now()); - } +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +pub enum ConnectOutcome { + Success, + Failed, } impl Drop for LatencyTimer { fn drop(&mut self) { - let duration = - self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated; - COMPUTE_CONNECTION_LATENCY - .with_label_values(&[ - self.protocol, - bool_to_str(self.cache_miss), - bool_to_str(self.pool_miss), - self.outcome, - ]) - .observe(duration.as_secs_f64()) + let duration = self + .stop + .unwrap_or_else(time::Instant::now) + .duration_since(self.start); + + let metric = &Metrics::get().proxy.compute_connection_latency_seconds; + + // Excluding client communication from the accumulated time. + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::Client, + }, + duration + .saturating_sub(self.accumulated.client) + .as_secs_f64(), + ); + + // Exclude client and cplane communication from the accumulated time. + let accumulated_total = self.accumulated.client + self.accumulated.cplane; + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::ClientAndCplane, + }, + duration.saturating_sub(accumulated_total).as_secs_f64(), + ); + + // Exclude client cplane, compue communication from the accumulated time. + let accumulated_total = + self.accumulated.client + self.accumulated.cplane + self.accumulated.compute; + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::ClientCplaneCompute, + }, + duration.saturating_sub(accumulated_total).as_secs_f64(), + ); + + // Exclude client cplane, compue, retry communication from the accumulated time. + let accumulated_total = self.accumulated.client + + self.accumulated.cplane + + self.accumulated.compute + + self.accumulated.retry; + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::ClientCplaneComputeRetry, + }, + duration.saturating_sub(accumulated_total).as_secs_f64(), + ); } } -pub static NUM_CONNECTION_FAILURES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_connection_failures_total", - "Number of connection failures (per kind).", - &["kind"], - ) - .unwrap() -}); - -pub static NUM_WAKEUP_FAILURES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_connection_failures_breakdown", - "Number of wake-up failures (per kind).", - &["retry", "kind"], - ) - .unwrap() -}); - -pub static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_io_bytes_per_client", - "Number of bytes sent/received between client and backend.", - crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS, - ) - .unwrap() -}); - -pub static NUM_BYTES_PROXIED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_io_bytes", - "Number of bytes sent/received between all clients and backends.", - &["direction"], - ) - .unwrap() -}); - -pub const fn bool_to_str(x: bool) -> &'static str { - if x { - "true" - } else { - "false" +impl From for Bool { + fn from(value: bool) -> Self { + if value { + Bool::True + } else { + Bool::False + } } } + +#[derive(LabelGroup)] +#[label(set = InvalidEndpointsSet)] +pub struct InvalidEndpointsGroup { + pub protocol: Protocol, + pub rejected: Bool, + pub outcome: ConnectOutcome, +} + +#[derive(LabelGroup)] +#[label(set = RetriesMetricSet)] +pub struct RetriesMetricGroup { + pub outcome: ConnectOutcome, + pub retry_type: RetryType, +} + +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +pub enum RetryType { + WakeCompute, + ConnectToCompute, +} + +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +#[label(singleton = "event")] +pub enum RedisEventsCount { + EndpointCreated, + BranchCreated, + ProjectCreated, + CancelSession, + PasswordUpdate, + AllowedIpsUpdate, +} + +pub struct ThreadPoolWorkers(usize); +#[derive(Copy, Clone)] +pub struct ThreadPoolWorkerId(pub usize); + +impl LabelValue for ThreadPoolWorkerId { + fn visit(&self, v: V) -> V::Output { + v.write_int(self.0 as i64) + } +} + +impl LabelGroup for ThreadPoolWorkerId { + fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) { + v.write_value(LabelName::from_str("worker"), self); + } +} + +impl LabelGroupSet for ThreadPoolWorkers { + type Group<'a> = ThreadPoolWorkerId; + + fn cardinality(&self) -> Option { + Some(self.0) + } + + fn encode_dense(&self, value: Self::Unique) -> Option { + Some(value) + } + + fn decode_dense(&self, value: usize) -> Self::Group<'_> { + ThreadPoolWorkerId(value) + } + + type Unique = usize; + + fn encode(&self, value: Self::Group<'_>) -> Option { + Some(value.0) + } + + fn decode(&self, value: &Self::Unique) -> Self::Group<'_> { + ThreadPoolWorkerId(*value) + } +} + +impl LabelSet for ThreadPoolWorkers { + type Value<'a> = ThreadPoolWorkerId; + + fn dynamic_cardinality(&self) -> Option { + Some(self.0) + } + + fn encode(&self, value: Self::Value<'_>) -> Option { + (value.0 < self.0).then_some(value.0) + } + + fn decode(&self, value: usize) -> Self::Value<'_> { + ThreadPoolWorkerId(value) + } +} + +impl FixedCardinalitySet for ThreadPoolWorkers { + fn cardinality(&self) -> usize { + self.0 + } +} + +#[derive(MetricGroup)] +#[metric(new(workers: usize))] +pub struct ThreadPoolMetrics { + #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))] + pub worker_task_turns_total: CounterVec, + #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))] + pub worker_task_skips_total: CounterVec, +} diff --git a/proxy/src/parse.rs b/proxy/src/parse.rs index 0d03574901..8c0f251066 100644 --- a/proxy/src/parse.rs +++ b/proxy/src/parse.rs @@ -2,14 +2,14 @@ use std::ffi::CStr; -pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> { +pub(crate) fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> { let cstr = CStr::from_bytes_until_nul(bytes).ok()?; let (_, other) = bytes.split_at(cstr.to_bytes_with_nul().len()); Some((cstr, other)) } /// See . -pub fn split_at_const(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> { +pub(crate) fn split_at_const(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> { (bytes.len() >= N).then(|| { let (head, tail) = bytes.split_at(N); (head.try_into().unwrap(), tail) diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 1d8931be85..17764f78d1 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -1,43 +1,26 @@ //! Proxy Protocol V2 implementation use std::{ - future::poll_fn, - future::Future, io, net::SocketAddr, - pin::{pin, Pin}, - task::{ready, Context, Poll}, + pin::Pin, + task::{Context, Poll}, }; -use bytes::{Buf, BytesMut}; -use hyper::server::conn::{AddrIncoming, AddrStream}; +use bytes::BytesMut; use pin_project_lite::pin_project; -use tls_listener::AsyncAccept; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf}; -pub struct ProxyProtocolAccept { - pub incoming: AddrIncoming, -} - pin_project! { - pub struct WithClientIp { + /// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough + pub(crate) struct ChainRW { #[pin] - pub inner: T, + pub(crate) inner: T, buf: BytesMut, - tlv_bytes: u16, - state: ProxyParse, } } -#[derive(Clone, PartialEq, Debug)] -enum ProxyParse { - NotStarted, - - Finished(SocketAddr), - None, -} - -impl AsyncWrite for WithClientIp { +impl AsyncWrite for ChainRW { #[inline] fn poll_write( self: Pin<&mut Self>, @@ -72,285 +55,174 @@ impl AsyncWrite for WithClientIp { } } -impl WithClientIp { - pub fn new(inner: T) -> Self { - WithClientIp { - inner, - buf: BytesMut::with_capacity(128), - tlv_bytes: 0, - state: ProxyParse::NotStarted, - } - } - - pub fn client_addr(&self) -> Option { - match self.state { - ProxyParse::Finished(socket) => Some(socket), - _ => None, - } - } -} - -impl WithClientIp { - pub async fn wait_for_addr(&mut self) -> io::Result> { - match self.state { - ProxyParse::NotStarted => { - let mut pin = Pin::new(&mut *self); - let addr = poll_fn(|cx| pin.as_mut().poll_client_ip(cx)).await?; - match addr { - Some(addr) => self.state = ProxyParse::Finished(addr), - None => self.state = ProxyParse::None, - } - Ok(addr) - } - ProxyParse::Finished(addr) => Ok(Some(addr)), - ProxyParse::None => Ok(None), - } - } -} - /// Proxy Protocol Version 2 Header const HEADER: [u8; 12] = [ 0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A, ]; -impl WithClientIp { - /// implementation of - /// Version 2 (Binary Format) - fn poll_client_ip( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll>> { - // The binary header format starts with a constant 12 bytes block containing the protocol signature : - // \x0D \x0A \x0D \x0A \x00 \x0D \x0A \x51 \x55 \x49 \x54 \x0A - while self.buf.len() < 16 { - let mut this = self.as_mut().project(); - let bytes_read = pin!(this.inner.read_buf(this.buf)).poll(cx)?; +pub(crate) async fn read_proxy_protocol( + mut read: T, +) -> std::io::Result<(ChainRW, Option)> { + let mut buf = BytesMut::with_capacity(128); + while buf.len() < 16 { + let bytes_read = read.read_buf(&mut buf).await?; - // exit for bad header - let len = usize::min(self.buf.len(), HEADER.len()); - if self.buf[..len] != HEADER[..len] { - return Poll::Ready(Ok(None)); - } - - // if no more bytes available then exit - if ready!(bytes_read) == 0 { - return Poll::Ready(Ok(None)); - }; + // exit for bad header + let len = usize::min(buf.len(), HEADER.len()); + if buf[..len] != HEADER[..len] { + return Ok((ChainRW { inner: read, buf }, None)); } - // The next byte (the 13th one) is the protocol version and command. - // The highest four bits contains the version. As of this specification, it must - // always be sent as \x2 and the receiver must only accept this value. - let vc = self.buf[12]; - let version = vc >> 4; - let command = vc & 0b1111; - if version != 2 { - return Poll::Ready(Err(io::Error::new( + // if no more bytes available then exit + if bytes_read == 0 { + return Ok((ChainRW { inner: read, buf }, None)); + }; + } + + let header = buf.split_to(16); + + // The next byte (the 13th one) is the protocol version and command. + // The highest four bits contains the version. As of this specification, it must + // always be sent as \x2 and the receiver must only accept this value. + let vc = header[12]; + let version = vc >> 4; + let command = vc & 0b1111; + if version != 2 { + return Err(io::Error::new( + io::ErrorKind::Other, + "invalid proxy protocol version. expected version 2", + )); + } + match command { + // the connection was established on purpose by the proxy + // without being relayed. The connection endpoints are the sender and the + // receiver. Such connections exist when the proxy sends health-checks to the + // server. The receiver must accept this connection as valid and must use the + // real connection endpoints and discard the protocol block including the + // family which is ignored. + 0 => {} + // the connection was established on behalf of another node, + // and reflects the original connection endpoints. The receiver must then use + // the information provided in the protocol block to get original the address. + 1 => {} + // other values are unassigned and must not be emitted by senders. Receivers + // must drop connections presenting unexpected values here. + _ => { + return Err(io::Error::new( io::ErrorKind::Other, - "invalid proxy protocol version. expected version 2", - ))); + "invalid proxy protocol command. expected local (0) or proxy (1)", + )) } - match command { - // the connection was established on purpose by the proxy - // without being relayed. The connection endpoints are the sender and the - // receiver. Such connections exist when the proxy sends health-checks to the - // server. The receiver must accept this connection as valid and must use the - // real connection endpoints and discard the protocol block including the - // family which is ignored. - 0 => {} - // the connection was established on behalf of another node, - // and reflects the original connection endpoints. The receiver must then use - // the information provided in the protocol block to get original the address. - 1 => {} - // other values are unassigned and must not be emitted by senders. Receivers - // must drop connections presenting unexpected values here. - _ => { - return Poll::Ready(Err(io::Error::new( - io::ErrorKind::Other, - "invalid proxy protocol command. expected local (0) or proxy (1)", - ))) - } - }; + }; - // The 14th byte contains the transport protocol and address family. The highest 4 - // bits contain the address family, the lowest 4 bits contain the protocol. - let ft = self.buf[13]; - let address_length = match ft { - // - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET - // protocol family. Address length is 2*4 + 2*2 = 12 bytes. - // - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET - // protocol family. Address length is 2*4 + 2*2 = 12 bytes. - 0x11 | 0x12 => 12, - // - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6 - // protocol family. Address length is 2*16 + 2*2 = 36 bytes. - // - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6 - // protocol family. Address length is 2*16 + 2*2 = 36 bytes. - 0x21 | 0x22 => 36, - // unspecified or unix stream. ignore the addresses - _ => 0, - }; + // The 14th byte contains the transport protocol and address family. The highest 4 + // bits contain the address family, the lowest 4 bits contain the protocol. + let ft = header[13]; + let address_length = match ft { + // - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET + // protocol family. Address length is 2*4 + 2*2 = 12 bytes. + // - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET + // protocol family. Address length is 2*4 + 2*2 = 12 bytes. + 0x11 | 0x12 => 12, + // - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6 + // protocol family. Address length is 2*16 + 2*2 = 36 bytes. + // - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6 + // protocol family. Address length is 2*16 + 2*2 = 36 bytes. + 0x21 | 0x22 => 36, + // unspecified or unix stream. ignore the addresses + _ => 0, + }; - // The 15th and 16th bytes is the address length in bytes in network endian order. - // It is used so that the receiver knows how many address bytes to skip even when - // it does not implement the presented protocol. Thus the length of the protocol - // header in bytes is always exactly 16 + this value. When a sender presents a - // LOCAL connection, it should not present any address so it sets this field to - // zero. Receivers MUST always consider this field to skip the appropriate number - // of bytes and must not assume zero is presented for LOCAL connections. When a - // receiver accepts an incoming connection showing an UNSPEC address family or - // protocol, it may or may not decide to log the address information if present. - let remaining_length = u16::from_be_bytes(self.buf[14..16].try_into().unwrap()); - if remaining_length < address_length { - return Poll::Ready(Err(io::Error::new( - io::ErrorKind::Other, - "invalid proxy protocol length. not enough to fit requested IP addresses", - ))); + // The 15th and 16th bytes is the address length in bytes in network endian order. + // It is used so that the receiver knows how many address bytes to skip even when + // it does not implement the presented protocol. Thus the length of the protocol + // header in bytes is always exactly 16 + this value. When a sender presents a + // LOCAL connection, it should not present any address so it sets this field to + // zero. Receivers MUST always consider this field to skip the appropriate number + // of bytes and must not assume zero is presented for LOCAL connections. When a + // receiver accepts an incoming connection showing an UNSPEC address family or + // protocol, it may or may not decide to log the address information if present. + let remaining_length = u16::from_be_bytes(header[14..16].try_into().unwrap()); + if remaining_length < address_length { + return Err(io::Error::new( + io::ErrorKind::Other, + "invalid proxy protocol length. not enough to fit requested IP addresses", + )); + } + drop(header); + + while buf.len() < remaining_length as usize { + if read.read_buf(&mut buf).await? == 0 { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "stream closed while waiting for proxy protocol addresses", + )); } - - while self.buf.len() < 16 + address_length as usize { - let mut this = self.as_mut().project(); - if ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?) == 0 { - return Poll::Ready(Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "stream closed while waiting for proxy protocol addresses", - ))); - } - } - - let this = self.as_mut().project(); - - // we are sure this is a proxy protocol v2 entry and we have read all the bytes we need - // discard the header we have parsed - this.buf.advance(16); - - // Starting from the 17th byte, addresses are presented in network byte order. - // The address order is always the same : - // - source layer 3 address in network byte order - // - destination layer 3 address in network byte order - // - source layer 4 address if any, in network byte order (port) - // - destination layer 4 address if any, in network byte order (port) - let addresses = this.buf.split_to(address_length as usize); - let socket = match address_length { - 12 => { - let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap(); - let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap()); - Some(SocketAddr::from((src_addr, src_port))) - } - 36 => { - let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap(); - let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap()); - Some(SocketAddr::from((src_addr, src_port))) - } - _ => None, - }; - - *this.tlv_bytes = remaining_length - address_length; - self.as_mut().skip_tlv_inner(); - - Poll::Ready(Ok(socket)) } - #[cold] - fn read_ip(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - let ip = ready!(self.as_mut().poll_client_ip(cx)?); - match ip { - Some(x) => *self.as_mut().project().state = ProxyParse::Finished(x), - None => *self.as_mut().project().state = ProxyParse::None, + // Starting from the 17th byte, addresses are presented in network byte order. + // The address order is always the same : + // - source layer 3 address in network byte order + // - destination layer 3 address in network byte order + // - source layer 4 address if any, in network byte order (port) + // - destination layer 4 address if any, in network byte order (port) + let addresses = buf.split_to(remaining_length as usize); + let socket = match address_length { + 12 => { + let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap(); + let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap()); + Some(SocketAddr::from((src_addr, src_port))) } - Poll::Ready(Ok(())) - } + 36 => { + let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap(); + let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap()); + Some(SocketAddr::from((src_addr, src_port))) + } + _ => None, + }; - #[cold] - fn skip_tlv(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - let mut this = self.as_mut().project(); - // we know that this.buf is empty - debug_assert_eq!(this.buf.len(), 0); - - this.buf.reserve((*this.tlv_bytes).clamp(0, 1024) as usize); - ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?); - self.skip_tlv_inner(); - - Poll::Ready(Ok(())) - } - - fn skip_tlv_inner(self: Pin<&mut Self>) { - let tlv_bytes_read = match u16::try_from(self.buf.len()) { - // we read more than u16::MAX therefore we must have read the full tlv_bytes - Err(_) => self.tlv_bytes, - // we might not have read the full tlv bytes yet - Ok(n) => u16::min(n, self.tlv_bytes), - }; - let this = self.project(); - *this.tlv_bytes -= tlv_bytes_read; - this.buf.advance(tlv_bytes_read as usize); - } + Ok((ChainRW { inner: read, buf }, socket)) } -impl AsyncRead for WithClientIp { +impl AsyncRead for ChainRW { #[inline] fn poll_read( - mut self: Pin<&mut Self>, + self: Pin<&mut Self>, cx: &mut Context<'_>, buf: &mut ReadBuf<'_>, ) -> Poll> { - // I'm assuming these 3 comparisons will be easy to branch predict. - // especially with the cold attributes - // which should make this read wrapper almost invisible - - if let ProxyParse::NotStarted = self.state { - ready!(self.as_mut().read_ip(cx)?); - } - - while self.tlv_bytes > 0 { - ready!(self.as_mut().skip_tlv(cx)?) - } - - let this = self.project(); - if this.buf.is_empty() { - this.inner.poll_read(cx, buf) + if self.buf.is_empty() { + self.project().inner.poll_read(cx, buf) } else { - // we know that tlv_bytes is 0 - debug_assert_eq!(*this.tlv_bytes, 0); - - let write = usize::min(this.buf.len(), buf.remaining()); - let slice = this.buf.split_to(write).freeze(); - buf.put_slice(&slice); - - // reset the allocation so it can be freed - if this.buf.is_empty() { - *this.buf = BytesMut::new(); - } - - Poll::Ready(Ok(())) + self.read_from_buf(buf) } } } -impl AsyncAccept for ProxyProtocolAccept { - type Connection = WithClientIp; +impl ChainRW { + #[cold] + fn read_from_buf(self: Pin<&mut Self>, buf: &mut ReadBuf<'_>) -> Poll> { + debug_assert!(!self.buf.is_empty()); + let this = self.project(); - type Error = io::Error; + let write = usize::min(this.buf.len(), buf.remaining()); + let slice = this.buf.split_to(write).freeze(); + buf.put_slice(&slice); - fn poll_accept( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll>> { - let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?); - let Some(conn) = conn else { - return Poll::Ready(None); - }; + // reset the allocation so it can be freed + if this.buf.is_empty() { + *this.buf = BytesMut::new(); + } - Poll::Ready(Some(Ok(WithClientIp::new(conn)))) + Poll::Ready(Ok(())) } } #[cfg(test)] mod tests { - use std::pin::pin; - use tokio::io::AsyncReadExt; - use crate::protocol2::{ProxyParse, WithClientIp}; + use crate::protocol2::read_proxy_protocol; #[tokio::test] async fn test_ipv4() { @@ -372,16 +244,15 @@ mod tests { let extra_data = [0x55; 256]; - let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice()))); + let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice())) + .await + .unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, extra_data); - assert_eq!( - read.state, - ProxyParse::Finished(([127, 0, 0, 1], 65535).into()) - ); + assert_eq!(addr, Some(([127, 0, 0, 1], 65535).into())); } #[tokio::test] @@ -404,17 +275,17 @@ mod tests { let extra_data = [0x55; 256]; - let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice()))); + let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice())) + .await + .unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, extra_data); assert_eq!( - read.state, - ProxyParse::Finished( - ([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into() - ) + addr, + Some(([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into()) ); } @@ -422,24 +293,24 @@ mod tests { async fn test_invalid() { let data = [0x55; 256]; - let mut read = pin!(WithClientIp::new(data.as_slice())); + let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, data); - assert_eq!(read.state, ProxyParse::None); + assert_eq!(addr, None); } #[tokio::test] async fn test_short() { let data = [0x55; 10]; - let mut read = pin!(WithClientIp::new(data.as_slice())); + let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, data); - assert_eq!(read.state, ProxyParse::None); + assert_eq!(addr, None); } #[tokio::test] @@ -465,15 +336,14 @@ mod tests { let extra_data = [0xaa; 256]; - let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice()))); + let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice())) + .await + .unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, extra_data); - assert_eq!( - read.state, - ProxyParse::Finished(([55, 56, 57, 58], 65535).into()) - ); + assert_eq!(addr, Some(([55, 56, 57, 58], 65535).into())); } } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 087cc7f7a9..ff199ac701 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -1,43 +1,47 @@ #[cfg(test)] mod tests; -pub mod connect_compute; -pub mod retry; +pub(crate) mod connect_compute; +mod copy_bidirectional; +pub(crate) mod handshake; +pub(crate) mod passthrough; +pub(crate) mod retry; +pub(crate) mod wake_compute; +pub use copy_bidirectional::copy_bidirectional_client_compute; +pub use copy_bidirectional::ErrorSource; use crate::{ auth, - cancellation::{self, CancelMap}, + cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}, compute, - config::{AuthenticationConfig, ProxyConfig, TlsConfig}, - console::messages::MetricsAuxInfo, + config::{ProxyConfig, TlsConfig}, context::RequestMonitoring, - metrics::{ - NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER, - NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE, - }, - protocol2::WithClientIp, + error::ReportableError, + metrics::{Metrics, NumClientConnectionsGuard}, + protocol2::read_proxy_protocol, + proxy::handshake::{handshake, HandshakeData}, rate_limiter::EndpointRateLimiter, stream::{PqStream, Stream}, - usage_metrics::{Ids, USAGE_METRICS}, EndpointCacheKey, }; -use anyhow::{bail, Context}; use futures::TryFutureExt; use itertools::Itertools; use once_cell::sync::OnceCell; -use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; +use pq_proto::{BeMessage as Be, StartupMessageParams}; use regex::Regex; use smol_str::{format_smolstr, SmolStr}; use std::sync::Arc; +use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, Instrument}; -use utils::measured_stream::MeasuredStream; +use tracing::{error, info, Instrument}; -use self::connect_compute::{connect_to_compute, TcpMechanism}; +use self::{ + connect_compute::{connect_to_compute, TcpMechanism}, + passthrough::ProxyPassthrough, +}; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; -const ERR_PROTO_VIOLATION: &str = "protocol violation"; pub async fn run_until_cancelled( f: F, @@ -58,6 +62,7 @@ pub async fn task_main( config: &'static ProxyConfig, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, + cancellation_handler: Arc, endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { @@ -69,57 +74,91 @@ pub async fn task_main( socket2::SockRef::from(&listener).set_keepalive(true)?; let connections = tokio_util::task::task_tracker::TaskTracker::new(); - let cancel_map = Arc::new(CancelMap::default()); while let Some(accept_result) = run_until_cancelled(listener.accept(), &cancellation_token).await { let (socket, peer_addr) = accept_result?; + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Tcp); + let session_id = uuid::Uuid::new_v4(); - let cancel_map = Arc::clone(&cancel_map); - let endpoint_rate_limiter = endpoint_rate_limiter.clone(); + let cancellation_handler = Arc::clone(&cancellation_handler); - connections.spawn( - async move { - info!("accepted postgres client connection"); + tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection"); + let endpoint_rate_limiter2 = endpoint_rate_limiter.clone(); - let mut socket = WithClientIp::new(socket); - let mut peer_addr = peer_addr.ip(); - if let Some(addr) = socket.wait_for_addr().await? { - peer_addr = addr.ip(); - tracing::Span::current().record("peer_addr", &tracing::field::display(addr)); - } else if config.require_client_ip { - bail!("missing required client IP"); + connections.spawn(async move { + let (socket, peer_addr) = match read_proxy_protocol(socket).await { + Ok((socket, Some(addr))) => (socket, addr.ip()), + Err(e) => { + error!("per-client task finished with an error: {e:#}"); + return; } + Ok((_socket, None)) if config.require_client_ip => { + error!("missing required client IP"); + return; + } + Ok((socket, None)) => (socket, peer_addr.ip()), + }; - let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region); + match socket.inner.set_nodelay(true) { + Ok(()) => {} + Err(e) => { + error!("per-client task finished with an error: failed to set socket option: {e:#}"); + return; + } + }; - socket - .inner - .set_nodelay(true) - .context("failed to set socket option")?; + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Tcp, + &config.region, + ); + let span = ctx.span(); + let startup = Box::pin( handle_client( config, - &mut ctx, - &cancel_map, + &ctx, + cancellation_handler, socket, ClientMode::Tcp, - endpoint_rate_limiter, + endpoint_rate_limiter2, + conn_gauge, ) - .await + .instrument(span.clone()), + ); + let res = startup.await; + + match res { + Err(e) => { + // todo: log and push to ctx the error kind + ctx.set_error_kind(e.get_error_kind()); + error!(parent: &span, "per-client task finished with an error: {e:#}"); + } + Ok(None) => { + ctx.set_success(); + } + Ok(Some(p)) => { + ctx.set_success(); + ctx.log_connect(); + match p.proxy_pass().instrument(span.clone()).await { + Ok(()) => {} + Err(ErrorSource::Client(e)) => { + error!(parent: &span, "per-client task finished with an IO error from the client: {e:#}"); + } + Err(ErrorSource::Compute(e)) => { + error!(parent: &span, "per-client task finished with an IO error from the compute: {e:#}"); + } + } + } } - .instrument(info_span!( - "handle_client", - ?session_id, - peer_addr = tracing::field::Empty - )) - .unwrap_or_else(move |e| { - // Acknowledge that the task has finished with an error. - error!(?session_id, "per-client task finished with an error: {e:#}"); - }), - ); + }); } connections.close(); @@ -131,21 +170,21 @@ pub async fn task_main( Ok(()) } -pub enum ClientMode { +pub(crate) enum ClientMode { Tcp, Websockets { hostname: Option }, } /// Abstracts the logic of handling TCP vs WS clients impl ClientMode { - fn allow_cleartext(&self) -> bool { + pub(crate) fn allow_cleartext(&self) -> bool { match self { ClientMode::Tcp => false, ClientMode::Websockets { .. } => true, } } - fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool { + pub(crate) fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool { match self { ClientMode::Tcp => config.allow_self_signed_compute, ClientMode::Websockets { .. } => false, @@ -168,164 +207,152 @@ impl ClientMode { } } -pub async fn handle_client( - config: &'static ProxyConfig, - ctx: &mut RequestMonitoring, - cancel_map: &CancelMap, - stream: S, - mode: ClientMode, - endpoint_rate_limiter: Arc, -) -> anyhow::Result<()> { - info!( - protocol = ctx.protocol, - "handling interactive connection from client" - ); - - let proto = ctx.protocol; - let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&[proto]) - .guard(); - let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE - .with_label_values(&[proto]) - .guard(); - - let tls = config.tls_config.as_ref(); - - let pause = ctx.latency_timer.pause(); - let do_handshake = handshake(stream, mode.handshake_tls(tls), cancel_map); - let (mut stream, params) = match do_handshake.await? { - Some(x) => x, - None => return Ok(()), // it's a cancellation request - }; - drop(pause); - - // Extract credentials which we're going to use for auth. - let user_info = { - let hostname = mode.hostname(stream.get_ref()); - - let common_names = tls.map(|tls| &tls.common_names); - let result = config - .auth_backend - .as_ref() - .map(|_| { - auth::ComputeUserInfoMaybeEndpoint::parse(ctx, ¶ms, hostname, common_names) - }) - .transpose(); - - match result { - Ok(user_info) => user_info, - Err(e) => stream.throw_error(e).await?, - } - }; - - ctx.set_endpoint_id(user_info.get_endpoint()); - - let client = Client::new( - stream, - user_info, - ¶ms, - mode.allow_self_signed_compute(config), - endpoint_rate_limiter, - ); - cancel_map - .with_session(|session| { - client.connect_to_db(ctx, session, mode, &config.authentication_config) - }) - .await +#[derive(Debug, Error)] +// almost all errors should be reported to the user, but there's a few cases where we cannot +// 1. Cancellation: we are not allowed to tell the client any cancellation statuses for security reasons +// 2. Handshake: handshake reports errors if it can, otherwise if the handshake fails due to protocol violation, +// we cannot be sure the client even understands our error message +// 3. PrepareClient: The client disconnected, so we can't tell them anyway... +pub(crate) enum ClientRequestError { + #[error("{0}")] + Cancellation(#[from] cancellation::CancelError), + #[error("{0}")] + Handshake(#[from] handshake::HandshakeError), + #[error("{0}")] + HandshakeTimeout(#[from] tokio::time::error::Elapsed), + #[error("{0}")] + PrepareClient(#[from] std::io::Error), + #[error("{0}")] + ReportedError(#[from] crate::stream::ReportedError), } -/// Establish a (most probably, secure) connection with the client. -/// For better testing experience, `stream` can be any object satisfying the traits. -/// It's easier to work with owned `stream` here as we need to upgrade it to TLS; -/// we also take an extra care of propagating only the select handshake errors to client. -#[tracing::instrument(skip_all)] -async fn handshake( - stream: S, - mut tls: Option<&TlsConfig>, - cancel_map: &CancelMap, -) -> anyhow::Result>, StartupMessageParams)>> { - // Client may try upgrading to each protocol only once - let (mut tried_ssl, mut tried_gss) = (false, false); - - let mut stream = PqStream::new(Stream::from_raw(stream)); - loop { - let msg = stream.read_startup_packet().await?; - info!("received {msg:?}"); - - use FeStartupPacket::*; - match msg { - SslRequest => match stream.get_ref() { - Stream::Raw { .. } if !tried_ssl => { - tried_ssl = true; - - // We can't perform TLS handshake without a config - let enc = tls.is_some(); - stream.write_message(&Be::EncryptionResponse(enc)).await?; - if let Some(tls) = tls.take() { - // Upgrade raw stream into a secure TLS-backed stream. - // NOTE: We've consumed `tls`; this fact will be used later. - - let (raw, read_buf) = stream.into_inner(); - // TODO: Normally, client doesn't send any data before - // server says TLS handshake is ok and read_buf is empy. - // However, you could imagine pipelining of postgres - // SSLRequest + TLS ClientHello in one hunk similar to - // pipelining in our node js driver. We should probably - // support that by chaining read_buf with the stream. - if !read_buf.is_empty() { - bail!("data is sent before server replied with EncryptionResponse"); - } - let tls_stream = raw.upgrade(tls.to_server_config()).await?; - - let (_, tls_server_end_point) = tls - .cert_resolver - .resolve(tls_stream.get_ref().1.server_name()) - .context("missing certificate")?; - - stream = PqStream::new(Stream::Tls { - tls: Box::new(tls_stream), - tls_server_end_point, - }); - } - } - _ => bail!(ERR_PROTO_VIOLATION), - }, - GssEncRequest => match stream.get_ref() { - Stream::Raw { .. } if !tried_gss => { - tried_gss = true; - - // Currently, we don't support GSSAPI - stream.write_message(&Be::EncryptionResponse(false)).await?; - } - _ => bail!(ERR_PROTO_VIOLATION), - }, - StartupMessage { params, .. } => { - // Check that the config has been consumed during upgrade - // OR we didn't provide it at all (for dev purposes). - if tls.is_some() { - stream.throw_error_str(ERR_INSECURE_CONNECTION).await?; - } - - info!(session_type = "normal", "successful handshake"); - break Ok(Some((stream, params))); - } - CancelRequest(cancel_key_data) => { - cancel_map.cancel_session(cancel_key_data).await?; - - info!(session_type = "cancellation", "successful handshake"); - break Ok(None); - } +impl ReportableError for ClientRequestError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + ClientRequestError::Cancellation(e) => e.get_error_kind(), + ClientRequestError::Handshake(e) => e.get_error_kind(), + ClientRequestError::HandshakeTimeout(_) => crate::error::ErrorKind::RateLimit, + ClientRequestError::ReportedError(e) => e.get_error_kind(), + ClientRequestError::PrepareClient(_) => crate::error::ErrorKind::ClientDisconnect, } } } +pub(crate) async fn handle_client( + config: &'static ProxyConfig, + ctx: &RequestMonitoring, + cancellation_handler: Arc, + stream: S, + mode: ClientMode, + endpoint_rate_limiter: Arc, + conn_gauge: NumClientConnectionsGuard<'static>, +) -> Result>, ClientRequestError> { + info!( + protocol = %ctx.protocol(), + "handling interactive connection from client" + ); + + let metrics = &Metrics::get().proxy; + let proto = ctx.protocol(); + let request_gauge = metrics.connection_requests.guard(proto); + + let tls = config.tls_config.as_ref(); + + let record_handshake_error = !ctx.has_private_peer_addr(); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); + let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error); + let (mut stream, params) = + match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { + HandshakeData::Startup(stream, params) => (stream, params), + HandshakeData::Cancel(cancel_key_data) => { + return Ok(cancellation_handler + .cancel_session(cancel_key_data, ctx.session_id()) + .await + .map(|()| None)?) + } + }; + drop(pause); + + ctx.set_db_options(params.clone()); + + let hostname = mode.hostname(stream.get_ref()); + + let common_names = tls.map(|tls| &tls.common_names); + + // Extract credentials which we're going to use for auth. + let result = config + .auth_backend + .as_ref() + .map(|()| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, ¶ms, hostname, common_names)) + .transpose(); + + let user_info = match result { + Ok(user_info) => user_info, + Err(e) => stream.throw_error(e).await?, + }; + + let user = user_info.get_user().to_owned(); + let user_info = match user_info + .authenticate( + ctx, + &mut stream, + mode.allow_cleartext(), + &config.authentication_config, + endpoint_rate_limiter, + ) + .await + { + Ok(auth_result) => auth_result, + Err(e) => { + let db = params.get("database"); + let app = params.get("application_name"); + let params_span = tracing::info_span!("", ?user, ?db, ?app); + + return stream.throw_error(e).instrument(params_span).await?; + } + }; + + let mut node = connect_to_compute( + ctx, + &TcpMechanism { + params: ¶ms, + locks: &config.connect_compute_locks, + }, + &user_info, + mode.allow_self_signed_compute(config), + config.wake_compute_retry_config, + config.connect_to_compute_retry_config, + ) + .or_else(|e| stream.throw_error(e)) + .await?; + + let session = cancellation_handler.get_session(); + prepare_client_connection(&node, &session, &mut stream).await?; + + // Before proxy passing, forward to compute whatever data is left in the + // PqStream input buffer. Normally there is none, but our serverless npm + // driver in pipeline mode sends startup, password and first query + // immediately after opening the connection. + let (stream, read_buf) = stream.into_inner(); + node.stream.write_all(&read_buf).await?; + + Ok(Some(ProxyPassthrough { + client: stream, + aux: node.aux.clone(), + compute: node, + _req: request_gauge, + _conn: conn_gauge, + _cancel: session, + })) +} + /// Finish client connection initialization: confirm auth success, send params, etc. #[tracing::instrument(skip_all)] -async fn prepare_client_connection( +async fn prepare_client_connection

( node: &compute::PostgresConnection, - session: cancellation::Session<'_>, + session: &cancellation::Session

, stream: &mut PqStream, -) -> anyhow::Result<()> { +) -> Result<(), std::io::Error> { // Register compute's query cancellation token and produce a new, unique one. // The new token (cancel_key_data) will be sent to the client. let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone()); @@ -349,165 +376,25 @@ async fn prepare_client_connection( Ok(()) } -/// Forward bytes in both directions (client <-> compute). -#[tracing::instrument(skip_all)] -pub async fn proxy_pass( - ctx: &mut RequestMonitoring, - client: impl AsyncRead + AsyncWrite + Unpin, - compute: impl AsyncRead + AsyncWrite + Unpin, - aux: MetricsAuxInfo, -) -> anyhow::Result<()> { - ctx.set_success(); - ctx.log(); - - let usage = USAGE_METRICS.register(Ids { - endpoint_id: aux.endpoint_id.clone(), - branch_id: aux.branch_id.clone(), - }); - - let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]); - let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx")); - let mut client = MeasuredStream::new( - client, - |_| {}, - |cnt| { - // Number of bytes we sent to the client (outbound). - m_sent.inc_by(cnt as u64); - m_sent2.inc_by(cnt as u64); - usage.record_egress(cnt as u64); - }, - ); - - let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]); - let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx")); - let mut compute = MeasuredStream::new( - compute, - |_| {}, - |cnt| { - // Number of bytes the client sent to the compute node (inbound). - m_recv.inc_by(cnt as u64); - m_recv2.inc_by(cnt as u64); - }, - ); - - // Starting from here we only proxy the client's traffic. - info!("performing the proxy pass..."); - let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?; - - Ok(()) -} - -/// Thin connection context. -struct Client<'a, S> { - /// The underlying libpq protocol stream. - stream: PqStream>, - /// Client credentials that we care about. - user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>, - /// KV-dictionary with PostgreSQL connection params. - params: &'a StartupMessageParams, - /// Allow self-signed certificates (for testing). - allow_self_signed_compute: bool, - /// Rate limiter for endpoints - endpoint_rate_limiter: Arc, -} - -impl<'a, S> Client<'a, S> { - /// Construct a new connection context. - fn new( - stream: PqStream>, - user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>, - params: &'a StartupMessageParams, - allow_self_signed_compute: bool, - endpoint_rate_limiter: Arc, - ) -> Self { - Self { - stream, - user_info, - params, - allow_self_signed_compute, - endpoint_rate_limiter, - } - } -} - -impl Client<'_, S> { - /// Let the client authenticate and connect to the designated compute node. - // Instrumentation logs endpoint name everywhere. Doesn't work for link - // auth; strictly speaking we don't know endpoint name in its case. - #[tracing::instrument(name = "", fields(ep = %self.user_info.get_endpoint().unwrap_or_default()), skip_all)] - async fn connect_to_db( - self, - ctx: &mut RequestMonitoring, - session: cancellation::Session<'_>, - mode: ClientMode, - config: &'static AuthenticationConfig, - ) -> anyhow::Result<()> { - let Self { - mut stream, - user_info, - params, - allow_self_signed_compute, - endpoint_rate_limiter, - } = self; - - // check rate limit - if let Some(ep) = user_info.get_endpoint() { - if !endpoint_rate_limiter.check(ep) { - return stream - .throw_error(auth::AuthError::too_many_connections()) - .await; - } - } - - let user = user_info.get_user().to_owned(); - let auth_result = match user_info - .authenticate(ctx, &mut stream, mode.allow_cleartext(), config) - .await - { - Ok(auth_result) => auth_result, - Err(e) => { - let db = params.get("database"); - let app = params.get("application_name"); - let params_span = tracing::info_span!("", ?user, ?db, ?app); - - return stream.throw_error(e).instrument(params_span).await; - } - }; - - let (mut node_info, user_info) = auth_result; - - node_info.allow_self_signed_compute = allow_self_signed_compute; - - let aux = node_info.aux.clone(); - let mut node = connect_to_compute(ctx, &TcpMechanism { params }, node_info, &user_info) - .or_else(|e| stream.throw_error(e)) - .await?; - - prepare_client_connection(&node, session, &mut stream).await?; - // Before proxy passing, forward to compute whatever data is left in the - // PqStream input buffer. Normally there is none, but our serverless npm - // driver in pipeline mode sends startup, password and first query - // immediately after opening the connection. - let (stream, read_buf) = stream.into_inner(); - node.stream.write_all(&read_buf).await?; - proxy_pass(ctx, stream, node.stream, aux).await - } -} - #[derive(Debug, Clone, PartialEq, Eq, Default)] -pub struct NeonOptions(Vec<(SmolStr, SmolStr)>); +pub(crate) struct NeonOptions(Vec<(SmolStr, SmolStr)>); impl NeonOptions { - pub fn parse_params(params: &StartupMessageParams) -> Self { + pub(crate) fn parse_params(params: &StartupMessageParams) -> Self { params .options_raw() .map(Self::parse_from_iter) .unwrap_or_default() } - pub fn parse_options_raw(options: &str) -> Self { + pub(crate) fn parse_options_raw(options: &str) -> Self { Self::parse_from_iter(StartupMessageParams::parse_options_raw(options)) } + pub(crate) fn is_ephemeral(&self) -> bool { + // Currently, neon endpoint options are all reserved for ephemeral endpoints. + !self.0.is_empty() + } + fn parse_from_iter<'a>(options: impl Iterator) -> Self { let mut options = options .filter_map(neon_option) @@ -517,7 +404,7 @@ impl NeonOptions { Self(options) } - pub fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey { + pub(crate) fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey { // prefix + format!(" {k}:{v}") // kinda jank because SmolStr is immutable std::iter::once(prefix) @@ -528,7 +415,7 @@ impl NeonOptions { /// DeepObject format /// `paramName[prop1]=value1¶mName[prop2]=value2&...` - pub fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> { + pub(crate) fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> { self.0 .iter() .map(|(k, v)| (format_smolstr!("options[{}]", k), v.clone())) @@ -536,7 +423,7 @@ impl NeonOptions { } } -pub fn neon_option(bytes: &str) -> Option<(&str, &str)> { +pub(crate) fn neon_option(bytes: &str) -> Option<(&str, &str)> { static RE: OnceCell = OnceCell::new(); let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").unwrap()); diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 8bbe88aa51..613548d4a0 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -1,61 +1,53 @@ use crate::{ - auth, + auth::backend::ComputeCredentialKeys, compute::{self, PostgresConnection}, - console::{self, errors::WakeComputeError, Api}, + config::RetryConfig, + console::{self, errors::WakeComputeError, locks::ApiLocks, CachedNodeInfo, NodeInfo}, context::RequestMonitoring, - metrics::{bool_to_str, NUM_CONNECTION_FAILURES, NUM_WAKEUP_FAILURES}, - proxy::retry::{retry_after, ShouldRetry}, + error::ReportableError, + metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType}, + proxy::{ + retry::{retry_after, should_retry, CouldRetry}, + wake_compute::wake_compute, + }, + Host, }; use async_trait::async_trait; -use hyper::StatusCode; use pq_proto::StartupMessageParams; -use std::ops::ControlFlow; use tokio::time; use tracing::{error, info, warn}; +use super::retry::ShouldRetryWakeCompute; + const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); /// If we couldn't connect, a cached connection info might be to blame /// (e.g. the compute node's address might've changed at the wrong time). /// Invalidate the cache entry (if any) to prevent subsequent errors. #[tracing::instrument(name = "invalidate_cache", skip_all)] -pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg { +pub(crate) fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo { let is_cached = node_info.cached(); if is_cached { warn!("invalidating stalled compute node info cache entry"); } - let label = match is_cached { - true => "compute_cached", - false => "compute_uncached", + let label = if is_cached { + ConnectionFailureKind::ComputeCached + } else { + ConnectionFailureKind::ComputeUncached }; - NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc(); + Metrics::get().proxy.connection_failures_total.inc(label); - node_info.invalidate().config -} - -/// Try to connect to the compute node once. -#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)] -async fn connect_to_compute_once( - ctx: &mut RequestMonitoring, - node_info: &console::CachedNodeInfo, - timeout: time::Duration, -) -> Result { - let allow_self_signed_compute = node_info.allow_self_signed_compute; - - node_info - .config - .connect(ctx, allow_self_signed_compute, timeout) - .await + node_info.invalidate() } #[async_trait] -pub trait ConnectMechanism { +pub(crate) trait ConnectMechanism { type Connection; - type ConnectError; + type ConnectError: ReportableError; type Error: From; async fn connect_once( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, node_info: &console::CachedNodeInfo, timeout: time::Duration, ) -> Result; @@ -63,9 +55,22 @@ pub trait ConnectMechanism { fn update_connect_config(&self, conf: &mut compute::ConnCfg); } -pub struct TcpMechanism<'a> { +#[async_trait] +pub(crate) trait ComputeConnectBackend { + async fn wake_compute( + &self, + ctx: &RequestMonitoring, + ) -> Result; + + fn get_keys(&self) -> &ComputeCredentialKeys; +} + +pub(crate) struct TcpMechanism<'a> { /// KV-dictionary with PostgreSQL connection params. - pub params: &'a StartupMessageParams, + pub(crate) params: &'a StartupMessageParams, + + /// connect_to_compute concurrency lock + pub(crate) locks: &'static ApiLocks, } #[async_trait] @@ -74,13 +79,16 @@ impl ConnectMechanism for TcpMechanism<'_> { type ConnectError = compute::ConnectionError; type Error = compute::ConnectionError; + #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] async fn connect_once( &self, - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, node_info: &console::CachedNodeInfo, timeout: time::Duration, ) -> Result { - connect_to_compute_once(ctx, node_info, timeout).await + let host = node_info.config.get_host()?; + let permit = self.locks.get_permit(&host).await?; + permit.release_result(node_info.connect(ctx, timeout).await) } fn update_connect_config(&self, config: &mut compute::ConnCfg) { @@ -88,156 +96,121 @@ impl ConnectMechanism for TcpMechanism<'_> { } } -fn report_error(e: &WakeComputeError, retry: bool) { - use crate::console::errors::ApiError; - let retry = bool_to_str(retry); - let kind = match e { - WakeComputeError::BadComputeAddress(_) => "bad_compute_address", - WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error", - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::LOCKED, - ref text, - }) if text.contains("written data quota exceeded") - || text.contains("the limit for current plan reached") => - { - "quota_exceeded" - } - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::LOCKED, - .. - }) => "api_console_locked", - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::BAD_REQUEST, - .. - }) => "api_console_bad_request", - WakeComputeError::ApiError(ApiError::Console { status, .. }) - if status.is_server_error() => - { - "api_console_other_server_error" - } - WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error", - WakeComputeError::TimeoutError => "timeout_error", - }; - NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc(); -} - /// Try to connect to the compute node, retrying if necessary. -/// This function might update `node_info`, so we take it by `&mut`. #[tracing::instrument(skip_all)] -pub async fn connect_to_compute( - ctx: &mut RequestMonitoring, +pub(crate) async fn connect_to_compute( + ctx: &RequestMonitoring, mechanism: &M, - mut node_info: console::CachedNodeInfo, - user_info: &auth::BackendType<'_, auth::backend::ComputeUserInfo>, + user_info: &B, + allow_self_signed_compute: bool, + wake_compute_retry_config: RetryConfig, + connect_to_compute_retry_config: RetryConfig, ) -> Result where - M::ConnectError: ShouldRetry + std::fmt::Debug, + M::ConnectError: CouldRetry + ShouldRetryWakeCompute + std::fmt::Debug, M::Error: From, { + let mut num_retries = 0; + let mut node_info = + wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?; + + node_info.set_keys(user_info.get_keys()); + node_info.allow_self_signed_compute = allow_self_signed_compute; + // let mut node_info = credentials.get_node_info(ctx, user_info).await?; mechanism.update_connect_config(&mut node_info.config); + let retry_type = RetryType::ConnectToCompute; // try once - let (config, err) = match mechanism + let err = match mechanism .connect_once(ctx, &node_info, CONNECT_TIMEOUT) .await { Ok(res) => { - ctx.latency_timer.success(); + ctx.success(); + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Success, + retry_type, + }, + num_retries.into(), + ); return Ok(res); } - Err(e) => { - error!(error = ?e, "could not connect to compute node"); - (invalidate_cache(node_info), e) - } + Err(e) => e, }; - ctx.latency_timer.cache_miss(); + error!(error = ?err, "could not connect to compute node"); - let mut num_retries = 1; - - // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node - info!("compute node's state has likely changed; requesting a wake-up"); - let node_info = loop { - let wake_res = match user_info { - auth::BackendType::Console(api, user_info) => api.wake_compute(ctx, user_info).await, - // nothing to do? - auth::BackendType::Link(_) => return Err(err.into()), - // test backend - #[cfg(test)] - auth::BackendType::Test(x) => x.wake_compute(), - }; - - match handle_try_wake(wake_res, num_retries) { - Err(e) => { - error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); - report_error(&e, false); - return Err(e.into()); - } - // failed to wake up but we can continue to retry - Ok(ControlFlow::Continue(e)) => { - report_error(&e, true); - warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); - } - // successfully woke up a compute node and can break the wakeup loop - Ok(ControlFlow::Break(mut node_info)) => { - node_info.config.reuse_password(&config); - mechanism.update_connect_config(&mut node_info.config); - break node_info; - } + let node_info = if !node_info.cached() || !err.should_retry_wake_compute() { + // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry. + // Do not need to retrieve a new node_info, just return the old one. + if should_retry(&err, num_retries, connect_to_compute_retry_config) { + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Failed, + retry_type, + }, + num_retries.into(), + ); + return Err(err.into()); } + node_info + } else { + // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node + info!("compute node's state has likely changed; requesting a wake-up"); + let old_node_info = invalidate_cache(node_info); + let mut node_info = + wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?; + node_info.reuse_settings(old_node_info); - let wait_duration = retry_after(num_retries); - num_retries += 1; - - time::sleep(wait_duration).await; + mechanism.update_connect_config(&mut node_info.config); + node_info }; // now that we have a new node, try connect to it repeatedly. // this can error for a few reasons, for instance: // * DNS connection settings haven't quite propagated yet info!("wake_compute success. attempting to connect"); + num_retries = 1; loop { match mechanism .connect_once(ctx, &node_info, CONNECT_TIMEOUT) .await { Ok(res) => { - ctx.latency_timer.success(); + ctx.success(); + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Success, + retry_type, + }, + num_retries.into(), + ); + info!(?num_retries, "connected to compute node after"); return Ok(res); } Err(e) => { - let retriable = e.should_retry(num_retries); - if !retriable { - error!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); + if !should_retry(&e, num_retries, connect_to_compute_retry_config) { + error!(error = ?e, num_retries, retriable = false, "couldn't connect to compute node"); + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Failed, + retry_type, + }, + num_retries.into(), + ); return Err(e.into()); } - warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); - } - } - let wait_duration = retry_after(num_retries); + warn!(error = ?e, num_retries, retriable = true, "couldn't connect to compute node"); + } + }; + + let wait_duration = retry_after(num_retries, connect_to_compute_retry_config); num_retries += 1; + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout); time::sleep(wait_duration).await; - } -} - -/// Attempts to wake up the compute node. -/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable -/// * Returns Ok(Break(node)) if the wakeup succeeded -/// * Returns Err(e) if there was an error -pub fn handle_try_wake( - result: Result, - num_retries: u32, -) -> Result, WakeComputeError> { - match result { - Err(err) => match &err { - WakeComputeError::ApiError(api) if api.should_retry(num_retries) => { - Ok(ControlFlow::Continue(err)) - } - _ => Err(err), - }, - // Ready to try again. - Ok(new) => Ok(ControlFlow::Break(new)), + drop(pause); } } diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs new file mode 100644 index 0000000000..4ebda013ac --- /dev/null +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -0,0 +1,305 @@ +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; +use tracing::info; + +use std::future::poll_fn; +use std::io; +use std::pin::Pin; +use std::task::{ready, Context, Poll}; + +#[derive(Debug)] +enum TransferState { + Running(CopyBuffer), + ShuttingDown(u64), + Done(u64), +} + +#[derive(Debug)] +pub(crate) enum ErrorDirection { + Read(io::Error), + Write(io::Error), +} + +impl ErrorSource { + fn from_client(err: ErrorDirection) -> ErrorSource { + match err { + ErrorDirection::Read(client) => Self::Client(client), + ErrorDirection::Write(compute) => Self::Compute(compute), + } + } + fn from_compute(err: ErrorDirection) -> ErrorSource { + match err { + ErrorDirection::Write(client) => Self::Client(client), + ErrorDirection::Read(compute) => Self::Compute(compute), + } + } +} + +#[derive(Debug)] +pub enum ErrorSource { + Client(io::Error), + Compute(io::Error), +} + +fn transfer_one_direction( + cx: &mut Context<'_>, + state: &mut TransferState, + r: &mut A, + w: &mut B, +) -> Poll> +where + A: AsyncRead + AsyncWrite + Unpin + ?Sized, + B: AsyncRead + AsyncWrite + Unpin + ?Sized, +{ + let mut r = Pin::new(r); + let mut w = Pin::new(w); + loop { + match state { + TransferState::Running(buf) => { + let count = ready!(buf.poll_copy(cx, r.as_mut(), w.as_mut()))?; + *state = TransferState::ShuttingDown(count); + } + TransferState::ShuttingDown(count) => { + ready!(w.as_mut().poll_shutdown(cx)).map_err(ErrorDirection::Write)?; + *state = TransferState::Done(*count); + } + TransferState::Done(count) => return Poll::Ready(Ok(*count)), + } + } +} + +#[tracing::instrument(skip_all)] +pub async fn copy_bidirectional_client_compute( + client: &mut Client, + compute: &mut Compute, +) -> Result<(u64, u64), ErrorSource> +where + Client: AsyncRead + AsyncWrite + Unpin + ?Sized, + Compute: AsyncRead + AsyncWrite + Unpin + ?Sized, +{ + let mut client_to_compute = TransferState::Running(CopyBuffer::new()); + let mut compute_to_client = TransferState::Running(CopyBuffer::new()); + + poll_fn(|cx| { + let mut client_to_compute_result = + transfer_one_direction(cx, &mut client_to_compute, client, compute) + .map_err(ErrorSource::from_client)?; + let mut compute_to_client_result = + transfer_one_direction(cx, &mut compute_to_client, compute, client) + .map_err(ErrorSource::from_compute)?; + + // Early termination checks from compute to client. + if let TransferState::Done(_) = compute_to_client { + if let TransferState::Running(buf) = &client_to_compute { + info!("Compute is done, terminate client"); + // Initiate shutdown + client_to_compute = TransferState::ShuttingDown(buf.amt); + client_to_compute_result = + transfer_one_direction(cx, &mut client_to_compute, client, compute) + .map_err(ErrorSource::from_client)?; + } + } + + // Early termination checks from client to compute. + if let TransferState::Done(_) = client_to_compute { + if let TransferState::Running(buf) = &compute_to_client { + info!("Client is done, terminate compute"); + // Initiate shutdown + compute_to_client = TransferState::ShuttingDown(buf.amt); + compute_to_client_result = + transfer_one_direction(cx, &mut compute_to_client, compute, client) + .map_err(ErrorSource::from_compute)?; + } + } + + // It is not a problem if ready! returns early ... (comment remains the same) + let client_to_compute = ready!(client_to_compute_result); + let compute_to_client = ready!(compute_to_client_result); + + Poll::Ready(Ok((client_to_compute, compute_to_client))) + }) + .await +} + +#[derive(Debug)] +pub(super) struct CopyBuffer { + read_done: bool, + need_flush: bool, + pos: usize, + cap: usize, + amt: u64, + buf: Box<[u8]>, +} +const DEFAULT_BUF_SIZE: usize = 1024; + +impl CopyBuffer { + pub(super) fn new() -> Self { + Self { + read_done: false, + need_flush: false, + pos: 0, + cap: 0, + amt: 0, + buf: vec![0; DEFAULT_BUF_SIZE].into_boxed_slice(), + } + } + + fn poll_fill_buf( + &mut self, + cx: &mut Context<'_>, + reader: Pin<&mut R>, + ) -> Poll> + where + R: AsyncRead + ?Sized, + { + let me = &mut *self; + let mut buf = ReadBuf::new(&mut me.buf); + buf.set_filled(me.cap); + + let res = reader.poll_read(cx, &mut buf); + if let Poll::Ready(Ok(())) = res { + let filled_len = buf.filled().len(); + me.read_done = me.cap == filled_len; + me.cap = filled_len; + } + res + } + + fn poll_write_buf( + &mut self, + cx: &mut Context<'_>, + mut reader: Pin<&mut R>, + mut writer: Pin<&mut W>, + ) -> Poll> + where + R: AsyncRead + ?Sized, + W: AsyncWrite + ?Sized, + { + let me = &mut *self; + match writer.as_mut().poll_write(cx, &me.buf[me.pos..me.cap]) { + Poll::Pending => { + // Top up the buffer towards full if we can read a bit more + // data - this should improve the chances of a large write + if !me.read_done && me.cap < me.buf.len() { + ready!(me.poll_fill_buf(cx, reader.as_mut())).map_err(ErrorDirection::Read)?; + } + Poll::Pending + } + res @ Poll::Ready(_) => res.map_err(ErrorDirection::Write), + } + } + + pub(super) fn poll_copy( + &mut self, + cx: &mut Context<'_>, + mut reader: Pin<&mut R>, + mut writer: Pin<&mut W>, + ) -> Poll> + where + R: AsyncRead + ?Sized, + W: AsyncWrite + ?Sized, + { + loop { + // If our buffer is empty, then we need to read some data to + // continue. + if self.pos == self.cap && !self.read_done { + self.pos = 0; + self.cap = 0; + + match self.poll_fill_buf(cx, reader.as_mut()) { + Poll::Ready(Ok(())) => (), + Poll::Ready(Err(err)) => return Poll::Ready(Err(ErrorDirection::Read(err))), + Poll::Pending => { + // Try flushing when the reader has no progress to avoid deadlock + // when the reader depends on buffered writer. + if self.need_flush { + ready!(writer.as_mut().poll_flush(cx)) + .map_err(ErrorDirection::Write)?; + self.need_flush = false; + } + + return Poll::Pending; + } + } + } + + // If our buffer has some data, let's write it out! + while self.pos < self.cap { + let i = ready!(self.poll_write_buf(cx, reader.as_mut(), writer.as_mut()))?; + if i == 0 { + return Poll::Ready(Err(ErrorDirection::Write(io::Error::new( + io::ErrorKind::WriteZero, + "write zero byte into writer", + )))); + } + self.pos += i; + self.amt += i as u64; + self.need_flush = true; + } + + // If pos larger than cap, this loop will never stop. + // In particular, user's wrong poll_write implementation returning + // incorrect written length may lead to thread blocking. + debug_assert!( + self.pos <= self.cap, + "writer returned length larger than input slice" + ); + + // If we've written all the data and we've seen EOF, flush out the + // data and finish the transfer. + if self.pos == self.cap && self.read_done { + ready!(writer.as_mut().poll_flush(cx)).map_err(ErrorDirection::Write)?; + return Poll::Ready(Ok(self.amt)); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tokio::io::AsyncWriteExt; + + #[tokio::test] + async fn test_client_to_compute() { + let (mut client_client, mut client_proxy) = tokio::io::duplex(8); // Create a mock duplex stream + let (mut compute_proxy, mut compute_client) = tokio::io::duplex(32); // Create a mock duplex stream + + // Simulate 'a' finishing while there's still data for 'b' + client_client.write_all(b"hello").await.unwrap(); + client_client.shutdown().await.unwrap(); + compute_client.write_all(b"Neon").await.unwrap(); + compute_client.shutdown().await.unwrap(); + + let result = copy_bidirectional_client_compute(&mut client_proxy, &mut compute_proxy) + .await + .unwrap(); + + // Assert correct transferred amounts + let (client_to_compute_count, compute_to_client_count) = result; + assert_eq!(client_to_compute_count, 5); // 'hello' was transferred + assert_eq!(compute_to_client_count, 4); // response only partially transferred or not at all + } + + #[tokio::test] + async fn test_compute_to_client() { + let (mut client_client, mut client_proxy) = tokio::io::duplex(32); // Create a mock duplex stream + let (mut compute_proxy, mut compute_client) = tokio::io::duplex(8); // Create a mock duplex stream + + // Simulate 'a' finishing while there's still data for 'b' + compute_client.write_all(b"hello").await.unwrap(); + compute_client.shutdown().await.unwrap(); + client_client + .write_all(b"Neon Serverless Postgres") + .await + .unwrap(); + + let result = copy_bidirectional_client_compute(&mut client_proxy, &mut compute_proxy) + .await + .unwrap(); + + // Assert correct transferred amounts + let (client_to_compute_count, compute_to_client_count) = result; + assert_eq!(compute_to_client_count, 5); // 'hello' was transferred + assert!(client_to_compute_count <= 8); // response only partially transferred or not at all + } +} diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs new file mode 100644 index 0000000000..5996b11c11 --- /dev/null +++ b/proxy/src/proxy/handshake.rs @@ -0,0 +1,257 @@ +use bytes::Buf; +use pq_proto::{ + framed::Framed, BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion, + StartupMessageParams, +}; +use thiserror::Error; +use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::{info, warn}; + +use crate::{ + auth::endpoint_sni, + config::{TlsConfig, PG_ALPN_PROTOCOL}, + context::RequestMonitoring, + error::ReportableError, + metrics::Metrics, + proxy::ERR_INSECURE_CONNECTION, + stream::{PqStream, Stream, StreamUpgradeError}, +}; + +#[derive(Error, Debug)] +pub(crate) enum HandshakeError { + #[error("data is sent before server replied with EncryptionResponse")] + EarlyData, + + #[error("protocol violation")] + ProtocolViolation, + + #[error("missing certificate")] + MissingCertificate, + + #[error("{0}")] + StreamUpgradeError(#[from] StreamUpgradeError), + + #[error("{0}")] + Io(#[from] std::io::Error), + + #[error("{0}")] + ReportedError(#[from] crate::stream::ReportedError), +} + +impl ReportableError for HandshakeError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + HandshakeError::EarlyData => crate::error::ErrorKind::User, + HandshakeError::ProtocolViolation => crate::error::ErrorKind::User, + // This error should not happen, but will if we have no default certificate and + // the client sends no SNI extension. + // If they provide SNI then we can be sure there is a certificate that matches. + HandshakeError::MissingCertificate => crate::error::ErrorKind::Service, + HandshakeError::StreamUpgradeError(upgrade) => match upgrade { + StreamUpgradeError::AlreadyTls => crate::error::ErrorKind::Service, + StreamUpgradeError::Io(_) => crate::error::ErrorKind::ClientDisconnect, + }, + HandshakeError::Io(_) => crate::error::ErrorKind::ClientDisconnect, + HandshakeError::ReportedError(e) => e.get_error_kind(), + } + } +} + +pub(crate) enum HandshakeData { + Startup(PqStream>, StartupMessageParams), + Cancel(CancelKeyData), +} + +/// Establish a (most probably, secure) connection with the client. +/// For better testing experience, `stream` can be any object satisfying the traits. +/// It's easier to work with owned `stream` here as we need to upgrade it to TLS; +/// we also take an extra care of propagating only the select handshake errors to client. +#[tracing::instrument(skip_all)] +pub(crate) async fn handshake( + ctx: &RequestMonitoring, + stream: S, + mut tls: Option<&TlsConfig>, + record_handshake_error: bool, +) -> Result, HandshakeError> { + // Client may try upgrading to each protocol only once + let (mut tried_ssl, mut tried_gss) = (false, false); + + const PG_PROTOCOL_EARLIEST: ProtocolVersion = ProtocolVersion::new(3, 0); + const PG_PROTOCOL_LATEST: ProtocolVersion = ProtocolVersion::new(3, 0); + + let mut stream = PqStream::new(Stream::from_raw(stream)); + loop { + let msg = stream.read_startup_packet().await?; + match msg { + FeStartupPacket::SslRequest { direct } => match stream.get_ref() { + Stream::Raw { .. } if !tried_ssl => { + tried_ssl = true; + + // We can't perform TLS handshake without a config + let have_tls = tls.is_some(); + if !direct { + stream + .write_message(&Be::EncryptionResponse(have_tls)) + .await?; + } else if !have_tls { + return Err(HandshakeError::ProtocolViolation); + } + + if let Some(tls) = tls.take() { + // Upgrade raw stream into a secure TLS-backed stream. + // NOTE: We've consumed `tls`; this fact will be used later. + + let Framed { + stream: raw, + read_buf, + write_buf, + } = stream.framed; + + let Stream::Raw { raw } = raw else { + return Err(HandshakeError::StreamUpgradeError( + StreamUpgradeError::AlreadyTls, + )); + }; + + let mut read_buf = read_buf.reader(); + let mut res = Ok(()); + let accept = tokio_rustls::TlsAcceptor::from(tls.to_server_config()) + .accept_with(raw, |session| { + // push the early data to the tls session + while !read_buf.get_ref().is_empty() { + match session.read_tls(&mut read_buf) { + Ok(_) => {} + Err(e) => { + res = Err(e); + break; + } + } + } + }); + + res?; + + let read_buf = read_buf.into_inner(); + if !read_buf.is_empty() { + return Err(HandshakeError::EarlyData); + } + + let tls_stream = accept.await.inspect_err(|_| { + if record_handshake_error { + Metrics::get().proxy.tls_handshake_failures.inc(); + } + })?; + + let conn_info = tls_stream.get_ref().1; + + // try parse endpoint + let ep = conn_info + .server_name() + .and_then(|sni| endpoint_sni(sni, &tls.common_names).ok().flatten()); + if let Some(ep) = ep { + ctx.set_endpoint_id(ep); + } + + // check the ALPN, if exists, as required. + match conn_info.alpn_protocol() { + None | Some(PG_ALPN_PROTOCOL) => {} + Some(other) => { + let alpn = String::from_utf8_lossy(other); + warn!(%alpn, "unexpected ALPN"); + return Err(HandshakeError::ProtocolViolation); + } + } + + let (_, tls_server_end_point) = tls + .cert_resolver + .resolve(conn_info.server_name()) + .ok_or(HandshakeError::MissingCertificate)?; + + stream = PqStream { + framed: Framed { + stream: Stream::Tls { + tls: Box::new(tls_stream), + tls_server_end_point, + }, + read_buf, + write_buf, + }, + }; + } + } + _ => return Err(HandshakeError::ProtocolViolation), + }, + FeStartupPacket::GssEncRequest => match stream.get_ref() { + Stream::Raw { .. } if !tried_gss => { + tried_gss = true; + + // Currently, we don't support GSSAPI + stream.write_message(&Be::EncryptionResponse(false)).await?; + } + _ => return Err(HandshakeError::ProtocolViolation), + }, + FeStartupPacket::StartupMessage { params, version } + if PG_PROTOCOL_EARLIEST <= version && version <= PG_PROTOCOL_LATEST => + { + // Check that the config has been consumed during upgrade + // OR we didn't provide it at all (for dev purposes). + if tls.is_some() { + return stream + .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User) + .await?; + } + + info!( + ?version, + ?params, + session_type = "normal", + "successful handshake" + ); + break Ok(HandshakeData::Startup(stream, params)); + } + // downgrade protocol version + FeStartupPacket::StartupMessage { params, version } + if version.major() == 3 && version > PG_PROTOCOL_LATEST => + { + warn!(?version, "unsupported minor version"); + + // no protocol extensions are supported. + // + let mut unsupported = vec![]; + for (k, _) in params.iter() { + if k.starts_with("_pq_.") { + unsupported.push(k); + } + } + + // TODO: remove unsupported options so we don't send them to compute. + + stream + .write_message(&Be::NegotiateProtocolVersion { + version: PG_PROTOCOL_LATEST, + options: &unsupported, + }) + .await?; + + info!( + ?version, + session_type = "normal", + "successful handshake; unsupported minor version requested" + ); + break Ok(HandshakeData::Startup(stream, params)); + } + FeStartupPacket::StartupMessage { version, .. } => { + warn!( + ?version, + session_type = "normal", + "unsuccessful handshake; unsupported version" + ); + return Err(HandshakeError::ProtocolViolation); + } + FeStartupPacket::CancelRequest(cancel_key_data) => { + info!(session_type = "cancellation", "successful handshake"); + break Ok(HandshakeData::Cancel(cancel_key_data)); + } + } + } +} diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs new file mode 100644 index 0000000000..c17108de0a --- /dev/null +++ b/proxy/src/proxy/passthrough.rs @@ -0,0 +1,78 @@ +use crate::{ + cancellation, + compute::PostgresConnection, + console::messages::MetricsAuxInfo, + metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}, + stream::Stream, + usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}, +}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::info; +use utils::measured_stream::MeasuredStream; + +use super::copy_bidirectional::ErrorSource; + +/// Forward bytes in both directions (client <-> compute). +#[tracing::instrument(skip_all)] +pub(crate) async fn proxy_pass( + client: impl AsyncRead + AsyncWrite + Unpin, + compute: impl AsyncRead + AsyncWrite + Unpin, + aux: MetricsAuxInfo, +) -> Result<(), ErrorSource> { + let usage = USAGE_METRICS.register(Ids { + endpoint_id: aux.endpoint_id, + branch_id: aux.branch_id, + }); + + let metrics = &Metrics::get().proxy.io_bytes; + let m_sent = metrics.with_labels(Direction::Tx); + let mut client = MeasuredStream::new( + client, + |_| {}, + |cnt| { + // Number of bytes we sent to the client (outbound). + metrics.get_metric(m_sent).inc_by(cnt as u64); + usage.record_egress(cnt as u64); + }, + ); + + let m_recv = metrics.with_labels(Direction::Rx); + let mut compute = MeasuredStream::new( + compute, + |_| {}, + |cnt| { + // Number of bytes the client sent to the compute node (inbound). + metrics.get_metric(m_recv).inc_by(cnt as u64); + }, + ); + + // Starting from here we only proxy the client's traffic. + info!("performing the proxy pass..."); + let _ = crate::proxy::copy_bidirectional::copy_bidirectional_client_compute( + &mut client, + &mut compute, + ) + .await?; + + Ok(()) +} + +pub(crate) struct ProxyPassthrough { + pub(crate) client: Stream, + pub(crate) compute: PostgresConnection, + pub(crate) aux: MetricsAuxInfo, + + pub(crate) _req: NumConnectionRequestsGuard<'static>, + pub(crate) _conn: NumClientConnectionsGuard<'static>, + pub(crate) _cancel: cancellation::Session

, +} + +impl ProxyPassthrough { + pub(crate) async fn proxy_pass(self) -> Result<(), ErrorSource> { + let res = proxy_pass(self.client, self.compute.stream, self.aux).await; + if let Err(err) = self.compute.cancel_closure.try_cancel_query().await { + tracing::error!(?err, "could not cancel the query in the database"); + } + res + } +} diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index a85ed380b0..15895d37e6 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -1,24 +1,23 @@ -use crate::compute; +use crate::{compute, config::RetryConfig}; use std::{error::Error, io}; use tokio::time; -/// Number of times we should retry the `/proxy_wake_compute` http request. -/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0 -pub const NUM_RETRIES_CONNECT: u32 = 16; -const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25); -const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2; - -pub trait ShouldRetry { +pub(crate) trait CouldRetry { + /// Returns true if the error could be retried fn could_retry(&self) -> bool; - fn should_retry(&self, num_retries: u32) -> bool { - match self { - _ if num_retries >= NUM_RETRIES_CONNECT => false, - err => err.could_retry(), - } - } } -impl ShouldRetry for io::Error { +pub(crate) trait ShouldRetryWakeCompute { + /// Returns true if we need to invalidate the cache for this node. + /// If false, we can continue retrying with the current node cache. + fn should_retry_wake_compute(&self) -> bool; +} + +pub(crate) fn should_retry(err: &impl CouldRetry, num_retries: u32, config: RetryConfig) -> bool { + num_retries < config.max_retries && err.could_retry() +} + +impl CouldRetry for io::Error { fn could_retry(&self) -> bool { use std::io::ErrorKind; matches!( @@ -28,7 +27,7 @@ impl ShouldRetry for io::Error { } } -impl ShouldRetry for tokio_postgres::error::DbError { +impl CouldRetry for tokio_postgres::error::DbError { fn could_retry(&self) -> bool { use tokio_postgres::error::SqlState; matches!( @@ -40,8 +39,25 @@ impl ShouldRetry for tokio_postgres::error::DbError { ) } } +impl ShouldRetryWakeCompute for tokio_postgres::error::DbError { + fn should_retry_wake_compute(&self) -> bool { + use tokio_postgres::error::SqlState; + // Here are errors that happens after the user successfully authenticated to the database. + // TODO: there are pgbouncer errors that should be retried, but they are not listed here. + !matches!( + self.code(), + &SqlState::TOO_MANY_CONNECTIONS + | &SqlState::OUT_OF_MEMORY + | &SqlState::SYNTAX_ERROR + | &SqlState::T_R_SERIALIZATION_FAILURE + | &SqlState::INVALID_CATALOG_NAME + | &SqlState::INVALID_SCHEMA_NAME + | &SqlState::INVALID_PARAMETER_VALUE + ) + } +} -impl ShouldRetry for tokio_postgres::Error { +impl CouldRetry for tokio_postgres::Error { fn could_retry(&self) -> bool { if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) { io::Error::could_retry(io_err) @@ -52,17 +68,41 @@ impl ShouldRetry for tokio_postgres::Error { } } } - -impl ShouldRetry for compute::ConnectionError { - fn could_retry(&self) -> bool { - match self { - compute::ConnectionError::Postgres(err) => err.could_retry(), - compute::ConnectionError::CouldNotConnect(err) => err.could_retry(), - _ => false, +impl ShouldRetryWakeCompute for tokio_postgres::Error { + fn should_retry_wake_compute(&self) -> bool { + if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) { + tokio_postgres::error::DbError::should_retry_wake_compute(db_err) + } else { + // likely an IO error. Possible the compute has shutdown and the + // cache is stale. + true } } } -pub fn retry_after(num_retries: u32) -> time::Duration { - BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1)) +impl CouldRetry for compute::ConnectionError { + fn could_retry(&self) -> bool { + match self { + compute::ConnectionError::Postgres(err) => err.could_retry(), + compute::ConnectionError::CouldNotConnect(err) => err.could_retry(), + compute::ConnectionError::WakeComputeError(err) => err.could_retry(), + _ => false, + } + } +} +impl ShouldRetryWakeCompute for compute::ConnectionError { + fn should_retry_wake_compute(&self) -> bool { + match self { + compute::ConnectionError::Postgres(err) => err.should_retry_wake_compute(), + // the cache entry was not checked for validity + compute::ConnectionError::TooManyConnectionAttempts(_) => false, + _ => true, + } + } +} + +pub(crate) fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration { + config + .base_delay + .mul_f64(config.backoff_factor.powi((num_retries as i32) - 1)) } diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index a552a857b9..4264dbae0f 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -2,17 +2,26 @@ mod mitm; +use std::time::Duration; + use super::connect_compute::ConnectMechanism; -use super::retry::ShouldRetry; +use super::retry::CouldRetry; use super::*; -use crate::auth::backend::{ComputeUserInfo, TestBackend}; -use crate::auth::IpPattern; -use crate::config::CertResolver; +use crate::auth::backend::{ + ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend, +}; +use crate::config::{CertResolver, RetryConfig}; +use crate::console::messages::{ConsoleError, Details, MetricsAuxInfo, Status}; +use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend, NodeInfoCache}; use crate::console::{self, CachedNodeInfo, NodeInfo}; -use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT}; -use crate::{auth, http, sasl, scram}; +use crate::error::ErrorKind; +use crate::{sasl, scram, BranchId, EndpointId, ProjectId}; +use anyhow::{bail, Context}; use async_trait::async_trait; +use http::StatusCode; +use retry::{retry_after, ShouldRetryWakeCompute}; use rstest::rstest; +use rustls::pki_types; use tokio_postgres::config::SslMode; use tokio_postgres::tls::{MakeTlsConnect, NoTls}; use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; @@ -21,7 +30,11 @@ use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; fn generate_certs( hostname: &str, common_name: &str, -) -> anyhow::Result<(rustls::Certificate, rustls::Certificate, rustls::PrivateKey)> { +) -> anyhow::Result<( + pki_types::CertificateDer<'static>, + pki_types::CertificateDer<'static>, + pki_types::PrivateKeyDer<'static>, +)> { let ca = rcgen::Certificate::from_params({ let mut params = rcgen::CertificateParams::default(); params.is_ca = rcgen::IsCa::Ca(rcgen::BasicConstraints::Unconstrained); @@ -38,9 +51,9 @@ fn generate_certs( })?; Ok(( - rustls::Certificate(ca.serialize_der()?), - rustls::Certificate(cert.serialize_der_with_signer(&ca)?), - rustls::PrivateKey(cert.serialize_private_key_der()), + pki_types::CertificateDer::from(ca.serialize_der()?), + pki_types::CertificateDer::from(cert.serialize_der_with_signer(&ca)?), + pki_types::PrivateKeyDer::Pkcs8(cert.serialize_private_key_der().into()), )) } @@ -75,9 +88,8 @@ fn generate_tls_config<'a>( let tls_config = { let config = rustls::ServerConfig::builder() - .with_safe_defaults() .with_no_client_auth() - .with_single_cert(vec![cert.clone()], key.clone())? + .with_single_cert(vec![cert.clone()], key.clone_key())? .into(); let mut cert_resolver = CertResolver::new(); @@ -94,10 +106,9 @@ fn generate_tls_config<'a>( let client_config = { let config = rustls::ClientConfig::builder() - .with_safe_defaults() .with_root_certificates({ let mut store = rustls::RootCertStore::empty(); - store.add(&ca)?; + store.add(ca)?; store }) .with_no_client_auth(); @@ -125,15 +136,15 @@ impl TestAuth for NoAuth {} struct Scram(scram::ServerSecret); impl Scram { - fn new(password: &str) -> anyhow::Result { - let salt = rand::random::<[u8; 16]>(); - let secret = scram::ServerSecret::build(password, &salt, 256) + async fn new(password: &str) -> anyhow::Result { + let secret = scram::ServerSecret::build(password) + .await .context("failed to generate scram secret")?; Ok(Scram(secret)) } - fn mock(user: &str) -> Self { - Scram(scram::ServerSecret::mock(user, rand::random())) + fn mock() -> Self { + Scram(scram::ServerSecret::mock(rand::random())) } } @@ -144,7 +155,7 @@ impl TestAuth for Scram { stream: &mut PqStream>, ) -> anyhow::Result<()> { let outcome = auth::AuthFlow::new(stream) - .begin(auth::Scram(&self.0)) + .begin(auth::Scram(&self.0, &RequestMonitoring::test())) .await? .authenticate() .await?; @@ -163,11 +174,12 @@ async fn dummy_proxy( tls: Option, auth: impl TestAuth + Send, ) -> anyhow::Result<()> { - let cancel_map = CancelMap::default(); - let client = WithClientIp::new(client); - let (mut stream, _params) = handshake(client, tls.as_ref(), &cancel_map) - .await? - .context("handshake failed")?; + let (client, _) = read_proxy_protocol(client).await?; + let mut stream = + match handshake(&RequestMonitoring::test(), client, tls.as_ref(), false).await? { + HandshakeData::Startup(stream, _) => stream, + HandshakeData::Cancel(_) => bail!("cancellation not supported"), + }; auth.authenticate(&mut stream).await?; @@ -275,7 +287,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> { let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), - Scram::new(password)?, + Scram::new(password).await?, )); let (_client, _conn) = tokio_postgres::Config::new() @@ -299,7 +311,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), - Scram::new("password")?, + Scram::new("password").await?, )); let (_client, _conn) = tokio_postgres::Config::new() @@ -320,11 +332,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> { let (client_config, server_config) = generate_tls_config("generic-project-name.localhost", "localhost")?; - let proxy = tokio::spawn(dummy_proxy( - client, - Some(server_config), - Scram::mock("user"), - )); + let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock())); use rand::{distributions::Alphanumeric, Rng}; let password: String = rand::thread_rng() @@ -354,11 +362,15 @@ async fn scram_auth_mock() -> anyhow::Result<()> { #[test] fn connect_compute_total_wait() { let mut total_wait = tokio::time::Duration::ZERO; - for num_retries in 1..NUM_RETRIES_CONNECT { - total_wait += retry_after(num_retries); + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + for num_retries in 1..config.max_retries { + total_wait += retry_after(num_retries, config); } - assert!(total_wait < tokio::time::Duration::from_secs(12)); - assert!(total_wait > tokio::time::Duration::from_secs(10)); + assert!(f64::abs(total_wait.as_secs_f64() - 15.0) < 0.1); } #[derive(Clone, Copy, Debug)] @@ -371,9 +383,11 @@ enum ConnectAction { Fail, } +#[derive(Clone)] struct TestConnectMechanism { counter: Arc>, sequence: Vec, + cache: &'static NodeInfoCache, } impl TestConnectMechanism { @@ -392,6 +406,12 @@ impl TestConnectMechanism { Self { counter: Arc::new(std::sync::Mutex::new(0)), sequence, + cache: Box::leak(Box::new(NodeInfoCache::new( + "test", + 1, + Duration::from_secs(100), + false, + ))), } } } @@ -402,21 +422,33 @@ struct TestConnection; #[derive(Debug)] struct TestConnectError { retryable: bool, + kind: crate::error::ErrorKind, +} + +impl ReportableError for TestConnectError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + self.kind + } } impl std::fmt::Display for TestConnectError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", self) + write!(f, "{self:?}") } } impl std::error::Error for TestConnectError {} -impl ShouldRetry for TestConnectError { +impl CouldRetry for TestConnectError { fn could_retry(&self) -> bool { self.retryable } } +impl ShouldRetryWakeCompute for TestConnectError { + fn should_retry_wake_compute(&self) -> bool { + true + } +} #[async_trait] impl ConnectMechanism for TestConnectMechanism { @@ -426,7 +458,7 @@ impl ConnectMechanism for TestConnectMechanism { async fn connect_once( &self, - _ctx: &mut RequestMonitoring, + _ctx: &RequestMonitoring, _node_info: &console::CachedNodeInfo, _timeout: std::time::Duration, ) -> Result { @@ -435,9 +467,15 @@ impl ConnectMechanism for TestConnectMechanism { *counter += 1; match action { ConnectAction::Connect => Ok(TestConnection), - ConnectAction::Retry => Err(TestConnectError { retryable: true }), - ConnectAction::Fail => Err(TestConnectError { retryable: false }), - x => panic!("expecting action {:?}, connect is called instead", x), + ConnectAction::Retry => Err(TestConnectError { + retryable: true, + kind: ErrorKind::Compute, + }), + ConnectAction::Fail => Err(TestConnectError { + retryable: false, + kind: ErrorKind::Compute, + }), + x => panic!("expecting action {x:?}, connect is called instead"), } } @@ -450,56 +488,90 @@ impl TestBackend for TestConnectMechanism { let action = self.sequence[*counter]; *counter += 1; match action { - ConnectAction::Wake => Ok(helper_create_cached_node_info()), + ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)), ConnectAction::WakeFail => { - let err = console::errors::ApiError::Console { - status: http::StatusCode::FORBIDDEN, - text: "TEST".into(), - }; + let err = console::errors::ApiError::Console(ConsoleError { + http_status_code: StatusCode::BAD_REQUEST, + error: "TEST".into(), + status: None, + }); assert!(!err.could_retry()); Err(console::errors::WakeComputeError::ApiError(err)) } ConnectAction::WakeRetry => { - let err = console::errors::ApiError::Console { - status: http::StatusCode::BAD_REQUEST, - text: "TEST".into(), - }; + let err = console::errors::ApiError::Console(ConsoleError { + http_status_code: StatusCode::BAD_REQUEST, + error: "TEST".into(), + status: Some(Status { + code: "error".into(), + message: "error".into(), + details: Details { + error_info: None, + retry_info: Some(console::messages::RetryInfo { retry_delay_ms: 1 }), + user_facing_message: None, + }, + }), + }); assert!(err.could_retry()); Err(console::errors::WakeComputeError::ApiError(err)) } - x => panic!("expecting action {:?}, wake_compute is called instead", x), + x => panic!("expecting action {x:?}, wake_compute is called instead"), } } - fn get_allowed_ips(&self) -> Result, console::errors::GetAuthInfoError> { + fn get_allowed_ips_and_secret( + &self, + ) -> Result<(CachedAllowedIps, Option), console::errors::GetAuthInfoError> + { unimplemented!("not used in tests") } } -fn helper_create_cached_node_info() -> CachedNodeInfo { +fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo { let node = NodeInfo { config: compute::ConnCfg::new(), - aux: Default::default(), + aux: MetricsAuxInfo { + endpoint_id: (&EndpointId::from("endpoint")).into(), + project_id: (&ProjectId::from("project")).into(), + branch_id: (&BranchId::from("branch")).into(), + cold_start_info: crate::console::messages::ColdStartInfo::Warm, + }, allow_self_signed_compute: false, }; - CachedNodeInfo::new_uncached(node) + let (_, node2) = cache.insert_unit("key".into(), Ok(node.clone())); + node2.map(|()| node) } fn helper_create_connect_info( mechanism: &TestConnectMechanism, -) -> (CachedNodeInfo, auth::BackendType<'_, ComputeUserInfo>) { - let cache = helper_create_cached_node_info(); - let user_info = auth::BackendType::Test(mechanism); - (cache, user_info) +) -> auth::Backend<'static, ComputeCredentials, &()> { + let user_info = auth::Backend::Console( + MaybeOwned::Owned(ConsoleBackend::Test(Box::new(mechanism.clone()))), + ComputeCredentials { + info: ComputeUserInfo { + endpoint: "endpoint".into(), + user: "user".into(), + options: NeonOptions::parse_options_raw(""), + }, + keys: ComputeCredentialKeys::Password("password".into()), + }, + ); + user_info } #[tokio::test] async fn connect_to_compute_success() { + let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![Connect]); - let (cache, user_info) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &user_info) + let ctx = RequestMonitoring::test(); + let mechanism = TestConnectMechanism::new(vec![Wake, Connect]); + let user_info = helper_create_connect_info(&mechanism); + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -507,11 +579,17 @@ async fn connect_to_compute_success() { #[tokio::test] async fn connect_to_compute_retry() { + let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]); - let (cache, user_info) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &user_info) + let ctx = RequestMonitoring::test(); + let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]); + let user_info = helper_create_connect_info(&mechanism); + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -520,11 +598,17 @@ async fn connect_to_compute_retry() { /// Test that we don't retry if the error is not retryable. #[tokio::test] async fn connect_to_compute_non_retry_1() { + let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]); - let (cache, user_info) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &user_info) + let ctx = RequestMonitoring::test(); + let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]); + let user_info = helper_create_connect_info(&mechanism); + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap_err(); mechanism.verify(); @@ -533,11 +617,17 @@ async fn connect_to_compute_non_retry_1() { /// Even for non-retryable errors, we should retry at least once. #[tokio::test] async fn connect_to_compute_non_retry_2() { + let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]); - let (cache, user_info) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &user_info) + let ctx = RequestMonitoring::test(); + let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]); + let user_info = helper_create_connect_info(&mechanism); + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -546,28 +636,50 @@ async fn connect_to_compute_non_retry_2() { /// Retry for at most `NUM_RETRIES_CONNECT` times. #[tokio::test] async fn connect_to_compute_non_retry_3() { - assert_eq!(NUM_RETRIES_CONNECT, 16); + let _ = env_logger::try_init(); + tokio::time::pause(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![ - Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, - Retry, Retry, Retry, Retry, /* the 17th time */ Retry, - ]); - let (cache, user_info) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &user_info) - .await - .unwrap_err(); + let ctx = RequestMonitoring::test(); + let mechanism = + TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]); + let user_info = helper_create_connect_info(&mechanism); + let wake_compute_retry_config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 1, + backoff_factor: 2.0, + }; + let connect_to_compute_retry_config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute( + &ctx, + &mechanism, + &user_info, + false, + wake_compute_retry_config, + connect_to_compute_retry_config, + ) + .await + .unwrap_err(); mechanism.verify(); } /// Should retry wake compute. #[tokio::test] async fn wake_retry() { + let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]); - let (cache, user_info) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &user_info) + let ctx = RequestMonitoring::test(); + let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]); + let user_info = helper_create_connect_info(&mechanism); + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -576,11 +688,17 @@ async fn wake_retry() { /// Wake failed with a non-retryable error. #[tokio::test] async fn wake_non_retry() { + let _ = env_logger::try_init(); use ConnectAction::*; - let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]); - let (cache, user_info) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &user_info) + let ctx = RequestMonitoring::test(); + let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]); + let user_info = helper_create_connect_info(&mechanism); + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) .await .unwrap_err(); mechanism.verify(); diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index a0a84a1dc0..33a2162bc7 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -1,7 +1,7 @@ //! Man-in-the-middle tests //! //! Channel binding should prevent a proxy server -//! - that has access to create valid certificates - +//! *that has access to create valid certificates* //! from controlling the TLS connection. use std::fmt::Debug; @@ -11,7 +11,6 @@ use bytes::{Bytes, BytesMut}; use futures::{SinkExt, StreamExt}; use postgres_protocol::message::frontend; use tokio::io::{AsyncReadExt, DuplexStream}; -use tokio_postgres::config::SslMode; use tokio_postgres::tls::TlsConnect; use tokio_util::codec::{Decoder, Encoder}; @@ -35,12 +34,18 @@ async fn proxy_mitm( tokio::spawn(async move { // begin handshake with end_server let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await; - // process handshake with end_client - let (end_client, startup) = - handshake(client1, Some(&server_config1), &CancelMap::default()) - .await - .unwrap() - .unwrap(); + let (end_client, startup) = match handshake( + &RequestMonitoring::test(), + client1, + Some(&server_config1), + false, + ) + .await + .unwrap() + { + HandshakeData::Startup(stream, params) => (stream, params), + HandshakeData::Cancel(_) => panic!("cancellation not supported"), + }; let mut end_server = tokio_util::codec::Framed::new(end_server, PgFrame); let (end_client, buf) = end_client.framed.into_inner(); @@ -63,7 +68,7 @@ async fn proxy_mitm( end_client.send(Bytes::from_static(b"R\0\0\0\x17\0\0\0\x0aSCRAM-SHA-256\0\0")).await.unwrap(); continue; } - end_client.send(message).await.unwrap() + end_client.send(message).await.unwrap(); } _ => break, } @@ -83,7 +88,7 @@ async fn proxy_mitm( end_server.send(buf.freeze()).await.unwrap(); continue; } - end_server.send(message).await.unwrap() + end_server.send(message).await.unwrap(); } _ => break, } @@ -97,7 +102,7 @@ async fn proxy_mitm( } /// taken from tokio-postgres -pub async fn connect_tls(mut stream: S, tls: T) -> T::Stream +pub(crate) async fn connect_tls(mut stream: S, tls: T) -> T::Stream where S: AsyncRead + AsyncWrite + Unpin, T: TlsConnect, @@ -110,9 +115,7 @@ where let mut buf = [0]; stream.read_exact(&mut buf).await.unwrap(); - if buf[0] != b'S' { - panic!("ssl not supported by server"); - } + assert!(buf[0] == b'S', "ssl not supported by server"); tls.connect(stream).await.unwrap() } @@ -151,7 +154,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), - Scram::new("password")?, + Scram::new("password").await?, )); let _client_err = tokio_postgres::Config::new() @@ -234,7 +237,7 @@ async fn connect_failure( let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), - Scram::new("password")?, + Scram::new("password").await?, )); let _client_err = tokio_postgres::Config::new() diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs new file mode 100644 index 0000000000..9b8ac6d29d --- /dev/null +++ b/proxy/src/proxy/wake_compute.rs @@ -0,0 +1,125 @@ +use crate::config::RetryConfig; +use crate::console::messages::{ConsoleError, Reason}; +use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo}; +use crate::context::RequestMonitoring; +use crate::metrics::{ + ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType, + WakeupFailureKind, +}; +use crate::proxy::retry::{retry_after, should_retry}; +use hyper1::StatusCode; +use tracing::{error, info, warn}; + +use super::connect_compute::ComputeConnectBackend; + +pub(crate) async fn wake_compute( + num_retries: &mut u32, + ctx: &RequestMonitoring, + api: &B, + config: RetryConfig, +) -> Result { + let retry_type = RetryType::WakeCompute; + loop { + match api.wake_compute(ctx).await { + Err(e) if !should_retry(&e, *num_retries, config) => { + error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); + report_error(&e, false); + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Failed, + retry_type, + }, + (*num_retries).into(), + ); + return Err(e); + } + Err(e) => { + warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); + report_error(&e, true); + } + Ok(n) => { + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Success, + retry_type, + }, + (*num_retries).into(), + ); + info!(?num_retries, "compute node woken up after"); + return Ok(n); + } + } + + let wait_duration = retry_after(*num_retries, config); + *num_retries += 1; + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout); + tokio::time::sleep(wait_duration).await; + drop(pause); + } +} + +fn report_error(e: &WakeComputeError, retry: bool) { + use crate::console::errors::ApiError; + let kind = match e { + WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress, + WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError, + WakeComputeError::ApiError(ApiError::Console(e)) => match e.get_reason() { + Reason::RoleProtected => WakeupFailureKind::ApiConsoleBadRequest, + Reason::ResourceNotFound => WakeupFailureKind::ApiConsoleBadRequest, + Reason::ProjectNotFound => WakeupFailureKind::ApiConsoleBadRequest, + Reason::EndpointNotFound => WakeupFailureKind::ApiConsoleBadRequest, + Reason::BranchNotFound => WakeupFailureKind::ApiConsoleBadRequest, + Reason::RateLimitExceeded => WakeupFailureKind::ApiConsoleLocked, + Reason::NonDefaultBranchComputeTimeExceeded => WakeupFailureKind::QuotaExceeded, + Reason::ActiveTimeQuotaExceeded => WakeupFailureKind::QuotaExceeded, + Reason::ComputeTimeQuotaExceeded => WakeupFailureKind::QuotaExceeded, + Reason::WrittenDataQuotaExceeded => WakeupFailureKind::QuotaExceeded, + Reason::DataTransferQuotaExceeded => WakeupFailureKind::QuotaExceeded, + Reason::LogicalSizeQuotaExceeded => WakeupFailureKind::QuotaExceeded, + Reason::ConcurrencyLimitReached => WakeupFailureKind::ApiConsoleLocked, + Reason::LockAlreadyTaken => WakeupFailureKind::ApiConsoleLocked, + Reason::RunningOperations => WakeupFailureKind::ApiConsoleLocked, + Reason::Unknown => match e { + ConsoleError { + http_status_code: StatusCode::LOCKED, + ref error, + .. + } if error.contains("written data quota exceeded") + || error.contains("the limit for current plan reached") => + { + WakeupFailureKind::QuotaExceeded + } + ConsoleError { + http_status_code: StatusCode::UNPROCESSABLE_ENTITY, + ref error, + .. + } if error.contains("compute time quota of non-primary branches is exceeded") => { + WakeupFailureKind::QuotaExceeded + } + ConsoleError { + http_status_code: StatusCode::LOCKED, + .. + } => WakeupFailureKind::ApiConsoleLocked, + ConsoleError { + http_status_code: StatusCode::BAD_REQUEST, + .. + } => WakeupFailureKind::ApiConsoleBadRequest, + ConsoleError { + http_status_code, .. + } if http_status_code.is_server_error() => { + WakeupFailureKind::ApiConsoleOtherServerError + } + ConsoleError { .. } => WakeupFailureKind::ApiConsoleOtherError, + }, + }, + WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked, + WakeComputeError::TooManyConnectionAttempts(_) => WakeupFailureKind::TimeoutError, + }; + Metrics::get() + .proxy + .connection_failures_breakdown + .inc(ConnectionFailuresBreakdownGroup { + kind, + retry: retry.into(), + }); +} diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index b26386d159..6e38f89458 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -1,7 +1,14 @@ -mod aimd; +mod leaky_bucket; mod limit_algorithm; mod limiter; -pub use aimd::Aimd; -pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig}; -pub use limiter::Limiter; -pub use limiter::{EndpointRateLimiter, RateBucketInfo}; + +#[cfg(test)] +pub(crate) use limit_algorithm::aimd::Aimd; + +pub(crate) use limit_algorithm::{ + DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token, +}; +pub(crate) use limiter::GlobalRateLimiter; + +pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter}; +pub use limiter::{BucketRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; diff --git a/proxy/src/rate_limiter/aimd.rs b/proxy/src/rate_limiter/aimd.rs deleted file mode 100644 index 2c14a54a6c..0000000000 --- a/proxy/src/rate_limiter/aimd.rs +++ /dev/null @@ -1,166 +0,0 @@ -use std::usize; - -use async_trait::async_trait; - -use super::limit_algorithm::{AimdConfig, LimitAlgorithm, Sample}; - -use super::limiter::Outcome; - -/// Loss-based congestion avoidance. -/// -/// Additive-increase, multiplicative decrease. -/// -/// Adds available currency when: -/// 1. no load-based errors are observed, and -/// 2. the utilisation of the current limit is high. -/// -/// Reduces available concurrency by a factor when load-based errors are detected. -pub struct Aimd { - min_limit: usize, - max_limit: usize, - decrease_factor: f32, - increase_by: usize, - min_utilisation_threshold: f32, -} - -impl Aimd { - pub fn new(config: AimdConfig) -> Self { - Self { - min_limit: config.aimd_min_limit, - max_limit: config.aimd_max_limit, - decrease_factor: config.aimd_decrease_factor, - increase_by: config.aimd_increase_by, - min_utilisation_threshold: config.aimd_min_utilisation_threshold, - } - } -} - -#[async_trait] -impl LimitAlgorithm for Aimd { - async fn update(&mut self, old_limit: usize, sample: Sample) -> usize { - use Outcome::*; - match sample.outcome { - Success => { - let utilisation = sample.in_flight as f32 / old_limit as f32; - - if utilisation > self.min_utilisation_threshold { - let limit = old_limit + self.increase_by; - limit.clamp(self.min_limit, self.max_limit) - } else { - old_limit - } - } - Overload => { - let limit = old_limit as f32 * self.decrease_factor; - - // Floor instead of round, so the limit reduces even with small numbers. - // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1 - let limit = limit.floor() as usize; - - limit.clamp(self.min_limit, self.max_limit) - } - } - } -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use tokio::sync::Notify; - - use super::*; - - use crate::rate_limiter::{Limiter, RateLimiterConfig}; - - #[tokio::test] - async fn should_decrease_limit_on_overload() { - let config = RateLimiterConfig { - initial_limit: 10, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let release_notifier = Arc::new(Notify::new()); - - let limiter = Limiter::new(config).with_release_notifier(release_notifier.clone()); - - let token = limiter.try_acquire().unwrap(); - limiter.release(token, Some(Outcome::Overload)).await; - release_notifier.notified().await; - assert_eq!(limiter.state().limit(), 5, "overload: decrease"); - } - - #[tokio::test] - async fn should_increase_limit_on_success_when_using_gt_util_threshold() { - let config = RateLimiterConfig { - initial_limit: 4, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - aimd_min_utilisation_threshold: 0.5, - aimd_increase_by: 1, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - let _token = limiter.try_acquire().unwrap(); - let _token = limiter.try_acquire().unwrap(); - - limiter.release(token, Some(Outcome::Success)).await; - assert_eq!(limiter.state().limit(), 5, "success: increase"); - } - - #[tokio::test] - async fn should_not_change_limit_on_success_when_using_lt_util_threshold() { - let config = RateLimiterConfig { - initial_limit: 4, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - aimd_min_utilisation_threshold: 0.5, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - - limiter.release(token, Some(Outcome::Success)).await; - assert_eq!( - limiter.state().limit(), - 4, - "success: ignore when < half limit" - ); - } - - #[tokio::test] - async fn should_not_change_limit_when_no_outcome() { - let config = RateLimiterConfig { - initial_limit: 10, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - aimd_min_utilisation_threshold: 0.5, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - limiter.release(token, None).await; - assert_eq!(limiter.state().limit(), 10, "ignore"); - } -} diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs new file mode 100644 index 0000000000..bf4d85f2e4 --- /dev/null +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -0,0 +1,145 @@ +use std::{ + hash::Hash, + sync::atomic::{AtomicUsize, Ordering}, +}; + +use ahash::RandomState; +use dashmap::DashMap; +use rand::{thread_rng, Rng}; +use tokio::time::Instant; +use tracing::info; +use utils::leaky_bucket::LeakyBucketState; + +use crate::intern::EndpointIdInt; + +// Simple per-endpoint rate limiter. +pub type EndpointRateLimiter = LeakyBucketRateLimiter; + +pub struct LeakyBucketRateLimiter { + map: DashMap, + config: utils::leaky_bucket::LeakyBucketConfig, + access_count: AtomicUsize, +} + +impl LeakyBucketRateLimiter { + pub const DEFAULT: LeakyBucketConfig = LeakyBucketConfig { + rps: 600.0, + max: 1500.0, + }; + + pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self { + Self { + map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards), + config: config.into(), + access_count: AtomicUsize::new(0), + } + } + + /// Check that number of connections to the endpoint is below `max_rps` rps. + pub(crate) fn check(&self, key: K, n: u32) -> bool { + let now = Instant::now(); + + if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 { + self.do_gc(now); + } + + let mut entry = self + .map + .entry(key) + .or_insert_with(|| LeakyBucketState { empty_at: now }); + + entry.add_tokens(&self.config, now, n as f64).is_ok() + } + + fn do_gc(&self, now: Instant) { + info!( + "cleaning up bucket rate limiter, current size = {}", + self.map.len() + ); + let n = self.map.shards().len(); + let shard = thread_rng().gen_range(0..n); + self.map.shards()[shard] + .write() + .retain(|_, value| !value.get().bucket_is_empty(now)); + } +} + +pub struct LeakyBucketConfig { + pub rps: f64, + pub max: f64, +} + +#[cfg(test)] +impl LeakyBucketConfig { + pub(crate) fn new(rps: f64, max: f64) -> Self { + assert!(rps > 0.0, "rps must be positive"); + assert!(max > 0.0, "max must be positive"); + Self { rps, max } + } +} + +impl From for utils::leaky_bucket::LeakyBucketConfig { + fn from(config: LeakyBucketConfig) -> Self { + utils::leaky_bucket::LeakyBucketConfig::new(config.rps, config.max) + } +} + +#[cfg(test)] +#[allow(clippy::float_cmp)] +mod tests { + use std::time::Duration; + + use tokio::time::Instant; + use utils::leaky_bucket::LeakyBucketState; + + use super::LeakyBucketConfig; + + #[tokio::test(start_paused = true)] + async fn check() { + let config: utils::leaky_bucket::LeakyBucketConfig = + LeakyBucketConfig::new(500.0, 2000.0).into(); + assert_eq!(config.cost, Duration::from_millis(2)); + assert_eq!(config.bucket_width, Duration::from_secs(4)); + + let mut bucket = LeakyBucketState { + empty_at: Instant::now(), + }; + + // should work for 2000 requests this second + for _ in 0..2000 { + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); + } + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); + assert_eq!(bucket.empty_at - Instant::now(), config.bucket_width); + + // in 1ms we should drain 0.5 tokens. + // make sure we don't lose any tokens + tokio::time::advance(Duration::from_millis(1)).await; + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); + tokio::time::advance(Duration::from_millis(1)).await; + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); + + // in 10ms we should drain 5 tokens + tokio::time::advance(Duration::from_millis(10)).await; + for _ in 0..5 { + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); + } + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); + + // in 10s we should drain 5000 tokens + // but cap is only 2000 + tokio::time::advance(Duration::from_secs(10)).await; + for _ in 0..2000 { + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); + } + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); + + // should sustain 500rps + for _ in 0..2000 { + tokio::time::advance(Duration::from_millis(10)).await; + for _ in 0..5 { + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); + } + } + } +} diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs index 5cd2d5ebb7..25607b7e10 100644 --- a/proxy/src/rate_limiter/limit_algorithm.rs +++ b/proxy/src/rate_limiter/limit_algorithm.rs @@ -1,98 +1,262 @@ //! Algorithms for controlling concurrency limits. -use async_trait::async_trait; -use std::time::Duration; +use parking_lot::Mutex; +use std::{pin::pin, sync::Arc, time::Duration}; +use tokio::{ + sync::Notify, + time::{error::Elapsed, Instant}, +}; -use super::{limiter::Outcome, Aimd}; +use self::aimd::Aimd; -/// An algorithm for controlling a concurrency limit. -#[async_trait] -pub trait LimitAlgorithm: Send + Sync + 'static { - /// Update the concurrency limit in response to a new job completion. - async fn update(&mut self, old_limit: usize, sample: Sample) -> usize; +pub(crate) mod aimd; + +/// Whether a job succeeded or failed as a result of congestion/overload. +/// +/// Errors not considered to be caused by overload should be ignored. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum Outcome { + /// The job succeeded, or failed in a way unrelated to overload. + Success, + /// The job failed because of overload, e.g. it timed out or an explicit backpressure signal + /// was observed. + Overload, } -/// The result of a job (or jobs), including the [Outcome] (loss) and latency (delay). -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Sample { +/// An algorithm for controlling a concurrency limit. +pub(crate) trait LimitAlgorithm: Send + Sync + 'static { + /// Update the concurrency limit in response to a new job completion. + fn update(&self, old_limit: usize, sample: Sample) -> usize; +} + +/// The result of a job (or jobs), including the [`Outcome`] (loss) and latency (delay). +#[derive(Debug, Clone, PartialEq, Eq, Copy)] +pub(crate) struct Sample { pub(crate) latency: Duration, /// Jobs in flight when the sample was taken. pub(crate) in_flight: usize, pub(crate) outcome: Outcome, } -#[derive(Clone, Copy, Debug, Default, clap::ValueEnum)] -pub enum RateLimitAlgorithm { - Fixed, +#[derive(Clone, Copy, Debug, Default, serde::Deserialize, PartialEq)] +#[serde(rename_all = "snake_case")] +pub(crate) enum RateLimitAlgorithm { #[default] - Aimd, + Fixed, + Aimd { + #[serde(flatten)] + conf: Aimd, + }, } -pub struct Fixed; +pub(crate) struct Fixed; -#[async_trait] impl LimitAlgorithm for Fixed { - async fn update(&mut self, old_limit: usize, _sample: Sample) -> usize { + fn update(&self, old_limit: usize, _sample: Sample) -> usize { old_limit } } -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)] pub struct RateLimiterConfig { - pub disable: bool, - pub algorithm: RateLimitAlgorithm, - pub timeout: Duration, - pub initial_limit: usize, - pub aimd_config: Option, + #[serde(flatten)] + pub(crate) algorithm: RateLimitAlgorithm, + pub(crate) initial_limit: usize, } impl RateLimiterConfig { - pub fn create_rate_limit_algorithm(self) -> Box { + pub(crate) fn create_rate_limit_algorithm(self) -> Box { match self.algorithm { RateLimitAlgorithm::Fixed => Box::new(Fixed), - RateLimitAlgorithm::Aimd => Box::new(Aimd::new(self.aimd_config.unwrap())), // For aimd algorithm config is mandatory. + RateLimitAlgorithm::Aimd { conf } => Box::new(conf), } } } -impl Default for RateLimiterConfig { - fn default() -> Self { +pub(crate) struct LimiterInner { + alg: Box, + available: usize, + limit: usize, + in_flight: usize, +} + +impl LimiterInner { + fn update_limit(&mut self, latency: Duration, outcome: Option) { + if let Some(outcome) = outcome { + let sample = Sample { + latency, + in_flight: self.in_flight, + outcome, + }; + self.limit = self.alg.update(self.limit, sample); + } + } + + fn take(&mut self, ready: &Notify) -> Option<()> { + if self.available >= 1 { + self.available -= 1; + self.in_flight += 1; + + // tell the next in the queue that there is a permit ready + if self.available >= 1 { + ready.notify_one(); + } + Some(()) + } else { + None + } + } +} + +/// Limits the number of concurrent jobs. +/// +/// Concurrency is limited through the use of [`Token`]s. Acquire a token to run a job, and release the +/// token once the job is finished. +/// +/// The limit will be automatically adjusted based on observed latency (delay) and/or failures +/// caused by overload (loss). +pub(crate) struct DynamicLimiter { + config: RateLimiterConfig, + inner: Mutex, + // to notify when a token is available + ready: Notify, +} + +/// A concurrency token, required to run a job. +/// +/// Release the token back to the [`DynamicLimiter`] after the job is complete. +pub(crate) struct Token { + start: Instant, + limiter: Option>, +} + +/// A snapshot of the state of the [`DynamicLimiter`]. +/// +/// Not guaranteed to be consistent under high concurrency. +#[derive(Debug, Clone, Copy)] +#[cfg(test)] +struct LimiterState { + limit: usize, +} + +impl DynamicLimiter { + /// Create a limiter with a given limit control algorithm. + pub(crate) fn new(config: RateLimiterConfig) -> Arc { + let ready = Notify::new(); + ready.notify_one(); + + Arc::new(Self { + inner: Mutex::new(LimiterInner { + alg: config.create_rate_limit_algorithm(), + available: config.initial_limit, + limit: config.initial_limit, + in_flight: 0, + }), + ready, + config, + }) + } + + /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available. + pub(crate) async fn acquire_timeout( + self: &Arc, + duration: Duration, + ) -> Result { + tokio::time::timeout(duration, self.acquire()).await? + } + + /// Try to acquire a concurrency [Token]. + async fn acquire(self: &Arc) -> Result { + if self.config.initial_limit == 0 { + // If the rate limiter is disabled, we can always acquire a token. + Ok(Token::disabled()) + } else { + let mut notified = pin!(self.ready.notified()); + let mut ready = notified.as_mut().enable(); + loop { + if ready { + let mut inner = self.inner.lock(); + if inner.take(&self.ready).is_some() { + break Ok(Token::new(self.clone())); + } + notified.set(self.ready.notified()); + } + notified.as_mut().await; + ready = true; + } + } + } + + /// Return the concurrency [Token], along with the outcome of the job. + /// + /// The [Outcome] of the job, and the time taken to perform it, may be used + /// to update the concurrency limit. + /// + /// Set the outcome to `None` to ignore the job. + fn release_inner(&self, start: Instant, outcome: Option) { + tracing::info!("outcome is {:?}", outcome); + if self.config.initial_limit == 0 { + return; + } + + let mut inner = self.inner.lock(); + + inner.update_limit(start.elapsed(), outcome); + + inner.in_flight -= 1; + if inner.in_flight < inner.limit { + inner.available = inner.limit - inner.in_flight; + // At least 1 permit is now available + self.ready.notify_one(); + } + } + + /// The current state of the limiter. + #[cfg(test)] + fn state(&self) -> LimiterState { + let inner = self.inner.lock(); + LimiterState { limit: inner.limit } + } +} + +impl Token { + fn new(limiter: Arc) -> Self { Self { - disable: true, - algorithm: RateLimitAlgorithm::Aimd, - timeout: Duration::from_secs(1), - initial_limit: 100, - aimd_config: Some(AimdConfig::default()), + start: Instant::now(), + limiter: Some(limiter), } } -} - -#[derive(clap::Parser, Clone, Copy, Debug)] -pub struct AimdConfig { - /// Minimum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 1)] - pub aimd_min_limit: usize, - /// Maximum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 1500)] - pub aimd_max_limit: usize, - /// Increase AIMD increase by value in case of success. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 10)] - pub aimd_increase_by: usize, - /// Decrease AIMD decrease by value in case of timout/429. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 0.9)] - pub aimd_decrease_factor: f32, - /// A threshold below which the limit won't be increased. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 0.8)] - pub aimd_min_utilisation_threshold: f32, -} - -impl Default for AimdConfig { - fn default() -> Self { + pub(crate) fn disabled() -> Self { Self { - aimd_min_limit: 1, - aimd_max_limit: 1500, - aimd_increase_by: 10, - aimd_decrease_factor: 0.9, - aimd_min_utilisation_threshold: 0.8, + start: Instant::now(), + limiter: None, + } + } + + pub(crate) fn is_disabled(&self) -> bool { + self.limiter.is_none() + } + + pub(crate) fn release(mut self, outcome: Outcome) { + self.release_mut(Some(outcome)); + } + + pub(crate) fn release_mut(&mut self, outcome: Option) { + if let Some(limiter) = self.limiter.take() { + limiter.release_inner(self.start, outcome); } } } + +impl Drop for Token { + fn drop(&mut self) { + self.release_mut(None); + } +} + +#[cfg(test)] +impl LimiterState { + /// The current concurrency limit. + fn limit(self) -> usize { + self.limit + } +} diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs new file mode 100644 index 0000000000..86b56e38fb --- /dev/null +++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs @@ -0,0 +1,265 @@ +use super::{LimitAlgorithm, Outcome, Sample}; + +/// Loss-based congestion avoidance. +/// +/// Additive-increase, multiplicative decrease. +/// +/// Adds available currency when: +/// 1. no load-based errors are observed, and +/// 2. the utilisation of the current limit is high. +/// +/// Reduces available concurrency by a factor when load-based errors are detected. +#[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)] +pub(crate) struct Aimd { + /// Minimum limit for AIMD algorithm. + pub(crate) min: usize, + /// Maximum limit for AIMD algorithm. + pub(crate) max: usize, + /// Decrease AIMD decrease by value in case of error. + pub(crate) dec: f32, + /// Increase AIMD increase by value in case of success. + pub(crate) inc: usize, + /// A threshold below which the limit won't be increased. + pub(crate) utilisation: f32, +} + +impl LimitAlgorithm for Aimd { + fn update(&self, old_limit: usize, sample: Sample) -> usize { + match sample.outcome { + Outcome::Success => { + let utilisation = sample.in_flight as f32 / old_limit as f32; + + if utilisation > self.utilisation { + let limit = old_limit + self.inc; + let increased_limit = limit.clamp(self.min, self.max); + if increased_limit > old_limit { + tracing::info!(increased_limit, "limit increased"); + } + + increased_limit + } else { + old_limit + } + } + Outcome::Overload => { + let limit = old_limit as f32 * self.dec; + + // Floor instead of round, so the limit reduces even with small numbers. + // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1 + let limit = limit.floor() as usize; + + let limit = limit.clamp(self.min, self.max); + tracing::info!(limit, "limit decreased"); + limit + } + } + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use crate::rate_limiter::limit_algorithm::{ + DynamicLimiter, RateLimitAlgorithm, RateLimiterConfig, + }; + + use super::*; + + #[tokio::test(start_paused = true)] + async fn increase_decrease() { + let config = RateLimiterConfig { + initial_limit: 1, + algorithm: RateLimitAlgorithm::Aimd { + conf: Aimd { + min: 1, + max: 2, + inc: 10, + dec: 0.5, + utilisation: 0.8, + }, + }, + }; + + let limiter = DynamicLimiter::new(config); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + token.release(Outcome::Success); + + assert_eq!(limiter.state().limit(), 2); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + token.release(Outcome::Success); + assert_eq!(limiter.state().limit(), 2); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + token.release(Outcome::Overload); + assert_eq!(limiter.state().limit(), 1); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + token.release(Outcome::Overload); + assert_eq!(limiter.state().limit(), 1); + } + + #[tokio::test(start_paused = true)] + async fn should_decrease_limit_on_overload() { + let config = RateLimiterConfig { + initial_limit: 10, + algorithm: RateLimitAlgorithm::Aimd { + conf: Aimd { + min: 1, + max: 1500, + inc: 10, + dec: 0.5, + utilisation: 0.8, + }, + }, + }; + + let limiter = DynamicLimiter::new(config); + + let token = limiter + .acquire_timeout(Duration::from_millis(100)) + .await + .unwrap(); + token.release(Outcome::Overload); + + assert_eq!(limiter.state().limit(), 5, "overload: decrease"); + } + + #[tokio::test(start_paused = true)] + async fn acquire_timeout_times_out() { + let config = RateLimiterConfig { + initial_limit: 1, + algorithm: RateLimitAlgorithm::Aimd { + conf: Aimd { + min: 1, + max: 2, + inc: 10, + dec: 0.5, + utilisation: 0.8, + }, + }, + }; + + let limiter = DynamicLimiter::new(config); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + let now = tokio::time::Instant::now(); + limiter + .acquire_timeout(Duration::from_secs(1)) + .await + .err() + .unwrap(); + + assert!(now.elapsed() >= Duration::from_secs(1)); + + token.release(Outcome::Success); + + assert_eq!(limiter.state().limit(), 2); + } + + #[tokio::test(start_paused = true)] + async fn should_increase_limit_on_success_when_using_gt_util_threshold() { + let config = RateLimiterConfig { + initial_limit: 4, + algorithm: RateLimitAlgorithm::Aimd { + conf: Aimd { + min: 1, + max: 1500, + inc: 1, + dec: 0.5, + utilisation: 0.5, + }, + }, + }; + + let limiter = DynamicLimiter::new(config); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + let _token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + let _token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + + token.release(Outcome::Success); + assert_eq!(limiter.state().limit(), 5, "success: increase"); + } + + #[tokio::test(start_paused = true)] + async fn should_not_change_limit_on_success_when_using_lt_util_threshold() { + let config = RateLimiterConfig { + initial_limit: 4, + algorithm: RateLimitAlgorithm::Aimd { + conf: Aimd { + min: 1, + max: 1500, + inc: 10, + dec: 0.5, + utilisation: 0.5, + }, + }, + }; + + let limiter = DynamicLimiter::new(config); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + + token.release(Outcome::Success); + assert_eq!( + limiter.state().limit(), + 4, + "success: ignore when < half limit" + ); + } + + #[tokio::test(start_paused = true)] + async fn should_not_change_limit_when_no_outcome() { + let config = RateLimiterConfig { + initial_limit: 10, + algorithm: RateLimitAlgorithm::Aimd { + conf: Aimd { + min: 1, + max: 1500, + inc: 10, + dec: 0.5, + utilisation: 0.5, + }, + }, + }; + + let limiter = DynamicLimiter::new(config); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + drop(token); + assert_eq!(limiter.state().limit(), 10, "ignore"); + } +} diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index cbae72711c..be529f174d 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -1,9 +1,10 @@ use std::{ + borrow::Cow, collections::hash_map::RandomState, - hash::BuildHasher, + hash::{BuildHasher, Hash}, sync::{ atomic::{AtomicUsize, Ordering}, - Arc, Mutex, + Mutex, }, }; @@ -11,16 +12,48 @@ use anyhow::bail; use dashmap::DashMap; use itertools::Itertools; use rand::{rngs::StdRng, Rng, SeedableRng}; -use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit}; -use tokio::time::{timeout, Duration, Instant}; +use tokio::time::{Duration, Instant}; use tracing::info; -use crate::EndpointId; +use crate::intern::EndpointIdInt; -use super::{ - limit_algorithm::{LimitAlgorithm, Sample}, - RateLimiterConfig, -}; +pub(crate) struct GlobalRateLimiter { + data: Vec, + info: Vec, +} + +impl GlobalRateLimiter { + pub(crate) fn new(info: Vec) -> Self { + Self { + data: vec![ + RateBucket { + start: Instant::now(), + count: 0, + }; + info.len() + ], + info, + } + } + + /// Check that number of connections is below `max_rps` rps. + pub(crate) fn check(&mut self) -> bool { + let now = Instant::now(); + + let should_allow_request = self + .data + .iter_mut() + .zip(&self.info) + .all(|(bucket, info)| bucket.should_allow_request(info, now, 1)); + + if should_allow_request { + // only increment the bucket counts if the request will actually be accepted + self.data.iter_mut().for_each(|b| b.inc(1)); + } + + should_allow_request + } +} // Simple per-endpoint rate limiter. // @@ -28,14 +61,11 @@ use super::{ // Purposefully ignore user name and database name as clients can reconnect // with different names, so we'll end up sending some http requests to // the control plane. -// -// We also may save quite a lot of CPU (I think) by bailing out right after we -// saw SNI, before doing TLS handshake. User-side error messages in that case -// does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now -// I went with a more expensive way that yields user-friendlier error messages. -pub struct EndpointRateLimiter { - map: DashMap, Hasher>, - info: &'static [RateBucketInfo], +pub type WakeComputeRateLimiter = BucketRateLimiter; + +pub struct BucketRateLimiter { + map: DashMap, Hasher>, + info: Cow<'static, [RateBucketInfo]>, access_count: AtomicUsize, rand: Mutex, } @@ -47,9 +77,9 @@ struct RateBucket { } impl RateBucket { - fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant) -> bool { + fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant, n: u32) -> bool { if now - self.start < info.interval { - self.count < info.max_rpi + self.count + n <= info.max_rpi } else { // bucket expired, reset self.count = 0; @@ -59,21 +89,21 @@ impl RateBucket { } } - fn inc(&mut self) { - self.count += 1; + fn inc(&mut self, n: u32) { + self.count += n; } } #[derive(Clone, Copy, PartialEq)] pub struct RateBucketInfo { - pub interval: Duration, + pub(crate) interval: Duration, // requests per interval - pub max_rpi: u32, + pub(crate) max_rpi: u32, } impl std::fmt::Display for RateBucketInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let rps = self.max_rpi * 1000 / self.interval.as_millis() as u32; + let rps = self.rps().floor() as u64; write!(f, "{rps}@{}", humantime::format_duration(self.interval)) } } @@ -104,6 +134,16 @@ impl RateBucketInfo { Self::new(100, Duration::from_secs(600)), ]; + pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [ + Self::new(500, Duration::from_secs(1)), + Self::new(300, Duration::from_secs(60)), + Self::new(200, Duration::from_secs(600)), + ]; + + pub fn rps(&self) -> f64 { + (self.max_rpi as f64) / self.interval.as_secs_f64() + } + pub fn validate(info: &mut [Self]) -> anyhow::Result<()> { info.sort_unstable_by_key(|info| info.interval); let invalid = info @@ -112,7 +152,7 @@ impl RateBucketInfo { .find(|(a, b)| a.max_rpi > b.max_rpi); if let Some((a, b)) = invalid { bail!( - "invalid endpoint RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})", + "invalid bucket RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})", b.max_rpi, a.max_rpi, ); @@ -124,19 +164,24 @@ impl RateBucketInfo { pub const fn new(max_rps: u32, interval: Duration) -> Self { Self { interval, - max_rpi: max_rps * interval.as_millis() as u32 / 1000, + max_rpi: ((max_rps as u64) * (interval.as_millis() as u64) / 1000) as u32, } } } -impl EndpointRateLimiter { - pub fn new(info: &'static [RateBucketInfo]) -> Self { +impl BucketRateLimiter { + pub fn new(info: impl Into>) -> Self { Self::new_with_rand_and_hasher(info, StdRng::from_entropy(), RandomState::new()) } } -impl EndpointRateLimiter { - fn new_with_rand_and_hasher(info: &'static [RateBucketInfo], rand: R, hasher: S) -> Self { +impl BucketRateLimiter { + fn new_with_rand_and_hasher( + info: impl Into>, + rand: R, + hasher: S, + ) -> Self { + let info = info.into(); info!(buckets = ?info, "endpoint rate limiter"); Self { info, @@ -147,7 +192,7 @@ impl EndpointRateLimiter { } /// Check that number of connections to the endpoint is below `max_rps` rps. - pub fn check(&self, endpoint: EndpointId) -> bool { + pub(crate) fn check(&self, key: K, n: u32) -> bool { // do a partial GC every 2k requests. This cleans up ~ 1/64th of the map. // worst case memory usage is about: // = 2 * 2048 * 64 * (48B + 72B) @@ -157,7 +202,7 @@ impl EndpointRateLimiter { } let now = Instant::now(); - let mut entry = self.map.entry(endpoint).or_insert_with(|| { + let mut entry = self.map.entry(key).or_insert_with(|| { vec![ RateBucket { start: now, @@ -169,12 +214,12 @@ impl EndpointRateLimiter { let should_allow_request = entry .iter_mut() - .zip(self.info) - .all(|(bucket, info)| bucket.should_allow_request(info, now)); + .zip(&*self.info) + .all(|(bucket, info)| bucket.should_allow_request(info, now, n)); if should_allow_request { // only increment the bucket counts if the request will actually be accepted - entry.iter_mut().for_each(RateBucket::inc); + entry.iter_mut().for_each(|b| b.inc(n)); } should_allow_request @@ -183,9 +228,9 @@ impl EndpointRateLimiter { /// Clean the map. Simple strategy: remove all entries in a random shard. /// At worst, we'll double the effective max_rps during the cleanup. /// But that way deletion does not aquire mutex on each entry access. - pub fn do_gc(&self) { + pub(crate) fn do_gc(&self) { info!( - "cleaning up endpoint rate limiter, current size = {}", + "cleaning up bucket rate limiter, current size = {}", self.map.len() ); let n = self.map.shards().len(); @@ -196,419 +241,16 @@ impl EndpointRateLimiter { } } -/// Limits the number of concurrent jobs. -/// -/// Concurrency is limited through the use of [Token]s. Acquire a token to run a job, and release the -/// token once the job is finished. -/// -/// The limit will be automatically adjusted based on observed latency (delay) and/or failures -/// caused by overload (loss). -pub struct Limiter { - limit_algo: AsyncMutex>, - semaphore: std::sync::Arc, - config: RateLimiterConfig, - - // ONLY WRITE WHEN LIMIT_ALGO IS LOCKED - limits: AtomicUsize, - - // ONLY USE ATOMIC ADD/SUB - in_flight: Arc, - - #[cfg(test)] - notifier: Option>, -} - -/// A concurrency token, required to run a job. -/// -/// Release the token back to the [Limiter] after the job is complete. -#[derive(Debug)] -pub struct Token<'t> { - permit: Option>, - start: Instant, - in_flight: Arc, -} - -/// A snapshot of the state of the [Limiter]. -/// -/// Not guaranteed to be consistent under high concurrency. -#[derive(Debug, Clone, Copy)] -pub struct LimiterState { - limit: usize, - in_flight: usize, -} - -/// Whether a job succeeded or failed as a result of congestion/overload. -/// -/// Errors not considered to be caused by overload should be ignored. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Outcome { - /// The job succeeded, or failed in a way unrelated to overload. - Success, - /// The job failed because of overload, e.g. it timed out or an explicit backpressure signal - /// was observed. - Overload, -} - -impl Outcome { - fn from_reqwest_error(error: &reqwest_middleware::Error) -> Self { - match error { - reqwest_middleware::Error::Middleware(_) => Outcome::Success, - reqwest_middleware::Error::Reqwest(e) => { - if let Some(status) = e.status() { - if status.is_server_error() - || reqwest::StatusCode::TOO_MANY_REQUESTS.as_u16() == status - { - Outcome::Overload - } else { - Outcome::Success - } - } else { - Outcome::Success - } - } - } - } - fn from_reqwest_response(response: &reqwest::Response) -> Self { - if response.status().is_server_error() - || response.status() == reqwest::StatusCode::TOO_MANY_REQUESTS - { - Outcome::Overload - } else { - Outcome::Success - } - } -} - -impl Limiter { - /// Create a limiter with a given limit control algorithm. - pub fn new(config: RateLimiterConfig) -> Self { - assert!(config.initial_limit > 0); - Self { - limit_algo: AsyncMutex::new(config.create_rate_limit_algorithm()), - semaphore: Arc::new(Semaphore::new(config.initial_limit)), - config, - limits: AtomicUsize::new(config.initial_limit), - in_flight: Arc::new(AtomicUsize::new(0)), - #[cfg(test)] - notifier: None, - } - } - // pub fn new(limit_algorithm: T, timeout: Duration, initial_limit: usize) -> Self { - // assert!(initial_limit > 0); - - // Self { - // limit_algo: AsyncMutex::new(limit_algorithm), - // semaphore: Arc::new(Semaphore::new(initial_limit)), - // timeout, - // limits: AtomicUsize::new(initial_limit), - // in_flight: Arc::new(AtomicUsize::new(0)), - // #[cfg(test)] - // notifier: None, - // } - // } - - /// In some cases [Token]s are acquired asynchronously when updating the limit. - #[cfg(test)] - pub fn with_release_notifier(mut self, n: std::sync::Arc) -> Self { - self.notifier = Some(n); - self - } - - /// Try to immediately acquire a concurrency [Token]. - /// - /// Returns `None` if there are none available. - pub fn try_acquire(&self) -> Option { - let result = if self.config.disable { - // If the rate limiter is disabled, we can always acquire a token. - Some(Token::new(None, self.in_flight.clone())) - } else { - self.semaphore - .try_acquire() - .map(|permit| Token::new(Some(permit), self.in_flight.clone())) - .ok() - }; - if result.is_some() { - self.in_flight.fetch_add(1, Ordering::AcqRel); - } - result - } - - /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available. - /// - /// Returns `None` if there are none available after `duration`. - pub async fn acquire_timeout(&self, duration: Duration) -> Option> { - info!("acquiring token: {:?}", self.semaphore.available_permits()); - let result = if self.config.disable { - // If the rate limiter is disabled, we can always acquire a token. - Some(Token::new(None, self.in_flight.clone())) - } else { - match timeout(duration, self.semaphore.acquire()).await { - Ok(maybe_permit) => maybe_permit - .map(|permit| Token::new(Some(permit), self.in_flight.clone())) - .ok(), - Err(_) => None, - } - }; - if result.is_some() { - self.in_flight.fetch_add(1, Ordering::AcqRel); - } - result - } - - /// Return the concurrency [Token], along with the outcome of the job. - /// - /// The [Outcome] of the job, and the time taken to perform it, may be used - /// to update the concurrency limit. - /// - /// Set the outcome to `None` to ignore the job. - pub async fn release(&self, mut token: Token<'_>, outcome: Option) { - tracing::info!("outcome is {:?}", outcome); - let in_flight = self.in_flight.load(Ordering::Acquire); - let old_limit = self.limits.load(Ordering::Acquire); - let available = if self.config.disable { - 0 // This is not used in the algorithm and can be anything. If the config disable it makes sense to set it to 0. - } else { - self.semaphore.available_permits() - }; - let total = in_flight + available; - - let mut algo = self.limit_algo.lock().await; - - let new_limit = if let Some(outcome) = outcome { - let sample = Sample { - latency: token.start.elapsed(), - in_flight, - outcome, - }; - algo.update(old_limit, sample).await - } else { - old_limit - }; - tracing::info!("new limit is {}", new_limit); - let actual_limit = if new_limit < total { - token.forget(); - total.saturating_sub(1) - } else { - if !self.config.disable { - self.semaphore.add_permits(new_limit.saturating_sub(total)); - } - new_limit - }; - crate::metrics::RATE_LIMITER_LIMIT - .with_label_values(&["expected"]) - .set(new_limit as i64); - crate::metrics::RATE_LIMITER_LIMIT - .with_label_values(&["actual"]) - .set(actual_limit as i64); - self.limits.store(new_limit, Ordering::Release); - #[cfg(test)] - if let Some(n) = &self.notifier { - n.notify_one(); - } - } - - /// The current state of the limiter. - pub fn state(&self) -> LimiterState { - let limit = self.limits.load(Ordering::Relaxed); - let in_flight = self.in_flight.load(Ordering::Relaxed); - LimiterState { limit, in_flight } - } -} - -impl<'t> Token<'t> { - fn new(permit: Option>, in_flight: Arc) -> Self { - Self { - permit, - start: Instant::now(), - in_flight, - } - } - - pub fn forget(&mut self) { - if let Some(permit) = self.permit.take() { - permit.forget(); - } - } -} - -impl Drop for Token<'_> { - fn drop(&mut self) { - self.in_flight.fetch_sub(1, Ordering::AcqRel); - } -} - -impl LimiterState { - /// The current concurrency limit. - pub fn limit(&self) -> usize { - self.limit - } - /// The number of jobs in flight. - pub fn in_flight(&self) -> usize { - self.in_flight - } -} - -#[async_trait::async_trait] -impl reqwest_middleware::Middleware for Limiter { - async fn handle( - &self, - req: reqwest::Request, - extensions: &mut task_local_extensions::Extensions, - next: reqwest_middleware::Next<'_>, - ) -> reqwest_middleware::Result { - let start = Instant::now(); - let token = self - .acquire_timeout(self.config.timeout) - .await - .ok_or_else(|| { - reqwest_middleware::Error::Middleware( - // TODO: Should we map it into user facing errors? - crate::console::errors::ApiError::Console { - status: crate::http::StatusCode::TOO_MANY_REQUESTS, - text: "Too many requests".into(), - } - .into(), - ) - })?; - info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane"); - crate::metrics::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64()); - match next.run(req, extensions).await { - Ok(response) => { - self.release(token, Some(Outcome::from_reqwest_response(&response))) - .await; - Ok(response) - } - Err(e) => { - self.release(token, Some(Outcome::from_reqwest_error(&e))) - .await; - Err(e) - } - } - } -} - #[cfg(test)] mod tests { - use std::{hash::BuildHasherDefault, pin::pin, task::Context, time::Duration}; + use std::{hash::BuildHasherDefault, time::Duration}; - use futures::{task::noop_waker_ref, Future}; use rand::SeedableRng; use rustc_hash::FxHasher; use tokio::time; - use super::{EndpointRateLimiter, Limiter, Outcome}; - use crate::{ - rate_limiter::{RateBucketInfo, RateLimitAlgorithm}, - EndpointId, - }; - - #[tokio::test] - async fn it_works() { - let config = super::RateLimiterConfig { - algorithm: RateLimitAlgorithm::Fixed, - timeout: Duration::from_secs(1), - initial_limit: 10, - disable: false, - ..Default::default() - }; - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - - limiter.release(token, Some(Outcome::Success)).await; - - assert_eq!(limiter.state().limit(), 10); - } - - #[tokio::test] - async fn is_fair() { - let config = super::RateLimiterConfig { - algorithm: RateLimitAlgorithm::Fixed, - timeout: Duration::from_secs(1), - initial_limit: 1, - disable: false, - ..Default::default() - }; - let limiter = Limiter::new(config); - - // === TOKEN 1 === - let token1 = limiter.try_acquire().unwrap(); - - let mut token2_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1))); - assert!( - token2_fut - .as_mut() - .poll(&mut Context::from_waker(noop_waker_ref())) - .is_pending(), - "token is acquired by token1" - ); - - let mut token3_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1))); - assert!( - token3_fut - .as_mut() - .poll(&mut Context::from_waker(noop_waker_ref())) - .is_pending(), - "token is acquired by token1" - ); - - limiter.release(token1, Some(Outcome::Success)).await; - // === END TOKEN 1 === - - // === TOKEN 2 === - assert!( - limiter.try_acquire().is_none(), - "token is acquired by token2" - ); - - assert!( - token3_fut - .as_mut() - .poll(&mut Context::from_waker(noop_waker_ref())) - .is_pending(), - "token is acquired by token2" - ); - - let token2 = token2_fut.await.unwrap(); - - limiter.release(token2, Some(Outcome::Success)).await; - // === END TOKEN 2 === - - // === TOKEN 3 === - assert!( - limiter.try_acquire().is_none(), - "token is acquired by token3" - ); - - let token3 = token3_fut.await.unwrap(); - limiter.release(token3, Some(Outcome::Success)).await; - // === END TOKEN 3 === - - // === TOKEN 4 === - let token4 = limiter.try_acquire().unwrap(); - limiter.release(token4, Some(Outcome::Success)).await; - } - - #[tokio::test] - async fn disable() { - let config = super::RateLimiterConfig { - algorithm: RateLimitAlgorithm::Fixed, - timeout: Duration::from_secs(1), - initial_limit: 1, - disable: true, - ..Default::default() - }; - let limiter = Limiter::new(config); - - // === TOKEN 1 === - let token1 = limiter.try_acquire().unwrap(); - let token2 = limiter.try_acquire().unwrap(); - let state = limiter.state(); - assert_eq!(state.limit(), 1); - assert_eq!(state.in_flight(), 2); // For disabled limiter, it's expected. - limiter.release(token1, None).await; - limiter.release(token2, None).await; - } + use super::{BucketRateLimiter, WakeComputeRateLimiter}; + use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId}; #[test] fn rate_bucket_rpi() { @@ -639,7 +281,7 @@ mod tests { } #[test] - #[should_panic = "invalid endpoint RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"] + #[should_panic = "invalid bucket RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"] fn rate_buckets_validate() { let mut rates: Vec = ["300@1s", "10@10s"] .into_iter() @@ -655,42 +297,43 @@ mod tests { .map(|s| s.parse().unwrap()) .collect(); RateBucketInfo::validate(&mut rates).unwrap(); - let limiter = EndpointRateLimiter::new(Vec::leak(rates)); + let limiter = WakeComputeRateLimiter::new(rates); let endpoint = EndpointId::from("ep-my-endpoint-1234"); + let endpoint = EndpointIdInt::from(endpoint); time::pause(); for _ in 0..100 { - assert!(limiter.check(endpoint.clone())); + assert!(limiter.check(endpoint, 1)); } // more connections fail - assert!(!limiter.check(endpoint.clone())); + assert!(!limiter.check(endpoint, 1)); // fail even after 500ms as it's in the same bucket time::advance(time::Duration::from_millis(500)).await; - assert!(!limiter.check(endpoint.clone())); + assert!(!limiter.check(endpoint, 1)); // after a full 1s, 100 requests are allowed again time::advance(time::Duration::from_millis(500)).await; for _ in 1..6 { - for _ in 0..100 { - assert!(limiter.check(endpoint.clone())); + for _ in 0..50 { + assert!(limiter.check(endpoint, 2)); } time::advance(time::Duration::from_millis(1000)).await; } // more connections after 600 will exceed the 20rps@30s limit - assert!(!limiter.check(endpoint.clone())); + assert!(!limiter.check(endpoint, 1)); // will still fail before the 30 second limit time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await; - assert!(!limiter.check(endpoint.clone())); + assert!(!limiter.check(endpoint, 1)); // after the full 30 seconds, 100 requests are allowed again time::advance(time::Duration::from_millis(1)).await; for _ in 0..100 { - assert!(limiter.check(endpoint.clone())); + assert!(limiter.check(endpoint, 1)); } } @@ -700,13 +343,10 @@ mod tests { let rand = rand::rngs::StdRng::from_seed([1; 32]); let hasher = BuildHasherDefault::::default(); - let limiter = EndpointRateLimiter::new_with_rand_and_hasher( - &RateBucketInfo::DEFAULT_SET, - rand, - hasher, - ); + let limiter = + BucketRateLimiter::new_with_rand_and_hasher(&RateBucketInfo::DEFAULT_SET, rand, hasher); for i in 0..1_000_000 { - limiter.check(format!("{i}").into()); + limiter.check(i, 1); } assert!(limiter.map.len() < 150_000); } diff --git a/proxy/src/redis.rs b/proxy/src/redis.rs index c2a91bed97..a322f0368c 100644 --- a/proxy/src/redis.rs +++ b/proxy/src/redis.rs @@ -1 +1,4 @@ +pub mod cancellation_publisher; +pub mod connection_with_credentials_provider; +pub mod elasticache; pub mod notifications; diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs new file mode 100644 index 0000000000..95bdfc0965 --- /dev/null +++ b/proxy/src/redis/cancellation_publisher.rs @@ -0,0 +1,161 @@ +use std::sync::Arc; + +use pq_proto::CancelKeyData; +use redis::AsyncCommands; +use tokio::sync::Mutex; +use uuid::Uuid; + +use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; + +use super::{ + connection_with_credentials_provider::ConnectionWithCredentialsProvider, + notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME}, +}; + +pub trait CancellationPublisherMut: Send + Sync + 'static { + #[allow(async_fn_in_trait)] + async fn try_publish( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()>; +} + +pub trait CancellationPublisher: Send + Sync + 'static { + #[allow(async_fn_in_trait)] + async fn try_publish( + &self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()>; +} + +impl CancellationPublisher for () { + async fn try_publish( + &self, + _cancel_key_data: CancelKeyData, + _session_id: Uuid, + ) -> anyhow::Result<()> { + Ok(()) + } +} + +impl CancellationPublisherMut for P { + async fn try_publish( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { +

::try_publish(self, cancel_key_data, session_id).await + } +} + +impl CancellationPublisher for Option

{ + async fn try_publish( + &self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + if let Some(p) = self { + p.try_publish(cancel_key_data, session_id).await + } else { + Ok(()) + } + } +} + +impl CancellationPublisher for Arc> { + async fn try_publish( + &self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + self.lock() + .await + .try_publish(cancel_key_data, session_id) + .await + } +} + +pub struct RedisPublisherClient { + client: ConnectionWithCredentialsProvider, + region_id: String, + limiter: GlobalRateLimiter, +} + +impl RedisPublisherClient { + pub fn new( + client: ConnectionWithCredentialsProvider, + region_id: String, + info: &'static [RateBucketInfo], + ) -> anyhow::Result { + Ok(Self { + client, + region_id, + limiter: GlobalRateLimiter::new(info.into()), + }) + } + + async fn publish( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + let payload = serde_json::to_string(&Notification::Cancel(CancelSession { + region_id: Some(self.region_id.clone()), + cancel_key_data, + session_id, + }))?; + let _: () = self.client.publish(PROXY_CHANNEL_NAME, payload).await?; + Ok(()) + } + pub(crate) async fn try_connect(&mut self) -> anyhow::Result<()> { + match self.client.connect().await { + Ok(()) => {} + Err(e) => { + tracing::error!("failed to connect to redis: {e}"); + return Err(e); + } + } + Ok(()) + } + async fn try_publish_internal( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + if !self.limiter.check() { + tracing::info!("Rate limit exceeded. Skipping cancellation message"); + return Err(anyhow::anyhow!("Rate limit exceeded")); + } + match self.publish(cancel_key_data, session_id).await { + Ok(()) => return Ok(()), + Err(e) => { + tracing::error!("failed to publish a message: {e}"); + } + } + tracing::info!("Publisher is disconnected. Reconnectiong..."); + self.try_connect().await?; + self.publish(cancel_key_data, session_id).await + } +} + +impl CancellationPublisherMut for RedisPublisherClient { + async fn try_publish( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + tracing::info!("publishing cancellation key to Redis"); + match self.try_publish_internal(cancel_key_data, session_id).await { + Ok(()) => { + tracing::info!("cancellation key successfuly published to Redis"); + Ok(()) + } + Err(e) => { + tracing::error!("failed to publish a message: {e}"); + Err(e) + } + } + } +} diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs new file mode 100644 index 0000000000..7d222e2dec --- /dev/null +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -0,0 +1,240 @@ +use std::{sync::Arc, time::Duration}; + +use futures::FutureExt; +use redis::{ + aio::{ConnectionLike, MultiplexedConnection}, + ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult, +}; +use tokio::task::JoinHandle; +use tracing::{error, info}; + +use super::elasticache::CredentialsProvider; + +enum Credentials { + Static(ConnectionInfo), + Dynamic(Arc, redis::ConnectionAddr), +} + +impl Clone for Credentials { + fn clone(&self) -> Self { + match self { + Credentials::Static(info) => Credentials::Static(info.clone()), + Credentials::Dynamic(provider, addr) => { + Credentials::Dynamic(Arc::clone(provider), addr.clone()) + } + } + } +} + +/// A wrapper around `redis::MultiplexedConnection` that automatically refreshes the token. +/// Provides PubSub connection without credentials refresh. +pub struct ConnectionWithCredentialsProvider { + credentials: Credentials, + con: Option, + refresh_token_task: Option>, + mutex: tokio::sync::Mutex<()>, +} + +impl Clone for ConnectionWithCredentialsProvider { + fn clone(&self) -> Self { + Self { + credentials: self.credentials.clone(), + con: None, + refresh_token_task: None, + mutex: tokio::sync::Mutex::new(()), + } + } +} + +impl ConnectionWithCredentialsProvider { + pub fn new_with_credentials_provider( + host: String, + port: u16, + credentials_provider: Arc, + ) -> Self { + Self { + credentials: Credentials::Dynamic( + credentials_provider, + redis::ConnectionAddr::TcpTls { + host, + port, + insecure: false, + tls_params: None, + }, + ), + con: None, + refresh_token_task: None, + mutex: tokio::sync::Mutex::new(()), + } + } + + pub fn new_with_static_credentials(params: T) -> Self { + Self { + credentials: Credentials::Static(params.into_connection_info().unwrap()), + con: None, + refresh_token_task: None, + mutex: tokio::sync::Mutex::new(()), + } + } + + async fn ping(con: &mut MultiplexedConnection) -> RedisResult<()> { + redis::cmd("PING").query_async(con).await + } + + pub(crate) async fn connect(&mut self) -> anyhow::Result<()> { + let _guard = self.mutex.lock().await; + if let Some(con) = self.con.as_mut() { + match Self::ping(con).await { + Ok(()) => { + return Ok(()); + } + Err(e) => { + error!("Error during PING: {e:?}"); + } + } + } else { + info!("Connection is not established"); + } + info!("Establishing a new connection..."); + self.con = None; + if let Some(f) = self.refresh_token_task.take() { + f.abort(); + } + let mut con = self + .get_client() + .await? + .get_multiplexed_tokio_connection() + .await?; + if let Credentials::Dynamic(credentials_provider, _) = &self.credentials { + let credentials_provider = credentials_provider.clone(); + let con2 = con.clone(); + let f = tokio::spawn(async move { + let _ = Self::keep_connection(con2, credentials_provider).await; + }); + self.refresh_token_task = Some(f); + } + match Self::ping(&mut con).await { + Ok(()) => { + info!("Connection succesfully established"); + } + Err(e) => { + error!("Connection is broken. Error during PING: {e:?}"); + } + } + self.con = Some(con); + Ok(()) + } + + async fn get_connection_info(&self) -> anyhow::Result { + match &self.credentials { + Credentials::Static(info) => Ok(info.clone()), + Credentials::Dynamic(provider, addr) => { + let (username, password) = provider.provide_credentials().await?; + Ok(ConnectionInfo { + addr: addr.clone(), + redis: RedisConnectionInfo { + db: 0, + username: Some(username), + password: Some(password.clone()), + }, + }) + } + } + } + + async fn get_client(&self) -> anyhow::Result { + let client = redis::Client::open(self.get_connection_info().await?)?; + Ok(client) + } + + // PubSub does not support credentials refresh. + // Requires manual reconnection every 12h. + pub(crate) async fn get_async_pubsub(&self) -> anyhow::Result { + Ok(self.get_client().await?.get_async_pubsub().await?) + } + + // The connection lives for 12h. + // It can be prolonged with sending `AUTH` commands with the refreshed token. + // https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/auth-iam.html#auth-iam-limits + async fn keep_connection( + mut con: MultiplexedConnection, + credentials_provider: Arc, + ) -> anyhow::Result<()> { + loop { + // The connection lives for 12h, for the sanity check we refresh it every hour. + tokio::time::sleep(Duration::from_secs(60 * 60)).await; + match Self::refresh_token(&mut con, credentials_provider.clone()).await { + Ok(()) => { + info!("Token refreshed"); + } + Err(e) => { + error!("Error during token refresh: {e:?}"); + } + } + } + } + async fn refresh_token( + con: &mut MultiplexedConnection, + credentials_provider: Arc, + ) -> anyhow::Result<()> { + let (user, password) = credentials_provider.provide_credentials().await?; + let _: () = redis::cmd("AUTH") + .arg(user) + .arg(password) + .query_async(con) + .await?; + Ok(()) + } + /// Sends an already encoded (packed) command into the TCP socket and + /// reads the single response from it. + pub(crate) async fn send_packed_command( + &mut self, + cmd: &redis::Cmd, + ) -> RedisResult { + // Clone connection to avoid having to lock the ArcSwap in write mode + let con = self.con.as_mut().ok_or(redis::RedisError::from(( + redis::ErrorKind::IoError, + "Connection not established", + )))?; + con.send_packed_command(cmd).await + } + + /// Sends multiple already encoded (packed) command into the TCP socket + /// and reads `count` responses from it. This is used to implement + /// pipelining. + pub(crate) async fn send_packed_commands( + &mut self, + cmd: &redis::Pipeline, + offset: usize, + count: usize, + ) -> RedisResult> { + // Clone shared connection future to avoid having to lock the ArcSwap in write mode + let con = self.con.as_mut().ok_or(redis::RedisError::from(( + redis::ErrorKind::IoError, + "Connection not established", + )))?; + con.send_packed_commands(cmd, offset, count).await + } +} + +impl ConnectionLike for ConnectionWithCredentialsProvider { + fn req_packed_command<'a>( + &'a mut self, + cmd: &'a redis::Cmd, + ) -> redis::RedisFuture<'a, redis::Value> { + (async move { self.send_packed_command(cmd).await }).boxed() + } + + fn req_packed_commands<'a>( + &'a mut self, + cmd: &'a redis::Pipeline, + offset: usize, + count: usize, + ) -> redis::RedisFuture<'a, Vec> { + (async move { self.send_packed_commands(cmd, offset, count).await }).boxed() + } + + fn get_db(&self) -> i64 { + 0 + } +} diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs new file mode 100644 index 0000000000..d118c8f412 --- /dev/null +++ b/proxy/src/redis/elasticache.rs @@ -0,0 +1,110 @@ +use std::time::{Duration, SystemTime}; + +use aws_config::meta::credentials::CredentialsProviderChain; +use aws_sdk_iam::config::ProvideCredentials; +use aws_sigv4::http_request::{ + self, SignableBody, SignableRequest, SignatureLocation, SigningSettings, +}; +use tracing::info; + +#[derive(Debug)] +pub struct AWSIRSAConfig { + region: String, + service_name: String, + cluster_name: String, + user_id: String, + token_ttl: Duration, + action: String, +} + +impl AWSIRSAConfig { + pub fn new(region: String, cluster_name: Option, user_id: Option) -> Self { + AWSIRSAConfig { + region, + service_name: "elasticache".to_string(), + cluster_name: cluster_name.unwrap_or_default(), + user_id: user_id.unwrap_or_default(), + // "The IAM authentication token is valid for 15 minutes" + // https://docs.aws.amazon.com/memorydb/latest/devguide/auth-iam.html#auth-iam-limits + token_ttl: Duration::from_secs(15 * 60), + action: "connect".to_string(), + } + } +} + +/// Credentials provider for AWS elasticache authentication. +/// +/// Official documentation: +/// +/// +/// Useful resources: +/// +pub struct CredentialsProvider { + config: AWSIRSAConfig, + credentials_provider: CredentialsProviderChain, +} + +impl CredentialsProvider { + pub fn new(config: AWSIRSAConfig, credentials_provider: CredentialsProviderChain) -> Self { + CredentialsProvider { + config, + credentials_provider, + } + } + pub(crate) async fn provide_credentials(&self) -> anyhow::Result<(String, String)> { + let aws_credentials = self + .credentials_provider + .provide_credentials() + .await? + .into(); + info!("AWS credentials successfully obtained"); + info!("Connecting to Redis with configuration: {:?}", self.config); + let mut settings = SigningSettings::default(); + settings.signature_location = SignatureLocation::QueryParams; + settings.expires_in = Some(self.config.token_ttl); + let signing_params = aws_sigv4::sign::v4::SigningParams::builder() + .identity(&aws_credentials) + .region(&self.config.region) + .name(&self.config.service_name) + .time(SystemTime::now()) + .settings(settings) + .build()? + .into(); + let auth_params = [ + ("Action", &self.config.action), + ("User", &self.config.user_id), + ]; + let auth_params = url::form_urlencoded::Serializer::new(String::new()) + .extend_pairs(auth_params) + .finish(); + let auth_uri = http::Uri::builder() + .scheme("http") + .authority(self.config.cluster_name.as_bytes()) + .path_and_query(format!("/?{auth_params}")) + .build()?; + info!("{}", auth_uri); + + // Convert the HTTP request into a signable request + let signable_request = SignableRequest::new( + "GET", + auth_uri.to_string(), + std::iter::empty(), + SignableBody::Bytes(&[]), + )?; + + // Sign and then apply the signature to the request + let (si, _) = http_request::sign(signable_request, &signing_params)?.into_parts(); + let mut signable_request = http::Request::builder() + .method("GET") + .uri(auth_uri) + .body(())?; + si.apply_to_request_http1x(&mut signable_request); + Ok(( + self.config.user_id.clone(), + signable_request + .uri() + .to_string() + .replacen("http://", "", 1), + )) + } +} diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 9cd70b109b..36a3443603 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -1,35 +1,37 @@ use std::{convert::Infallible, sync::Arc}; use futures::StreamExt; +use pq_proto::CancelKeyData; use redis::aio::PubSub; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; +use uuid::Uuid; -use crate::{cache::project_info::ProjectInfoCache, ProjectId, RoleName}; +use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use crate::{ + cache::project_info::ProjectInfoCache, + cancellation::{CancelMap, CancellationHandler}, + intern::{ProjectIdInt, RoleNameInt}, + metrics::{Metrics, RedisErrors, RedisEventsCount}, +}; -const CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; +const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; +pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates"; const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20); const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20); -struct ConsoleRedisClient { - client: redis::Client, +async fn try_connect(client: &ConnectionWithCredentialsProvider) -> anyhow::Result { + let mut conn = client.get_async_pubsub().await?; + tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`"); + conn.subscribe(CPLANE_CHANNEL_NAME).await?; + tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`"); + conn.subscribe(PROXY_CHANNEL_NAME).await?; + Ok(conn) } -impl ConsoleRedisClient { - pub fn new(url: &str) -> anyhow::Result { - let client = redis::Client::open(url)?; - Ok(Self { client }) - } - async fn try_connect(&self) -> anyhow::Result { - let mut conn = self.client.get_async_connection().await?.into_pubsub(); - tracing::info!("subscribing to a channel `{CHANNEL_NAME}`"); - conn.subscribe(CHANNEL_NAME).await?; - Ok(conn) - } -} - -#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] #[serde(tag = "topic", content = "data")] -enum Notification { +pub(crate) enum Notification { #[serde( rename = "/allowed_ips_updated", deserialize_with = "deserialize_json_string" @@ -42,16 +44,25 @@ enum Notification { deserialize_with = "deserialize_json_string" )] PasswordUpdate { password_update: PasswordUpdate }, + #[serde(rename = "/cancel_session")] + Cancel(CancelSession), } -#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] -struct AllowedIpsUpdate { - project_id: ProjectId, +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +pub(crate) struct AllowedIpsUpdate { + project_id: ProjectIdInt, } -#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] -struct PasswordUpdate { - project_id: ProjectId, - role_name: RoleName, +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +pub(crate) struct PasswordUpdate { + project_id: ProjectIdInt, + role_name: RoleNameInt, } +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +pub(crate) struct CancelSession { + pub(crate) region_id: Option, + pub(crate) cancel_key_data: CancelKeyData, + pub(crate) session_id: Uuid, +} + fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result where T: for<'de2> serde::Deserialize<'de2>, @@ -61,92 +72,211 @@ where serde_json::from_str(&s).map_err(::custom) } -fn invalidate_cache(cache: Arc, msg: Notification) { - use Notification::*; - match msg { - AllowedIpsUpdate { allowed_ips_update } => { - cache.invalidate_allowed_ips_for_project(&allowed_ips_update.project_id) +struct MessageHandler { + cache: Arc, + cancellation_handler: Arc>, + region_id: String, +} + +impl Clone for MessageHandler { + fn clone(&self) -> Self { + Self { + cache: self.cache.clone(), + cancellation_handler: self.cancellation_handler.clone(), + region_id: self.region_id.clone(), } - PasswordUpdate { password_update } => cache.invalidate_role_secret_for_project( - &password_update.project_id, - &password_update.role_name, - ), } } -#[tracing::instrument(skip(cache))] -fn handle_message(msg: redis::Msg, cache: Arc) -> anyhow::Result<()> -where - C: ProjectInfoCache + Send + Sync + 'static, -{ - let payload: String = msg.get_payload()?; - tracing::debug!(?payload, "received a message payload"); - - let msg: Notification = match serde_json::from_str(&payload) { - Ok(msg) => msg, - Err(e) => { - tracing::error!("broken message: {e}"); - return Ok(()); +impl MessageHandler { + pub(crate) fn new( + cache: Arc, + cancellation_handler: Arc>, + region_id: String, + ) -> Self { + Self { + cache, + cancellation_handler, + region_id, } - }; - tracing::debug!(?msg, "received a message"); - invalidate_cache(cache.clone(), msg.clone()); - // It might happen that the invalid entry is on the way to be cached. - // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds. - // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message. - tokio::spawn(async move { - tokio::time::sleep(INVALIDATION_LAG).await; - invalidate_cache(cache, msg.clone()); - }); + } + pub(crate) async fn increment_active_listeners(&self) { + self.cache.increment_active_listeners().await; + } + pub(crate) async fn decrement_active_listeners(&self) { + self.cache.decrement_active_listeners().await; + } + #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))] + async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> { + let payload: String = msg.get_payload()?; + tracing::debug!(?payload, "received a message payload"); - Ok(()) + let msg: Notification = match serde_json::from_str(&payload) { + Ok(msg) => msg, + Err(e) => { + Metrics::get().proxy.redis_errors_total.inc(RedisErrors { + channel: msg.get_channel_name(), + }); + tracing::error!("broken message: {e}"); + return Ok(()); + } + }; + tracing::debug!(?msg, "received a message"); + match msg { + Notification::Cancel(cancel_session) => { + tracing::Span::current().record( + "session_id", + tracing::field::display(cancel_session.session_id), + ); + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::CancelSession); + if let Some(cancel_region) = cancel_session.region_id { + // If the message is not for this region, ignore it. + if cancel_region != self.region_id { + return Ok(()); + } + } + // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message. + match self + .cancellation_handler + .cancel_session(cancel_session.cancel_key_data, uuid::Uuid::nil()) + .await + { + Ok(()) => {} + Err(e) => { + tracing::error!("failed to cancel session: {e}"); + } + } + } + Notification::AllowedIpsUpdate { .. } | Notification::PasswordUpdate { .. } => { + invalidate_cache(self.cache.clone(), msg.clone()); + if matches!(msg, Notification::AllowedIpsUpdate { .. }) { + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::AllowedIpsUpdate); + } else if matches!(msg, Notification::PasswordUpdate { .. }) { + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::PasswordUpdate); + } + // It might happen that the invalid entry is on the way to be cached. + // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds. + // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message. + let cache = self.cache.clone(); + tokio::spawn(async move { + tokio::time::sleep(INVALIDATION_LAG).await; + invalidate_cache(cache, msg); + }); + } + } + + Ok(()) + } } -/// Handle console's invalidation messages. -#[tracing::instrument(name = "console_notifications", skip_all)] -pub async fn task_main(url: String, cache: Arc) -> anyhow::Result -where - C: ProjectInfoCache + Send + Sync + 'static, -{ - cache.enable_ttl(); +fn invalidate_cache(cache: Arc, msg: Notification) { + match msg { + Notification::AllowedIpsUpdate { allowed_ips_update } => { + cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id); + } + Notification::PasswordUpdate { password_update } => cache + .invalidate_role_secret_for_project( + password_update.project_id, + password_update.role_name, + ), + Notification::Cancel(_) => unreachable!("cancel message should be handled separately"), + } +} +async fn handle_messages( + handler: MessageHandler, + redis: ConnectionWithCredentialsProvider, + cancellation_token: CancellationToken, +) -> anyhow::Result<()> { loop { - let redis = ConsoleRedisClient::new(&url)?; - let conn = match redis.try_connect().await { + if cancellation_token.is_cancelled() { + return Ok(()); + } + let mut conn = match try_connect(&redis).await { Ok(conn) => { - cache.disable_ttl(); + handler.increment_active_listeners().await; conn } Err(e) => { tracing::error!( - "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}" - ); + "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}" + ); tokio::time::sleep(RECONNECT_TIMEOUT).await; continue; } }; - let mut stream = conn.into_on_message(); + let mut stream = conn.on_message(); while let Some(msg) = stream.next().await { - match handle_message(msg, cache.clone()) { + match handler.handle_message(msg).await { Ok(()) => {} Err(e) => { tracing::error!("failed to handle message: {e}, will try to reconnect"); break; } } + if cancellation_token.is_cancelled() { + handler.decrement_active_listeners().await; + return Ok(()); + } } - cache.enable_ttl(); + handler.decrement_active_listeners().await; + } +} + +/// Handle console's invalidation messages. +#[tracing::instrument(name = "redis_notifications", skip_all)] +pub async fn task_main( + redis: ConnectionWithCredentialsProvider, + cache: Arc, + cancel_map: CancelMap, + region_id: String, +) -> anyhow::Result +where + C: ProjectInfoCache + Send + Sync + 'static, +{ + let cancellation_handler = Arc::new(CancellationHandler::<()>::new( + cancel_map, + crate::metrics::CancellationSource::FromRedis, + )); + let handler = MessageHandler::new(cache, cancellation_handler, region_id); + // 6h - 1m. + // There will be 1 minute overlap between two tasks. But at least we can be sure that no message is lost. + let mut interval = tokio::time::interval(std::time::Duration::from_secs(6 * 60 * 60 - 60)); + loop { + let cancellation_token = CancellationToken::new(); + interval.tick().await; + + tokio::spawn(handle_messages( + handler.clone(), + redis.clone(), + cancellation_token.clone(), + )); + tokio::spawn(async move { + tokio::time::sleep(std::time::Duration::from_secs(6 * 60 * 60)).await; // 6h. + cancellation_token.cancel(); + }); } } #[cfg(test)] mod tests { + use crate::{ProjectId, RoleName}; + use super::*; use serde_json::json; #[test] fn parse_allowed_ips() -> anyhow::Result<()> { - let project_id = "new_project".to_string(); + let project_id: ProjectId = "new_project".into(); let data = format!("{{\"project_id\": \"{project_id}\"}}"); let text = json!({ "type": "message", @@ -161,7 +291,7 @@ mod tests { result, Notification::AllowedIpsUpdate { allowed_ips_update: AllowedIpsUpdate { - project_id: project_id.into() + project_id: (&project_id).into() } } ); @@ -171,8 +301,8 @@ mod tests { #[test] fn parse_password_updated() -> anyhow::Result<()> { - let project_id = "new_project".to_string(); - let role_name = "new_role".to_string(); + let project_id: ProjectId = "new_project".into(); + let role_name: RoleName = "new_role".into(); let data = format!("{{\"project_id\": \"{project_id}\", \"role_name\": \"{role_name}\"}}"); let text = json!({ "type": "message", @@ -187,12 +317,39 @@ mod tests { result, Notification::PasswordUpdate { password_update: PasswordUpdate { - project_id: project_id.into(), - role_name: role_name.into() + project_id: (&project_id).into(), + role_name: (&role_name).into(), } } ); + Ok(()) + } + #[test] + fn parse_cancel_session() -> anyhow::Result<()> { + let cancel_key_data = CancelKeyData { + backend_pid: 42, + cancel_key: 41, + }; + let uuid = uuid::Uuid::new_v4(); + let msg = Notification::Cancel(CancelSession { + cancel_key_data, + region_id: None, + session_id: uuid, + }); + let text = serde_json::to_string(&msg)?; + let result: Notification = serde_json::from_str(&text)?; + assert_eq!(msg, result); + + let msg = Notification::Cancel(CancelSession { + cancel_key_data, + region_id: Some("region".to_string()), + session_id: uuid, + }); + let text = serde_json::to_string(&msg)?; + let result: Notification = serde_json::from_str(&text)?; + assert_eq!(msg, result,); + Ok(()) } } diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs index da1cf21c6a..0a36694359 100644 --- a/proxy/src/sasl.rs +++ b/proxy/src/sasl.rs @@ -10,17 +10,17 @@ mod channel_binding; mod messages; mod stream; -use crate::error::UserFacingError; +use crate::error::{ReportableError, UserFacingError}; use std::io; use thiserror::Error; -pub use channel_binding::ChannelBinding; -pub use messages::FirstMessage; -pub use stream::{Outcome, SaslStream}; +pub(crate) use channel_binding::ChannelBinding; +pub(crate) use messages::FirstMessage; +pub(crate) use stream::{Outcome, SaslStream}; /// Fine-grained auth errors help in writing tests. #[derive(Error, Debug)] -pub enum Error { +pub(crate) enum Error { #[error("Channel binding failed: {0}")] ChannelBindingFailed(&'static str), @@ -33,27 +33,42 @@ pub enum Error { #[error("Internal error: missing digest")] MissingBinding, + #[error("could not decode salt: {0}")] + Base64(#[from] base64::DecodeError), + #[error(transparent)] Io(#[from] io::Error), } impl UserFacingError for Error { fn to_string_client(&self) -> String { - use Error::*; match self { - ChannelBindingFailed(m) => m.to_string(), - ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"), + Self::ChannelBindingFailed(m) => (*m).to_string(), + Self::ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"), _ => "authentication protocol violation".to_string(), } } } +impl ReportableError for Error { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + Error::ChannelBindingFailed(_) => crate::error::ErrorKind::User, + Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User, + Error::BadClientMessage(_) => crate::error::ErrorKind::User, + Error::MissingBinding => crate::error::ErrorKind::Service, + Error::Base64(_) => crate::error::ErrorKind::ControlPlane, + Error::Io(_) => crate::error::ErrorKind::ClientDisconnect, + } + } +} + /// A convenient result type for SASL exchange. -pub type Result = std::result::Result; +pub(crate) type Result = std::result::Result; /// A result of one SASL exchange. #[must_use] -pub enum Step { +pub(crate) enum Step { /// We should continue exchanging messages. Continue(T, String), /// The client has been authenticated successfully. @@ -63,7 +78,7 @@ pub enum Step { } /// Every SASL mechanism (e.g. [SCRAM](crate::scram)) is expected to implement this trait. -pub trait Mechanism: Sized { +pub(crate) trait Mechanism: Sized { /// What's produced as a result of successful authentication. type Output; diff --git a/proxy/src/sasl/channel_binding.rs b/proxy/src/sasl/channel_binding.rs index 13d681de6d..fdd011448e 100644 --- a/proxy/src/sasl/channel_binding.rs +++ b/proxy/src/sasl/channel_binding.rs @@ -2,7 +2,7 @@ /// Channel binding flag (possibly with params). #[derive(Debug, PartialEq, Eq)] -pub enum ChannelBinding { +pub(crate) enum ChannelBinding { /// Client doesn't support channel binding. NotSupportedClient, /// Client thinks server doesn't support channel binding. @@ -12,45 +12,45 @@ pub enum ChannelBinding { } impl ChannelBinding { - pub fn and_then(self, f: impl FnOnce(T) -> Result) -> Result, E> { - use ChannelBinding::*; + pub(crate) fn and_then( + self, + f: impl FnOnce(T) -> Result, + ) -> Result, E> { Ok(match self { - NotSupportedClient => NotSupportedClient, - NotSupportedServer => NotSupportedServer, - Required(x) => Required(f(x)?), + Self::NotSupportedClient => ChannelBinding::NotSupportedClient, + Self::NotSupportedServer => ChannelBinding::NotSupportedServer, + Self::Required(x) => ChannelBinding::Required(f(x)?), }) } } impl<'a> ChannelBinding<&'a str> { // NB: FromStr doesn't work with lifetimes - pub fn parse(input: &'a str) -> Option { - use ChannelBinding::*; + pub(crate) fn parse(input: &'a str) -> Option { Some(match input { - "n" => NotSupportedClient, - "y" => NotSupportedServer, - other => Required(other.strip_prefix("p=")?), + "n" => Self::NotSupportedClient, + "y" => Self::NotSupportedServer, + other => Self::Required(other.strip_prefix("p=")?), }) } } impl ChannelBinding { /// Encode channel binding data as base64 for subsequent checks. - pub fn encode<'a, E>( + pub(crate) fn encode<'a, E>( &self, get_cbind_data: impl FnOnce(&T) -> Result<&'a [u8], E>, ) -> Result, E> { - use ChannelBinding::*; Ok(match self { - NotSupportedClient => { + Self::NotSupportedClient => { // base64::encode("n,,") "biws".into() } - NotSupportedServer => { + Self::NotSupportedServer => { // base64::encode("y,,") "eSws".into() } - Required(mode) => { + Self::Required(mode) => { use std::io::Write; let mut cbind_input = vec![]; write!(&mut cbind_input, "p={mode},,",).unwrap(); diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs index b9208f6f1f..6c9a42b2db 100644 --- a/proxy/src/sasl/messages.rs +++ b/proxy/src/sasl/messages.rs @@ -5,16 +5,16 @@ use pq_proto::{BeAuthenticationSaslMessage, BeMessage}; /// SASL-specific payload of [`PasswordMessage`](pq_proto::FeMessage::PasswordMessage). #[derive(Debug)] -pub struct FirstMessage<'a> { +pub(crate) struct FirstMessage<'a> { /// Authentication method, e.g. `"SCRAM-SHA-256"`. - pub method: &'a str, + pub(crate) method: &'a str, /// Initial client message. - pub message: &'a str, + pub(crate) message: &'a str, } impl<'a> FirstMessage<'a> { // NB: FromStr doesn't work with lifetimes - pub fn parse(bytes: &'a [u8]) -> Option { + pub(crate) fn parse(bytes: &'a [u8]) -> Option { let (method_cstr, tail) = split_cstr(bytes)?; let method = method_cstr.to_str().ok()?; @@ -42,10 +42,9 @@ pub(super) enum ServerMessage { impl<'a> ServerMessage<&'a str> { pub(super) fn to_reply(&self) -> BeMessage<'a> { - use BeAuthenticationSaslMessage::*; BeMessage::AuthenticationSasl(match self { - ServerMessage::Continue(s) => Continue(s.as_bytes()), - ServerMessage::Final(s) => Final(s.as_bytes()), + ServerMessage::Continue(s) => BeAuthenticationSaslMessage::Continue(s.as_bytes()), + ServerMessage::Final(s) => BeAuthenticationSaslMessage::Final(s.as_bytes()), }) } } diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs index 9115b0f61a..b6becd28e1 100644 --- a/proxy/src/sasl/stream.rs +++ b/proxy/src/sasl/stream.rs @@ -7,7 +7,7 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; /// Abstracts away all peculiarities of the libpq's protocol. -pub struct SaslStream<'a, S> { +pub(crate) struct SaslStream<'a, S> { /// The underlying stream. stream: &'a mut PqStream, /// Current password message we received from client. @@ -17,7 +17,7 @@ pub struct SaslStream<'a, S> { } impl<'a, S> SaslStream<'a, S> { - pub fn new(stream: &'a mut PqStream, first: &'a str) -> Self { + pub(crate) fn new(stream: &'a mut PqStream, first: &'a str) -> Self { Self { stream, current: bytes::Bytes::new(), @@ -53,7 +53,7 @@ impl SaslStream<'_, S> { /// It's much easier to match on those two variants /// than to peek into a noisy protocol error type. #[must_use = "caller must explicitly check for success"] -pub enum Outcome { +pub(crate) enum Outcome { /// Authentication succeeded and produced some value. Success(R), /// Authentication failed (reason attached). @@ -63,7 +63,7 @@ pub enum Outcome { impl SaslStream<'_, S> { /// Perform SASL message exchange according to the underlying algorithm /// until user is either authenticated or denied access. - pub async fn authenticate( + pub(crate) async fn authenticate( mut self, mut mechanism: M, ) -> super::Result> { diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs index 49a7a13043..d058f1c3f8 100644 --- a/proxy/src/scram.rs +++ b/proxy/src/scram.rs @@ -6,18 +6,18 @@ //! * //! * +mod countmin; mod exchange; mod key; mod messages; +mod pbkdf2; mod secret; mod signature; +pub mod threadpool; -#[cfg(any(test, doc))] -mod password; - -pub use exchange::{exchange, Exchange}; -pub use key::ScramKey; -pub use secret::ServerSecret; +pub(crate) use exchange::{exchange, Exchange}; +pub(crate) use key::ScramKey; +pub(crate) use secret::ServerSecret; use hmac::{Hmac, Mac}; use sha2::{Digest, Sha256}; @@ -26,8 +26,8 @@ const SCRAM_SHA_256: &str = "SCRAM-SHA-256"; const SCRAM_SHA_256_PLUS: &str = "SCRAM-SHA-256-PLUS"; /// A list of supported SCRAM methods. -pub const METHODS: &[&str] = &[SCRAM_SHA_256_PLUS, SCRAM_SHA_256]; -pub const METHODS_WITHOUT_PLUS: &[&str] = &[SCRAM_SHA_256]; +pub(crate) const METHODS: &[&str] = &[SCRAM_SHA_256_PLUS, SCRAM_SHA_256]; +pub(crate) const METHODS_WITHOUT_PLUS: &[&str] = &[SCRAM_SHA_256]; /// Decode base64 into array without any heap allocations fn base64_decode_array(input: impl AsRef<[u8]>) -> Option<[u8; N]> { @@ -59,27 +59,23 @@ fn sha256<'a>(parts: impl IntoIterator) -> [u8; 32] { #[cfg(test)] mod tests { - use crate::sasl::{Mechanism, Step}; + use crate::{ + intern::EndpointIdInt, + sasl::{Mechanism, Step}, + EndpointId, + }; - use super::{password::SaltedPassword, Exchange, ServerSecret}; + use super::{threadpool::ThreadPool, Exchange, ServerSecret}; #[test] - fn happy_path() { + fn snapshot() { let iterations = 4096; - let salt_base64 = "QSXCR+Q6sek8bf92"; - let pw = SaltedPassword::new( - b"pencil", - base64::decode(salt_base64).unwrap().as_slice(), - iterations, - ); + let salt = "QSXCR+Q6sek8bf92"; + let stored_key = "FO+9jBb3MUukt6jJnzjPZOWc5ow/Pu6JtPyju0aqaE8="; + let server_key = "qxJ1SbmSAi5EcS0J5Ck/cKAm/+Ixa+Kwp63f4OHDgzo="; + let secret = format!("SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}",); + let secret = ServerSecret::parse(&secret).unwrap(); - let secret = ServerSecret { - iterations, - salt_base64: salt_base64.to_owned(), - stored_key: pw.client_key().sha256(), - server_key: pw.server_key(), - doomed: false, - }; const NONCE: [u8; 18] = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ]; @@ -121,4 +117,32 @@ mod tests { ] ); } + + async fn run_round_trip_test(server_password: &str, client_password: &str) { + let pool = ThreadPool::new(1); + + let ep = EndpointId::from("foo"); + let ep = EndpointIdInt::from(ep); + + let scram_secret = ServerSecret::build(server_password).await.unwrap(); + let outcome = super::exchange(&pool, ep, &scram_secret, client_password.as_bytes()) + .await + .unwrap(); + + match outcome { + crate::sasl::Outcome::Success(_) => {} + crate::sasl::Outcome::Failure(r) => panic!("{r}"), + } + } + + #[tokio::test] + async fn round_trip() { + run_round_trip_test("pencil", "pencil").await; + } + + #[tokio::test] + #[should_panic(expected = "password doesn't match")] + async fn failure() { + run_round_trip_test("pencil", "eraser").await; + } } diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs new file mode 100644 index 0000000000..64ee0135e1 --- /dev/null +++ b/proxy/src/scram/countmin.rs @@ -0,0 +1,165 @@ +use std::hash::Hash; + +/// estimator of hash jobs per second. +/// +pub(crate) struct CountMinSketch { + // one for each depth + hashers: Vec, + width: usize, + depth: usize, + // buckets, width*depth + buckets: Vec, +} + +impl CountMinSketch { + /// Given parameters (ε, δ), + /// set width = ceil(e/ε) + /// set depth = ceil(ln(1/δ)) + /// + /// guarantees: + /// actual <= estimate + /// estimate <= actual + ε * N with probability 1 - δ + /// where N is the cardinality of the stream + pub(crate) fn with_params(epsilon: f64, delta: f64) -> Self { + CountMinSketch::new( + (std::f64::consts::E / epsilon).ceil() as usize, + (1.0_f64 / delta).ln().ceil() as usize, + ) + } + + fn new(width: usize, depth: usize) -> Self { + Self { + #[cfg(test)] + hashers: (0..depth) + .map(|i| { + // digits of pi for good randomness + ahash::RandomState::with_seeds( + 314159265358979323, + 84626433832795028, + 84197169399375105, + 82097494459230781 + i as u64, + ) + }) + .collect(), + #[cfg(not(test))] + hashers: (0..depth).map(|_| ahash::RandomState::new()).collect(), + width, + depth, + buckets: vec![0; width * depth], + } + } + + pub(crate) fn inc_and_return(&mut self, t: &T, x: u32) -> u32 { + let mut min = u32::MAX; + for row in 0..self.depth { + let col = (self.hashers[row].hash_one(t) as usize) % self.width; + + let row = &mut self.buckets[row * self.width..][..self.width]; + row[col] = row[col].saturating_add(x); + min = std::cmp::min(min, row[col]); + } + min + } + + pub(crate) fn reset(&mut self) { + self.buckets.clear(); + self.buckets.resize(self.width * self.depth, 0); + } +} + +#[cfg(test)] +mod tests { + use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng}; + + use super::CountMinSketch; + + fn eval_precision(n: usize, p: f64, q: f64) -> usize { + // fixed value of phi for consistent test + let mut rng = StdRng::seed_from_u64(16180339887498948482); + + #[allow(non_snake_case)] + let mut N = 0; + + let mut ids = vec![]; + + for _ in 0..n { + // number to insert at once + let n = rng.gen_range(1..4096); + // number of insert operations + let m = rng.gen_range(1..100); + + let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid(); + ids.push((id, n, m)); + + // N = sum(actual) + N += n * m; + } + + // q% of counts will be within p of the actual value + let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q); + + // insert a bunch of entries in a random order + let mut ids2 = ids.clone(); + while !ids2.is_empty() { + ids2.shuffle(&mut rng); + ids2.retain_mut(|id| { + sketch.inc_and_return(&id.0, id.1); + id.2 -= 1; + id.2 > 0 + }); + } + + let mut within_p = 0; + for (id, n, m) in ids { + let actual = n * m; + let estimate = sketch.inc_and_return(&id, 0); + + // This estimate has the guarantee that actual <= estimate + assert!(actual <= estimate); + + // This estimate has the guarantee that estimate <= actual + εN with probability 1 - δ. + // ε = p / N, δ = 1 - q; + // therefore, estimate <= actual + p with probability q. + if estimate as f64 <= actual as f64 + p { + within_p += 1; + } + } + within_p + } + + #[test] + fn precision() { + assert_eq!(eval_precision(100, 100.0, 0.99), 100); + assert_eq!(eval_precision(1000, 100.0, 0.99), 1000); + assert_eq!(eval_precision(100, 4096.0, 0.99), 100); + assert_eq!(eval_precision(1000, 4096.0, 0.99), 1000); + + // seems to be more precise than the literature indicates? + // probably numbers are too small to truly represent the probabilities. + assert_eq!(eval_precision(100, 4096.0, 0.90), 100); + assert_eq!(eval_precision(1000, 4096.0, 0.90), 1000); + assert_eq!(eval_precision(100, 4096.0, 0.1), 96); + assert_eq!(eval_precision(1000, 4096.0, 0.1), 988); + } + + // returns memory usage in bytes, and the time complexity per insert. + fn eval_cost(p: f64, q: f64) -> (usize, usize) { + #[allow(non_snake_case)] + // N = sum(actual) + // Let's assume 1021 samples, all of 4096 + let N = 1021 * 4096; + let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q); + + let memory = size_of::() * sketch.buckets.len(); + let time = sketch.depth; + (memory, time) + } + + #[test] + fn memory_usage() { + assert_eq!(eval_cost(100.0, 0.99), (2273580, 5)); + assert_eq!(eval_cost(4096.0, 0.99), (55520, 5)); + assert_eq!(eval_cost(4096.0, 0.90), (33312, 3)); + assert_eq!(eval_cost(4096.0, 0.1), (11104, 1)); + } +} diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 9af7db5201..786cbcaa19 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -2,14 +2,19 @@ use std::convert::Infallible; -use postgres_protocol::authentication::sasl::ScramSha256; +use hmac::{Hmac, Mac}; +use sha2::Sha256; use super::messages::{ ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN, }; +use super::pbkdf2::Pbkdf2; use super::secret::ServerSecret; use super::signature::SignatureBuilder; +use super::threadpool::ThreadPool; +use super::ScramKey; use crate::config; +use crate::intern::EndpointIdInt; use crate::sasl::{self, ChannelBinding, Error as SaslError}; /// The only channel binding mode we currently support. @@ -51,14 +56,14 @@ enum ExchangeState { } /// Server's side of SCRAM auth algorithm. -pub struct Exchange<'a> { +pub(crate) struct Exchange<'a> { state: ExchangeState, secret: &'a ServerSecret, tls_server_end_point: config::TlsServerEndPoint, } impl<'a> Exchange<'a> { - pub fn new( + pub(crate) fn new( secret: &'a ServerSecret, nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN], tls_server_end_point: config::TlsServerEndPoint, @@ -71,40 +76,44 @@ impl<'a> Exchange<'a> { } } -pub fn exchange( +// copied from +async fn derive_client_key( + pool: &ThreadPool, + endpoint: EndpointIdInt, + password: &[u8], + salt: &[u8], + iterations: u32, +) -> ScramKey { + let salted_password = pool + .spawn_job(endpoint, Pbkdf2::start(password, salt, iterations)) + .await; + + let make_key = |name| { + let key = Hmac::::new_from_slice(&salted_password) + .expect("HMAC is able to accept all key sizes") + .chain_update(name) + .finalize(); + + <[u8; 32]>::from(key.into_bytes()) + }; + + make_key(b"Client Key").into() +} + +pub(crate) async fn exchange( + pool: &ThreadPool, + endpoint: EndpointIdInt, secret: &ServerSecret, - mut client: ScramSha256, - tls_server_end_point: config::TlsServerEndPoint, + password: &[u8], ) -> sasl::Result> { - use sasl::Step::*; + let salt = base64::decode(&secret.salt_base64)?; + let client_key = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await; - let init = SaslInitial { - nonce: rand::random, - }; - - let client_first = std::str::from_utf8(client.message()) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; - let sent = match init.transition(secret, &tls_server_end_point, client_first)? { - Continue(sent, server_first) => { - client.update(server_first.as_bytes())?; - sent - } - Success(x, _) => match x {}, - Failure(msg) => return Ok(sasl::Outcome::Failure(msg)), - }; - - let client_final = std::str::from_utf8(client.message()) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; - let keys = match sent.transition(secret, &tls_server_end_point, client_final)? { - Success(keys, server_final) => { - client.finish(server_final.as_bytes())?; - keys - } - Continue(x, _) => match x {}, - Failure(msg) => return Ok(sasl::Outcome::Failure(msg)), - }; - - Ok(sasl::Outcome::Success(keys)) + if secret.is_password_invalid(&client_key).into() { + Ok(sasl::Outcome::Failure("password doesn't match")) + } else { + Ok(sasl::Outcome::Success(client_key)) + } } impl SaslInitial { @@ -185,7 +194,7 @@ impl SaslSentInner { .derive_client_key(&client_final_message.proof); // Auth fails either if keys don't match or it's pre-determined to fail. - if client_key.sha256() != secret.stored_key || secret.doomed { + if secret.is_password_invalid(&client_key).into() { return Ok(sasl::Step::Failure("password doesn't match")); } @@ -200,23 +209,23 @@ impl sasl::Mechanism for Exchange<'_> { type Output = super::ScramKey; fn exchange(mut self, input: &str) -> sasl::Result> { - use {sasl::Step::*, ExchangeState::*}; + use {sasl::Step, ExchangeState}; match &self.state { - Initial(init) => { + ExchangeState::Initial(init) => { match init.transition(self.secret, &self.tls_server_end_point, input)? { - Continue(sent, msg) => { - self.state = SaltSent(sent); - Ok(Continue(self, msg)) + Step::Continue(sent, msg) => { + self.state = ExchangeState::SaltSent(sent); + Ok(Step::Continue(self, msg)) } - Success(x, _) => match x {}, - Failure(msg) => Ok(Failure(msg)), + Step::Success(x, _) => match x {}, + Step::Failure(msg) => Ok(Step::Failure(msg)), } } - SaltSent(sent) => { + ExchangeState::SaltSent(sent) => { match sent.transition(self.secret, &self.tls_server_end_point, input)? { - Success(keys, msg) => Ok(Success(keys, msg)), - Continue(x, _) => match x {}, - Failure(msg) => Ok(Failure(msg)), + Step::Success(keys, msg) => Ok(Step::Success(keys, msg)), + Step::Continue(x, _) => match x {}, + Step::Failure(msg) => Ok(Step::Failure(msg)), } } } diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs index 66c2c6b207..fe55ff493b 100644 --- a/proxy/src/scram/key.rs +++ b/proxy/src/scram/key.rs @@ -1,23 +1,37 @@ //! Tools for client/server/stored key management. -/// Faithfully taken from PostgreSQL. -pub const SCRAM_KEY_LEN: usize = 32; +use subtle::ConstantTimeEq; -/// One of the keys derived from the [password](super::password::SaltedPassword). +/// Faithfully taken from PostgreSQL. +pub(crate) const SCRAM_KEY_LEN: usize = 32; + +/// One of the keys derived from the user's password. /// We use the same structure for all keys, i.e. /// `ClientKey`, `StoredKey`, and `ServerKey`. -#[derive(Clone, Default, PartialEq, Eq, Debug)] +#[derive(Clone, Default, Eq, Debug)] #[repr(transparent)] -pub struct ScramKey { +pub(crate) struct ScramKey { bytes: [u8; SCRAM_KEY_LEN], } +impl PartialEq for ScramKey { + fn eq(&self, other: &Self) -> bool { + self.ct_eq(other).into() + } +} + +impl ConstantTimeEq for ScramKey { + fn ct_eq(&self, other: &Self) -> subtle::Choice { + self.bytes.ct_eq(&other.bytes) + } +} + impl ScramKey { - pub fn sha256(&self) -> Self { + pub(crate) fn sha256(&self) -> Self { super::sha256([self.as_ref()]).into() } - pub fn as_bytes(&self) -> [u8; SCRAM_KEY_LEN] { + pub(crate) fn as_bytes(&self) -> [u8; SCRAM_KEY_LEN] { self.bytes } } diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index b59baec508..fd9e77764c 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -8,7 +8,7 @@ use std::fmt; use std::ops::Range; /// Faithfully taken from PostgreSQL. -pub const SCRAM_RAW_NONCE_LEN: usize = 18; +pub(crate) const SCRAM_RAW_NONCE_LEN: usize = 18; /// Although we ignore all extensions, we still have to validate the message. fn validate_sasl_extensions<'a>(parts: impl Iterator) -> Option<()> { @@ -27,20 +27,18 @@ fn validate_sasl_extensions<'a>(parts: impl Iterator) -> Option< } #[derive(Debug)] -pub struct ClientFirstMessage<'a> { +pub(crate) struct ClientFirstMessage<'a> { /// `client-first-message-bare`. - pub bare: &'a str, + pub(crate) bare: &'a str, /// Channel binding mode. - pub cbind_flag: ChannelBinding<&'a str>, - /// (Client username)[]. - pub username: &'a str, + pub(crate) cbind_flag: ChannelBinding<&'a str>, /// Client nonce. - pub nonce: &'a str, + pub(crate) nonce: &'a str, } impl<'a> ClientFirstMessage<'a> { // NB: FromStr doesn't work with lifetimes - pub fn parse(input: &'a str) -> Option { + pub(crate) fn parse(input: &'a str) -> Option { let mut parts = input.split(','); let cbind_flag = ChannelBinding::parse(parts.next()?)?; @@ -58,6 +56,14 @@ impl<'a> ClientFirstMessage<'a> { // In theory, these might be preceded by "reserved-mext" (i.e. "m=") let username = parts.next()?.strip_prefix("n=")?; + + // https://github.com/postgres/postgres/blob/f83908798f78c4cafda217ca875602c88ea2ae28/src/backend/libpq/auth-scram.c#L13-L14 + if !username.is_empty() { + tracing::warn!(username, "scram username provided, but is not expected"); + // TODO(conrad): + // return None; + } + let nonce = parts.next()?.strip_prefix("r=")?; // Validate but ignore auth extensions @@ -66,13 +72,12 @@ impl<'a> ClientFirstMessage<'a> { Some(Self { bare, cbind_flag, - username, nonce, }) } /// Build a response to [`ClientFirstMessage`]. - pub fn build_server_first_message( + pub(crate) fn build_server_first_message( &self, nonce: &[u8; SCRAM_RAW_NONCE_LEN], salt_base64: &str, @@ -84,7 +89,7 @@ impl<'a> ClientFirstMessage<'a> { write!(&mut message, "r={}", self.nonce).unwrap(); base64::encode_config_buf(nonce, base64::STANDARD, &mut message); let combined_nonce = 2..message.len(); - write!(&mut message, ",s={},i={}", salt_base64, iterations).unwrap(); + write!(&mut message, ",s={salt_base64},i={iterations}").unwrap(); // This design guarantees that it's impossible to create a // server-first-message without receiving a client-first-message @@ -96,20 +101,20 @@ impl<'a> ClientFirstMessage<'a> { } #[derive(Debug)] -pub struct ClientFinalMessage<'a> { +pub(crate) struct ClientFinalMessage<'a> { /// `client-final-message-without-proof`. - pub without_proof: &'a str, + pub(crate) without_proof: &'a str, /// Channel binding data (base64). - pub channel_binding: &'a str, + pub(crate) channel_binding: &'a str, /// Combined client & server nonce. - pub nonce: &'a str, + pub(crate) nonce: &'a str, /// Client auth proof. - pub proof: [u8; SCRAM_KEY_LEN], + pub(crate) proof: [u8; SCRAM_KEY_LEN], } impl<'a> ClientFinalMessage<'a> { // NB: FromStr doesn't work with lifetimes - pub fn parse(input: &'a str) -> Option { + pub(crate) fn parse(input: &'a str) -> Option { let (without_proof, proof) = input.rsplit_once(',')?; let mut parts = without_proof.split(','); @@ -130,9 +135,9 @@ impl<'a> ClientFinalMessage<'a> { } /// Build a response to [`ClientFinalMessage`]. - pub fn build_server_final_message( + pub(crate) fn build_server_final_message( &self, - signature_builder: SignatureBuilder, + signature_builder: SignatureBuilder<'_>, server_key: &ScramKey, ) -> String { let mut buf = String::from("v="); @@ -148,7 +153,7 @@ impl<'a> ClientFinalMessage<'a> { /// We need to keep a convenient representation of this /// message for the next authentication step. -pub struct OwnedServerFirstMessage { +pub(crate) struct OwnedServerFirstMessage { /// Owned `server-first-message`. message: String, /// Slice into `message`. @@ -158,13 +163,13 @@ pub struct OwnedServerFirstMessage { impl OwnedServerFirstMessage { /// Extract combined nonce from the message. #[inline(always)] - pub fn nonce(&self) -> &str { + pub(crate) fn nonce(&self) -> &str { &self.message[self.nonce.clone()] } /// Get reference to a text representation of the message. #[inline(always)] - pub fn as_str(&self) -> &str { + pub(crate) fn as_str(&self) -> &str { &self.message } } @@ -188,24 +193,44 @@ mod tests { // (Almost) real strings captured during debug sessions let cases = [ - (NotSupportedClient, "n,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju"), - (NotSupportedServer, "y,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju"), + (NotSupportedClient, "n,,n=,r=t8JwklwKecDLwSsA72rHmVju"), + (NotSupportedServer, "y,,n=,r=t8JwklwKecDLwSsA72rHmVju"), ( Required("tls-server-end-point"), - "p=tls-server-end-point,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju", + "p=tls-server-end-point,,n=,r=t8JwklwKecDLwSsA72rHmVju", ), ]; for (cb, input) in cases { let msg = ClientFirstMessage::parse(input).unwrap(); - assert_eq!(msg.bare, "n=pepe,r=t8JwklwKecDLwSsA72rHmVju"); - assert_eq!(msg.username, "pepe"); + assert_eq!(msg.bare, "n=,r=t8JwklwKecDLwSsA72rHmVju"); assert_eq!(msg.nonce, "t8JwklwKecDLwSsA72rHmVju"); assert_eq!(msg.cbind_flag, cb); } } + #[test] + fn parse_client_first_message_with_invalid_gs2_authz() { + assert!(ClientFirstMessage::parse("n,authzid,n=,r=nonce").is_none()); + } + + #[test] + fn parse_client_first_message_with_extra_params() { + let msg = ClientFirstMessage::parse("n,,n=,r=nonce,a=foo,b=bar,c=baz").unwrap(); + assert_eq!(msg.bare, "n=,r=nonce,a=foo,b=bar,c=baz"); + assert_eq!(msg.nonce, "nonce"); + assert_eq!(msg.cbind_flag, ChannelBinding::NotSupportedClient); + } + + #[test] + fn parse_client_first_message_with_extra_params_invalid() { + // must be of the form `=<...>` + assert!(ClientFirstMessage::parse("n,,n=,r=nonce,abc=foo").is_none()); + assert!(ClientFirstMessage::parse("n,,n=,r=nonce,1=foo").is_none()); + assert!(ClientFirstMessage::parse("n,,n=,r=nonce,a").is_none()); + } + #[test] fn parse_client_final_message() { let input = [ diff --git a/proxy/src/scram/password.rs b/proxy/src/scram/password.rs deleted file mode 100644 index 022f2842dd..0000000000 --- a/proxy/src/scram/password.rs +++ /dev/null @@ -1,74 +0,0 @@ -//! Password hashing routines. - -use super::key::ScramKey; - -pub const SALTED_PASSWORD_LEN: usize = 32; - -/// Salted hashed password is essential for [key](super::key) derivation. -#[repr(transparent)] -pub struct SaltedPassword { - bytes: [u8; SALTED_PASSWORD_LEN], -} - -impl SaltedPassword { - /// See `scram-common.c : scram_SaltedPassword` for details. - /// Further reading: (see `PBKDF2`). - pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword { - pbkdf2::pbkdf2_hmac_array::(password, salt, iterations).into() - } - - /// Derive `ClientKey` from a salted hashed password. - pub fn client_key(&self) -> ScramKey { - super::hmac_sha256(&self.bytes, [b"Client Key".as_ref()]).into() - } - - /// Derive `ServerKey` from a salted hashed password. - pub fn server_key(&self) -> ScramKey { - super::hmac_sha256(&self.bytes, [b"Server Key".as_ref()]).into() - } -} - -impl From<[u8; SALTED_PASSWORD_LEN]> for SaltedPassword { - #[inline(always)] - fn from(bytes: [u8; SALTED_PASSWORD_LEN]) -> Self { - Self { bytes } - } -} - -#[cfg(test)] -mod tests { - use super::SaltedPassword; - - fn legacy_pbkdf2_impl(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword { - let one = 1_u32.to_be_bytes(); // magic - - let mut current = super::super::hmac_sha256(password, [salt, &one]); - let mut result = current; - for _ in 1..iterations { - current = super::super::hmac_sha256(password, [current.as_ref()]); - // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094 - for (i, x) in current.iter().enumerate() { - result[i] ^= x; - } - } - - result.into() - } - - #[test] - fn pbkdf2() { - let password = "a-very-secure-password"; - let salt = "such-a-random-salt"; - let iterations = 4096; - let output = [ - 203, 18, 206, 81, 4, 154, 193, 100, 147, 41, 211, 217, 177, 203, 69, 210, 194, 211, - 101, 1, 248, 156, 96, 0, 8, 223, 30, 87, 158, 41, 20, 42, - ]; - - let actual = SaltedPassword::new(password.as_bytes(), salt.as_bytes(), iterations); - let expected = legacy_pbkdf2_impl(password.as_bytes(), salt.as_bytes(), iterations); - - assert_eq!(actual.bytes, output); - assert_eq!(actual.bytes, expected.bytes); - } -} diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs new file mode 100644 index 0000000000..4cf76c8452 --- /dev/null +++ b/proxy/src/scram/pbkdf2.rs @@ -0,0 +1,89 @@ +use hmac::{ + digest::{consts::U32, generic_array::GenericArray}, + Hmac, Mac, +}; +use sha2::Sha256; + +pub(crate) struct Pbkdf2 { + hmac: Hmac, + prev: GenericArray, + hi: GenericArray, + iterations: u32, +} + +// inspired from +impl Pbkdf2 { + pub(crate) fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self { + let hmac = + Hmac::::new_from_slice(str).expect("HMAC is able to accept all key sizes"); + + let prev = hmac + .clone() + .chain_update(salt) + .chain_update(1u32.to_be_bytes()) + .finalize() + .into_bytes(); + + Self { + hmac, + // one consumed for the hash above + iterations: iterations - 1, + hi: prev, + prev, + } + } + + pub(crate) fn cost(&self) -> u32 { + (self.iterations).clamp(0, 4096) + } + + pub(crate) fn turn(&mut self) -> std::task::Poll<[u8; 32]> { + let Self { + hmac, + prev, + hi, + iterations, + } = self; + + // only do 4096 iterations per turn before sharing the thread for fairness + let n = (*iterations).clamp(0, 4096); + for _ in 0..n { + *prev = hmac.clone().chain_update(*prev).finalize().into_bytes(); + + for (hi, prev) in hi.iter_mut().zip(*prev) { + *hi ^= prev; + } + } + + *iterations -= n; + if *iterations == 0 { + std::task::Poll::Ready((*hi).into()) + } else { + std::task::Poll::Pending + } + } +} + +#[cfg(test)] +mod tests { + use super::Pbkdf2; + use pbkdf2::pbkdf2_hmac_array; + use sha2::Sha256; + + #[test] + fn works() { + let salt = b"sodium chloride"; + let pass = b"Ne0n_!5_50_C007"; + + let mut job = Pbkdf2::start(pass, salt, 60000); + let hash = loop { + let std::task::Poll::Ready(hash) = job.turn() else { + continue; + }; + break hash; + }; + + let expected = pbkdf2_hmac_array::(pass, salt, 60000); + assert_eq!(hash, expected); + } +} diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index 041548014a..8c6a08d432 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -1,27 +1,29 @@ //! Tools for SCRAM server secret management. +use subtle::{Choice, ConstantTimeEq}; + use super::base64_decode_array; use super::key::ScramKey; -/// Server secret is produced from [password](super::password::SaltedPassword) +/// Server secret is produced from user's password, /// and is used throughout the authentication process. #[derive(Clone, Eq, PartialEq, Debug)] -pub struct ServerSecret { +pub(crate) struct ServerSecret { /// Number of iterations for `PBKDF2` function. - pub iterations: u32, + pub(crate) iterations: u32, /// Salt used to hash user's password. - pub salt_base64: String, + pub(crate) salt_base64: String, /// Hashed `ClientKey`. - pub stored_key: ScramKey, + pub(crate) stored_key: ScramKey, /// Used by client to verify server's signature. - pub server_key: ScramKey, + pub(crate) server_key: ScramKey, /// Should auth fail no matter what? /// This is exactly the case for mocked secrets. - pub doomed: bool, + pub(crate) doomed: bool, } impl ServerSecret { - pub fn parse(input: &str) -> Option { + pub(crate) fn parse(input: &str) -> Option { // SCRAM-SHA-256$:$: let s = input.strip_prefix("SCRAM-SHA-256$")?; let (params, keys) = s.split_once('$')?; @@ -40,16 +42,21 @@ impl ServerSecret { Some(secret) } + pub(crate) fn is_password_invalid(&self, client_key: &ScramKey) -> Choice { + // constant time to not leak partial key match + client_key.sha256().ct_ne(&self.stored_key) | Choice::from(self.doomed as u8) + } + /// To avoid revealing information to an attacker, we use a /// mocked server secret even if the user doesn't exist. /// See `auth-scram.c : mock_scram_secret` for details. - pub fn mock(user: &str, nonce: [u8; 32]) -> Self { - // Refer to `auth-scram.c : scram_mock_salt`. - let mocked_salt = super::sha256([user.as_bytes(), &nonce]); - + pub(crate) fn mock(nonce: [u8; 32]) -> Self { Self { - iterations: 4096, - salt_base64: base64::encode(mocked_salt), + // this doesn't reveal much information as we're going to use + // iteration count 1 for our generated passwords going forward. + // PG16 users can set iteration count=1 already today. + iterations: 1, + salt_base64: base64::encode(nonce), stored_key: ScramKey::default(), server_key: ScramKey::default(), doomed: true, @@ -59,21 +66,8 @@ impl ServerSecret { /// Build a new server secret from the prerequisites. /// XXX: We only use this function in tests. #[cfg(test)] - pub fn build(password: &str, salt: &[u8], iterations: u32) -> Option { - // TODO: implement proper password normalization required by the RFC - if !password.is_ascii() { - return None; - } - - let password = super::password::SaltedPassword::new(password.as_bytes(), salt, iterations); - - Some(Self { - iterations, - salt_base64: base64::encode(salt), - stored_key: password.client_key().sha256(), - server_key: password.server_key(), - doomed: false, - }) + pub(crate) async fn build(password: &str) -> Option { + Self::parse(&postgres_protocol::password::scram_sha_256(password.as_bytes()).await) } } @@ -88,13 +82,7 @@ mod tests { let stored_key = "D5h6KTMBlUvDJk2Y8ELfC1Sjtc6k9YHjRyuRZyBNJns="; let server_key = "Pi3QHbcluX//NDfVkKlFl88GGzlJ5LkyPwcdlN/QBvI="; - let secret = format!( - "SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}", - iterations = iterations, - salt = salt, - stored_key = stored_key, - server_key = server_key, - ); + let secret = format!("SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}"); let parsed = ServerSecret::parse(&secret).unwrap(); assert_eq!(parsed.iterations, iterations); @@ -103,20 +91,4 @@ mod tests { assert_eq!(base64::encode(parsed.stored_key), stored_key); assert_eq!(base64::encode(parsed.server_key), server_key); } - - #[test] - fn build_scram_secret() { - let salt = b"salt"; - let secret = ServerSecret::build("password", salt, 4096).unwrap(); - assert_eq!(secret.iterations, 4096); - assert_eq!(secret.salt_base64, base64::encode(salt)); - assert_eq!( - base64::encode(secret.stored_key.as_ref()), - "lF4cRm/Jky763CN4HtxdHnjV4Q8AWTNlKvGmEFFU8IQ=" - ); - assert_eq!( - base64::encode(secret.server_key.as_ref()), - "ub8OgRsftnk2ccDMOt7ffHXNcikRkQkq1lh4xaAqrSw=" - ); - } } diff --git a/proxy/src/scram/signature.rs b/proxy/src/scram/signature.rs index 1c2811d757..d3255cf2ca 100644 --- a/proxy/src/scram/signature.rs +++ b/proxy/src/scram/signature.rs @@ -4,14 +4,14 @@ use super::key::{ScramKey, SCRAM_KEY_LEN}; /// A collection of message parts needed to derive the client's signature. #[derive(Debug)] -pub struct SignatureBuilder<'a> { - pub client_first_message_bare: &'a str, - pub server_first_message: &'a str, - pub client_final_message_without_proof: &'a str, +pub(crate) struct SignatureBuilder<'a> { + pub(crate) client_first_message_bare: &'a str, + pub(crate) server_first_message: &'a str, + pub(crate) client_final_message_without_proof: &'a str, } impl SignatureBuilder<'_> { - pub fn build(&self, key: &ScramKey) -> Signature { + pub(crate) fn build(&self, key: &ScramKey) -> Signature { let parts = [ self.client_first_message_bare.as_bytes(), b",", @@ -28,13 +28,13 @@ impl SignatureBuilder<'_> { /// produces `ClientKey` that we need for authentication. #[derive(Debug)] #[repr(transparent)] -pub struct Signature { +pub(crate) struct Signature { bytes: [u8; SCRAM_KEY_LEN], } impl Signature { /// Derive `ClientKey` from client's signature and proof. - pub fn derive_client_key(&self, proof: &[u8; SCRAM_KEY_LEN]) -> ScramKey { + pub(crate) fn derive_client_key(&self, proof: &[u8; SCRAM_KEY_LEN]) -> ScramKey { // This is how the proof is calculated: // // 1. sha256(ClientKey) -> StoredKey diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs new file mode 100644 index 0000000000..2702aeebfe --- /dev/null +++ b/proxy/src/scram/threadpool.rs @@ -0,0 +1,213 @@ +//! Custom threadpool implementation for password hashing. +//! +//! Requirements: +//! 1. Fairness per endpoint. +//! 2. Yield support for high iteration counts. + +use std::{ + cell::RefCell, + future::Future, + pin::Pin, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, Weak, + }, + task::{Context, Poll}, +}; + +use futures::FutureExt; +use rand::Rng; +use rand::{rngs::SmallRng, SeedableRng}; + +use crate::{ + intern::EndpointIdInt, + metrics::{ThreadPoolMetrics, ThreadPoolWorkerId}, + scram::countmin::CountMinSketch, +}; + +use super::pbkdf2::Pbkdf2; + +pub struct ThreadPool { + runtime: Option, + pub metrics: Arc, +} + +/// How often to reset the sketch values +const SKETCH_RESET_INTERVAL: u64 = 1021; + +thread_local! { + static STATE: RefCell> = const { RefCell::new(None) }; +} + +impl ThreadPool { + pub fn new(n_workers: u8) -> Arc { + // rayon would be nice here, but yielding in rayon does not work well afaict. + + Arc::new_cyclic(|pool| { + let pool = pool.clone(); + let worker_id = AtomicUsize::new(0); + + let runtime = tokio::runtime::Builder::new_multi_thread() + .worker_threads(n_workers as usize) + .on_thread_start(move || { + STATE.with_borrow_mut(|state| { + *state = Some(ThreadRt { + pool: pool.clone(), + id: ThreadPoolWorkerId(worker_id.fetch_add(1, Ordering::Relaxed)), + rng: SmallRng::from_entropy(), + // used to determine whether we should temporarily skip tasks for fairness. + // 99% of estimates will overcount by no more than 4096 samples + countmin: CountMinSketch::with_params( + 1.0 / (SKETCH_RESET_INTERVAL as f64), + 0.01, + ), + tick: 0, + }); + }); + }) + .build() + .unwrap(); + + Self { + runtime: Some(runtime), + metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)), + } + }) + } + + pub(crate) fn spawn_job(&self, endpoint: EndpointIdInt, pbkdf2: Pbkdf2) -> JobHandle { + JobHandle( + self.runtime + .as_ref() + .unwrap() + .spawn(JobSpec { pbkdf2, endpoint }), + ) + } +} + +impl Drop for ThreadPool { + fn drop(&mut self) { + self.runtime.take().unwrap().shutdown_background(); + } +} + +struct ThreadRt { + pool: Weak, + id: ThreadPoolWorkerId, + rng: SmallRng, + countmin: CountMinSketch, + tick: u64, +} + +impl ThreadRt { + fn should_run(&mut self, job: &JobSpec) -> bool { + let rate = self + .countmin + .inc_and_return(&job.endpoint, job.pbkdf2.cost()); + + const P: f64 = 2000.0; + // probability decreases as rate increases. + // lower probability, higher chance of being skipped + // + // estimates (rate in terms of 4096 rounds): + // rate = 0 => probability = 100% + // rate = 10 => probability = 71.3% + // rate = 50 => probability = 62.1% + // rate = 500 => probability = 52.3% + // rate = 1021 => probability = 49.8% + // + // My expectation is that the pool queue will only begin backing up at ~1000rps + // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above + // are in requests per second. + let probability = P.ln() / (P + rate as f64).ln(); + self.rng.gen_bool(probability) + } +} + +struct JobSpec { + pbkdf2: Pbkdf2, + endpoint: EndpointIdInt, +} + +impl Future for JobSpec { + type Output = [u8; 32]; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + STATE.with_borrow_mut(|state| { + let state = state.as_mut().expect("should be set on thread startup"); + + state.tick = state.tick.wrapping_add(1); + if state.tick % SKETCH_RESET_INTERVAL == 0 { + state.countmin.reset(); + } + + if state.should_run(&self) { + if let Some(pool) = state.pool.upgrade() { + pool.metrics.worker_task_turns_total.inc(state.id); + } + + match self.pbkdf2.turn() { + Poll::Ready(result) => Poll::Ready(result), + // more to do, we shall requeue + Poll::Pending => { + cx.waker().wake_by_ref(); + Poll::Pending + } + } + } else { + if let Some(pool) = state.pool.upgrade() { + pool.metrics.worker_task_skips_total.inc(state.id); + } + + cx.waker().wake_by_ref(); + Poll::Pending + } + }) + } +} + +pub(crate) struct JobHandle(tokio::task::JoinHandle<[u8; 32]>); + +impl Future for JobHandle { + type Output = [u8; 32]; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + match self.0.poll_unpin(cx) { + Poll::Ready(Ok(ok)) => Poll::Ready(ok), + Poll::Ready(Err(err)) => std::panic::resume_unwind(err.into_panic()), + Poll::Pending => Poll::Pending, + } + } +} + +impl Drop for JobHandle { + fn drop(&mut self) { + self.0.abort(); + } +} + +#[cfg(test)] +mod tests { + use crate::EndpointId; + + use super::*; + + #[tokio::test] + async fn hash_is_correct() { + let pool = ThreadPool::new(1); + + let ep = EndpointId::from("foo"); + let ep = EndpointIdInt::from(ep); + + let salt = [0x55; 32]; + let actual = pool + .spawn_job(ep, Pbkdf2::start(b"password", &salt, 4096)) + .await; + + let expected = [ + 10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242, + 178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140, + ]; + assert_eq!(actual, expected); + } +} diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index 8af008394a..84f98cb8ad 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -2,61 +2,72 @@ //! //! Handles both SQL over HTTP and SQL over Websockets. +mod backend; +pub mod cancel_set; mod conn_pool; +mod http_util; +mod json; mod sql_over_http; mod websocket; +use async_trait::async_trait; +use atomic_take::AtomicTake; +use bytes::Bytes; pub use conn_pool::GlobalConnPoolOptions; -use anyhow::bail; -use hyper::StatusCode; -use metrics::IntCounterPairGuard; +use anyhow::Context; +use futures::future::{select, Either}; +use futures::TryFutureExt; +use http::{Method, Response, StatusCode}; +use http_body_util::Full; +use hyper1::body::Incoming; +use hyper_util::rt::TokioExecutor; +use hyper_util::server::conn::auto::Builder; use rand::rngs::StdRng; use rand::SeedableRng; -pub use reqwest_middleware::{ClientWithMiddleware, Error}; -pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::time::timeout; +use tokio_rustls::TlsAcceptor; use tokio_util::task::TaskTracker; -use crate::config::TlsConfig; +use crate::cancellation::CancellationHandlerMain; +use crate::config::ProxyConfig; use crate::context::RequestMonitoring; -use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE; -use crate::protocol2::{ProxyProtocolAccept, WithClientIp}; +use crate::metrics::Metrics; +use crate::protocol2::{read_proxy_protocol, ChainRW}; +use crate::proxy::run_until_cancelled; use crate::rate_limiter::EndpointRateLimiter; -use crate::{cancellation::CancelMap, config::ProxyConfig}; -use futures::StreamExt; -use hyper::{ - server::{ - accept, - conn::{AddrIncoming, AddrStream}, - }, - Body, Method, Request, Response, -}; +use crate::serverless::backend::PoolingBackend; +use crate::serverless::http_util::{api_error_into_response, json_response}; -use std::net::IpAddr; -use std::task::Poll; -use std::{future::ready, sync::Arc}; -use tls_listener::TlsListener; -use tokio::net::TcpListener; +use std::net::{IpAddr, SocketAddr}; +use std::pin::{pin, Pin}; +use std::sync::Arc; +use tokio::net::{TcpListener, TcpStream}; use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, warn, Instrument}; -use utils::http::{error::ApiError, json::json_response}; +use tracing::{error, info, warn, Instrument}; +use utils::http::error::ApiError; + +pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api"; pub async fn task_main( config: &'static ProxyConfig, ws_listener: TcpListener, cancellation_token: CancellationToken, + cancellation_handler: Arc, endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("websocket server has shut down"); } - let conn_pool = conn_pool::GlobalConnPool::new(config); - - let conn_pool2 = Arc::clone(&conn_pool); - tokio::spawn(async move { - conn_pool2.gc_worker(StdRng::from_entropy()).await; - }); + let conn_pool = conn_pool::GlobalConnPool::new(&config.http_config); + { + let conn_pool = Arc::clone(&conn_pool); + tokio::spawn(async move { + conn_pool.gc_worker(StdRng::from_entropy()).await; + }); + } // shutdown the connection pool tokio::spawn({ @@ -70,142 +81,268 @@ pub async fn task_main( } }); - let tls_config = match config.tls_config.as_ref() { - Some(config) => config, - None => { - warn!("TLS config is missing, WebSocket Secure server will not be started"); - return Ok(()); - } - }; - let tls_acceptor: tokio_rustls::TlsAcceptor = tls_config.to_server_config().into(); - - let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?; - let _ = addr_incoming.set_nodelay(true); - let addr_incoming = ProxyProtocolAccept { - incoming: addr_incoming, - }; - - let ws_connections = tokio_util::task::task_tracker::TaskTracker::new(); - ws_connections.close(); // allows `ws_connections.wait to complete` - - let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| { - if let Err(err) = conn { - error!("failed to accept TLS connection for websockets: {err:?}"); - ready(false) - } else { - ready(true) - } + let backend = Arc::new(PoolingBackend { + pool: Arc::clone(&conn_pool), + config, + endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter), }); + let tls_acceptor: Arc = match config.tls_config.as_ref() { + Some(config) => { + let mut tls_server_config = rustls::ServerConfig::clone(&config.to_server_config()); + // prefer http2, but support http/1.1 + tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; + Arc::new(tls_server_config) + } + None => { + warn!("TLS config is missing"); + Arc::new(NoTls) + } + }; - let make_svc = hyper::service::make_service_fn( - |stream: &tokio_rustls::server::TlsStream>| { - let (io, tls) = stream.get_ref(); - let client_addr = io.client_addr(); - let remote_addr = io.inner.remote_addr(); - let sni_name = tls.server_name().map(|s| s.to_string()); - let conn_pool = conn_pool.clone(); - let ws_connections = ws_connections.clone(); - let endpoint_rate_limiter = endpoint_rate_limiter.clone(); + let connections = tokio_util::task::task_tracker::TaskTracker::new(); + connections.close(); // allows `connections.wait to complete` - async move { - let peer_addr = match client_addr { - Some(addr) => addr, - None if config.require_client_ip => bail!("missing required client ip"), - None => remote_addr, - }; - Ok(MetricService::new(hyper::service::service_fn( - move |req: Request| { - let sni_name = sni_name.clone(); - let conn_pool = conn_pool.clone(); - let ws_connections = ws_connections.clone(); - let endpoint_rate_limiter = endpoint_rate_limiter.clone(); + while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await { + let (conn, peer_addr) = res.context("could not accept TCP stream")?; + if let Err(e) = conn.set_nodelay(true) { + tracing::error!("could not set nodelay: {e}"); + continue; + } + let conn_id = uuid::Uuid::new_v4(); + let http_conn_span = tracing::info_span!("http_conn", ?conn_id); - async move { - let cancel_map = Arc::new(CancelMap::default()); - let session_id = uuid::Uuid::new_v4(); - - request_handler( - req, - config, - tls_config, - conn_pool, - ws_connections, - cancel_map, - session_id, - sni_name, - peer_addr.ip(), - endpoint_rate_limiter, - ) - .instrument(info_span!( - "serverless", - session = %session_id, - %peer_addr, - )) - .await - } - }, - ))) + let n_connections = Metrics::get() + .proxy + .client_connections + .sample(crate::metrics::Protocol::Http); + tracing::trace!(?n_connections, threshold = ?config.http_config.client_conn_threshold, "check"); + if n_connections > config.http_config.client_conn_threshold { + tracing::trace!("attempting to cancel a random connection"); + if let Some(token) = config.http_config.cancel_set.take() { + tracing::debug!("cancelling a random connection"); + token.cancel(); } - }, - ); + } - hyper::Server::builder(accept::from_stream(tls_listener)) - .serve(make_svc) - .with_graceful_shutdown(cancellation_token.cancelled()) - .await?; + let conn_token = cancellation_token.child_token(); + let tls_acceptor = tls_acceptor.clone(); + let backend = backend.clone(); + let connections2 = connections.clone(); + let cancellation_handler = cancellation_handler.clone(); + let endpoint_rate_limiter = endpoint_rate_limiter.clone(); + connections.spawn( + async move { + let conn_token2 = conn_token.clone(); + let _cancel_guard = config.http_config.cancel_set.insert(conn_id, conn_token2); - // await websocket connections - ws_connections.wait().await; + let session_id = uuid::Uuid::new_v4(); + + let _gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Http); + + let startup_result = Box::pin(connection_startup( + config, + tls_acceptor, + session_id, + conn, + peer_addr, + )) + .await; + let Some((conn, peer_addr)) = startup_result else { + return; + }; + + Box::pin(connection_handler( + config, + backend, + connections2, + cancellation_handler, + endpoint_rate_limiter, + conn_token, + conn, + peer_addr, + session_id, + )) + .await; + } + .instrument(http_conn_span), + ); + } + + connections.wait().await; Ok(()) } -struct MetricService { - inner: S, - _gauge: IntCounterPairGuard, +pub(crate) trait AsyncReadWrite: AsyncRead + AsyncWrite + Send + 'static {} +impl AsyncReadWrite for T {} +pub(crate) type AsyncRW = Pin>; + +#[async_trait] +trait MaybeTlsAcceptor: Send + Sync + 'static { + async fn accept(self: Arc, conn: ChainRW) -> std::io::Result; } -impl MetricService { - fn new(inner: S) -> MetricService { - MetricService { - inner, - _gauge: NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&["http"]) - .guard(), +#[async_trait] +impl MaybeTlsAcceptor for rustls::ServerConfig { + async fn accept(self: Arc, conn: ChainRW) -> std::io::Result { + Ok(Box::pin(TlsAcceptor::from(self).accept(conn).await?)) + } +} + +struct NoTls; + +#[async_trait] +impl MaybeTlsAcceptor for NoTls { + async fn accept(self: Arc, conn: ChainRW) -> std::io::Result { + Ok(Box::pin(conn)) + } +} + +/// Handles the TCP startup lifecycle. +/// 1. Parses PROXY protocol V2 +/// 2. Handles TLS handshake +async fn connection_startup( + config: &ProxyConfig, + tls_acceptor: Arc, + session_id: uuid::Uuid, + conn: TcpStream, + peer_addr: SocketAddr, +) -> Option<(AsyncRW, IpAddr)> { + // handle PROXY protocol + let (conn, peer) = match read_proxy_protocol(conn).await { + Ok(c) => c, + Err(e) => { + tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); + return None; } - } + }; + + let peer_addr = peer.unwrap_or(peer_addr).ip(); + let has_private_peer_addr = match peer_addr { + IpAddr::V4(ip) => ip.is_private(), + IpAddr::V6(_) => false, + }; + info!(?session_id, %peer_addr, "accepted new TCP connection"); + + // try upgrade to TLS, but with a timeout. + let conn = match timeout(config.handshake_timeout, tls_acceptor.accept(conn)).await { + Ok(Ok(conn)) => { + info!(?session_id, %peer_addr, "accepted new TLS connection"); + conn + } + // The handshake failed + Ok(Err(e)) => { + if !has_private_peer_addr { + Metrics::get().proxy.tls_handshake_failures.inc(); + } + warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}"); + return None; + } + // The handshake timed out + Err(e) => { + if !has_private_peer_addr { + Metrics::get().proxy.tls_handshake_failures.inc(); + } + warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}"); + return None; + } + }; + + Some((conn, peer_addr)) } -impl hyper::service::Service> for MetricService -where - S: hyper::service::Service>, -{ - type Response = S::Response; - type Error = S::Error; - type Future = S::Future; +/// Handles HTTP connection +/// 1. With graceful shutdowns +/// 2. With graceful request cancellation with connection failure +/// 3. With websocket upgrade support. +#[allow(clippy::too_many_arguments)] +async fn connection_handler( + config: &'static ProxyConfig, + backend: Arc, + connections: TaskTracker, + cancellation_handler: Arc, + endpoint_rate_limiter: Arc, + cancellation_token: CancellationToken, + conn: AsyncRW, + peer_addr: IpAddr, + session_id: uuid::Uuid, +) { + let session_id = AtomicTake::new(session_id); - fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll> { - self.inner.poll_ready(cx) - } + // Cancel all current inflight HTTP requests if the HTTP connection is closed. + let http_cancellation_token = CancellationToken::new(); + let _cancel_connection = http_cancellation_token.clone().drop_guard(); - fn call(&mut self, req: Request) -> Self::Future { - self.inner.call(req) + let server = Builder::new(TokioExecutor::new()); + let conn = server.serve_connection_with_upgrades( + hyper_util::rt::TokioIo::new(conn), + hyper1::service::service_fn(move |req: hyper1::Request| { + // First HTTP request shares the same session ID + let session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4); + + // Cancel the current inflight HTTP request if the requets stream is closed. + // This is slightly different to `_cancel_connection` in that + // h2 can cancel individual requests with a `RST_STREAM`. + let http_request_token = http_cancellation_token.child_token(); + let cancel_request = http_request_token.clone().drop_guard(); + + // `request_handler` is not cancel safe. It expects to be cancelled only at specific times. + // By spawning the future, we ensure it never gets cancelled until it decides to. + let handler = connections.spawn( + request_handler( + req, + config, + backend.clone(), + connections.clone(), + cancellation_handler.clone(), + session_id, + peer_addr, + http_request_token, + endpoint_rate_limiter.clone(), + ) + .in_current_span() + .map_ok_or_else(api_error_into_response, |r| r), + ); + async move { + let res = handler.await; + cancel_request.disarm(); + res + } + }), + ); + + // On cancellation, trigger the HTTP connection handler to shut down. + let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await { + Either::Left((_cancelled, mut conn)) => { + tracing::debug!(%peer_addr, "cancelling connection"); + conn.as_mut().graceful_shutdown(); + conn.await + } + Either::Right((res, _)) => res, + }; + + match res { + Ok(()) => tracing::info!(%peer_addr, "HTTP connection closed"), + Err(e) => tracing::warn!(%peer_addr, "HTTP connection error {e}"), } } #[allow(clippy::too_many_arguments)] async fn request_handler( - mut request: Request, + mut request: hyper1::Request, config: &'static ProxyConfig, - tls: &'static TlsConfig, - conn_pool: Arc, + backend: Arc, ws_connections: TaskTracker, - cancel_map: Arc, + cancellation_handler: Arc, session_id: uuid::Uuid, - sni_hostname: Option, peer_addr: IpAddr, + // used to cancel in-flight HTTP requests. not used to cancel websockets + http_cancellation_token: CancellationToken, endpoint_rate_limiter: Arc, -) -> Result, ApiError> { +) -> Result>, ApiError> { let host = request .headers() .get("host") @@ -214,57 +351,65 @@ async fn request_handler( .map(|s| s.to_string()); // Check if the request is a websocket upgrade request. - if hyper_tungstenite::is_upgrade_request(&request) { - info!(session_id = ?session_id, "performing websocket upgrade"); + if config.http_config.accept_websockets + && framed_websockets::upgrade::is_upgrade_request(&request) + { + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Ws, + &config.region, + ); - let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None) + let span = ctx.span(); + info!(parent: &span, "performing websocket upgrade"); + + let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request) .map_err(|e| ApiError::BadRequest(e.into()))?; ws_connections.spawn( async move { - let mut ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region); - if let Err(e) = websocket::serve_websocket( config, - &mut ctx, + ctx, websocket, - &cancel_map, - host, + cancellation_handler, endpoint_rate_limiter, + host, ) .await { - error!(session_id = ?session_id, "error in websocket connection: {e:#}"); + error!("error in websocket connection: {e:#}"); } } - .in_current_span(), + .instrument(span), ); // Return the response so the spawned future can continue. - Ok(response) - } else if request.uri().path() == "/sql" && request.method() == Method::POST { - let mut ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region); + Ok(response.map(|_: http_body_util::Empty| Full::new(Bytes::new()))) + } else if request.uri().path() == "/sql" && *request.method() == Method::POST { + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Http, + &config.region, + ); + let span = ctx.span(); - sql_over_http::handle( - tls, - &config.http_config, - &mut ctx, - request, - sni_hostname, - conn_pool, - ) - .await - } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS { + sql_over_http::handle(config, ctx, request, backend, http_cancellation_token) + .instrument(span) + .await + } else if request.uri().path() == "/sql" && *request.method() == Method::OPTIONS { Response::builder() .header("Allow", "OPTIONS, POST") .header("Access-Control-Allow-Origin", "*") .header( "Access-Control-Allow-Headers", - "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In, Neon-Batch-Read-Only, Neon-Batch-Isolation-Level", + "Authorization, Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In, Neon-Batch-Read-Only, Neon-Batch-Isolation-Level", ) .header("Access-Control-Max-Age", "86400" /* 24 hours */) .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code - .body(Body::empty()) + .body(Full::new(Bytes::new())) .map_err(|e| ApiError::InternalServerError(e.into())) } else { json_response(StatusCode::BAD_REQUEST, "query is not supported") diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs new file mode 100644 index 0000000000..f24e0478be --- /dev/null +++ b/proxy/src/serverless/backend.rs @@ -0,0 +1,300 @@ +use std::{sync::Arc, time::Duration}; + +use async_trait::async_trait; +use tracing::{field::display, info}; + +use crate::{ + auth::{ + backend::{local::StaticAuthRules, ComputeCredentials, ComputeUserInfo}, + check_peer_addr_is_in_list, AuthError, + }, + compute, + config::{AuthenticationConfig, ProxyConfig}, + console::{ + errors::{GetAuthInfoError, WakeComputeError}, + locks::ApiLocks, + provider::ApiLockError, + CachedNodeInfo, + }, + context::RequestMonitoring, + error::{ErrorKind, ReportableError, UserFacingError}, + intern::EndpointIdInt, + proxy::{ + connect_compute::ConnectMechanism, + retry::{CouldRetry, ShouldRetryWakeCompute}, + }, + rate_limiter::EndpointRateLimiter, + Host, +}; + +use super::conn_pool::{poll_client, AuthData, Client, ConnInfo, GlobalConnPool}; + +pub(crate) struct PoolingBackend { + pub(crate) pool: Arc>, + pub(crate) config: &'static ProxyConfig, + pub(crate) endpoint_rate_limiter: Arc, +} + +impl PoolingBackend { + pub(crate) async fn authenticate_with_password( + &self, + ctx: &RequestMonitoring, + config: &AuthenticationConfig, + user_info: &ComputeUserInfo, + password: &[u8], + ) -> Result { + let user_info = user_info.clone(); + let backend = self + .config + .auth_backend + .as_ref() + .map(|()| user_info.clone()); + let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?; + if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { + return Err(AuthError::ip_address_not_allowed(ctx.peer_addr())); + } + if !self + .endpoint_rate_limiter + .check(user_info.endpoint.clone().into(), 1) + { + return Err(AuthError::too_many_connections()); + } + let cached_secret = match maybe_secret { + Some(secret) => secret, + None => backend.get_role_secret(ctx).await?, + }; + + let secret = match cached_secret.value.clone() { + Some(secret) => self.config.authentication_config.check_rate_limit( + ctx, + config, + secret, + &user_info.endpoint, + true, + )?, + None => { + // If we don't have an authentication secret, for the http flow we can just return an error. + info!("authentication info not found"); + return Err(AuthError::auth_failed(&*user_info.user)); + } + }; + let ep = EndpointIdInt::from(&user_info.endpoint); + let auth_outcome = + crate::auth::validate_password_and_exchange(&config.thread_pool, ep, password, secret) + .await?; + let res = match auth_outcome { + crate::sasl::Outcome::Success(key) => { + info!("user successfully authenticated"); + Ok(key) + } + crate::sasl::Outcome::Failure(reason) => { + info!("auth backend failed with an error: {reason}"); + Err(AuthError::auth_failed(&*user_info.user)) + } + }; + res.map(|key| ComputeCredentials { + info: user_info, + keys: key, + }) + } + + pub(crate) async fn authenticate_with_jwt( + &self, + ctx: &RequestMonitoring, + user_info: &ComputeUserInfo, + jwt: &str, + ) -> Result { + match &self.config.auth_backend { + crate::auth::Backend::Console(_, ()) => { + Err(AuthError::auth_failed("JWT login is not yet supported")) + } + crate::auth::Backend::Web(_, ()) => Err(AuthError::auth_failed( + "JWT login over web auth proxy is not supported", + )), + crate::auth::Backend::Local(cache) => { + cache + .jwks_cache + .check_jwt( + ctx, + user_info.endpoint.clone(), + user_info.user.clone(), + &StaticAuthRules, + jwt, + ) + .await + .map_err(|e| AuthError::auth_failed(e.to_string()))?; + Ok(ComputeCredentials { + info: user_info.clone(), + keys: crate::auth::backend::ComputeCredentialKeys::None, + }) + } + } + } + + // Wake up the destination if needed. Code here is a bit involved because + // we reuse the code from the usual proxy and we need to prepare few structures + // that this code expects. + #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] + pub(crate) async fn connect_to_compute( + &self, + ctx: &RequestMonitoring, + conn_info: ConnInfo, + keys: ComputeCredentials, + force_new: bool, + ) -> Result, HttpConnError> { + let maybe_client = if force_new { + info!("pool: pool is disabled"); + None + } else { + info!("pool: looking for an existing connection"); + self.pool.get(ctx, &conn_info)? + }; + + if let Some(client) = maybe_client { + return Ok(client); + } + let conn_id = uuid::Uuid::new_v4(); + tracing::Span::current().record("conn_id", display(conn_id)); + info!(%conn_id, "pool: opening a new connection '{conn_info}'"); + let backend = self.config.auth_backend.as_ref().map(|()| keys); + crate::proxy::connect_compute::connect_to_compute( + ctx, + &TokioMechanism { + conn_id, + conn_info, + pool: self.pool.clone(), + locks: &self.config.connect_compute_locks, + }, + &backend, + false, // do not allow self signed compute for http flow + self.config.wake_compute_retry_config, + self.config.connect_to_compute_retry_config, + ) + .await + } +} + +#[derive(Debug, thiserror::Error)] +pub(crate) enum HttpConnError { + #[error("pooled connection closed at inconsistent state")] + ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError), + #[error("could not connection to compute")] + ConnectionError(#[from] tokio_postgres::Error), + + #[error("could not get auth info")] + GetAuthInfo(#[from] GetAuthInfoError), + #[error("user not authenticated")] + AuthError(#[from] AuthError), + #[error("wake_compute returned error")] + WakeCompute(#[from] WakeComputeError), + #[error("error acquiring resource permit: {0}")] + TooManyConnectionAttempts(#[from] ApiLockError), +} + +impl ReportableError for HttpConnError { + fn get_error_kind(&self) -> ErrorKind { + match self { + HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute, + HttpConnError::ConnectionError(p) => p.get_error_kind(), + HttpConnError::GetAuthInfo(a) => a.get_error_kind(), + HttpConnError::AuthError(a) => a.get_error_kind(), + HttpConnError::WakeCompute(w) => w.get_error_kind(), + HttpConnError::TooManyConnectionAttempts(w) => w.get_error_kind(), + } + } +} + +impl UserFacingError for HttpConnError { + fn to_string_client(&self) -> String { + match self { + HttpConnError::ConnectionClosedAbruptly(_) => self.to_string(), + HttpConnError::ConnectionError(p) => p.to_string(), + HttpConnError::GetAuthInfo(c) => c.to_string_client(), + HttpConnError::AuthError(c) => c.to_string_client(), + HttpConnError::WakeCompute(c) => c.to_string_client(), + HttpConnError::TooManyConnectionAttempts(_) => { + "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() + } + } + } +} + +impl CouldRetry for HttpConnError { + fn could_retry(&self) -> bool { + match self { + HttpConnError::ConnectionError(e) => e.could_retry(), + HttpConnError::ConnectionClosedAbruptly(_) => false, + HttpConnError::GetAuthInfo(_) => false, + HttpConnError::AuthError(_) => false, + HttpConnError::WakeCompute(_) => false, + HttpConnError::TooManyConnectionAttempts(_) => false, + } + } +} +impl ShouldRetryWakeCompute for HttpConnError { + fn should_retry_wake_compute(&self) -> bool { + match self { + HttpConnError::ConnectionError(e) => e.should_retry_wake_compute(), + // we never checked cache validity + HttpConnError::TooManyConnectionAttempts(_) => false, + _ => true, + } + } +} + +struct TokioMechanism { + pool: Arc>, + conn_info: ConnInfo, + conn_id: uuid::Uuid, + + /// connect_to_compute concurrency lock + locks: &'static ApiLocks, +} + +#[async_trait] +impl ConnectMechanism for TokioMechanism { + type Connection = Client; + type ConnectError = HttpConnError; + type Error = HttpConnError; + + async fn connect_once( + &self, + ctx: &RequestMonitoring, + node_info: &CachedNodeInfo, + timeout: Duration, + ) -> Result { + let host = node_info.config.get_host()?; + let permit = self.locks.get_permit(&host).await?; + + let mut config = (*node_info.config).clone(); + let config = config + .user(&self.conn_info.user_info.user) + .dbname(&self.conn_info.dbname) + .connect_timeout(timeout); + + match &self.conn_info.auth { + AuthData::Jwt(_) => {} + AuthData::Password(pw) => { + config.password(pw); + } + } + + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); + let res = config.connect(tokio_postgres::NoTls).await; + drop(pause); + let (client, connection) = permit.release_result(res)?; + + tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); + Ok(poll_client( + self.pool.clone(), + ctx, + self.conn_info.clone(), + client, + connection, + self.conn_id, + node_info.aux.clone(), + )) + } + + fn update_connect_config(&self, _config: &mut compute::ConnCfg) {} +} diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs new file mode 100644 index 0000000000..7659745473 --- /dev/null +++ b/proxy/src/serverless/cancel_set.rs @@ -0,0 +1,102 @@ +//! A set for cancelling random http connections + +use std::{ + hash::{BuildHasher, BuildHasherDefault}, + num::NonZeroUsize, + time::Duration, +}; + +use indexmap::IndexMap; +use parking_lot::Mutex; +use rand::{thread_rng, Rng}; +use rustc_hash::FxHasher; +use tokio::time::Instant; +use tokio_util::sync::CancellationToken; +use uuid::Uuid; + +type Hasher = BuildHasherDefault; + +pub struct CancelSet { + shards: Box<[Mutex]>, + // keyed by random uuid, fxhasher is fine + hasher: Hasher, +} + +pub(crate) struct CancelShard { + tokens: IndexMap, +} + +impl CancelSet { + pub fn new(shards: usize) -> Self { + CancelSet { + shards: (0..shards) + .map(|_| { + Mutex::new(CancelShard { + tokens: IndexMap::with_hasher(Hasher::default()), + }) + }) + .collect(), + hasher: Hasher::default(), + } + } + + pub(crate) fn take(&self) -> Option { + for _ in 0..4 { + if let Some(token) = self.take_raw(thread_rng().gen()) { + return Some(token); + } + tracing::trace!("failed to get cancel token"); + } + None + } + + pub(crate) fn take_raw(&self, rng: usize) -> Option { + NonZeroUsize::new(self.shards.len()) + .and_then(|len| self.shards[rng % len].lock().take(rng / len)) + } + + pub(crate) fn insert(&self, id: uuid::Uuid, token: CancellationToken) -> CancelGuard<'_> { + let shard = NonZeroUsize::new(self.shards.len()).map(|len| { + let hash = self.hasher.hash_one(id) as usize; + let shard = &self.shards[hash % len]; + shard.lock().insert(id, token); + shard + }); + CancelGuard { shard, id } + } +} + +impl CancelShard { + fn take(&mut self, rng: usize) -> Option { + NonZeroUsize::new(self.tokens.len()).and_then(|len| { + // 10 second grace period so we don't cancel new connections + if self.tokens.get_index(rng % len)?.1 .0.elapsed() < Duration::from_secs(10) { + return None; + } + + let (_key, (_insert, token)) = self.tokens.swap_remove_index(rng % len)?; + Some(token) + }) + } + + fn remove(&mut self, id: uuid::Uuid) { + self.tokens.swap_remove(&id); + } + + fn insert(&mut self, id: uuid::Uuid, token: CancellationToken) { + self.tokens.insert(id, (Instant::now(), token)); + } +} + +pub(crate) struct CancelGuard<'a> { + shard: Option<&'a Mutex>, + id: Uuid, +} + +impl Drop for CancelGuard<'_> { + fn drop(&mut self) { + if let Some(shard) = self.shard { + shard.lock().remove(self.id); + } + } +} diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 5a7279ae63..bea599e9b9 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,17 +1,8 @@ -use anyhow::Context; -use async_trait::async_trait; use dashmap::DashMap; use futures::{future::poll_fn, Future}; -use metrics::{register_int_counter_pair, IntCounterPair, IntCounterPairGuard}; -use once_cell::sync::Lazy; use parking_lot::RwLock; -use pbkdf2::{ - password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString}, - Params, Pbkdf2, -}; -use prometheus::{exponential_buckets, register_histogram, Histogram}; use rand::Rng; -use smol_str::SmolStr; +use smallvec::SmallVec; use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration}; use std::{ fmt, @@ -21,40 +12,49 @@ use std::{ ops::Deref, sync::atomic::{self, AtomicUsize}, }; -use tokio::time::{self, Instant}; -use tokio_postgres::{AsyncMessage, ReadyForQueryStatus}; +use tokio::time::Instant; +use tokio_postgres::tls::NoTlsStream; +use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; +use tokio_util::sync::CancellationToken; +use crate::console::messages::{ColdStartInfo, MetricsAuxInfo}; +use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; +use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; use crate::{ - auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list}, - console::{self, messages::MetricsAuxInfo}, - context::RequestMonitoring, - metrics::NUM_DB_CONNECTIONS_GAUGE, - proxy::connect_compute::ConnectMechanism, - usage_metrics::{Ids, MetricCounter, USAGE_METRICS}, - DbName, EndpointCacheKey, RoleName, + auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName, }; -use crate::{compute, config}; use tracing::{debug, error, warn, Span}; use tracing::{info, info_span, Instrument}; -pub const APP_NAME: SmolStr = SmolStr::new_inline("/sql_over_http"); +use super::backend::HttpConnError; #[derive(Debug, Clone)] -pub struct ConnInfo { - pub user_info: ComputeUserInfo, - pub dbname: DbName, - pub password: SmolStr, +pub(crate) struct ConnInfo { + pub(crate) user_info: ComputeUserInfo, + pub(crate) dbname: DbName, + pub(crate) auth: AuthData, +} + +#[derive(Debug, Clone)] +pub(crate) enum AuthData { + Password(SmallVec<[u8; 16]>), + Jwt(String), } impl ConnInfo { // hm, change to hasher to avoid cloning? - pub fn db_and_user(&self) -> (DbName, RoleName) { + pub(crate) fn db_and_user(&self) -> (DbName, RoleName) { (self.dbname.clone(), self.user_info.user.clone()) } - pub fn endpoint_cache_key(&self) -> EndpointCacheKey { - self.user_info.endpoint_cache_key() + pub(crate) fn endpoint_cache_key(&self) -> Option { + // We don't want to cache http connections for ephemeral endpoints. + if self.user_info.options.is_ephemeral() { + None + } else { + Some(self.user_info.endpoint_cache_key()) + } } } @@ -72,39 +72,55 @@ impl fmt::Display for ConnInfo { } } -struct ConnPoolEntry { - conn: ClientInner, +struct ConnPoolEntry { + conn: ClientInner, _last_access: std::time::Instant, } // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool // Number of open connections is limited by the `max_conns_per_endpoint`. -pub struct EndpointConnPool { - pools: HashMap<(DbName, RoleName), DbUserConnPool>, +pub(crate) struct EndpointConnPool { + pools: HashMap<(DbName, RoleName), DbUserConnPool>, total_conns: usize, max_conns: usize, - _guard: IntCounterPairGuard, + _guard: HttpEndpointPoolsGuard<'static>, + global_connections_count: Arc, + global_pool_size_max_conns: usize, } -impl EndpointConnPool { - fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option { +impl EndpointConnPool { + fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option> { let Self { - pools, total_conns, .. + pools, + total_conns, + global_connections_count, + .. } = self; - pools - .get_mut(&db_user) - .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns)) + pools.get_mut(&db_user).and_then(|pool_entries| { + pool_entries.get_conn_entry(total_conns, global_connections_count.clone()) + }) } fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool { let Self { - pools, total_conns, .. + pools, + total_conns, + global_connections_count, + .. } = self; if let Some(pool) = pools.get_mut(&db_user) { let old_len = pool.conns.len(); pool.conns.retain(|conn| conn.conn.conn_id != conn_id); let new_len = pool.conns.len(); let removed = old_len - new_len; + if removed > 0 { + global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); + } *total_conns -= removed; removed > 0 } else { @@ -112,12 +128,22 @@ impl EndpointConnPool { } } - fn put(pool: &RwLock, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> { + fn put(pool: &RwLock, conn_info: &ConnInfo, client: ClientInner) { let conn_id = client.conn_id; - if client.inner.is_closed() { + if client.is_closed() { info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed"); - return Ok(()); + return; + } + let global_max_conn = pool.read().global_pool_size_max_conns; + if pool + .read() + .global_connections_count + .load(atomic::Ordering::Relaxed) + >= global_max_conn + { + info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full"); + return; } // return connection to the pool @@ -127,18 +153,23 @@ impl EndpointConnPool { let mut pool = pool.write(); if pool.total_conns < pool.max_conns { - // we create this db-user entry in get, so it should not be None - if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) { - pool_entries.conns.push(ConnPoolEntry { - conn: client, - _last_access: std::time::Instant::now(), - }); + let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default(); + pool_entries.conns.push(ConnPoolEntry { + conn: client, + _last_access: std::time::Instant::now(), + }); - returned = true; - per_db_size = pool_entries.conns.len(); + returned = true; + per_db_size = pool_entries.conns.len(); - pool.total_conns += 1; - } + pool.total_conns += 1; + pool.global_connections_count + .fetch_add(1, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .inc(); } pool.total_conns @@ -150,54 +181,72 @@ impl EndpointConnPool { } else { info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}"); } - - Ok(()) } } -/// 4096 is the number of rounds that SCRAM-SHA-256 recommends. -/// It's not the 600,000 that OWASP recommends... but our passwords are high entropy anyway. -/// -/// Still takes 1.4ms to hash on my hardware. -/// We don't want to ruin the latency improvements of using the pool by making password verification take too long -const PARAMS: Params = Params { - rounds: 4096, - output_length: 32, -}; - -#[derive(Default)] -pub struct DbUserConnPool { - conns: Vec, - password_hash: Option, +impl Drop for EndpointConnPool { + fn drop(&mut self) { + if self.total_conns > 0 { + self.global_connections_count + .fetch_sub(self.total_conns, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(self.total_conns as i64); + } + } } -impl DbUserConnPool { - fn clear_closed_clients(&mut self, conns: &mut usize) { +pub(crate) struct DbUserConnPool { + conns: Vec>, +} + +impl Default for DbUserConnPool { + fn default() -> Self { + Self { conns: Vec::new() } + } +} + +impl DbUserConnPool { + fn clear_closed_clients(&mut self, conns: &mut usize) -> usize { let old_len = self.conns.len(); - self.conns.retain(|conn| !conn.conn.inner.is_closed()); + self.conns.retain(|conn| !conn.conn.is_closed()); let new_len = self.conns.len(); let removed = old_len - new_len; *conns -= removed; + removed } - fn get_conn_entry(&mut self, conns: &mut usize) -> Option { - self.clear_closed_clients(conns); + fn get_conn_entry( + &mut self, + conns: &mut usize, + global_connections_count: Arc, + ) -> Option> { + let mut removed = self.clear_closed_clients(conns); let conn = self.conns.pop(); if conn.is_some() { *conns -= 1; + removed += 1; } + global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); conn } } -pub struct GlobalConnPool { +pub(crate) struct GlobalConnPool { // endpoint -> per-endpoint connection pool // // That should be a fairly conteded map, so return reference to the per-endpoint // pool as early as possible and release the lock. - global_pool: DashMap>>, + global_pool: DashMap>>>, /// Number of endpoint-connection pools /// @@ -206,7 +255,10 @@ pub struct GlobalConnPool { /// It's only used for diagnostics. global_pool_size: AtomicUsize, - proxy_config: &'static crate::config::ProxyConfig, + /// Total number of connections in the pool + global_connections_count: Arc, + + config: &'static crate::config::HttpConfig, } #[derive(Debug, Clone, Copy)] @@ -224,45 +276,39 @@ pub struct GlobalConnPoolOptions { pub idle_timeout: Duration, pub opt_in: bool, + + // Total number of connections in the pool. + pub max_total_conns: usize, } -pub static GC_LATENCY: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_http_pool_reclaimation_lag_seconds", - "Time it takes to reclaim unused connection pools", - // 1us -> 65ms - exponential_buckets(1e-6, 2.0, 16).unwrap(), - ) - .unwrap() -}); - -pub static ENDPOINT_POOLS: Lazy = Lazy::new(|| { - register_int_counter_pair!( - "proxy_http_pool_endpoints_registered_total", - "Number of endpoints we have registered pools for", - "proxy_http_pool_endpoints_unregistered_total", - "Number of endpoints we have unregistered pools for", - ) - .unwrap() -}); - -impl GlobalConnPool { - pub fn new(config: &'static crate::config::ProxyConfig) -> Arc { - let shards = config.http_config.pool_options.pool_shards; +impl GlobalConnPool { + pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { + let shards = config.pool_options.pool_shards; Arc::new(Self { global_pool: DashMap::with_shard_amount(shards), global_pool_size: AtomicUsize::new(0), - proxy_config: config, + config, + global_connections_count: Arc::new(AtomicUsize::new(0)), }) } - pub fn shutdown(&self) { + #[cfg(test)] + pub(crate) fn get_global_connections_count(&self) -> usize { + self.global_connections_count + .load(atomic::Ordering::Relaxed) + } + + pub(crate) fn get_idle_timeout(&self) -> Duration { + self.config.pool_options.idle_timeout + } + + pub(crate) fn shutdown(&self) { // drops all strong references to endpoint-pools self.global_pool.clear(); } - pub async fn gc_worker(&self, mut rng: impl Rng) { - let epoch = self.proxy_config.http_config.pool_options.gc_epoch; + pub(crate) async fn gc_worker(&self, mut rng: impl Rng) { + let epoch = self.config.pool_options.gc_epoch; let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32); loop { interval.tick().await; @@ -278,8 +324,12 @@ impl GlobalConnPool { // acquire a random shard lock let mut shard = self.global_pool.shards()[shard].write(); - let timer = GC_LATENCY.start_timer(); + let timer = Metrics::get() + .proxy + .http_pool_reclaimation_lag_seconds + .start_timer(); let current_len = shard.len(); + let mut clients_removed = 0; shard.retain(|endpoint, x| { // if the current endpoint pool is unique (no other strong or weak references) // then it is currently not in use by any connections. @@ -289,9 +339,9 @@ impl GlobalConnPool { } = pool.get_mut(); // ensure that closed clients are removed - pools - .iter_mut() - .for_each(|(_, db_pool)| db_pool.clear_closed_clients(total_conns)); + for db_pool in pools.values_mut() { + clients_removed += db_pool.clear_closed_clients(total_conns); + } // we only remove this pool if it has no active connections if *total_conns == 0 { @@ -302,10 +352,24 @@ impl GlobalConnPool { true }); + let new_len = shard.len(); drop(shard); - timer.observe_duration(); + timer.observe(); + // Do logging outside of the lock. + if clients_removed > 0 { + let size = self + .global_connections_count + .fetch_sub(clients_removed, atomic::Ordering::Relaxed) + - clients_removed; + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(clients_removed as i64); + info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); + } let removed = current_len - new_len; if removed > 0 { @@ -317,134 +381,52 @@ impl GlobalConnPool { } } - pub async fn get( + pub(crate) fn get( self: &Arc, - ctx: &mut RequestMonitoring, - conn_info: ConnInfo, - force_new: bool, - ) -> anyhow::Result { - let mut client: Option = None; + ctx: &RequestMonitoring, + conn_info: &ConnInfo, + ) -> Result>, HttpConnError> { + let mut client: Option> = None; + let Some(endpoint) = conn_info.endpoint_cache_key() else { + return Ok(None); + }; - let mut hash_valid = false; - let mut endpoint_pool = Weak::new(); - if !force_new { - let pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()); - endpoint_pool = Arc::downgrade(&pool); - let mut hash = None; - - // find a pool entry by (dbname, username) if exists - { - let pool = pool.read(); - if let Some(pool_entries) = pool.pools.get(&conn_info.db_and_user()) { - if !pool_entries.conns.is_empty() { - hash = pool_entries.password_hash.clone(); - } - } - } - - // a connection exists in the pool, verify the password hash - if let Some(hash) = hash { - let pw = conn_info.password.clone(); - let validate = tokio::task::spawn_blocking(move || { - Pbkdf2.verify_password(pw.as_bytes(), &hash.password_hash()) - }) - .await?; - - // if the hash is invalid, don't error - // we will continue with the regular connection flow - if validate.is_ok() { - hash_valid = true; - if let Some(entry) = pool.write().get_conn_entry(conn_info.db_and_user()) { - client = Some(entry.conn) - } - } - } + let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); + if let Some(entry) = endpoint_pool + .write() + .get_conn_entry(conn_info.db_and_user()) + { + client = Some(entry.conn); } + let endpoint_pool = Arc::downgrade(&endpoint_pool); // ok return cached connection if found and establish a new one otherwise - let new_client = if let Some(client) = client { - ctx.set_project(client.aux.clone()); - if client.inner.is_closed() { - let conn_id = uuid::Uuid::new_v4(); - info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one"); - connect_to_compute( - self.proxy_config, - ctx, - &conn_info, - conn_id, - endpoint_pool.clone(), - ) - .await - } else { - info!("pool: reusing connection '{conn_info}'"); - client.session.send(ctx.session_id)?; - tracing::Span::current().record( - "pid", - &tracing::field::display(client.inner.get_process_id()), - ); - ctx.latency_timer.pool_hit(); - ctx.latency_timer.success(); - return Ok(Client::new(client, conn_info, endpoint_pool).await); + if let Some(client) = client { + if client.is_closed() { + info!("pool: cached connection '{conn_info}' is closed, opening a new one"); + return Ok(None); } - } else { - let conn_id = uuid::Uuid::new_v4(); - info!(%conn_id, "pool: opening a new connection '{conn_info}'"); - connect_to_compute( - self.proxy_config, - ctx, - &conn_info, - conn_id, - endpoint_pool.clone(), - ) - .await - }; - if let Ok(client) = &new_client { + tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id)); tracing::Span::current().record( "pid", - &tracing::field::display(client.inner.get_process_id()), + tracing::field::display(client.inner.get_process_id()), ); + info!( + cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), + "pool: reusing connection '{conn_info}'" + ); + client.session.send(ctx.session_id())?; + ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); + ctx.success(); + return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); } - - match &new_client { - // clear the hash. it's no longer valid - // TODO: update tokio-postgres fork to allow access to this error kind directly - Err(err) - if hash_valid && err.to_string().contains("password authentication failed") => - { - let pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()); - let mut pool = pool.write(); - if let Some(entry) = pool.pools.get_mut(&conn_info.db_and_user()) { - entry.password_hash = None; - } - } - // new password is valid and we should insert/update it - Ok(_) if !force_new && !hash_valid => { - let pw = conn_info.password.clone(); - let new_hash = tokio::task::spawn_blocking(move || { - let salt = SaltString::generate(rand::rngs::OsRng); - Pbkdf2 - .hash_password_customized(pw.as_bytes(), None, None, PARAMS, &salt) - .map(|s| s.serialize()) - }) - .await??; - - let pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()); - let mut pool = pool.write(); - pool.pools - .entry(conn_info.db_and_user()) - .or_default() - .password_hash = Some(new_hash); - } - _ => {} - } - let new_client = new_client?; - Ok(Client::new(new_client, conn_info, endpoint_pool).await) + Ok(None) } fn get_or_create_endpoint_pool( - &self, + self: &Arc, endpoint: &EndpointCacheKey, - ) -> Arc> { + ) -> Arc>> { // fast path if let Some(pool) = self.global_pool.get(endpoint) { return pool.clone(); @@ -454,12 +436,10 @@ impl GlobalConnPool { let new_pool = Arc::new(RwLock::new(EndpointConnPool { pools: HashMap::new(), total_conns: 0, - max_conns: self - .proxy_config - .http_config - .pool_options - .max_conns_per_endpoint, - _guard: ENDPOINT_POOLS.guard(), + max_conns: self.config.pool_options.max_conns_per_endpoint, + _guard: Metrics::get().proxy.http_endpoint_pools.guard(), + global_connections_count: self.global_connections_count.clone(), + global_pool_size_max_conns: self.config.pool_options.max_total_conns, })); // find or create a pool for this endpoint @@ -488,279 +468,218 @@ impl GlobalConnPool { } } -struct TokioMechanism<'a> { - pool: Weak>, - conn_info: &'a ConnInfo, +pub(crate) fn poll_client( + global_pool: Arc>, + ctx: &RequestMonitoring, + conn_info: ConnInfo, + client: C, + mut connection: tokio_postgres::Connection, conn_id: uuid::Uuid, - idle: Duration, -} - -#[async_trait] -impl ConnectMechanism for TokioMechanism<'_> { - type Connection = ClientInner; - type ConnectError = tokio_postgres::Error; - type Error = anyhow::Error; - - async fn connect_once( - &self, - ctx: &mut RequestMonitoring, - node_info: &console::CachedNodeInfo, - timeout: time::Duration, - ) -> Result { - connect_to_compute_once( - ctx, - node_info, - self.conn_info, - timeout, - self.conn_id, - self.pool.clone(), - self.idle, - ) - .await - } - - fn update_connect_config(&self, _config: &mut compute::ConnCfg) {} -} - -// Wake up the destination if needed. Code here is a bit involved because -// we reuse the code from the usual proxy and we need to prepare few structures -// that this code expects. -#[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] -async fn connect_to_compute( - config: &config::ProxyConfig, - ctx: &mut RequestMonitoring, - conn_info: &ConnInfo, - conn_id: uuid::Uuid, - pool: Weak>, -) -> anyhow::Result { - ctx.set_application(Some(APP_NAME)); - let backend = config - .auth_backend - .as_ref() - .map(|_| conn_info.user_info.clone()); - - if !config.disable_ip_check_for_http { - let allowed_ips = backend.get_allowed_ips(ctx).await?; - if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { - return Err(auth::AuthError::ip_address_not_allowed().into()); - } - } - let node_info = backend - .wake_compute(ctx) - .await? - .context("missing cache entry from wake_compute")?; - - ctx.set_project(node_info.aux.clone()); - - crate::proxy::connect_compute::connect_to_compute( - ctx, - &TokioMechanism { - conn_id, - conn_info, - pool, - idle: config.http_config.pool_options.idle_timeout, - }, - node_info, - &backend, - ) - .await -} - -async fn connect_to_compute_once( - ctx: &mut RequestMonitoring, - node_info: &console::CachedNodeInfo, - conn_info: &ConnInfo, - timeout: time::Duration, - conn_id: uuid::Uuid, - pool: Weak>, - idle: Duration, -) -> Result { - let mut config = (*node_info.config).clone(); - let mut session = ctx.session_id; - - let (client, mut connection) = config - .user(&conn_info.user_info.user) - .password(&*conn_info.password) - .dbname(&conn_info.dbname) - .connect_timeout(timeout) - .connect(tokio_postgres::NoTls) - .await?; - - let conn_gauge = NUM_DB_CONNECTIONS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(); - - tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id())); - - let (tx, mut rx) = tokio::sync::watch::channel(session); + aux: MetricsAuxInfo, +) -> Client { + let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol()); + let mut session_id = ctx.session_id(); + let (tx, mut rx) = tokio::sync::watch::channel(session_id); let span = info_span!(parent: None, "connection", %conn_id); + let cold_start_info = ctx.cold_start_info(); span.in_scope(|| { - info!(%conn_info, %session, "new connection"); + info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection"); }); + let pool = match conn_info.endpoint_cache_key() { + Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)), + None => Weak::new(), + }; + let pool_clone = pool.clone(); let db_user = conn_info.db_and_user(); + let idle = global_pool.get_idle_timeout(); + let cancel = CancellationToken::new(); + let cancelled = cancel.clone().cancelled_owned(); + tokio::spawn( - async move { - let _conn_gauge = conn_gauge; - let mut idle_timeout = pin!(tokio::time::sleep(idle)); - poll_fn(move |cx| { - if matches!(rx.has_changed(), Ok(true)) { - session = *rx.borrow_and_update(); - info!(%session, "changed session"); + async move { + let _conn_gauge = conn_gauge; + let mut idle_timeout = pin!(tokio::time::sleep(idle)); + let mut cancelled = pin!(cancelled); + + poll_fn(move |cx| { + if cancelled.as_mut().poll(cx).is_ready() { + info!("connection dropped"); + return Poll::Ready(()) + } + + match rx.has_changed() { + Ok(true) => { + session_id = *rx.borrow_and_update(); + info!(%session_id, "changed session"); idle_timeout.as_mut().reset(Instant::now() + idle); } - - // 5 minute idle connection timeout - if idle_timeout.as_mut().poll(cx).is_ready() { - idle_timeout.as_mut().reset(Instant::now() + idle); - info!("connection idle"); - if let Some(pool) = pool.clone().upgrade() { - // remove client from pool - should close the connection if it's idle. - // does nothing if the client is currently checked-out and in-use - if pool.write().remove_client(db_user.clone(), conn_id) { - info!("idle connection removed"); - } - } + Err(_) => { + info!("connection dropped"); + return Poll::Ready(()) } + _ => {} + } - loop { - let message = ready!(connection.poll_message(cx)); - - match message { - Some(Ok(AsyncMessage::Notice(notice))) => { - info!(%session, "notice: {}", notice); - } - Some(Ok(AsyncMessage::Notification(notif))) => { - warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received"); - } - Some(Ok(_)) => { - warn!(%session, "unknown message"); - } - Some(Err(e)) => { - error!(%session, "connection error: {}", e); - break - } - None => { - info!("connection closed"); - break - } - } - } - - // remove from connection pool + // 5 minute idle connection timeout + if idle_timeout.as_mut().poll(cx).is_ready() { + idle_timeout.as_mut().reset(Instant::now() + idle); + info!("connection idle"); if let Some(pool) = pool.clone().upgrade() { + // remove client from pool - should close the connection if it's idle. + // does nothing if the client is currently checked-out and in-use if pool.write().remove_client(db_user.clone(), conn_id) { - info!("closed connection removed"); + info!("idle connection removed"); } } + } - Poll::Ready(()) - }).await; + loop { + let message = ready!(connection.poll_message(cx)); - } - .instrument(span) - ); + match message { + Some(Ok(AsyncMessage::Notice(notice))) => { + info!(%session_id, "notice: {}", notice); + } + Some(Ok(AsyncMessage::Notification(notif))) => { + warn!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received"); + } + Some(Ok(_)) => { + warn!(%session_id, "unknown message"); + } + Some(Err(e)) => { + error!(%session_id, "connection error: {}", e); + break + } + None => { + info!("connection closed"); + break + } + } + } - Ok(ClientInner { + // remove from connection pool + if let Some(pool) = pool.clone().upgrade() { + if pool.write().remove_client(db_user.clone(), conn_id) { + info!("closed connection removed"); + } + } + + Poll::Ready(()) + }).await; + + } + .instrument(span)); + let inner = ClientInner { inner: client, session: tx, - aux: node_info.aux.clone(), + cancel, + aux, conn_id, - }) + }; + Client::new(inner, conn_info, pool_clone) } -struct ClientInner { - inner: tokio_postgres::Client, +struct ClientInner { + inner: C, session: tokio::sync::watch::Sender, + cancel: CancellationToken, aux: MetricsAuxInfo, conn_id: uuid::Uuid, } -impl Client { - pub fn metrics(&self) -> Arc { +impl Drop for ClientInner { + fn drop(&mut self) { + // on client drop, tell the conn to shut down + self.cancel.cancel(); + } +} + +pub(crate) trait ClientInnerExt: Sync + Send + 'static { + fn is_closed(&self) -> bool; + fn get_process_id(&self) -> i32; +} + +impl ClientInnerExt for tokio_postgres::Client { + fn is_closed(&self) -> bool { + self.is_closed() + } + fn get_process_id(&self) -> i32 { + self.get_process_id() + } +} + +impl ClientInner { + pub(crate) fn is_closed(&self) -> bool { + self.inner.is_closed() + } +} + +impl Client { + pub(crate) fn metrics(&self) -> Arc { let aux = &self.inner.as_ref().unwrap().aux; USAGE_METRICS.register(Ids { - endpoint_id: aux.endpoint_id.clone(), - branch_id: aux.branch_id.clone(), + endpoint_id: aux.endpoint_id, + branch_id: aux.branch_id, }) } } -pub struct Client { - conn_id: uuid::Uuid, +pub(crate) struct Client { span: Span, - inner: Option, + inner: Option>, conn_info: ConnInfo, - pool: Weak>, + pool: Weak>>, } -pub struct Discard<'a> { - conn_id: uuid::Uuid, +pub(crate) struct Discard<'a, C: ClientInnerExt> { conn_info: &'a ConnInfo, - pool: &'a mut Weak>, + pool: &'a mut Weak>>, } -impl Client { - pub(self) async fn new( - inner: ClientInner, +impl Client { + pub(self) fn new( + inner: ClientInner, conn_info: ConnInfo, - pool: Weak>, + pool: Weak>>, ) -> Self { Self { - conn_id: inner.conn_id, inner: Some(inner), span: Span::current(), conn_info, pool, } } - pub fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) { + pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) { let Self { inner, pool, - conn_id, conn_info, span: _, } = self; - ( - &mut inner - .as_mut() - .expect("client inner should not be removed") - .inner, - Discard { - pool, - conn_info, - conn_id: *conn_id, - }, - ) - } - - pub fn check_idle(&mut self, status: ReadyForQueryStatus) { - self.inner().1.check_idle(status) - } - pub fn discard(&mut self) { - self.inner().1.discard() + let inner = inner.as_mut().expect("client inner should not be removed"); + (&mut inner.inner, Discard { conn_info, pool }) } } -impl Discard<'_> { - pub fn check_idle(&mut self, status: ReadyForQueryStatus) { +impl Discard<'_, C> { + pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) { let conn_info = &self.conn_info; if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { - info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle") + info!("pool: throwing away connection '{conn_info}' because connection is not idle"); } } - pub fn discard(&mut self) { + pub(crate) fn discard(&mut self) { let conn_info = &self.conn_info; if std::mem::take(self.pool).strong_count() > 0 { - info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state") + info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"); } } } -impl Deref for Client { - type Target = tokio_postgres::Client; +impl Deref for Client { + type Target = C; fn deref(&self) -> &Self::Target { &self @@ -771,8 +690,8 @@ impl Deref for Client { } } -impl Drop for Client { - fn drop(&mut self) { +impl Client { + fn do_drop(&mut self) -> Option { let conn_info = self.conn_info.clone(); let client = self .inner @@ -781,10 +700,174 @@ impl Drop for Client { if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() { let current_span = self.span.clone(); // return connection to the pool - tokio::task::spawn_blocking(move || { + return Some(move || { let _span = current_span.enter(); - let _ = EndpointConnPool::put(&conn_pool, &conn_info, client); + EndpointConnPool::put(&conn_pool, &conn_info, client); }); } + None + } +} + +impl Drop for Client { + fn drop(&mut self) { + if let Some(drop) = self.do_drop() { + tokio::task::spawn_blocking(drop); + } + } +} + +#[cfg(test)] +mod tests { + use std::{mem, sync::atomic::AtomicBool}; + + use crate::{ + proxy::NeonOptions, serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId, + }; + + use super::*; + + struct MockClient(Arc); + impl MockClient { + fn new(is_closed: bool) -> Self { + MockClient(Arc::new(is_closed.into())) + } + } + impl ClientInnerExt for MockClient { + fn is_closed(&self) -> bool { + self.0.load(atomic::Ordering::Relaxed) + } + fn get_process_id(&self) -> i32 { + 0 + } + } + + fn create_inner() -> ClientInner { + create_inner_with(MockClient::new(false)) + } + + fn create_inner_with(client: MockClient) -> ClientInner { + ClientInner { + inner: client, + session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()), + cancel: CancellationToken::new(), + aux: MetricsAuxInfo { + endpoint_id: (&EndpointId::from("endpoint")).into(), + project_id: (&ProjectId::from("project")).into(), + branch_id: (&BranchId::from("branch")).into(), + cold_start_info: crate::console::messages::ColdStartInfo::Warm, + }, + conn_id: uuid::Uuid::new_v4(), + } + } + + #[tokio::test] + async fn test_pool() { + let _ = env_logger::try_init(); + let config = Box::leak(Box::new(crate::config::HttpConfig { + accept_websockets: false, + pool_options: GlobalConnPoolOptions { + max_conns_per_endpoint: 2, + gc_epoch: Duration::from_secs(1), + pool_shards: 2, + idle_timeout: Duration::from_secs(1), + opt_in: false, + max_total_conns: 3, + }, + cancel_set: CancelSet::new(0), + client_conn_threshold: u64::MAX, + })); + let pool = GlobalConnPool::new(config); + let conn_info = ConnInfo { + user_info: ComputeUserInfo { + user: "user".into(), + endpoint: "endpoint".into(), + options: NeonOptions::default(), + }, + dbname: "dbname".into(), + auth: AuthData::Password("password".as_bytes().into()), + }; + let ep_pool = Arc::downgrade( + &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()), + ); + { + let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + assert_eq!(0, pool.get_global_connections_count()); + client.inner().1.discard(); + // Discard should not add the connection from the pool. + assert_eq!(0, pool.get_global_connections_count()); + } + { + let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + client.do_drop().unwrap()(); + mem::forget(client); // drop the client + assert_eq!(1, pool.get_global_connections_count()); + } + { + let mut closed_client = Client::new( + create_inner_with(MockClient::new(true)), + conn_info.clone(), + ep_pool.clone(), + ); + closed_client.do_drop().unwrap()(); + mem::forget(closed_client); // drop the client + // The closed client shouldn't be added to the pool. + assert_eq!(1, pool.get_global_connections_count()); + } + let is_closed: Arc = Arc::new(false.into()); + { + let mut client = Client::new( + create_inner_with(MockClient(is_closed.clone())), + conn_info.clone(), + ep_pool.clone(), + ); + client.do_drop().unwrap()(); + mem::forget(client); // drop the client + + // The client should be added to the pool. + assert_eq!(2, pool.get_global_connections_count()); + } + { + let mut client = Client::new(create_inner(), conn_info, ep_pool); + client.do_drop().unwrap()(); + mem::forget(client); // drop the client + + // The client shouldn't be added to the pool. Because the ep-pool is full. + assert_eq!(2, pool.get_global_connections_count()); + } + + let conn_info = ConnInfo { + user_info: ComputeUserInfo { + user: "user".into(), + endpoint: "endpoint-2".into(), + options: NeonOptions::default(), + }, + dbname: "dbname".into(), + auth: AuthData::Password("password".as_bytes().into()), + }; + let ep_pool = Arc::downgrade( + &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()), + ); + { + let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + client.do_drop().unwrap()(); + mem::forget(client); // drop the client + assert_eq!(3, pool.get_global_connections_count()); + } + { + let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + client.do_drop().unwrap()(); + mem::forget(client); // drop the client + + // The client shouldn't be added to the pool. Because the global pool is full. + assert_eq!(3, pool.get_global_connections_count()); + } + + is_closed.store(true, atomic::Ordering::Relaxed); + // Do gc for all shards. + pool.gc(0); + pool.gc(1); + // Closed client should be removed from the pool. + assert_eq!(2, pool.get_global_connections_count()); } } diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs new file mode 100644 index 0000000000..abf0ffe290 --- /dev/null +++ b/proxy/src/serverless/http_util.rs @@ -0,0 +1,96 @@ +//! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility +//! Will merge back in at some point in the future. + +use bytes::Bytes; + +use anyhow::Context; +use http::{Response, StatusCode}; +use http_body_util::Full; + +use serde::Serialize; +use utils::http::error::ApiError; + +/// Like [`ApiError::into_response`] +pub(crate) fn api_error_into_response(this: ApiError) -> Response> { + match this { + ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status( + format!("{err:#?}"), // use debug printing so that we give the cause + StatusCode::BAD_REQUEST, + ), + ApiError::Forbidden(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::FORBIDDEN) + } + ApiError::Unauthorized(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::UNAUTHORIZED) + } + ApiError::NotFound(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::NOT_FOUND) + } + ApiError::Conflict(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::CONFLICT) + } + ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status( + this.to_string(), + StatusCode::PRECONDITION_FAILED, + ), + ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status( + "Shutting down".to_string(), + StatusCode::SERVICE_UNAVAILABLE, + ), + ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::SERVICE_UNAVAILABLE, + ), + ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::REQUEST_TIMEOUT, + ), + ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status( + this.to_string(), + StatusCode::INTERNAL_SERVER_ERROR, + ), + ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::INTERNAL_SERVER_ERROR, + ), + } +} + +/// Same as [`utils::http::error::HttpErrorBody`] +#[derive(Serialize)] +struct HttpErrorBody { + pub(crate) msg: String, +} + +impl HttpErrorBody { + /// Same as [`utils::http::error::HttpErrorBody::response_from_msg_and_status`] + fn response_from_msg_and_status(msg: String, status: StatusCode) -> Response> { + HttpErrorBody { msg }.to_response(status) + } + + /// Same as [`utils::http::error::HttpErrorBody::to_response`] + fn to_response(&self, status: StatusCode) -> Response> { + Response::builder() + .status(status) + .header(http::header::CONTENT_TYPE, "application/json") + // we do not have nested maps with non string keys so serialization shouldn't fail + .body(Full::new(Bytes::from(serde_json::to_string(self).unwrap()))) + .unwrap() + } +} + +/// Same as [`utils::http::json::json_response`] +pub(crate) fn json_response( + status: StatusCode, + data: T, +) -> Result>, ApiError> { + let json = serde_json::to_string(&data) + .context("Failed to serialize JSON response") + .map_err(ApiError::InternalServerError)?; + let response = Response::builder() + .status(status) + .header(http::header::CONTENT_TYPE, "application/json") + .body(Full::new(Bytes::from(json))) + .map_err(|e| ApiError::InternalServerError(e.into()))?; + Ok(response) +} diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs new file mode 100644 index 0000000000..9f328a0e1d --- /dev/null +++ b/proxy/src/serverless/json.rs @@ -0,0 +1,462 @@ +use serde_json::Map; +use serde_json::Value; +use tokio_postgres::types::Kind; +use tokio_postgres::types::Type; +use tokio_postgres::Row; + +// +// Convert json non-string types to strings, so that they can be passed to Postgres +// as parameters. +// +pub(crate) fn json_to_pg_text(json: Vec) -> Vec> { + json.iter().map(json_value_to_pg_text).collect() +} + +fn json_value_to_pg_text(value: &Value) -> Option { + match value { + // special care for nulls + Value::Null => None, + + // convert to text with escaping + v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()), + + // avoid escaping here, as we pass this as a parameter + Value::String(s) => Some(s.to_string()), + + // special care for arrays + Value::Array(_) => json_array_to_pg_array(value), + } +} + +// +// Serialize a JSON array to a Postgres array. Contrary to the strings in the params +// in the array we need to escape the strings. Postgres is okay with arrays of form +// '{1,"2",3}'::int[], so we don't check that array holds values of the same type, leaving +// it for Postgres to check. +// +// Example of the same escaping in node-postgres: packages/pg/lib/utils.js +// +fn json_array_to_pg_array(value: &Value) -> Option { + match value { + // special care for nulls + Value::Null => None, + + // convert to text with escaping + // here string needs to be escaped, as it is part of the array + v @ (Value::Bool(_) | Value::Number(_) | Value::String(_)) => Some(v.to_string()), + v @ Value::Object(_) => json_array_to_pg_array(&Value::String(v.to_string())), + + // recurse into array + Value::Array(arr) => { + let vals = arr + .iter() + .map(json_array_to_pg_array) + .map(|v| v.unwrap_or_else(|| "NULL".to_string())) + .collect::>() + .join(","); + + Some(format!("{{{vals}}}")) + } + } +} + +#[derive(Debug, thiserror::Error)] +pub(crate) enum JsonConversionError { + #[error("internal error compute returned invalid data: {0}")] + AsTextError(tokio_postgres::Error), + #[error("parse int error: {0}")] + ParseIntError(#[from] std::num::ParseIntError), + #[error("parse float error: {0}")] + ParseFloatError(#[from] std::num::ParseFloatError), + #[error("parse json error: {0}")] + ParseJsonError(#[from] serde_json::Error), + #[error("unbalanced array")] + UnbalancedArray, +} + +// +// Convert postgres row with text-encoded values to JSON object +// +pub(crate) fn pg_text_row_to_json( + row: &Row, + columns: &[Type], + raw_output: bool, + array_mode: bool, +) -> Result { + let iter = row + .columns() + .iter() + .zip(columns) + .enumerate() + .map(|(i, (column, typ))| { + let name = column.name(); + let pg_value = row.as_text(i).map_err(JsonConversionError::AsTextError)?; + let json_value = if raw_output { + match pg_value { + Some(v) => Value::String(v.to_string()), + None => Value::Null, + } + } else { + pg_text_to_json(pg_value, typ)? + }; + Ok((name.to_string(), json_value)) + }); + + if array_mode { + // drop keys and aggregate into array + let arr = iter + .map(|r| r.map(|(_key, val)| val)) + .collect::, JsonConversionError>>()?; + Ok(Value::Array(arr)) + } else { + let obj = iter.collect::, JsonConversionError>>()?; + Ok(Value::Object(obj)) + } +} + +// +// Convert postgres text-encoded value to JSON value +// +fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result { + if let Some(val) = pg_value { + if let Kind::Array(elem_type) = pg_type.kind() { + return pg_array_parse(val, elem_type); + } + + match *pg_type { + Type::BOOL => Ok(Value::Bool(val == "t")), + Type::INT2 | Type::INT4 => { + let val = val.parse::()?; + Ok(Value::Number(serde_json::Number::from(val))) + } + Type::FLOAT4 | Type::FLOAT8 => { + let fval = val.parse::()?; + let num = serde_json::Number::from_f64(fval); + if let Some(num) = num { + Ok(Value::Number(num)) + } else { + // Pass Nan, Inf, -Inf as strings + // JS JSON.stringify() does converts them to null, but we + // want to preserve them, so we pass them as strings + Ok(Value::String(val.to_string())) + } + } + Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?), + _ => Ok(Value::String(val.to_string())), + } + } else { + Ok(Value::Null) + } +} + +// +// Parse postgres array into JSON array. +// +// This is a bit involved because we need to handle nested arrays and quoted +// values. Unlike postgres we don't check that all nested arrays have the same +// dimensions, we just return them as is. +// +fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result { + _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v) +} + +fn _pg_array_parse( + pg_array: &str, + elem_type: &Type, + nested: bool, +) -> Result<(Value, usize), JsonConversionError> { + let mut pg_array_chr = pg_array.char_indices(); + let mut level = 0; + let mut quote = false; + let mut entries: Vec = Vec::new(); + let mut entry = String::new(); + + // skip bounds decoration + if let Some('[') = pg_array.chars().next() { + for (_, c) in pg_array_chr.by_ref() { + if c == '=' { + break; + } + } + } + + fn push_checked( + entry: &mut String, + entries: &mut Vec, + elem_type: &Type, + ) -> Result<(), JsonConversionError> { + if !entry.is_empty() { + // While in usual postgres response we get nulls as None and everything else + // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while + // string with value 'NULL' will be represented by '"NULL"'). So catch NULLs + // here while we have quotation info and convert them to None. + if entry == "NULL" { + entries.push(pg_text_to_json(None, elem_type)?); + } else { + entries.push(pg_text_to_json(Some(entry), elem_type)?); + } + entry.clear(); + } + + Ok(()) + } + + while let Some((mut i, mut c)) = pg_array_chr.next() { + let mut escaped = false; + + if c == '\\' { + escaped = true; + (i, c) = pg_array_chr.next().unwrap(); + } + + match c { + '{' if !quote => { + level += 1; + if level > 1 { + let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?; + entries.push(res); + for _ in 0..off - 1 { + pg_array_chr.next(); + } + } + } + '}' if !quote => { + level -= 1; + if level == 0 { + push_checked(&mut entry, &mut entries, elem_type)?; + if nested { + return Ok((Value::Array(entries), i)); + } + } + } + '"' if !escaped => { + if quote { + // end of quoted string, so push it manually without any checks + // for emptiness or nulls + entries.push(pg_text_to_json(Some(&entry), elem_type)?); + entry.clear(); + } + quote = !quote; + } + ',' if !quote => { + push_checked(&mut entry, &mut entries, elem_type)?; + } + _ => { + entry.push(c); + } + } + } + + if level != 0 { + return Err(JsonConversionError::UnbalancedArray); + } + + Ok((Value::Array(entries), 0)) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn test_atomic_types_to_pg_params() { + let json = vec![Value::Bool(true), Value::Bool(false)]; + let pg_params = json_to_pg_text(json); + assert_eq!( + pg_params, + vec![Some("true".to_owned()), Some("false".to_owned())] + ); + + let json = vec![Value::Number(serde_json::Number::from(42))]; + let pg_params = json_to_pg_text(json); + assert_eq!(pg_params, vec![Some("42".to_owned())]); + + let json = vec![Value::String("foo\"".to_string())]; + let pg_params = json_to_pg_text(json); + assert_eq!(pg_params, vec![Some("foo\"".to_owned())]); + + let json = vec![Value::Null]; + let pg_params = json_to_pg_text(json); + assert_eq!(pg_params, vec![None]); + } + + #[test] + fn test_json_array_to_pg_array() { + // atoms and escaping + let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]"; + let json: Value = serde_json::from_str(json).unwrap(); + let pg_params = json_to_pg_text(vec![json]); + assert_eq!( + pg_params, + vec![Some( + "{true,false,NULL,\"NULL\",42,\"foo\",\"bar\\\"-\\\\\"}".to_owned() + )] + ); + + // nested arrays + let json = "[[true, false], [null, 42], [\"foo\", \"bar\\\"-\\\\\"]]"; + let json: Value = serde_json::from_str(json).unwrap(); + let pg_params = json_to_pg_text(vec![json]); + assert_eq!( + pg_params, + vec![Some( + "{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned() + )] + ); + // array of objects + let json = r#"[{"foo": 1},{"bar": 2}]"#; + let json: Value = serde_json::from_str(json).unwrap(); + let pg_params = json_to_pg_text(vec![json]); + assert_eq!( + pg_params, + vec![Some(r#"{"{\"foo\":1}","{\"bar\":2}"}"#.to_owned())] + ); + } + + #[test] + fn test_atomic_types_parse() { + assert_eq!( + pg_text_to_json(Some("foo"), &Type::TEXT).unwrap(), + json!("foo") + ); + assert_eq!(pg_text_to_json(None, &Type::TEXT).unwrap(), json!(null)); + assert_eq!(pg_text_to_json(Some("42"), &Type::INT4).unwrap(), json!(42)); + assert_eq!(pg_text_to_json(Some("42"), &Type::INT2).unwrap(), json!(42)); + assert_eq!( + pg_text_to_json(Some("42"), &Type::INT8).unwrap(), + json!("42") + ); + assert_eq!( + pg_text_to_json(Some("42.42"), &Type::FLOAT8).unwrap(), + json!(42.42) + ); + assert_eq!( + pg_text_to_json(Some("42.42"), &Type::FLOAT4).unwrap(), + json!(42.42) + ); + assert_eq!( + pg_text_to_json(Some("NaN"), &Type::FLOAT4).unwrap(), + json!("NaN") + ); + assert_eq!( + pg_text_to_json(Some("Infinity"), &Type::FLOAT4).unwrap(), + json!("Infinity") + ); + assert_eq!( + pg_text_to_json(Some("-Infinity"), &Type::FLOAT4).unwrap(), + json!("-Infinity") + ); + + let json: Value = + serde_json::from_str("{\"s\":\"str\",\"n\":42,\"f\":4.2,\"a\":[null,3,\"a\"]}") + .unwrap(); + assert_eq!( + pg_text_to_json( + Some(r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#), + &Type::JSONB + ) + .unwrap(), + json + ); + } + + #[test] + fn test_pg_array_parse_text() { + fn pt(pg_arr: &str) -> Value { + pg_array_parse(pg_arr, &Type::TEXT).unwrap() + } + assert_eq!( + pt(r#"{"aa\"\\\,a",cha,"bbbb"}"#), + json!(["aa\"\\,a", "cha", "bbbb"]) + ); + assert_eq!( + pt(r#"{{"foo","bar"},{"bee","bop"}}"#), + json!([["foo", "bar"], ["bee", "bop"]]) + ); + assert_eq!( + pt(r#"{{{{"foo",NULL,"bop",bup}}}}"#), + json!([[[["foo", null, "bop", "bup"]]]]) + ); + assert_eq!( + pt(r#"{{"1",2,3},{4,NULL,6},{NULL,NULL,NULL}}"#), + json!([["1", "2", "3"], ["4", null, "6"], [null, null, null]]) + ); + } + + #[test] + fn test_pg_array_parse_bool() { + fn pb(pg_arr: &str) -> Value { + pg_array_parse(pg_arr, &Type::BOOL).unwrap() + } + assert_eq!(pb(r#"{t,f,t}"#), json!([true, false, true])); + assert_eq!(pb(r#"{{t,f,t}}"#), json!([[true, false, true]])); + assert_eq!( + pb(r#"{{t,f},{f,t}}"#), + json!([[true, false], [false, true]]) + ); + assert_eq!( + pb(r#"{{t,NULL},{NULL,f}}"#), + json!([[true, null], [null, false]]) + ); + } + + #[test] + fn test_pg_array_parse_numbers() { + fn pn(pg_arr: &str, ty: &Type) -> Value { + pg_array_parse(pg_arr, ty).unwrap() + } + assert_eq!(pn(r#"{1,2,3}"#, &Type::INT4), json!([1, 2, 3])); + assert_eq!(pn(r#"{1,2,3}"#, &Type::INT2), json!([1, 2, 3])); + assert_eq!(pn(r#"{1,2,3}"#, &Type::INT8), json!(["1", "2", "3"])); + assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT4), json!([1.0, 2.0, 3.0])); + assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT8), json!([1.0, 2.0, 3.0])); + assert_eq!( + pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT4), + json!([1.1, 2.2, 3.3]) + ); + assert_eq!( + pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT8), + json!([1.1, 2.2, 3.3]) + ); + assert_eq!( + pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT4), + json!(["NaN", "Infinity", "-Infinity"]) + ); + assert_eq!( + pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT8), + json!(["NaN", "Infinity", "-Infinity"]) + ); + } + + #[test] + fn test_pg_array_with_decoration() { + fn p(pg_arr: &str) -> Value { + pg_array_parse(pg_arr, &Type::INT2).unwrap() + } + assert_eq!( + p(r#"[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}"#), + json!([[[1, 2, 3], [4, 5, 6]]]) + ); + } + + #[test] + fn test_pg_array_parse_json() { + fn pt(pg_arr: &str) -> Value { + pg_array_parse(pg_arr, &Type::JSONB).unwrap() + } + assert_eq!(pt(r#"{"{}"}"#), json!([{}])); + assert_eq!( + pt(r#"{"{\"foo\": 1, \"bar\": 2}"}"#), + json!([{"foo": 1, "bar": 2}]) + ); + assert_eq!( + pt(r#"{"{\"foo\": 1}", "{\"bar\": 2}"}"#), + json!([{"foo": 1}, {"bar": 2}]) + ); + assert_eq!( + pt(r#"{{"{\"foo\": 1}", "{\"bar\": 2}"}}"#), + json!([[{"foo": 1}, {"bar": 2}]]) + ); + } +} diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index f108ab34ab..5b36f5e91d 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -1,50 +1,78 @@ +use std::pin::pin; use std::sync::Arc; -use anyhow::bail; -use futures::pin_mut; +use bytes::Bytes; +use futures::future::select; +use futures::future::try_join; +use futures::future::Either; use futures::StreamExt; -use hyper::body::HttpBody; -use hyper::header; -use hyper::http::HeaderName; -use hyper::http::HeaderValue; -use hyper::Response; -use hyper::StatusCode; -use hyper::{Body, HeaderMap, Request}; -use serde_json::json; -use serde_json::Map; +use futures::TryFutureExt; +use http::header::AUTHORIZATION; +use http_body_util::BodyExt; +use http_body_util::Full; +use hyper1::body::Body; +use hyper1::body::Incoming; +use hyper1::header; +use hyper1::http::HeaderName; +use hyper1::http::HeaderValue; +use hyper1::Response; +use hyper1::StatusCode; +use hyper1::{HeaderMap, Request}; +use pq_proto::StartupMessageParamsBuilder; +use serde::Serialize; use serde_json::Value; +use tokio::time; use tokio_postgres::error::DbError; use tokio_postgres::error::ErrorPosition; -use tokio_postgres::types::Kind; -use tokio_postgres::types::Type; +use tokio_postgres::error::SqlState; use tokio_postgres::GenericClient; use tokio_postgres::IsolationLevel; +use tokio_postgres::NoTls; use tokio_postgres::ReadyForQueryStatus; -use tokio_postgres::Row; use tokio_postgres::Transaction; +use tokio_util::sync::CancellationToken; use tracing::error; -use tracing::instrument; +use tracing::info; +use typed_json::json; use url::Url; +use urlencoding; use utils::http::error::ApiError; -use utils::http::json::json_response; use crate::auth::backend::ComputeUserInfo; use crate::auth::endpoint_sni; -use crate::config::HttpConfig; +use crate::auth::ComputeUserInfoParseError; +use crate::config::ProxyConfig; use crate::config::TlsConfig; use crate::context::RequestMonitoring; -use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE; +use crate::error::ErrorKind; +use crate::error::ReportableError; +use crate::error::UserFacingError; +use crate::metrics::HttpDirection; +use crate::metrics::Metrics; +use crate::proxy::run_until_cancelled; use crate::proxy::NeonOptions; -use crate::EndpointId; +use crate::serverless::backend::HttpConnError; +use crate::usage_metrics::MetricCounterRecorder; +use crate::DbName; use crate::RoleName; +use super::backend::PoolingBackend; +use super::conn_pool::AuthData; +use super::conn_pool::Client; use super::conn_pool::ConnInfo; -use super::conn_pool::GlobalConnPool; +use super::http_util::json_response; +use super::json::json_to_pg_text; +use super::json::pg_text_row_to_json; +use super::json::JsonConversionError; #[derive(serde::Deserialize)] +#[serde(rename_all = "camelCase")] struct QueryData { query: String, - params: Vec, + #[serde(deserialize_with = "bytes_to_pg_text")] + params: Vec>, + #[serde(default)] + array_mode: Option, } #[derive(serde::Deserialize)] @@ -61,8 +89,8 @@ enum Payload { const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB -const SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART: &str = "api"; +static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string"); static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output"); static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode"); static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in"); @@ -72,135 +100,135 @@ static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrab static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true"); -// -// Convert json non-string types to strings, so that they can be passed to Postgres -// as parameters. -// -fn json_to_pg_text(json: Vec) -> Vec> { - json.iter() - .map(|value| { - match value { - // special care for nulls - Value::Null => None, - - // convert to text with escaping - v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()), - - // avoid escaping here, as we pass this as a parameter - Value::String(s) => Some(s.to_string()), - - // special care for arrays - Value::Array(_) => json_array_to_pg_array(value), - } - }) - .collect() +fn bytes_to_pg_text<'de, D>(deserializer: D) -> Result>, D::Error> +where + D: serde::de::Deserializer<'de>, +{ + // TODO: consider avoiding the allocation here. + let json: Vec = serde::de::Deserialize::deserialize(deserializer)?; + Ok(json_to_pg_text(json)) } -// -// Serialize a JSON array to a Postgres array. Contrary to the strings in the params -// in the array we need to escape the strings. Postgres is okay with arrays of form -// '{1,"2",3}'::int[], so we don't check that array holds values of the same type, leaving -// it for Postgres to check. -// -// Example of the same escaping in node-postgres: packages/pg/lib/utils.js -// -fn json_array_to_pg_array(value: &Value) -> Option { - match value { - // special care for nulls - Value::Null => None, +#[derive(Debug, thiserror::Error)] +pub(crate) enum ConnInfoError { + #[error("invalid header: {0}")] + InvalidHeader(&'static HeaderName), + #[error("invalid connection string: {0}")] + UrlParseError(#[from] url::ParseError), + #[error("incorrect scheme")] + IncorrectScheme, + #[error("missing database name")] + MissingDbName, + #[error("invalid database name")] + InvalidDbName, + #[error("missing username")] + MissingUsername, + #[error("invalid username: {0}")] + InvalidUsername(#[from] std::string::FromUtf8Error), + #[error("missing password")] + MissingPassword, + #[error("missing hostname")] + MissingHostname, + #[error("invalid hostname: {0}")] + InvalidEndpoint(#[from] ComputeUserInfoParseError), + #[error("malformed endpoint")] + MalformedEndpoint, +} - // convert to text with escaping - // here string needs to be escaped, as it is part of the array - v @ (Value::Bool(_) | Value::Number(_) | Value::String(_)) => Some(v.to_string()), - v @ Value::Object(_) => json_array_to_pg_array(&Value::String(v.to_string())), +impl ReportableError for ConnInfoError { + fn get_error_kind(&self) -> ErrorKind { + ErrorKind::User + } +} - // recurse into array - Value::Array(arr) => { - let vals = arr - .iter() - .map(json_array_to_pg_array) - .map(|v| v.unwrap_or_else(|| "NULL".to_string())) - .collect::>() - .join(","); - - Some(format!("{{{}}}", vals)) - } +impl UserFacingError for ConnInfoError { + fn to_string_client(&self) -> String { + self.to_string() } } fn get_conn_info( - ctx: &mut RequestMonitoring, + ctx: &RequestMonitoring, headers: &HeaderMap, - sni_hostname: Option, - tls: &TlsConfig, -) -> Result { + tls: Option<&TlsConfig>, +) -> Result { + // HTTP only uses cleartext (for now and likely always) + ctx.set_auth_method(crate::context::AuthMethod::Cleartext); + let connection_string = headers - .get("Neon-Connection-String") - .ok_or(anyhow::anyhow!("missing connection string"))? - .to_str()?; + .get(&CONN_STRING) + .ok_or(ConnInfoError::InvalidHeader(&CONN_STRING))? + .to_str() + .map_err(|_| ConnInfoError::InvalidHeader(&CONN_STRING))?; let connection_url = Url::parse(connection_string)?; let protocol = connection_url.scheme(); if protocol != "postgres" && protocol != "postgresql" { - return Err(anyhow::anyhow!( - "connection string must start with postgres: or postgresql:" - )); + return Err(ConnInfoError::IncorrectScheme); } let mut url_path = connection_url .path_segments() - .ok_or(anyhow::anyhow!("missing database name"))?; + .ok_or(ConnInfoError::MissingDbName)?; - let dbname = url_path - .next() - .ok_or(anyhow::anyhow!("invalid database name"))?; + let dbname: DbName = + urlencoding::decode(url_path.next().ok_or(ConnInfoError::InvalidDbName)?)?.into(); + ctx.set_dbname(dbname.clone()); - let username = RoleName::from(connection_url.username()); + let username = RoleName::from(urlencoding::decode(connection_url.username())?); if username.is_empty() { - return Err(anyhow::anyhow!("missing username")); + return Err(ConnInfoError::MissingUsername); } ctx.set_user(username.clone()); - let password = connection_url - .password() - .ok_or(anyhow::anyhow!("no password"))?; + let auth = if let Some(auth) = headers.get(&AUTHORIZATION) { + let auth = auth + .to_str() + .map_err(|_| ConnInfoError::InvalidHeader(&AUTHORIZATION))?; + AuthData::Jwt( + auth.strip_prefix("Bearer ") + .ok_or(ConnInfoError::MissingPassword)? + .into(), + ) + } else if let Some(pass) = connection_url.password() { + AuthData::Password(match urlencoding::decode_binary(pass.as_bytes()) { + std::borrow::Cow::Borrowed(b) => b.into(), + std::borrow::Cow::Owned(b) => b.into(), + }) + } else { + return Err(ConnInfoError::MissingPassword); + }; - // TLS certificate selector now based on SNI hostname, so if we are running here - // we are sure that SNI hostname is set to one of the configured domain names. - let sni_hostname = sni_hostname.ok_or(anyhow::anyhow!("no SNI hostname set"))?; - - let hostname = connection_url - .host_str() - .ok_or(anyhow::anyhow!("no host"))?; - - let host_header = headers - .get("host") - .and_then(|h| h.to_str().ok()) - .and_then(|h| h.split(':').next()); - - // sni_hostname has to be either the same as hostname or the one used in serverless driver. - if !check_matches(&sni_hostname, hostname)? { - return Err(anyhow::anyhow!("mismatched SNI hostname and hostname")); - } else if let Some(h) = host_header { - if h != sni_hostname { - return Err(anyhow::anyhow!("mismatched host header and hostname")); + let endpoint = match connection_url.host() { + Some(url::Host::Domain(hostname)) => { + if let Some(tls) = tls { + endpoint_sni(hostname, &tls.common_names)? + .ok_or(ConnInfoError::MalformedEndpoint)? + } else { + hostname + .split_once('.') + .map_or(hostname, |(prefix, _)| prefix) + .into() + } } - } - - let endpoint = endpoint_sni(hostname, &tls.common_names)?; - - let endpoint: EndpointId = endpoint.into(); - ctx.set_endpoint_id(Some(endpoint.clone())); + Some(url::Host::Ipv4(_) | url::Host::Ipv6(_)) | None => { + return Err(ConnInfoError::MissingHostname) + } + }; + ctx.set_endpoint_id(endpoint.clone()); let pairs = connection_url.query_pairs(); let mut options = Option::None; + let mut params = StartupMessageParamsBuilder::default(); + params.insert("user", &username); + params.insert("database", &dbname); for (key, value) in pairs { + params.insert(&key, &value); if key == "options" { options = Some(NeonOptions::parse_options_raw(&value)); - break; } } @@ -212,301 +240,404 @@ fn get_conn_info( Ok(ConnInfo { user_info, - dbname: dbname.into(), - password: password.into(), + dbname, + auth, }) } -fn check_matches(sni_hostname: &str, hostname: &str) -> Result { - if sni_hostname == hostname { - return Ok(true); - } - let (sni_hostname_first, sni_hostname_rest) = sni_hostname - .split_once('.') - .ok_or_else(|| anyhow::anyhow!("Unexpected sni format."))?; - let (_, hostname_rest) = hostname - .split_once('.') - .ok_or_else(|| anyhow::anyhow!("Unexpected hostname format."))?; - Ok(sni_hostname_rest == hostname_rest - && sni_hostname_first == SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART) -} - // TODO: return different http error codes -pub async fn handle( - tls: &'static TlsConfig, - config: &'static HttpConfig, - ctx: &mut RequestMonitoring, - request: Request, - sni_hostname: Option, - conn_pool: Arc, -) -> Result, ApiError> { - let result = tokio::time::timeout( - config.request_timeout, - handle_inner(tls, config, ctx, request, sni_hostname, conn_pool), - ) - .await; +pub(crate) async fn handle( + config: &'static ProxyConfig, + ctx: RequestMonitoring, + request: Request, + backend: Arc, + cancel: CancellationToken, +) -> Result>, ApiError> { + let result = handle_inner(cancel, config, &ctx, request, backend).await; + let mut response = match result { - Ok(r) => match r { - Ok(r) => r, - Err(e) => { - let mut message = format!("{:?}", e); - let db_error = e - .downcast_ref::() - .and_then(|e| e.as_db_error()); - fn get<'a, T: serde::Serialize>( - db: Option<&'a DbError>, - x: impl FnOnce(&'a DbError) -> T, - ) -> Value { - db.map(x) - .and_then(|t| serde_json::to_value(t).ok()) - .unwrap_or_default() - } + Ok(r) => { + ctx.set_success(); + r + } + Err(e @ SqlOverHttpError::Cancelled(_)) => { + let error_kind = e.get_error_kind(); + ctx.set_error_kind(error_kind); - if let Some(db_error) = db_error { - db_error.message().clone_into(&mut message); - } + let message = "Query cancelled, connection was terminated"; - let position = db_error.and_then(|db| db.position()); - let (position, internal_position, internal_query) = match position { - Some(ErrorPosition::Original(position)) => ( - Value::String(position.to_string()), - Value::Null, - Value::Null, - ), - Some(ErrorPosition::Internal { position, query }) => ( - Value::Null, - Value::String(position.to_string()), - Value::String(query.clone()), - ), - None => (Value::Null, Value::Null, Value::Null), - }; - - let code = get(db_error, |db| db.code().code()); - let severity = get(db_error, |db| db.severity()); - let detail = get(db_error, |db| db.detail()); - let hint = get(db_error, |db| db.hint()); - let where_ = get(db_error, |db| db.where_()); - let table = get(db_error, |db| db.table()); - let column = get(db_error, |db| db.column()); - let schema = get(db_error, |db| db.schema()); - let datatype = get(db_error, |db| db.datatype()); - let constraint = get(db_error, |db| db.constraint()); - let file = get(db_error, |db| db.file()); - let line = get(db_error, |db| db.line().map(|l| l.to_string())); - let routine = get(db_error, |db| db.routine()); - - error!( - ?code, - "sql-over-http per-client task finished with an error: {e:#}" - ); - // TODO: this shouldn't always be bad request. - json_response( - StatusCode::BAD_REQUEST, - json!({ - "message": message, - "code": code, - "detail": detail, - "hint": hint, - "position": position, - "internalPosition": internal_position, - "internalQuery": internal_query, - "severity": severity, - "where": where_, - "table": table, - "column": column, - "schema": schema, - "dataType": datatype, - "constraint": constraint, - "file": file, - "line": line, - "routine": routine, - }), - )? - } - }, - Err(_) => { - let message = format!( - "HTTP-Connection timed out, execution time exeeded {} seconds", - config.request_timeout.as_secs() + tracing::info!( + kind=error_kind.to_metric_label(), + error=%e, + msg=message, + "forwarding error to user" ); - error!(message); + json_response( - StatusCode::GATEWAY_TIMEOUT, - json!({ "message": message, "code": StatusCode::GATEWAY_TIMEOUT.as_u16() }), + StatusCode::BAD_REQUEST, + json!({ "message": message, "code": SqlState::PROTOCOL_VIOLATION.code() }), + )? + } + Err(e) => { + let error_kind = e.get_error_kind(); + ctx.set_error_kind(error_kind); + + let mut message = e.to_string_client(); + let db_error = match &e { + SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e)) + | SqlOverHttpError::Postgres(e) => e.as_db_error(), + _ => None, + }; + fn get<'a, T: Default>(db: Option<&'a DbError>, x: impl FnOnce(&'a DbError) -> T) -> T { + db.map(x).unwrap_or_default() + } + + if let Some(db_error) = db_error { + db_error.message().clone_into(&mut message); + } + + let position = db_error.and_then(|db| db.position()); + let (position, internal_position, internal_query) = match position { + Some(ErrorPosition::Original(position)) => (Some(position.to_string()), None, None), + Some(ErrorPosition::Internal { position, query }) => { + (None, Some(position.to_string()), Some(query.clone())) + } + None => (None, None, None), + }; + + let code = get(db_error, |db| db.code().code()); + let severity = get(db_error, |db| db.severity()); + let detail = get(db_error, |db| db.detail()); + let hint = get(db_error, |db| db.hint()); + let where_ = get(db_error, |db| db.where_()); + let table = get(db_error, |db| db.table()); + let column = get(db_error, |db| db.column()); + let schema = get(db_error, |db| db.schema()); + let datatype = get(db_error, |db| db.datatype()); + let constraint = get(db_error, |db| db.constraint()); + let file = get(db_error, |db| db.file()); + let line = get(db_error, |db| db.line().map(|l| l.to_string())); + let routine = get(db_error, |db| db.routine()); + + tracing::info!( + kind=error_kind.to_metric_label(), + error=%e, + msg=message, + "forwarding error to user" + ); + + // TODO: this shouldn't always be bad request. + json_response( + StatusCode::BAD_REQUEST, + json!({ + "message": message, + "code": code, + "detail": detail, + "hint": hint, + "position": position, + "internalPosition": internal_position, + "internalQuery": internal_query, + "severity": severity, + "where": where_, + "table": table, + "column": column, + "schema": schema, + "dataType": datatype, + "constraint": constraint, + "file": file, + "line": line, + "routine": routine, + }), )? } }; - response.headers_mut().insert( - "Access-Control-Allow-Origin", - hyper::http::HeaderValue::from_static("*"), - ); + + response + .headers_mut() + .insert("Access-Control-Allow-Origin", HeaderValue::from_static("*")); Ok(response) } -#[instrument(name = "sql-over-http", fields(pid = tracing::field::Empty), skip_all)] +#[derive(Debug, thiserror::Error)] +pub(crate) enum SqlOverHttpError { + #[error("{0}")] + ReadPayload(#[from] ReadPayloadError), + #[error("{0}")] + ConnectCompute(#[from] HttpConnError), + #[error("{0}")] + ConnInfo(#[from] ConnInfoError), + #[error("request is too large (max is {MAX_REQUEST_SIZE} bytes)")] + RequestTooLarge, + #[error("response is too large (max is {MAX_RESPONSE_SIZE} bytes)")] + ResponseTooLarge, + #[error("invalid isolation level")] + InvalidIsolationLevel, + #[error("{0}")] + Postgres(#[from] tokio_postgres::Error), + #[error("{0}")] + JsonConversion(#[from] JsonConversionError), + #[error("{0}")] + Cancelled(SqlOverHttpCancel), +} + +impl ReportableError for SqlOverHttpError { + fn get_error_kind(&self) -> ErrorKind { + match self { + SqlOverHttpError::ReadPayload(e) => e.get_error_kind(), + SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(), + SqlOverHttpError::ConnInfo(e) => e.get_error_kind(), + SqlOverHttpError::RequestTooLarge => ErrorKind::User, + SqlOverHttpError::ResponseTooLarge => ErrorKind::User, + SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User, + SqlOverHttpError::Postgres(p) => p.get_error_kind(), + SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres, + SqlOverHttpError::Cancelled(c) => c.get_error_kind(), + } + } +} + +impl UserFacingError for SqlOverHttpError { + fn to_string_client(&self) -> String { + match self { + SqlOverHttpError::ReadPayload(p) => p.to_string(), + SqlOverHttpError::ConnectCompute(c) => c.to_string_client(), + SqlOverHttpError::ConnInfo(c) => c.to_string_client(), + SqlOverHttpError::RequestTooLarge => self.to_string(), + SqlOverHttpError::ResponseTooLarge => self.to_string(), + SqlOverHttpError::InvalidIsolationLevel => self.to_string(), + SqlOverHttpError::Postgres(p) => p.to_string(), + SqlOverHttpError::JsonConversion(_) => "could not parse postgres response".to_string(), + SqlOverHttpError::Cancelled(_) => self.to_string(), + } + } +} + +#[derive(Debug, thiserror::Error)] +pub(crate) enum ReadPayloadError { + #[error("could not read the HTTP request body: {0}")] + Read(#[from] hyper1::Error), + #[error("could not parse the HTTP request body: {0}")] + Parse(#[from] serde_json::Error), +} + +impl ReportableError for ReadPayloadError { + fn get_error_kind(&self) -> ErrorKind { + match self { + ReadPayloadError::Read(_) => ErrorKind::ClientDisconnect, + ReadPayloadError::Parse(_) => ErrorKind::User, + } + } +} + +#[derive(Debug, thiserror::Error)] +pub(crate) enum SqlOverHttpCancel { + #[error("query was cancelled")] + Postgres, + #[error("query was cancelled while stuck trying to connect to the database")] + Connect, +} + +impl ReportableError for SqlOverHttpCancel { + fn get_error_kind(&self) -> ErrorKind { + match self { + SqlOverHttpCancel::Postgres => ErrorKind::ClientDisconnect, + SqlOverHttpCancel::Connect => ErrorKind::ClientDisconnect, + } + } +} + +#[derive(Clone, Copy, Debug)] +struct HttpHeaders { + raw_output: bool, + default_array_mode: bool, + txn_isolation_level: Option, + txn_read_only: bool, + txn_deferrable: bool, +} + +impl HttpHeaders { + fn try_parse(headers: &hyper1::http::HeaderMap) -> Result { + // Determine the output options. Default behaviour is 'false'. Anything that is not + // strictly 'true' assumed to be false. + let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE); + let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE); + + // isolation level, read only and deferrable + let txn_isolation_level = match headers.get(&TXN_ISOLATION_LEVEL) { + Some(x) => Some( + map_header_to_isolation_level(x).ok_or(SqlOverHttpError::InvalidIsolationLevel)?, + ), + None => None, + }; + + let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE); + let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE); + + Ok(Self { + raw_output, + default_array_mode, + txn_isolation_level, + txn_read_only, + txn_deferrable, + }) + } +} + +fn map_header_to_isolation_level(level: &HeaderValue) -> Option { + match level.as_bytes() { + b"Serializable" => Some(IsolationLevel::Serializable), + b"ReadUncommitted" => Some(IsolationLevel::ReadUncommitted), + b"ReadCommitted" => Some(IsolationLevel::ReadCommitted), + b"RepeatableRead" => Some(IsolationLevel::RepeatableRead), + _ => None, + } +} + +fn map_isolation_level_to_headers(level: IsolationLevel) -> Option { + match level { + IsolationLevel::ReadUncommitted => Some(HeaderValue::from_static("ReadUncommitted")), + IsolationLevel::ReadCommitted => Some(HeaderValue::from_static("ReadCommitted")), + IsolationLevel::RepeatableRead => Some(HeaderValue::from_static("RepeatableRead")), + IsolationLevel::Serializable => Some(HeaderValue::from_static("Serializable")), + _ => None, + } +} + async fn handle_inner( - tls: &'static TlsConfig, - config: &'static HttpConfig, - ctx: &mut RequestMonitoring, - request: Request, - sni_hostname: Option, - conn_pool: Arc, -) -> anyhow::Result> { - let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE - .with_label_values(&["http"]) - .guard(); + cancel: CancellationToken, + config: &'static ProxyConfig, + ctx: &RequestMonitoring, + request: Request, + backend: Arc, +) -> Result>, SqlOverHttpError> { + let _requeset_gauge = Metrics::get() + .proxy + .connection_requests + .guard(ctx.protocol()); + info!( + protocol = %ctx.protocol(), + "handling interactive connection from client" + ); // // Determine the destination and connection params // let headers = request.headers(); - let conn_info = get_conn_info(ctx, headers, sni_hostname, tls)?; - // Determine the output options. Default behaviour is 'false'. Anything that is not - // strictly 'true' assumed to be false. - let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE); - let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE); + // TLS config should be there. + let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref())?; + info!(user = conn_info.user_info.user.as_str(), "credentials"); // Allow connection pooling only if explicitly requested // or if we have decided that http pool is no longer opt-in - let allow_pool = - !config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); + let allow_pool = !config.http_config.pool_options.opt_in + || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); - // isolation level, read only and deferrable + let parsed_headers = HttpHeaders::try_parse(headers)?; - let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned(); - let txn_isolation_level = match txn_isolation_level_raw { - Some(ref x) => Some(match x.as_bytes() { - b"Serializable" => IsolationLevel::Serializable, - b"ReadUncommitted" => IsolationLevel::ReadUncommitted, - b"ReadCommitted" => IsolationLevel::ReadCommitted, - b"RepeatableRead" => IsolationLevel::RepeatableRead, - _ => bail!("invalid isolation level"), - }), - None => None, - }; - - let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE); - let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE); - - let paused = ctx.latency_timer.pause(); let request_content_length = match request.body().size_hint().upper() { Some(v) => v, None => MAX_REQUEST_SIZE + 1, }; - drop(paused); + info!(request_content_length, "request size in bytes"); + Metrics::get() + .proxy + .http_conn_content_length_bytes + .observe(HttpDirection::Request, request_content_length as f64); // we don't have a streaming request support yet so this is to prevent OOM // from a malicious user sending an extremely large request body if request_content_length > MAX_REQUEST_SIZE { - return Err(anyhow::anyhow!( - "request is too large (max is {MAX_REQUEST_SIZE} bytes)" - )); + return Err(SqlOverHttpError::RequestTooLarge); } - // - // Read the query and query params from the request body - // - let body = hyper::body::to_bytes(request.into_body()).await?; - let payload: Payload = serde_json::from_slice(&body)?; + let fetch_and_process_request = Box::pin( + async { + let body = request.into_body().collect().await?.to_bytes(); + info!(length = body.len(), "request payload read"); + let payload: Payload = serde_json::from_slice(&body)?; + Ok::(payload) // Adjust error type accordingly + } + .map_err(SqlOverHttpError::from), + ); - let mut client = conn_pool.get(ctx, conn_info, !allow_pool).await?; + let authenticate_and_connect = Box::pin( + async { + let keys = match &conn_info.auth { + AuthData::Password(pw) => { + backend + .authenticate_with_password( + ctx, + &config.authentication_config, + &conn_info.user_info, + pw, + ) + .await? + } + AuthData::Jwt(jwt) => { + backend + .authenticate_with_jwt(ctx, &conn_info.user_info, jwt) + .await? + } + }; + + let client = backend + .connect_to_compute(ctx, conn_info, keys, !allow_pool) + .await?; + // not strictly necessary to mark success here, + // but it's just insurance for if we forget it somewhere else + ctx.success(); + Ok::<_, HttpConnError>(client) + } + .map_err(SqlOverHttpError::from), + ); + + let (payload, mut client) = match run_until_cancelled( + // Run both operations in parallel + try_join( + pin!(fetch_and_process_request), + pin!(authenticate_and_connect), + ), + &cancel, + ) + .await + { + Some(result) => result?, + None => return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect)), + }; let mut response = Response::builder() .status(StatusCode::OK) .header(header::CONTENT_TYPE, "application/json"); - // - // Now execute the query and return the result - // - let mut size = 0; - let result = - match payload { - Payload::Single(stmt) => { - let (status, results) = - query_to_json(&*client, stmt, &mut 0, raw_output, array_mode) - .await - .map_err(|e| { - client.discard(); - e - })?; - client.check_idle(status); - results + // Now execute the query and return the result. + let json_output = match payload { + Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?, + Payload::Batch(statements) => { + if parsed_headers.txn_read_only { + response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE); } - Payload::Batch(statements) => { - let (inner, mut discard) = client.inner(); - let mut builder = inner.build_transaction(); - if let Some(isolation_level) = txn_isolation_level { - builder = builder.isolation_level(isolation_level); - } - if txn_read_only { - builder = builder.read_only(true); - } - if txn_deferrable { - builder = builder.deferrable(true); - } - - let transaction = builder.start().await.map_err(|e| { - // if we cannot start a transaction, we should return immediately - // and not return to the pool. connection is clearly broken - discard.discard(); - e - })?; - - let results = - match query_batch(&transaction, statements, &mut size, raw_output, array_mode) - .await - { - Ok(results) => { - let status = transaction.commit().await.map_err(|e| { - // if we cannot commit - for now don't return connection to pool - // TODO: get a query status from the error - discard.discard(); - e - })?; - discard.check_idle(status); - results - } - Err(err) => { - let status = transaction.rollback().await.map_err(|e| { - // if we cannot rollback - for now don't return connection to pool - // TODO: get a query status from the error - discard.discard(); - e - })?; - discard.check_idle(status); - return Err(err); - } - }; - - if txn_read_only { - response = response.header( - TXN_READ_ONLY.clone(), - HeaderValue::try_from(txn_read_only.to_string())?, - ); - } - if txn_deferrable { - response = response.header( - TXN_DEFERRABLE.clone(), - HeaderValue::try_from(txn_deferrable.to_string())?, - ); - } - if let Some(txn_isolation_level) = txn_isolation_level_raw { - response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level); - } - json!({ "results": results }) + if parsed_headers.txn_deferrable { + response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE); + } + if let Some(txn_isolation_level) = parsed_headers + .txn_isolation_level + .and_then(map_isolation_level_to_headers) + { + response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level); } - }; - ctx.set_success(); - ctx.log(); + statements + .process(cancel, &mut client, parsed_headers) + .await? + } + }; + let metrics = client.metrics(); - // how could this possibly fail - let body = serde_json::to_string(&result).expect("json serialization should not fail"); - let len = body.len(); + let len = json_output.len(); let response = response - .body(Body::from(body)) + .body(Full::new(Bytes::from(json_output))) // only fails if invalid status code or invalid header/values are given. // these are not user configurable so it cannot fail dynamically .expect("building response payload should not fail"); @@ -514,43 +645,203 @@ async fn handle_inner( // count the egress bytes - we miss the TLS and header overhead but oh well... // moving this later in the stack is going to be a lot of effort and ehhhh metrics.record_egress(len as u64); + Metrics::get() + .proxy + .http_conn_content_length_bytes + .observe(HttpDirection::Response, len as f64); Ok(response) } +impl QueryData { + async fn process( + self, + cancel: CancellationToken, + client: &mut Client, + parsed_headers: HttpHeaders, + ) -> Result { + let (inner, mut discard) = client.inner(); + let cancel_token = inner.cancel_token(); + + let res = match select( + pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)), + pin!(cancel.cancelled()), + ) + .await + { + // The query successfully completed. + Either::Left((Ok((status, results)), __not_yet_cancelled)) => { + discard.check_idle(status); + + let json_output = + serde_json::to_string(&results).expect("json serialization should not fail"); + Ok(json_output) + } + // The query failed with an error + Either::Left((Err(e), __not_yet_cancelled)) => { + discard.discard(); + return Err(e); + } + // The query was cancelled. + Either::Right((_cancelled, query)) => { + tracing::info!("cancelling query"); + if let Err(err) = cancel_token.cancel_query(NoTls).await { + tracing::error!(?err, "could not cancel query"); + } + // wait for the query cancellation + match time::timeout(time::Duration::from_millis(100), query).await { + // query successed before it was cancelled. + Ok(Ok((status, results))) => { + discard.check_idle(status); + + let json_output = serde_json::to_string(&results) + .expect("json serialization should not fail"); + Ok(json_output) + } + // query failed or was cancelled. + Ok(Err(error)) => { + let db_error = match &error { + SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e)) + | SqlOverHttpError::Postgres(e) => e.as_db_error(), + _ => None, + }; + + // if errored for some other reason, it might not be safe to return + if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) { + discard.discard(); + } + + Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)) + } + Err(_timeout) => { + discard.discard(); + Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)) + } + } + } + }; + res + } +} + +impl BatchQueryData { + async fn process( + self, + cancel: CancellationToken, + client: &mut Client, + parsed_headers: HttpHeaders, + ) -> Result { + info!("starting transaction"); + let (inner, mut discard) = client.inner(); + let cancel_token = inner.cancel_token(); + let mut builder = inner.build_transaction(); + if let Some(isolation_level) = parsed_headers.txn_isolation_level { + builder = builder.isolation_level(isolation_level); + } + if parsed_headers.txn_read_only { + builder = builder.read_only(true); + } + if parsed_headers.txn_deferrable { + builder = builder.deferrable(true); + } + + let transaction = builder.start().await.map_err(|e| { + // if we cannot start a transaction, we should return immediately + // and not return to the pool. connection is clearly broken + discard.discard(); + e + })?; + + let json_output = + match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await { + Ok(json_output) => { + info!("commit"); + let status = transaction.commit().await.map_err(|e| { + // if we cannot commit - for now don't return connection to pool + // TODO: get a query status from the error + discard.discard(); + e + })?; + discard.check_idle(status); + json_output + } + Err(SqlOverHttpError::Cancelled(_)) => { + if let Err(err) = cancel_token.cancel_query(NoTls).await { + tracing::error!(?err, "could not cancel query"); + } + // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe. + discard.discard(); + + return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); + } + Err(err) => { + info!("rollback"); + let status = transaction.rollback().await.map_err(|e| { + // if we cannot rollback - for now don't return connection to pool + // TODO: get a query status from the error + discard.discard(); + e + })?; + discard.check_idle(status); + return Err(err); + } + }; + + Ok(json_output) + } +} + async fn query_batch( + cancel: CancellationToken, transaction: &Transaction<'_>, queries: BatchQueryData, - total_size: &mut usize, - raw_output: bool, - array_mode: bool, -) -> anyhow::Result> { + parsed_headers: HttpHeaders, +) -> Result { let mut results = Vec::with_capacity(queries.queries.len()); let mut current_size = 0; for stmt in queries.queries { - // TODO: maybe we should check that the transaction bit is set here - let (_, values) = - query_to_json(transaction, stmt, &mut current_size, raw_output, array_mode).await?; - results.push(values); + let query = pin!(query_to_json( + transaction, + stmt, + &mut current_size, + parsed_headers, + )); + let cancelled = pin!(cancel.cancelled()); + let res = select(query, cancelled).await; + match res { + // TODO: maybe we should check that the transaction bit is set here + Either::Left((Ok((_, values)), _cancelled)) => { + results.push(values); + } + Either::Left((Err(e), _cancelled)) => { + return Err(e); + } + Either::Right((_cancelled, _)) => { + return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); + } + } } - *total_size += current_size; - Ok(results) + + let results = json!({ "results": results }); + let json_output = serde_json::to_string(&results).expect("json serialization should not fail"); + + Ok(json_output) } async fn query_to_json( client: &T, data: QueryData, current_size: &mut usize, - raw_output: bool, - array_mode: bool, -) -> anyhow::Result<(ReadyForQueryStatus, Value)> { - let query_params = json_to_pg_text(data.params); - let row_stream = client.query_raw_txt(&data.query, query_params).await?; + parsed_headers: HttpHeaders, +) -> Result<(ReadyForQueryStatus, impl Serialize), SqlOverHttpError> { + info!("executing query"); + let query_params = data.params; + let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?); + info!("finished executing query"); // Manually drain the stream into a vector to leave row_stream hanging // around to get a command tag. Also check that the response is not too // big. - pin_mut!(row_stream); let mut rows: Vec = Vec::new(); while let Some(row) = row_stream.next().await { let row = row?; @@ -559,9 +850,7 @@ async fn query_to_json( // we don't have a streaming response support yet so this is to prevent OOM // from a malicious query (eg a cross join) if *current_size > MAX_RESPONSE_SIZE { - return Err(anyhow::anyhow!( - "response is too large (max is {MAX_RESPONSE_SIZE} bytes)" - )); + return Err(SqlOverHttpError::ResponseTooLarge); } } @@ -580,13 +869,21 @@ async fn query_to_json( } .and_then(|s| s.parse::().ok()); - let mut fields = vec![]; - let mut columns = vec![]; + info!( + rows = rows.len(), + ?ready, + command_tag, + "finished reading rows" + ); + + let columns_len = row_stream.columns().len(); + let mut fields = Vec::with_capacity(columns_len); + let mut columns = Vec::with_capacity(columns_len); for c in row_stream.columns() { fields.push(json!({ - "name": Value::String(c.name().to_owned()), - "dataTypeID": Value::Number(c.type_().oid().into()), + "name": c.name().to_owned(), + "dataTypeID": c.type_().oid(), "tableID": c.table_oid(), "columnID": c.column_id(), "dataTypeSize": c.type_size(), @@ -596,407 +893,22 @@ async fn query_to_json( columns.push(client.get_type(c.type_oid()).await?); } + let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode); + // convert rows to JSON let rows = rows .iter() - .map(|row| pg_text_row_to_json(row, &columns, raw_output, array_mode)) + .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode)) .collect::, _>>()?; - // resulting JSON format is based on the format of node-postgres result - Ok(( - ready, - json!({ - "command": command_tag_name, - "rowCount": command_tag_count, - "rows": rows, - "fields": fields, - "rowAsArray": array_mode, - }), - )) -} - -// -// Convert postgres row with text-encoded values to JSON object -// -pub fn pg_text_row_to_json( - row: &Row, - columns: &[Type], - raw_output: bool, - array_mode: bool, -) -> Result { - let iter = row - .columns() - .iter() - .zip(columns) - .enumerate() - .map(|(i, (column, typ))| { - let name = column.name(); - let pg_value = row.as_text(i)?; - let json_value = if raw_output { - match pg_value { - Some(v) => Value::String(v.to_string()), - None => Value::Null, - } - } else { - pg_text_to_json(pg_value, typ)? - }; - Ok((name.to_string(), json_value)) - }); - - if array_mode { - // drop keys and aggregate into array - let arr = iter - .map(|r| r.map(|(_key, val)| val)) - .collect::, anyhow::Error>>()?; - Ok(Value::Array(arr)) - } else { - let obj = iter.collect::, anyhow::Error>>()?; - Ok(Value::Object(obj)) - } -} - -// -// Convert postgres text-encoded value to JSON value -// -pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result { - if let Some(val) = pg_value { - if let Kind::Array(elem_type) = pg_type.kind() { - return pg_array_parse(val, elem_type); - } - - match *pg_type { - Type::BOOL => Ok(Value::Bool(val == "t")), - Type::INT2 | Type::INT4 => { - let val = val.parse::()?; - Ok(Value::Number(serde_json::Number::from(val))) - } - Type::FLOAT4 | Type::FLOAT8 => { - let fval = val.parse::()?; - let num = serde_json::Number::from_f64(fval); - if let Some(num) = num { - Ok(Value::Number(num)) - } else { - // Pass Nan, Inf, -Inf as strings - // JS JSON.stringify() does converts them to null, but we - // want to preserve them, so we pass them as strings - Ok(Value::String(val.to_string())) - } - } - Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?), - _ => Ok(Value::String(val.to_string())), - } - } else { - Ok(Value::Null) - } -} - -// -// Parse postgres array into JSON array. -// -// This is a bit involved because we need to handle nested arrays and quoted -// values. Unlike postgres we don't check that all nested arrays have the same -// dimensions, we just return them as is. -// -fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result { - _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v) -} - -fn _pg_array_parse( - pg_array: &str, - elem_type: &Type, - nested: bool, -) -> Result<(Value, usize), anyhow::Error> { - let mut pg_array_chr = pg_array.char_indices(); - let mut level = 0; - let mut quote = false; - let mut entries: Vec = Vec::new(); - let mut entry = String::new(); - - // skip bounds decoration - if let Some('[') = pg_array.chars().next() { - for (_, c) in pg_array_chr.by_ref() { - if c == '=' { - break; - } - } - } - - fn push_checked( - entry: &mut String, - entries: &mut Vec, - elem_type: &Type, - ) -> Result<(), anyhow::Error> { - if !entry.is_empty() { - // While in usual postgres response we get nulls as None and everything else - // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while - // string with value 'NULL' will be represented by '"NULL"'). So catch NULLs - // here while we have quotation info and convert them to None. - if entry == "NULL" { - entries.push(pg_text_to_json(None, elem_type)?); - } else { - entries.push(pg_text_to_json(Some(entry), elem_type)?); - } - entry.clear(); - } - - Ok(()) - } - - while let Some((mut i, mut c)) = pg_array_chr.next() { - let mut escaped = false; - - if c == '\\' { - escaped = true; - (i, c) = pg_array_chr.next().unwrap(); - } - - match c { - '{' if !quote => { - level += 1; - if level > 1 { - let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?; - entries.push(res); - for _ in 0..off - 1 { - pg_array_chr.next(); - } - } - } - '}' if !quote => { - level -= 1; - if level == 0 { - push_checked(&mut entry, &mut entries, elem_type)?; - if nested { - return Ok((Value::Array(entries), i)); - } - } - } - '"' if !escaped => { - if quote { - // end of quoted string, so push it manually without any checks - // for emptiness or nulls - entries.push(pg_text_to_json(Some(&entry), elem_type)?); - entry.clear(); - } - quote = !quote; - } - ',' if !quote => { - push_checked(&mut entry, &mut entries, elem_type)?; - } - _ => { - entry.push(c); - } - } - } - - if level != 0 { - return Err(anyhow::anyhow!("unbalanced array")); - } - - Ok((Value::Array(entries), 0)) -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn test_atomic_types_to_pg_params() { - let json = vec![Value::Bool(true), Value::Bool(false)]; - let pg_params = json_to_pg_text(json); - assert_eq!( - pg_params, - vec![Some("true".to_owned()), Some("false".to_owned())] - ); - - let json = vec![Value::Number(serde_json::Number::from(42))]; - let pg_params = json_to_pg_text(json); - assert_eq!(pg_params, vec![Some("42".to_owned())]); - - let json = vec![Value::String("foo\"".to_string())]; - let pg_params = json_to_pg_text(json); - assert_eq!(pg_params, vec![Some("foo\"".to_owned())]); - - let json = vec![Value::Null]; - let pg_params = json_to_pg_text(json); - assert_eq!(pg_params, vec![None]); - } - - #[test] - fn test_json_array_to_pg_array() { - // atoms and escaping - let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]"; - let json: Value = serde_json::from_str(json).unwrap(); - let pg_params = json_to_pg_text(vec![json]); - assert_eq!( - pg_params, - vec![Some( - "{true,false,NULL,\"NULL\",42,\"foo\",\"bar\\\"-\\\\\"}".to_owned() - )] - ); - - // nested arrays - let json = "[[true, false], [null, 42], [\"foo\", \"bar\\\"-\\\\\"]]"; - let json: Value = serde_json::from_str(json).unwrap(); - let pg_params = json_to_pg_text(vec![json]); - assert_eq!( - pg_params, - vec![Some( - "{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned() - )] - ); - // array of objects - let json = r#"[{"foo": 1},{"bar": 2}]"#; - let json: Value = serde_json::from_str(json).unwrap(); - let pg_params = json_to_pg_text(vec![json]); - assert_eq!( - pg_params, - vec![Some(r#"{"{\"foo\":1}","{\"bar\":2}"}"#.to_owned())] - ); - } - - #[test] - fn test_atomic_types_parse() { - assert_eq!( - pg_text_to_json(Some("foo"), &Type::TEXT).unwrap(), - json!("foo") - ); - assert_eq!(pg_text_to_json(None, &Type::TEXT).unwrap(), json!(null)); - assert_eq!(pg_text_to_json(Some("42"), &Type::INT4).unwrap(), json!(42)); - assert_eq!(pg_text_to_json(Some("42"), &Type::INT2).unwrap(), json!(42)); - assert_eq!( - pg_text_to_json(Some("42"), &Type::INT8).unwrap(), - json!("42") - ); - assert_eq!( - pg_text_to_json(Some("42.42"), &Type::FLOAT8).unwrap(), - json!(42.42) - ); - assert_eq!( - pg_text_to_json(Some("42.42"), &Type::FLOAT4).unwrap(), - json!(42.42) - ); - assert_eq!( - pg_text_to_json(Some("NaN"), &Type::FLOAT4).unwrap(), - json!("NaN") - ); - assert_eq!( - pg_text_to_json(Some("Infinity"), &Type::FLOAT4).unwrap(), - json!("Infinity") - ); - assert_eq!( - pg_text_to_json(Some("-Infinity"), &Type::FLOAT4).unwrap(), - json!("-Infinity") - ); - - let json: Value = - serde_json::from_str("{\"s\":\"str\",\"n\":42,\"f\":4.2,\"a\":[null,3,\"a\"]}") - .unwrap(); - assert_eq!( - pg_text_to_json( - Some(r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#), - &Type::JSONB - ) - .unwrap(), - json - ); - } - - #[test] - fn test_pg_array_parse_text() { - fn pt(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::TEXT).unwrap() - } - assert_eq!( - pt(r#"{"aa\"\\\,a",cha,"bbbb"}"#), - json!(["aa\"\\,a", "cha", "bbbb"]) - ); - assert_eq!( - pt(r#"{{"foo","bar"},{"bee","bop"}}"#), - json!([["foo", "bar"], ["bee", "bop"]]) - ); - assert_eq!( - pt(r#"{{{{"foo",NULL,"bop",bup}}}}"#), - json!([[[["foo", null, "bop", "bup"]]]]) - ); - assert_eq!( - pt(r#"{{"1",2,3},{4,NULL,6},{NULL,NULL,NULL}}"#), - json!([["1", "2", "3"], ["4", null, "6"], [null, null, null]]) - ); - } - - #[test] - fn test_pg_array_parse_bool() { - fn pb(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::BOOL).unwrap() - } - assert_eq!(pb(r#"{t,f,t}"#), json!([true, false, true])); - assert_eq!(pb(r#"{{t,f,t}}"#), json!([[true, false, true]])); - assert_eq!( - pb(r#"{{t,f},{f,t}}"#), - json!([[true, false], [false, true]]) - ); - assert_eq!( - pb(r#"{{t,NULL},{NULL,f}}"#), - json!([[true, null], [null, false]]) - ); - } - - #[test] - fn test_pg_array_parse_numbers() { - fn pn(pg_arr: &str, ty: &Type) -> Value { - pg_array_parse(pg_arr, ty).unwrap() - } - assert_eq!(pn(r#"{1,2,3}"#, &Type::INT4), json!([1, 2, 3])); - assert_eq!(pn(r#"{1,2,3}"#, &Type::INT2), json!([1, 2, 3])); - assert_eq!(pn(r#"{1,2,3}"#, &Type::INT8), json!(["1", "2", "3"])); - assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT4), json!([1.0, 2.0, 3.0])); - assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT8), json!([1.0, 2.0, 3.0])); - assert_eq!( - pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT4), - json!([1.1, 2.2, 3.3]) - ); - assert_eq!( - pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT8), - json!([1.1, 2.2, 3.3]) - ); - assert_eq!( - pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT4), - json!(["NaN", "Infinity", "-Infinity"]) - ); - assert_eq!( - pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT8), - json!(["NaN", "Infinity", "-Infinity"]) - ); - } - - #[test] - fn test_pg_array_with_decoration() { - fn p(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::INT2).unwrap() - } - assert_eq!( - p(r#"[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}"#), - json!([[[1, 2, 3], [4, 5, 6]]]) - ); - } - #[test] - fn test_pg_array_parse_json() { - fn pt(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::JSONB).unwrap() - } - assert_eq!(pt(r#"{"{}"}"#), json!([{}])); - assert_eq!( - pt(r#"{"{\"foo\": 1, \"bar\": 2}"}"#), - json!([{"foo": 1, "bar": 2}]) - ); - assert_eq!( - pt(r#"{"{\"foo\": 1}", "{\"bar\": 2}"}"#), - json!([{"foo": 1}, {"bar": 2}]) - ); - assert_eq!( - pt(r#"{{"{\"foo\": 1}", "{\"bar\": 2}"}}"#), - json!([[{"foo": 1}, {"bar": 2}]]) - ); - } + // Resulting JSON format is based on the format of node-postgres result. + let results = json!({ + "command": command_tag_name.to_string(), + "rowCount": command_tag_count, + "rows": rows, + "fields": fields, + "rowAsArray": array_mode, + }); + + Ok((ready, results)) } diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index a6529c920a..3d257223b8 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -1,15 +1,19 @@ +use crate::proxy::ErrorSource; use crate::{ - cancellation::CancelMap, + cancellation::CancellationHandlerMain, config::ProxyConfig, context::RequestMonitoring, - error::io_error, + error::{io_error, ReportableError}, + metrics::Metrics, proxy::{handle_client, ClientMode}, rate_limiter::EndpointRateLimiter, }; -use bytes::{Buf, Bytes}; +use anyhow::Context as _; +use bytes::{Buf, BufMut, Bytes, BytesMut}; +use framed_websockets::{Frame, OpCode, WebSocketServer}; use futures::{Sink, Stream}; -use hyper::upgrade::Upgraded; -use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream}; +use hyper1::upgrade::OnUpgrade; +use hyper_util::rt::TokioIo; use pin_project_lite::pin_project; use std::{ @@ -20,25 +24,23 @@ use std::{ use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}; use tracing::warn; -// TODO: use `std::sync::Exclusive` once it's stabilized. -// Tracking issue: https://github.com/rust-lang/rust/issues/98407. -use sync_wrapper::SyncWrapper; - pin_project! { /// This is a wrapper around a [`WebSocketStream`] that /// implements [`AsyncRead`] and [`AsyncWrite`]. - pub struct WebSocketRw { + pub(crate) struct WebSocketRw { #[pin] - stream: SyncWrapper>, - bytes: Bytes, + stream: WebSocketServer, + recv: Bytes, + send: BytesMut, } } impl WebSocketRw { - pub fn new(stream: WebSocketStream) -> Self { + pub(crate) fn new(stream: WebSocketServer) -> Self { Self { - stream: stream.into(), - bytes: Bytes::new(), + stream, + recv: Bytes::new(), + send: BytesMut::new(), } } } @@ -49,22 +51,25 @@ impl AsyncWrite for WebSocketRw { cx: &mut Context<'_>, buf: &[u8], ) -> Poll> { - let mut stream = self.project().stream.get_pin_mut(); + let this = self.project(); + let mut stream = this.stream; ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?; - match stream.as_mut().start_send(Message::Binary(buf.into())) { + + this.send.put(buf); + match stream.as_mut().start_send(Frame::binary(this.send.split())) { Ok(()) => Poll::Ready(Ok(buf.len())), Err(e) => Poll::Ready(Err(io_error(e))), } } fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - let stream = self.project().stream.get_pin_mut(); + let stream = self.project().stream; stream.poll_flush(cx).map_err(io_error) } fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - let stream = self.project().stream.get_pin_mut(); + let stream = self.project().stream; stream.poll_close(cx).map_err(io_error) } } @@ -75,13 +80,10 @@ impl AsyncRead for WebSocketRw { cx: &mut Context<'_>, buf: &mut ReadBuf<'_>, ) -> Poll> { - if buf.remaining() > 0 { - let bytes = ready!(self.as_mut().poll_fill_buf(cx))?; - let len = std::cmp::min(bytes.len(), buf.remaining()); - buf.put_slice(&bytes[..len]); - self.consume(len); - } - + let bytes = ready!(self.as_mut().poll_fill_buf(cx))?; + let len = std::cmp::min(bytes.len(), buf.remaining()); + buf.put_slice(&bytes[..len]); + self.consume(len); Poll::Ready(Ok(())) } } @@ -93,31 +95,27 @@ impl AsyncBufRead for WebSocketRw { let mut this = self.project(); loop { - if !this.bytes.chunk().is_empty() { - let chunk = (*this.bytes).chunk(); + if !this.recv.chunk().is_empty() { + let chunk = (*this.recv).chunk(); return Poll::Ready(Ok(chunk)); } - let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx)); + let res = ready!(this.stream.as_mut().poll_next(cx)); match res.transpose().map_err(io_error)? { - Some(message) => match message { - Message::Ping(_) => {} - Message::Pong(_) => {} - Message::Text(text) => { + Some(message) => match message.opcode { + OpCode::Ping => {} + OpCode::Pong => {} + OpCode::Text => { // We expect to see only binary messages. let error = "unexpected text message in the websocket"; - warn!(length = text.len(), error); + warn!(length = message.payload.len(), error); return Poll::Ready(Err(io_error(error))); } - Message::Frame(_) => { - // This case is impossible according to Frame's doc. - panic!("unexpected raw frame in the websocket"); + OpCode::Binary | OpCode::Continuation => { + debug_assert!(this.recv.is_empty()); + *this.recv = message.payload.freeze(); } - Message::Binary(chunk) => { - assert!(this.bytes.is_empty()); - *this.bytes = Bytes::from(chunk); - } - Message::Close(_) => return EOF, + OpCode::Close => return EOF, }, None => return EOF, } @@ -125,44 +123,73 @@ impl AsyncBufRead for WebSocketRw { } fn consume(self: Pin<&mut Self>, amount: usize) { - self.project().bytes.advance(amount); + self.project().recv.advance(amount); } } -pub async fn serve_websocket( +pub(crate) async fn serve_websocket( config: &'static ProxyConfig, - ctx: &mut RequestMonitoring, - websocket: HyperWebsocket, - cancel_map: &CancelMap, - hostname: Option, + ctx: RequestMonitoring, + websocket: OnUpgrade, + cancellation_handler: Arc, endpoint_rate_limiter: Arc, + hostname: Option, ) -> anyhow::Result<()> { let websocket = websocket.await?; - handle_client( + let websocket = WebSocketServer::after_handshake(TokioIo::new(websocket)); + + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Ws); + + let res = Box::pin(handle_client( config, - ctx, - cancel_map, + &ctx, + cancellation_handler, WebSocketRw::new(websocket), ClientMode::Websockets { hostname }, endpoint_rate_limiter, - ) - .await?; - Ok(()) + conn_gauge, + )) + .await; + + match res { + Err(e) => { + // todo: log and push to ctx the error kind + ctx.set_error_kind(e.get_error_kind()); + Err(e.into()) + } + Ok(None) => { + ctx.set_success(); + Ok(()) + } + Ok(Some(p)) => { + ctx.set_success(); + ctx.log_connect(); + match p.proxy_pass().await { + Ok(()) => Ok(()), + Err(ErrorSource::Client(err)) => Err(err).context("client"), + Err(ErrorSource::Compute(err)) => Err(err).context("compute"), + } + } + } } #[cfg(test)] mod tests { use std::pin::pin; + use framed_websockets::WebSocketServer; use futures::{SinkExt, StreamExt}; - use hyper_tungstenite::{ - tungstenite::{protocol::Role, Message}, - WebSocketStream, - }; use tokio::{ io::{duplex, AsyncReadExt, AsyncWriteExt}, task::JoinSet, }; + use tokio_tungstenite::{ + tungstenite::{protocol::Role, Message}, + WebSocketStream, + }; use super::WebSocketRw; @@ -187,9 +214,7 @@ mod tests { }); js.spawn(async move { - let mut rw = pin!(WebSocketRw::new( - WebSocketStream::from_raw_socket(stream2, Role::Server, None).await - )); + let mut rw = pin!(WebSocketRw::new(WebSocketServer::after_handshake(stream2))); let mut buf = vec![0; 1024]; let n = rw.read(&mut buf).await.unwrap(); diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index f48b3fe39f..c14dd18afe 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,6 +1,6 @@ use crate::config::TlsServerEndPoint; -use crate::error::UserFacingError; -use anyhow::bail; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::metrics::Metrics; use bytes::BytesMut; use pq_proto::framed::{ConnectionError, Framed}; @@ -14,6 +14,7 @@ use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tokio_rustls::server::TlsStream; /// Stream wrapper which implements libpq's protocol. +/// /// NOTE: This object deliberately doesn't implement [`AsyncRead`] /// or [`AsyncWrite`] to prevent subtle errors (e.g. trying /// to pass random malformed bytes through the connection). @@ -35,7 +36,7 @@ impl PqStream { } /// Get a shared reference to the underlying stream. - pub fn get_ref(&self) -> &S { + pub(crate) fn get_ref(&self) -> &S { self.framed.get_ref() } } @@ -62,20 +63,47 @@ impl PqStream { .ok_or_else(err_connection) } - pub async fn read_password_message(&mut self) -> io::Result { + pub(crate) async fn read_password_message(&mut self) -> io::Result { match self.read_message().await? { FeMessage::PasswordMessage(msg) => Ok(msg), bad => Err(io::Error::new( io::ErrorKind::InvalidData, - format!("unexpected message type: {:?}", bad), + format!("unexpected message type: {bad:?}"), )), } } } +#[derive(Debug)] +pub struct ReportedError { + source: anyhow::Error, + error_kind: ErrorKind, +} + +impl std::fmt::Display for ReportedError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.source.fmt(f) + } +} + +impl std::error::Error for ReportedError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + self.source.source() + } +} + +impl ReportableError for ReportedError { + fn get_error_kind(&self) -> ErrorKind { + self.error_kind + } +} + impl PqStream { /// Write the message into an internal buffer, but don't flush the underlying stream. - pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { + pub(crate) fn write_message_noflush( + &mut self, + message: &BeMessage<'_>, + ) -> io::Result<&mut Self> { self.framed .write_message(message) .map_err(ProtocolError::into_io_error)?; @@ -90,7 +118,7 @@ impl PqStream { } /// Flush the output buffer into the underlying stream. - pub async fn flush(&mut self) -> io::Result<&mut Self> { + pub(crate) async fn flush(&mut self) -> io::Result<&mut Self> { self.framed.flush().await?; Ok(self) } @@ -98,24 +126,52 @@ impl PqStream { /// Write the error message using [`Self::write_message`], then re-throw it. /// Allowing string literals is safe under the assumption they might not contain any runtime info. /// This method exists due to `&str` not implementing `Into`. - pub async fn throw_error_str(&mut self, error: &'static str) -> anyhow::Result { - tracing::info!("forwarding error to user: {error}"); - self.write_message(&BeMessage::ErrorResponse(error, None)) - .await?; - bail!(error) + pub async fn throw_error_str( + &mut self, + msg: &'static str, + error_kind: ErrorKind, + ) -> Result { + tracing::info!( + kind = error_kind.to_metric_label(), + msg, + "forwarding error to user" + ); + + // already error case, ignore client IO error + let _: Result<_, std::io::Error> = self + .write_message(&BeMessage::ErrorResponse(msg, None)) + .await; + + Err(ReportedError { + source: anyhow::anyhow!(msg), + error_kind, + }) } /// Write the error message using [`Self::write_message`], then re-throw it. /// Trait [`UserFacingError`] acts as an allowlist for error types. - pub async fn throw_error(&mut self, error: E) -> anyhow::Result + pub(crate) async fn throw_error(&mut self, error: E) -> Result where E: UserFacingError + Into, { + let error_kind = error.get_error_kind(); let msg = error.to_string_client(); - tracing::info!("forwarding error to user: {msg}"); - self.write_message(&BeMessage::ErrorResponse(&msg, None)) - .await?; - bail!(error) + tracing::info!( + kind=error_kind.to_metric_label(), + error=%error, + msg, + "forwarding error to user" + ); + + // already error case, ignore client IO error + let _: Result<_, std::io::Error> = self + .write_message(&BeMessage::ErrorResponse(&msg, None)) + .await; + + Err(ReportedError { + source: anyhow::anyhow!(error), + error_kind, + }) } } @@ -148,7 +204,7 @@ impl Stream { } } - pub fn tls_server_end_point(&self) -> TlsServerEndPoint { + pub(crate) fn tls_server_end_point(&self) -> TlsServerEndPoint { match self { Stream::Raw { .. } => TlsServerEndPoint::Undefined, Stream::Tls { @@ -171,9 +227,20 @@ pub enum StreamUpgradeError { impl Stream { /// If possible, upgrade raw stream into a secure TLS-based stream. - pub async fn upgrade(self, cfg: Arc) -> Result, StreamUpgradeError> { + pub async fn upgrade( + self, + cfg: Arc, + record_handshake_error: bool, + ) -> Result, StreamUpgradeError> { match self { - Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?), + Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg) + .accept(raw) + .await + .inspect_err(|_| { + if record_handshake_error { + Metrics::get().proxy.tls_handshake_failures.inc(); + } + })?), Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls), } } diff --git a/proxy/src/url.rs b/proxy/src/url.rs index 92c64bb8ad..28ac7efdfc 100644 --- a/proxy/src/url.rs +++ b/proxy/src/url.rs @@ -7,12 +7,12 @@ pub struct ApiUrl(url::Url); impl ApiUrl { /// Consume the wrapper and return inner [url](url::Url). - pub fn into_inner(self) -> url::Url { + pub(crate) fn into_inner(self) -> url::Url { self.0 } /// See [`url::Url::path_segments_mut`]. - pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut { + pub(crate) fn path_segments_mut(&mut self) -> url::PathSegmentsMut<'_> { // We've already verified that it works during construction. self.0.path_segments_mut().expect("bad API url") } diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index d75aedf89b..fd8599bcb3 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -1,24 +1,40 @@ //! Periodically collect proxy consumption metrics //! and push them to a HTTP endpoint. -use crate::{config::MetricCollectionConfig, http, BranchId, EndpointId}; -use chrono::{DateTime, Utc}; +use crate::{ + config::{MetricBackupCollectionConfig, MetricCollectionConfig}, + context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}, + http, + intern::{BranchIdInt, EndpointIdInt}, +}; +use anyhow::Context; +use async_compression::tokio::write::GzipEncoder; +use bytes::Bytes; +use chrono::{DateTime, Datelike, Timelike, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; use dashmap::{mapref::entry::Entry, DashMap}; +use futures::future::select; use once_cell::sync::Lazy; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use serde::{Deserialize, Serialize}; use std::{ convert::Infallible, + pin::pin, sync::{ atomic::{AtomicU64, AtomicUsize, Ordering}, Arc, }, time::Duration, }; +use tokio::io::AsyncWriteExt; +use tokio_util::sync::CancellationToken; use tracing::{error, info, instrument, trace}; +use utils::backoff; +use uuid::{NoContext, Timestamp}; const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; -const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); +const HTTP_REPORTING_REQUEST_TIMEOUT: Duration = Duration::from_secs(10); +const HTTP_REPORTING_RETRY_DURATION: Duration = Duration::from_secs(60); /// Key that uniquely identifies the object, this metric describes. /// Currently, endpoint_id is enough, but this may change later, @@ -28,24 +44,98 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); /// so while the project-id is unique across regions the whole pipeline will work correctly /// because we enrich the event with project_id in the control-plane endpoint. #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] -pub struct Ids { - pub endpoint_id: EndpointId, - pub branch_id: BranchId, +pub(crate) struct Ids { + pub(crate) endpoint_id: EndpointIdInt, + pub(crate) branch_id: BranchIdInt, +} + +pub(crate) trait MetricCounterRecorder { + /// Record that some bytes were sent from the proxy to the client + fn record_egress(&self, bytes: u64); + /// Record that some connections were opened + fn record_connection(&self, count: usize); +} + +trait MetricCounterReporter { + fn get_metrics(&mut self) -> (u64, usize); + fn move_metrics(&self) -> (u64, usize); } #[derive(Debug)] -pub struct MetricCounter { +struct MetricBackupCounter { transmitted: AtomicU64, opened_connections: AtomicUsize, } -impl MetricCounter { - /// Record that some bytes were sent from the proxy to the client - pub fn record_egress(&self, bytes: u64) { +impl MetricCounterRecorder for MetricBackupCounter { + fn record_egress(&self, bytes: u64) { self.transmitted.fetch_add(bytes, Ordering::AcqRel); } + fn record_connection(&self, count: usize) { + self.opened_connections.fetch_add(count, Ordering::AcqRel); + } +} + +impl MetricCounterReporter for MetricBackupCounter { + fn get_metrics(&mut self) -> (u64, usize) { + ( + *self.transmitted.get_mut(), + *self.opened_connections.get_mut(), + ) + } + fn move_metrics(&self) -> (u64, usize) { + ( + self.transmitted.swap(0, Ordering::AcqRel), + self.opened_connections.swap(0, Ordering::AcqRel), + ) + } +} + +#[derive(Debug)] +pub(crate) struct MetricCounter { + transmitted: AtomicU64, + opened_connections: AtomicUsize, + backup: Arc, +} + +impl MetricCounterRecorder for MetricCounter { + /// Record that some bytes were sent from the proxy to the client + fn record_egress(&self, bytes: u64) { + self.transmitted.fetch_add(bytes, Ordering::AcqRel); + self.backup.record_egress(bytes); + } + + /// Record that some connections were opened + fn record_connection(&self, count: usize) { + self.opened_connections.fetch_add(count, Ordering::AcqRel); + self.backup.record_connection(count); + } +} + +impl MetricCounterReporter for MetricCounter { + fn get_metrics(&mut self) -> (u64, usize) { + ( + *self.transmitted.get_mut(), + *self.opened_connections.get_mut(), + ) + } + fn move_metrics(&self) -> (u64, usize) { + ( + self.transmitted.swap(0, Ordering::AcqRel), + self.opened_connections.swap(0, Ordering::AcqRel), + ) + } +} + +trait Clearable { /// extract the value that should be reported + fn should_report(self: &Arc) -> Option; + /// Determine whether the counter should be cleared from the global map. + fn should_clear(self: &mut Arc) -> bool; +} + +impl Clearable for C { fn should_report(self: &Arc) -> Option { // heuristic to see if the branch is still open // if a clone happens while we are observing, the heuristic will be incorrect. @@ -54,13 +144,12 @@ impl MetricCounter { // However, for the strong count to be 1 it must have occured that at one instant // all the endpoints were closed, so missing a report because the endpoints are closed is valid. let is_open = Arc::strong_count(self) > 1; - let opened = self.opened_connections.swap(0, Ordering::AcqRel); // update cached metrics eagerly, even if they can't get sent // (to avoid sending the same metrics twice) // see the relevant discussion on why to do so even if the status is not success: // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956 - let value = self.transmitted.swap(0, Ordering::AcqRel); + let (value, opened) = self.move_metrics(); // Our only requirement is that we report in every interval if there was an open connection // if there were no opened connections since, then we don't need to report @@ -70,15 +159,12 @@ impl MetricCounter { Some(value) } } - - /// Determine whether the counter should be cleared from the global map. fn should_clear(self: &mut Arc) -> bool { // we can't clear this entry if it's acquired elsewhere let Some(counter) = Arc::get_mut(self) else { return false; }; - let opened = *counter.opened_connections.get_mut(); - let value = *counter.transmitted.get_mut(); + let (opened, value) = counter.get_metrics(); // clear if there's no data to report value == 0 && opened == 0 } @@ -88,13 +174,28 @@ impl MetricCounter { type FastHasher = std::hash::BuildHasherDefault; #[derive(Default)] -pub struct Metrics { +pub(crate) struct Metrics { endpoints: DashMap, FastHasher>, + backup_endpoints: DashMap, FastHasher>, } impl Metrics { /// Register a new byte metrics counter for this endpoint - pub fn register(&self, ids: Ids) -> Arc { + pub(crate) fn register(&self, ids: Ids) -> Arc { + let backup = if let Some(entry) = self.backup_endpoints.get(&ids) { + entry.clone() + } else { + self.backup_endpoints + .entry(ids.clone()) + .or_insert_with(|| { + Arc::new(MetricBackupCounter { + transmitted: AtomicU64::new(0), + opened_connections: AtomicUsize::new(0), + }) + }) + .clone() + }; + let entry = if let Some(entry) = self.endpoints.get(&ids) { entry.clone() } else { @@ -104,17 +205,18 @@ impl Metrics { Arc::new(MetricCounter { transmitted: AtomicU64::new(0), opened_connections: AtomicUsize::new(0), + backup: backup.clone(), }) }) .clone() }; - entry.opened_connections.fetch_add(1, Ordering::AcqRel); + entry.record_connection(1); entry } } -pub static USAGE_METRICS: Lazy = Lazy::new(Metrics::default); +pub(crate) static USAGE_METRICS: Lazy = Lazy::new(Metrics::default); pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result { info!("metrics collector config: {config:?}"); @@ -122,7 +224,10 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result anyhow::Result anyhow::Result, - now: DateTime, -) { - info!( - "starting collect_metrics_iteration. metric_collection_endpoint: {}", - metric_collection_endpoint - ); - +fn collect_and_clear_metrics( + endpoints: &DashMap, FastHasher>, +) -> Vec<(Ids, u64)> { let mut metrics_to_clear = Vec::new(); - let metrics_to_send: Vec<(Ids, u64)> = metrics - .endpoints + let metrics_to_send: Vec<(Ids, u64)> = endpoints .iter() .filter_map(|counter| { let key = counter.key().clone(); @@ -173,33 +266,71 @@ async fn collect_metrics_iteration( }) .collect(); + for metric in metrics_to_clear { + match endpoints.entry(metric) { + Entry::Occupied(mut counter) => { + if counter.get_mut().should_clear() { + counter.remove_entry(); + } + } + Entry::Vacant(_) => {} + } + } + metrics_to_send +} + +fn create_event_chunks<'a>( + metrics_to_send: &'a [(Ids, u64)], + hostname: &'a str, + prev: DateTime, + now: DateTime, + chunk_size: usize, +) -> impl Iterator>> + 'a { + // Split into chunks of 1000 metrics to avoid exceeding the max request size + metrics_to_send + .chunks(chunk_size) + .map(move |chunk| EventChunk { + events: chunk + .iter() + .map(|(ids, value)| Event { + kind: EventType::Incremental { + start_time: prev, + stop_time: now, + }, + metric: PROXY_IO_BYTES_PER_CLIENT, + idempotency_key: idempotency_key(hostname), + value: *value, + extra: ids.clone(), + }) + .collect(), + }) +} + +#[instrument(skip_all)] +async fn collect_metrics_iteration( + endpoints: &DashMap, FastHasher>, + client: &http::ClientWithMiddleware, + metric_collection_endpoint: &reqwest::Url, + hostname: &str, + prev: DateTime, + now: DateTime, +) { + info!( + "starting collect_metrics_iteration. metric_collection_endpoint: {}", + metric_collection_endpoint + ); + + let metrics_to_send = collect_and_clear_metrics(endpoints); + if metrics_to_send.is_empty() { trace!("no new metrics to send"); } // Send metrics. - // Split into chunks of 1000 metrics to avoid exceeding the max request size - for chunk in metrics_to_send.chunks(CHUNK_SIZE) { - let events = chunk - .iter() - .map(|(ids, value)| Event { - kind: EventType::Incremental { - start_time: prev, - stop_time: now, - }, - metric: PROXY_IO_BYTES_PER_CLIENT, - idempotency_key: idempotency_key(hostname), - value: *value, - extra: Ids { - endpoint_id: ids.endpoint_id.clone(), - branch_id: ids.branch_id.clone(), - }, - }) - .collect(); - + for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, CHUNK_SIZE) { let res = client .post(metric_collection_endpoint.clone()) - .json(&EventChunk { events }) + .json(&chunk) .send() .await; @@ -213,23 +344,143 @@ async fn collect_metrics_iteration( if !res.status().is_success() { error!("metrics endpoint refused the sent metrics: {:?}", res); - for metric in chunk.iter().filter(|(_, value)| *value > (1u64 << 40)) { + for metric in chunk.events.iter().filter(|e| e.value > (1u64 << 40)) { // Report if the metric value is suspiciously large error!("potentially abnormal metric value: {:?}", metric); } } } +} - for metric in metrics_to_clear { - match metrics.endpoints.entry(metric) { - Entry::Occupied(mut counter) => { - if counter.get_mut().should_clear() { - counter.remove_entry(); - } - } - Entry::Vacant(_) => {} +pub async fn task_backup( + backup_config: &MetricBackupCollectionConfig, + cancellation_token: CancellationToken, +) -> anyhow::Result<()> { + info!("metrics backup config: {backup_config:?}"); + scopeguard::defer! { + info!("metrics backup has shut down"); + } + // Even if the remote storage is not configured, we still want to clear the metrics. + let storage = if let Some(config) = backup_config.remote_storage_config.as_ref() { + Some( + GenericRemoteStorage::from_config(config) + .await + .context("remote storage init")?, + ) + } else { + None + }; + let mut ticker = tokio::time::interval(backup_config.interval); + let mut prev = Utc::now(); + let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned(); + loop { + select(pin!(ticker.tick()), pin!(cancellation_token.cancelled())).await; + let now = Utc::now(); + collect_metrics_backup_iteration( + &USAGE_METRICS.backup_endpoints, + &storage, + &hostname, + prev, + now, + backup_config.chunk_size, + ) + .await; + + prev = now; + if cancellation_token.is_cancelled() { + info!("metrics backup has been cancelled"); + break; } } + Ok(()) +} + +#[instrument(skip_all)] +async fn collect_metrics_backup_iteration( + endpoints: &DashMap, FastHasher>, + storage: &Option, + hostname: &str, + prev: DateTime, + now: DateTime, + chunk_size: usize, +) { + let year = now.year(); + let month = now.month(); + let day = now.day(); + let hour = now.hour(); + let minute = now.minute(); + let second = now.second(); + let cancel = CancellationToken::new(); + + info!("starting collect_metrics_backup_iteration"); + + let metrics_to_send = collect_and_clear_metrics(endpoints); + + if metrics_to_send.is_empty() { + trace!("no new metrics to send"); + } + + // Send metrics. + for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, chunk_size) { + let real_now = Utc::now(); + let id = uuid::Uuid::new_v7(Timestamp::from_unix( + NoContext, + real_now.second().into(), + real_now.nanosecond(), + )); + let path = format!("year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z_{id}.json.gz"); + let remote_path = match RemotePath::from_string(&path) { + Ok(remote_path) => remote_path, + Err(e) => { + error!("failed to create remote path from str {path}: {:?}", e); + continue; + } + }; + + let res = upload_events_chunk(storage, chunk, &remote_path, &cancel).await; + + if let Err(e) = res { + error!( + "failed to upload consumption events to remote storage: {:?}", + e + ); + } + } +} + +async fn upload_events_chunk( + storage: &Option, + chunk: EventChunk<'_, Event>, + remote_path: &RemotePath, + cancel: &CancellationToken, +) -> anyhow::Result<()> { + let Some(storage) = storage else { + error!("no remote storage configured"); + return Ok(()); + }; + let data = serde_json::to_vec(&chunk).context("serialize metrics")?; + let mut encoder = GzipEncoder::new(Vec::new()); + encoder.write_all(&data).await.context("compress metrics")?; + encoder.shutdown().await.context("compress metrics")?; + let compressed_data: Bytes = encoder.get_ref().clone().into(); + backoff::retry( + || async { + let stream = futures::stream::once(futures::future::ready(Ok(compressed_data.clone()))); + storage + .upload(stream, compressed_data.len(), remote_path, None, cancel) + .await + }, + TimeoutOrCancel::caused_by_cancel, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_UPLOAD_MAX_RETRIES, + "request_data_upload", + cancel, + ) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("request_data_upload")?; + Ok(()) } #[cfg(test)] @@ -248,8 +499,8 @@ mod tests { }; use url::Url; - use super::{collect_metrics_iteration, Ids, Metrics}; - use crate::{http, rate_limiter::RateLimiterConfig}; + use super::*; + use crate::{http, BranchId, EndpointId}; #[tokio::test] async fn metrics() { @@ -279,23 +530,24 @@ mod tests { tokio::spawn(server); let metrics = Metrics::default(); - let client = http::new_client(RateLimiterConfig::default()); + let client = http::new_client(); let endpoint = Url::parse(&format!("http://{addr}")).unwrap(); let now = Utc::now(); // no counters have been registered - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert!(r.is_empty()); // register a new counter + let counter = metrics.register(Ids { - endpoint_id: "e1".into(), - branch_id: "b1".into(), + endpoint_id: (&EndpointId::from("e1")).into(), + branch_id: (&BranchId::from("b1")).into(), }); // the counter should be observed despite 0 egress - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert_eq!(r.len(), 1); assert_eq!(r[0].events.len(), 1); @@ -305,7 +557,7 @@ mod tests { counter.record_egress(1); // egress should be observered - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert_eq!(r.len(), 1); assert_eq!(r[0].events.len(), 1); @@ -315,11 +567,19 @@ mod tests { drop(counter); // we do not observe the counter - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert!(r.is_empty()); // counter is unregistered assert!(metrics.endpoints.is_empty()); + + collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000) + .await; + assert!(!metrics.backup_endpoints.is_empty()); + collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000) + .await; + // backup counter is unregistered after the second iteration + assert!(metrics.backup_endpoints.is_empty()); } } diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs index bba5494cfe..86d0f9e8b2 100644 --- a/proxy/src/waiters.rs +++ b/proxy/src/waiters.rs @@ -7,13 +7,13 @@ use thiserror::Error; use tokio::sync::oneshot; #[derive(Debug, Error)] -pub enum RegisterError { +pub(crate) enum RegisterError { #[error("Waiter `{0}` already registered")] Occupied(String), } #[derive(Debug, Error)] -pub enum NotifyError { +pub(crate) enum NotifyError { #[error("Notify failed: waiter `{0}` not registered")] NotFound(String), @@ -22,21 +22,21 @@ pub enum NotifyError { } #[derive(Debug, Error)] -pub enum WaitError { +pub(crate) enum WaitError { #[error("Wait failed: channel hangup")] Hangup, } -pub struct Waiters(pub(self) Mutex>>); +pub(crate) struct Waiters(pub(self) Mutex>>); impl Default for Waiters { fn default() -> Self { - Waiters(Default::default()) + Waiters(Mutex::default()) } } impl Waiters { - pub fn register(&self, key: String) -> Result, RegisterError> { + pub(crate) fn register(&self, key: String) -> Result, RegisterError> { let (tx, rx) = oneshot::channel(); self.0 @@ -53,7 +53,7 @@ impl Waiters { }) } - pub fn notify(&self, key: &str, value: T) -> Result<(), NotifyError> + pub(crate) fn notify(&self, key: &str, value: T) -> Result<(), NotifyError> where T: Send + Sync, { @@ -79,7 +79,7 @@ impl<'a, T> Drop for DropKey<'a, T> { } pin_project! { - pub struct Waiter<'a, T> { + pub(crate) struct Waiter<'a, T> { #[pin] receiver: oneshot::Receiver, guard: DropKey<'a, T>, @@ -111,7 +111,7 @@ mod tests { let waiters = Arc::clone(&waiters); let notifier = tokio::spawn(async move { - waiters.notify(key, Default::default())?; + waiters.notify(key, ())?; Ok(()) }); diff --git a/pyproject.toml b/pyproject.toml index 24e075b489..ad3961ef55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,7 @@ [tool.poetry] -name = "neon" -version = "0.1.0" description = "" authors = [] +package-mode = false [tool.poetry.dependencies] python = "^3.9" @@ -10,21 +9,21 @@ pytest = "^7.4.4" psycopg2-binary = "^2.9.6" typing-extensions = "^4.6.1" PyJWT = {version = "^2.1.0", extras = ["crypto"]} -requests = "^2.31.0" +requests = "^2.32.0" pytest-xdist = "^3.3.1" asyncpg = "^0.29.0" aiopg = "^1.4.0" -Jinja2 = "^3.1.3" +Jinja2 = "^3.1.4" types-requests = "^2.31.0.0" types-psycopg2 = "^2.9.21.10" boto3 = "^1.34.11" boto3-stubs = {extras = ["s3"], version = "^1.26.16"} -moto = {extras = ["server"], version = "^4.1.2"} +moto = {extras = ["server"], version = "^5.0.6"} backoff = "^2.2.1" pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" pytest-timeout = "^2.1.0" -Werkzeug = "^3.0.1" +Werkzeug = "^3.0.3" pytest-order = "^1.1.0" allure-pytest = "^2.13.2" pytest-asyncio = "^0.21.0" @@ -33,22 +32,31 @@ psutil = "^5.9.4" types-psutil = "^5.9.5.12" types-toml = "^0.10.8.6" pytest-httpserver = "^1.0.8" -aiohttp = "3.9.0" +aiohttp = "3.10.2" pytest-rerunfailures = "^13.0" types-pytest-lazy-fixture = "^0.6.3.3" pytest-split = "^0.8.1" zstandard = "^0.21.0" +httpx = {extras = ["http2"], version = "^0.26.0"} +pytest-repeat = "^0.9.3" +websockets = "^12.0" +clickhouse-connect = "^0.7.16" +kafka-python = "^2.0.2" [tool.poetry.group.dev.dependencies] mypy = "==1.3.0" -ruff = "^0.1.11" +ruff = "^0.2.2" [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" [tool.mypy] -exclude = "^vendor/" +exclude = [ + "^vendor/", + "^target/", + "test_runner/performance/pgvector/loaddata.py", +] check_untyped_defs = true # Help mypy find imports when running against list of individual files. # Without this line it would behave differently when executed on the entire project. @@ -67,12 +75,19 @@ module = [ "allure.*", "allure_commons.*", "allure_pytest.*", + "kafka.*", ] ignore_missing_imports = true [tool.ruff] target-version = "py39" -extend-exclude = ["vendor/"] +extend-exclude = [ + "vendor/", + "target/", +] +line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter + +[tool.ruff.lint] ignore = [ "E501", # Line too long, we don't want to be too strict about it ] @@ -82,5 +97,5 @@ select = [ "I", # isort "W", # pycodestyle "B", # bugbear + "UP032", # f-string ] -line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 9b5a965f7d..e78c4d6790 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.75.0" +channel = "1.81.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs deleted file mode 100644 index 7b9f96dce3..0000000000 --- a/s3_scrubber/src/checks.rs +++ /dev/null @@ -1,400 +0,0 @@ -use std::collections::{HashMap, HashSet}; - -use anyhow::Context; -use aws_sdk_s3::{types::ObjectIdentifier, Client}; -use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata; -use pageserver_api::shard::ShardIndex; -use tracing::{error, info, warn}; -use utils::generation::Generation; -use utils::id::TimelineId; - -use crate::cloud_admin_api::BranchData; -use crate::metadata_stream::stream_listing; -use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; -use futures_util::{pin_mut, StreamExt}; -use pageserver::tenant::remote_timeline_client::parse_remote_index_path; -use pageserver::tenant::storage_layer::LayerFileName; -use pageserver::tenant::IndexPart; -use remote_storage::RemotePath; - -pub(crate) struct TimelineAnalysis { - /// Anomalies detected - pub(crate) errors: Vec, - - /// Healthy-but-noteworthy, like old-versioned structures that are readable but - /// worth reporting for awareness that we must not remove that old version decoding - /// yet. - pub(crate) warnings: Vec, - - /// Keys not referenced in metadata: candidates for removal, but NOT NECESSARILY: beware - /// of races between reading the metadata and reading the objects. - pub(crate) garbage_keys: Vec, -} - -impl TimelineAnalysis { - fn new() -> Self { - Self { - errors: Vec::new(), - warnings: Vec::new(), - garbage_keys: Vec::new(), - } - } -} - -pub(crate) fn branch_cleanup_and_check_errors( - id: &TenantShardTimelineId, - tenant_objects: &mut TenantObjectListing, - s3_active_branch: Option<&BranchData>, - console_branch: Option, - s3_data: Option, -) -> TimelineAnalysis { - let mut result = TimelineAnalysis::new(); - - info!("Checking timeline {id}"); - - if let Some(s3_active_branch) = s3_active_branch { - info!( - "Checking console status for timeline for branch {:?}/{:?}", - s3_active_branch.project_id, s3_active_branch.id - ); - match console_branch { - Some(_) => {result.errors.push(format!("Timeline has deleted branch data in the console (id = {:?}, project_id = {:?}), recheck whether it got removed during the check", - s3_active_branch.id, s3_active_branch.project_id)) - }, - None => { - result.errors.push(format!("Timeline has no branch data in the console (id = {:?}, project_id = {:?}), recheck whether it got removed during the check", - s3_active_branch.id, s3_active_branch.project_id)) - } - }; - } - - match s3_data { - Some(s3_data) => { - result.garbage_keys.extend(s3_data.keys_to_remove); - - match s3_data.blob_data { - BlobDataParseResult::Parsed { - index_part, - index_part_generation: _index_part_generation, - s3_layers: _s3_layers, - } => { - if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) { - result.errors.push(format!( - "index_part.json version: {}", - index_part.get_version() - )) - } - - if &index_part.get_version() != IndexPart::KNOWN_VERSIONS.last().unwrap() { - result.warnings.push(format!( - "index_part.json version is not latest: {}", - index_part.get_version() - )) - } - - if index_part.metadata.disk_consistent_lsn() - != index_part.get_disk_consistent_lsn() - { - result.errors.push(format!( - "Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})", - index_part.metadata.disk_consistent_lsn(), - index_part.get_disk_consistent_lsn(), - )) - } - - if index_part.layer_metadata.is_empty() { - // not an error, can happen for branches with zero writes, but notice that - info!("index_part.json has no layers"); - } - - for (layer, metadata) in index_part.layer_metadata { - if metadata.file_size == 0 { - result.errors.push(format!( - "index_part.json contains a layer {} that has 0 size in its layer metadata", layer.file_name(), - )) - } - - if !tenant_objects.check_ref(id.timeline_id, &layer, &metadata) { - // FIXME: this will emit false positives if an index was - // uploaded concurrently with our scan. To make this check - // correct, we need to try sending a HEAD request for the - // layer we think is missing. - result.errors.push(format!( - "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage", - layer.file_name(), - metadata.generation.get_suffix(), - metadata.shard - )) - } - } - } - BlobDataParseResult::Relic => {} - BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend( - parse_errors - .into_iter() - .map(|error| format!("parse error: {error}")), - ), - } - } - None => result - .errors - .push("Timeline has no data on S3 at all".to_string()), - } - - if result.errors.is_empty() { - info!("No check errors found"); - } else { - warn!("Timeline metadata errors: {0:?}", result.errors); - } - - if !result.warnings.is_empty() { - warn!("Timeline metadata warnings: {0:?}", result.warnings); - } - - if !result.garbage_keys.is_empty() { - error!( - "The following keys should be removed from S3: {0:?}", - result.garbage_keys - ) - } - - result -} - -#[derive(Default)] -pub(crate) struct LayerRef { - ref_count: usize, -} - -/// Top-level index of objects in a tenant. This may be used by any shard-timeline within -/// the tenant to query whether an object exists. -#[derive(Default)] -pub(crate) struct TenantObjectListing { - shard_timelines: - HashMap<(ShardIndex, TimelineId), HashMap<(LayerFileName, Generation), LayerRef>>, -} - -impl TenantObjectListing { - /// Having done an S3 listing of the keys within a timeline prefix, merge them into the overall - /// list of layer keys for the Tenant. - pub(crate) fn push( - &mut self, - ttid: TenantShardTimelineId, - layers: HashSet<(LayerFileName, Generation)>, - ) { - let shard_index = ShardIndex::new( - ttid.tenant_shard_id.shard_number, - ttid.tenant_shard_id.shard_count, - ); - let replaced = self.shard_timelines.insert( - (shard_index, ttid.timeline_id), - layers - .into_iter() - .map(|l| (l, LayerRef::default())) - .collect(), - ); - - assert!( - replaced.is_none(), - "Built from an S3 object listing, which should never repeat a key" - ); - } - - /// Having loaded a timeline index, check if a layer referenced by the index exists. If it does, - /// the layer's refcount will be incremented. Later, after calling this for all references in all indices - /// in a tenant, orphan layers may be detected by their zero refcounts. - /// - /// Returns true if the layer exists - pub(crate) fn check_ref( - &mut self, - timeline_id: TimelineId, - layer_file: &LayerFileName, - metadata: &IndexLayerMetadata, - ) -> bool { - let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else { - return false; - }; - - let Some(layer_ref) = shard_tl.get_mut(&(layer_file.clone(), metadata.generation)) else { - return false; - }; - - layer_ref.ref_count += 1; - - true - } - - pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerFileName, Generation)> { - let mut result = Vec::new(); - for ((shard_index, timeline_id), layers) in &self.shard_timelines { - for ((layer_file, generation), layer_ref) in layers { - if layer_ref.ref_count == 0 { - result.push((*shard_index, *timeline_id, layer_file.clone(), *generation)) - } - } - } - - result - } -} - -#[derive(Debug)] -pub(crate) struct S3TimelineBlobData { - pub(crate) blob_data: BlobDataParseResult, - pub(crate) keys_to_remove: Vec, -} - -#[derive(Debug)] -pub(crate) enum BlobDataParseResult { - Parsed { - index_part: IndexPart, - index_part_generation: Generation, - s3_layers: HashSet<(LayerFileName, Generation)>, - }, - /// The remains of a deleted Timeline (i.e. an initdb archive only) - Relic, - Incorrect(Vec), -} - -fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), String> { - match name.rsplit_once('-') { - // FIXME: this is gross, just use a regex? - Some((layer_filename, gen)) if gen.len() == 8 => { - let layer = layer_filename.parse::()?; - let gen = - Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?; - Ok((layer, gen)) - } - _ => Ok((name.parse::()?, Generation::none())), - } -} - -pub(crate) async fn list_timeline_blobs( - s3_client: &Client, - id: TenantShardTimelineId, - s3_root: &RootTarget, -) -> anyhow::Result { - let mut s3_layers = HashSet::new(); - - let mut errors = Vec::new(); - let mut keys_to_remove = Vec::new(); - - let mut timeline_dir_target = s3_root.timeline_root(&id); - timeline_dir_target.delimiter = String::new(); - - let mut index_parts: Vec = Vec::new(); - let mut initdb_archive: bool = false; - - let stream = stream_listing(s3_client, &timeline_dir_target); - pin_mut!(stream); - while let Some(obj) = stream.next().await { - let obj = obj?; - let key = obj.key(); - - let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket); - match blob_name { - Some(name) if name.starts_with("index_part.json") => { - tracing::info!("Index key {key}"); - index_parts.push(obj) - } - Some("initdb.tar.zst") => { - tracing::info!("initdb archive {key}"); - initdb_archive = true; - } - Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) { - Ok((new_layer, gen)) => { - tracing::info!("Parsed layer key: {} {:?}", new_layer, gen); - s3_layers.insert((new_layer, gen)); - } - Err(e) => { - tracing::info!("Error parsing key {maybe_layer_name}"); - errors.push( - format!("S3 list response got an object with key {key} that is not a layer name: {e}"), - ); - keys_to_remove.push(key.to_string()); - } - }, - None => { - tracing::info!("Peculiar key {}", key); - errors.push(format!("S3 list response got an object with odd key {key}")); - keys_to_remove.push(key.to_string()); - } - } - } - - if index_parts.is_empty() && s3_layers.is_empty() && initdb_archive { - tracing::info!( - "Timeline is empty apart from initdb archive: expected post-deletion state." - ); - return Ok(S3TimelineBlobData { - blob_data: BlobDataParseResult::Relic, - keys_to_remove: Vec::new(), - }); - } - - // Choose the index_part with the highest generation - let (index_part_object, index_part_generation) = match index_parts - .iter() - .filter_map(|k| { - let key = k.key(); - // Stripping the index key to the last part, because RemotePath doesn't - // like absolute paths, and depending on prefix_in_bucket it's possible - // for the keys we read back to start with a slash. - let basename = key.rsplit_once('/').unwrap().1; - parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (k, g)) - }) - .max_by_key(|i| i.1) - .map(|(k, g)| (k.clone(), g)) - { - Some((key, gen)) => (Some(key), gen), - None => { - // Legacy/missing case: one or zero index parts, which did not have a generation - (index_parts.pop(), Generation::none()) - } - }; - - if index_part_object.is_none() { - errors.push("S3 list response got no index_part.json file".to_string()); - } - - if let Some(index_part_object_key) = index_part_object.as_ref().map(|object| object.key()) { - let index_part_bytes = download_object_with_retries( - s3_client, - &timeline_dir_target.bucket_name, - index_part_object_key, - ) - .await - .context("index_part.json download")?; - - match serde_json::from_slice(&index_part_bytes) { - Ok(index_part) => { - return Ok(S3TimelineBlobData { - blob_data: BlobDataParseResult::Parsed { - index_part, - index_part_generation, - s3_layers, - }, - keys_to_remove, - }) - } - Err(index_parse_error) => errors.push(format!( - "index_part.json body parsing error: {index_parse_error}" - )), - } - } else { - errors.push(format!( - "Index part object {index_part_object:?} has no key" - )); - } - - if errors.is_empty() { - errors.push( - "Unexpected: no errors did not lead to a successfully parsed blob return".to_string(), - ); - } - - Ok(S3TimelineBlobData { - blob_data: BlobDataParseResult::Incorrect(errors), - keys_to_remove, - }) -} diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs deleted file mode 100644 index d2842877d0..0000000000 --- a/s3_scrubber/src/lib.rs +++ /dev/null @@ -1,398 +0,0 @@ -#![deny(unsafe_code)] -#![deny(clippy::undocumented_unsafe_blocks)] -pub mod checks; -pub mod cloud_admin_api; -pub mod garbage; -pub mod metadata_stream; -pub mod scan_metadata; - -use std::env; -use std::fmt::Display; -use std::sync::Arc; -use std::time::Duration; - -use anyhow::Context; -use aws_config::environment::EnvironmentVariableCredentialsProvider; -use aws_config::imds::credentials::ImdsCredentialsProvider; -use aws_config::meta::credentials::CredentialsProviderChain; -use aws_config::profile::ProfileFileCredentialsProvider; -use aws_config::retry::RetryConfig; -use aws_config::sso::SsoCredentialsProvider; -use aws_config::BehaviorVersion; -use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep}; -use aws_sdk_s3::{Client, Config}; -use aws_smithy_async::rt::sleep::TokioSleep; - -use clap::ValueEnum; -use pageserver::tenant::TENANTS_SEGMENT_NAME; -use pageserver_api::shard::TenantShardId; -use reqwest::Url; -use serde::{Deserialize, Serialize}; -use std::io::IsTerminal; -use tokio::io::AsyncReadExt; -use tracing::error; -use tracing_appender::non_blocking::WorkerGuard; -use tracing_subscriber::{fmt, prelude::*, EnvFilter}; -use utils::id::TimelineId; - -const MAX_RETRIES: usize = 20; -const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN"; - -#[derive(Debug, Clone)] -pub struct S3Target { - pub bucket_name: String, - /// This `prefix_in_bucket` is only equal to the PS/SK config of the same - /// name for the RootTarget: other instances of S3Target will have prefix_in_bucket - /// with extra parts. - pub prefix_in_bucket: String, - pub delimiter: String, -} - -/// Convenience for referring to timelines within a particular shard: more ergonomic -/// than using a 2-tuple. -/// -/// This is the shard-aware equivalent of TenantTimelineId. It's defined here rather -/// than somewhere more broadly exposed, because this kind of thing is rarely needed -/// in the pageserver, as all timeline objects existing in the scope of a particular -/// tenant: the scrubber is different in that it handles collections of data referring to many -/// TenantShardTimelineIds in on place. -#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq)] -pub struct TenantShardTimelineId { - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, -} - -impl TenantShardTimelineId { - fn new(tenant_shard_id: TenantShardId, timeline_id: TimelineId) -> Self { - Self { - tenant_shard_id, - timeline_id, - } - } -} - -impl Display for TenantShardTimelineId { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}/{}", self.tenant_shard_id, self.timeline_id) - } -} - -#[derive(clap::ValueEnum, Debug, Clone, Copy, PartialEq, Eq)] -pub enum TraversingDepth { - Tenant, - Timeline, -} - -impl Display for TraversingDepth { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_str(match self { - Self::Tenant => "tenant", - Self::Timeline => "timeline", - }) - } -} - -#[derive(ValueEnum, Clone, Copy, Eq, PartialEq, Debug, Serialize, Deserialize)] -pub enum NodeKind { - Safekeeper, - Pageserver, -} - -impl NodeKind { - fn as_str(&self) -> &'static str { - match self { - Self::Safekeeper => "safekeeper", - Self::Pageserver => "pageserver", - } - } -} - -impl Display for NodeKind { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_str(self.as_str()) - } -} - -impl S3Target { - pub fn with_sub_segment(&self, new_segment: &str) -> Self { - let mut new_self = self.clone(); - if new_self.prefix_in_bucket.is_empty() { - new_self.prefix_in_bucket = format!("/{}/", new_segment); - } else { - if new_self.prefix_in_bucket.ends_with('/') { - new_self.prefix_in_bucket.pop(); - } - new_self.prefix_in_bucket = - [&new_self.prefix_in_bucket, new_segment, ""].join(&new_self.delimiter); - } - new_self - } -} - -#[derive(Clone)] -pub enum RootTarget { - Pageserver(S3Target), - Safekeeper(S3Target), -} - -impl RootTarget { - pub fn tenants_root(&self) -> S3Target { - match self { - Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME), - Self::Safekeeper(root) => root.with_sub_segment("wal"), - } - } - - pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target { - self.tenants_root().with_sub_segment(&tenant_id.to_string()) - } - - pub fn timelines_root(&self, tenant_id: &TenantShardId) -> S3Target { - match self { - Self::Pageserver(_) => self.tenant_root(tenant_id).with_sub_segment("timelines"), - Self::Safekeeper(_) => self.tenant_root(tenant_id), - } - } - - pub fn timeline_root(&self, id: &TenantShardTimelineId) -> S3Target { - self.timelines_root(&id.tenant_shard_id) - .with_sub_segment(&id.timeline_id.to_string()) - } - - pub fn bucket_name(&self) -> &str { - match self { - Self::Pageserver(root) => &root.bucket_name, - Self::Safekeeper(root) => &root.bucket_name, - } - } - - pub fn delimiter(&self) -> &str { - match self { - Self::Pageserver(root) => &root.delimiter, - Self::Safekeeper(root) => &root.delimiter, - } - } -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct BucketConfig { - pub region: String, - pub bucket: String, - pub prefix_in_bucket: Option, - - /// Use SSO if this is set, else rely on AWS_* environment vars - pub sso_account_id: Option, -} - -impl Display for BucketConfig { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{}/{}/{}", - self.sso_account_id.as_deref().unwrap_or(""), - self.region, - self.bucket - ) - } -} - -impl BucketConfig { - pub fn from_env() -> anyhow::Result { - let sso_account_id = env::var("SSO_ACCOUNT_ID").ok(); - let region = env::var("REGION").context("'REGION' param retrieval")?; - let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?; - let prefix_in_bucket = env::var("BUCKET_PREFIX").ok(); - - Ok(Self { - region, - bucket, - prefix_in_bucket, - sso_account_id, - }) - } -} - -pub struct ConsoleConfig { - pub token: String, - pub base_url: Url, -} - -impl ConsoleConfig { - pub fn from_env() -> anyhow::Result { - let base_url: Url = env::var("CLOUD_ADMIN_API_URL") - .context("'CLOUD_ADMIN_API_URL' param retrieval")? - .parse() - .context("'CLOUD_ADMIN_API_URL' param parsing")?; - - let token = env::var(CLOUD_ADMIN_API_TOKEN_ENV_VAR) - .context("'CLOUD_ADMIN_API_TOKEN' environment variable fetch")?; - - Ok(Self { base_url, token }) - } -} - -pub fn init_logging(file_name: &str) -> WorkerGuard { - let (file_writer, guard) = - tracing_appender::non_blocking(tracing_appender::rolling::never("./logs/", file_name)); - - let file_logs = fmt::Layer::new() - .with_target(false) - .with_ansi(false) - .with_writer(file_writer); - let stderr_logs = fmt::Layer::new() - .with_ansi(std::io::stderr().is_terminal()) - .with_target(false) - .with_writer(std::io::stderr); - tracing_subscriber::registry() - .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"))) - .with(file_logs) - .with(stderr_logs) - .init(); - - guard -} - -pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Client { - let credentials_provider = { - // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" - let chain = CredentialsProviderChain::first_try( - "env", - EnvironmentVariableCredentialsProvider::new(), - ) - // uses "AWS_PROFILE" / `aws sso login --profile ` - .or_else( - "profile-sso", - ProfileFileCredentialsProvider::builder().build(), - ); - - // Use SSO if we were given an account ID - match account_id { - Some(sso_account) => chain.or_else( - "sso", - SsoCredentialsProvider::builder() - .account_id(sso_account) - .role_name("PowerUserAccess") - .start_url("https://neondb.awsapps.com/start") - .region(bucket_region.clone()) - .build(), - ), - None => chain, - } - .or_else( - // Finally try IMDS - "imds", - ImdsCredentialsProvider::builder().build(), - ) - }; - - let sleep_impl: Arc = Arc::new(TokioSleep::new()); - - let mut builder = Config::builder() - .behavior_version(BehaviorVersion::v2023_11_09()) - .region(bucket_region) - .retry_config(RetryConfig::adaptive().with_max_attempts(3)) - .sleep_impl(SharedAsyncSleep::from(sleep_impl)) - .credentials_provider(credentials_provider); - - if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") { - builder = builder.endpoint_url(endpoint) - } - - Client::from_conf(builder.build()) -} - -fn init_remote( - bucket_config: BucketConfig, - node_kind: NodeKind, -) -> anyhow::Result<(Arc, RootTarget)> { - let bucket_region = Region::new(bucket_config.region); - let delimiter = "/".to_string(); - let s3_client = Arc::new(init_s3_client(bucket_config.sso_account_id, bucket_region)); - - let s3_root = match node_kind { - NodeKind::Pageserver => RootTarget::Pageserver(S3Target { - bucket_name: bucket_config.bucket, - prefix_in_bucket: bucket_config - .prefix_in_bucket - .unwrap_or("pageserver/v1".to_string()), - delimiter, - }), - NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target { - bucket_name: bucket_config.bucket, - prefix_in_bucket: bucket_config - .prefix_in_bucket - .unwrap_or("safekeeper/v1".to_string()), - delimiter, - }), - }; - - Ok((s3_client, s3_root)) -} - -async fn list_objects_with_retries( - s3_client: &Client, - s3_target: &S3Target, - continuation_token: Option, -) -> anyhow::Result { - for _ in 0..MAX_RETRIES { - match s3_client - .list_objects_v2() - .bucket(&s3_target.bucket_name) - .prefix(&s3_target.prefix_in_bucket) - .delimiter(&s3_target.delimiter) - .set_continuation_token(continuation_token.clone()) - .send() - .await - { - Ok(response) => return Ok(response), - Err(e) => { - error!("list_objects_v2 query failed: {e}"); - tokio::time::sleep(Duration::from_secs(1)).await; - } - } - } - - anyhow::bail!("Failed to list objects {MAX_RETRIES} times") -} - -async fn download_object_with_retries( - s3_client: &Client, - bucket_name: &str, - key: &str, -) -> anyhow::Result> { - for _ in 0..MAX_RETRIES { - let mut body_buf = Vec::new(); - let response_stream = match s3_client - .get_object() - .bucket(bucket_name) - .key(key) - .send() - .await - { - Ok(response) => response, - Err(e) => { - error!("Failed to download object for key {key}: {e}"); - tokio::time::sleep(Duration::from_secs(1)).await; - continue; - } - }; - - match response_stream - .body - .into_async_read() - .read_to_end(&mut body_buf) - .await - { - Ok(bytes_read) => { - tracing::info!("Downloaded {bytes_read} bytes for object object with key {key}"); - return Ok(body_buf); - } - Err(e) => { - error!("Failed to stream object body for key {key}: {e}"); - tokio::time::sleep(Duration::from_secs(1)).await; - } - } - } - - anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times") -} diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs deleted file mode 100644 index 957213856b..0000000000 --- a/s3_scrubber/src/main.rs +++ /dev/null @@ -1,106 +0,0 @@ -use pageserver_api::shard::TenantShardId; -use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; -use s3_scrubber::scan_metadata::scan_metadata; -use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth}; - -use clap::{Parser, Subcommand}; - -#[derive(Parser)] -#[command(author, version, about, long_about = None)] -#[command(arg_required_else_help(true))] -struct Cli { - #[command(subcommand)] - command: Command, - - #[arg(short, long, default_value_t = false)] - delete: bool, -} - -#[derive(Subcommand, Debug)] -enum Command { - FindGarbage { - #[arg(short, long)] - node_kind: NodeKind, - #[arg(short, long, default_value_t=TraversingDepth::Tenant)] - depth: TraversingDepth, - #[arg(short, long, default_value_t = String::from("garbage.json"))] - output_path: String, - }, - PurgeGarbage { - #[arg(short, long)] - input_path: String, - #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)] - mode: PurgeMode, - }, - ScanMetadata { - #[arg(short, long, default_value_t = false)] - json: bool, - #[arg(long = "tenant-id", num_args = 0..)] - tenant_ids: Vec, - }, -} - -#[tokio::main] -async fn main() -> anyhow::Result<()> { - let cli = Cli::parse(); - - let bucket_config = BucketConfig::from_env()?; - - let command_log_name = match &cli.command { - Command::ScanMetadata { .. } => "scan", - Command::FindGarbage { .. } => "find-garbage", - Command::PurgeGarbage { .. } => "purge-garbage", - }; - let _guard = init_logging(&format!( - "{}_{}_{}_{}.log", - std::env::args().next().unwrap(), - command_log_name, - bucket_config.bucket, - chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S") - )); - - match cli.command { - Command::ScanMetadata { json, tenant_ids } => { - match scan_metadata(bucket_config.clone(), tenant_ids).await { - Err(e) => { - tracing::error!("Failed: {e}"); - Err(e) - } - Ok(summary) => { - if json { - println!("{}", serde_json::to_string(&summary).unwrap()) - } else { - println!("{}", summary.summary_string()); - } - if summary.is_fatal() { - Err(anyhow::anyhow!("Fatal scrub errors detected")) - } else if summary.is_empty() { - // Strictly speaking an empty bucket is a valid bucket, but if someone ran the - // scrubber they were likely expecting to scan something, and if we see no timelines - // at all then it's likely due to some configuration issues like a bad prefix - Err(anyhow::anyhow!( - "No timelines found in bucket {} prefix {}", - bucket_config.bucket, - bucket_config - .prefix_in_bucket - .unwrap_or("".to_string()) - )) - } else { - Ok(()) - } - } - } - } - Command::FindGarbage { - node_kind, - depth, - output_path, - } => { - let console_config = ConsoleConfig::from_env()?; - find_garbage(bucket_config, console_config, depth, node_kind, output_path).await - } - Command::PurgeGarbage { input_path, mode } => { - purge_garbage(input_path, mode, !cli.delete).await - } - } -} diff --git a/s3_scrubber/src/metadata_stream.rs b/s3_scrubber/src/metadata_stream.rs deleted file mode 100644 index 073f37f319..0000000000 --- a/s3_scrubber/src/metadata_stream.rs +++ /dev/null @@ -1,136 +0,0 @@ -use anyhow::Context; -use async_stream::{stream, try_stream}; -use aws_sdk_s3::{types::ObjectIdentifier, Client}; -use tokio_stream::Stream; - -use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId}; -use pageserver_api::shard::TenantShardId; -use utils::id::TimelineId; - -/// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2 -pub fn stream_tenants<'a>( - s3_client: &'a Client, - target: &'a RootTarget, -) -> impl Stream> + 'a { - try_stream! { - let mut continuation_token = None; - let tenants_target = target.tenants_root(); - loop { - let fetch_response = - list_objects_with_retries(s3_client, &tenants_target, continuation_token.clone()).await?; - - let new_entry_ids = fetch_response - .common_prefixes() - .iter() - .filter_map(|prefix| prefix.prefix()) - .filter_map(|prefix| -> Option<&str> { - prefix - .strip_prefix(&tenants_target.prefix_in_bucket)? - .strip_suffix('/') - }).map(|entry_id_str| { - entry_id_str - .parse() - .with_context(|| format!("Incorrect entry id str: {entry_id_str}")) - }); - - for i in new_entry_ids { - yield i?; - } - - match fetch_response.next_continuation_token { - Some(new_token) => continuation_token = Some(new_token), - None => break, - } - } - } -} - -/// Given a TenantShardId, output a stream of the timelines within that tenant, discovered -/// using ListObjectsv2. The listing is done before the stream is built, so that this -/// function can be used to generate concurrency on a stream using buffer_unordered. -pub async fn stream_tenant_timelines<'a>( - s3_client: &'a Client, - target: &'a RootTarget, - tenant: TenantShardId, -) -> anyhow::Result> + 'a> { - let mut timeline_ids: Vec> = Vec::new(); - let mut continuation_token = None; - let timelines_target = target.timelines_root(&tenant); - - loop { - tracing::info!("Listing in {}", tenant); - let fetch_response = - list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone()) - .await; - let fetch_response = match fetch_response { - Err(e) => { - timeline_ids.push(Err(e)); - break; - } - Ok(r) => r, - }; - - let new_entry_ids = fetch_response - .common_prefixes() - .iter() - .filter_map(|prefix| prefix.prefix()) - .filter_map(|prefix| -> Option<&str> { - prefix - .strip_prefix(&timelines_target.prefix_in_bucket)? - .strip_suffix('/') - }) - .map(|entry_id_str| { - entry_id_str - .parse::() - .with_context(|| format!("Incorrect entry id str: {entry_id_str}")) - }); - - for i in new_entry_ids { - timeline_ids.push(i); - } - - match fetch_response.next_continuation_token { - Some(new_token) => continuation_token = Some(new_token), - None => break, - } - } - - tracing::info!("Yielding for {}", tenant); - Ok(stream! { - for i in timeline_ids { - let id = i?; - yield Ok(TenantShardTimelineId::new(tenant, id)); - } - }) -} - -pub(crate) fn stream_listing<'a>( - s3_client: &'a Client, - target: &'a S3Target, -) -> impl Stream> + 'a { - try_stream! { - let mut continuation_token = None; - loop { - let fetch_response = - list_objects_with_retries(s3_client, target, continuation_token.clone()).await?; - - if target.delimiter.is_empty() { - for object_key in fetch_response.contents().iter().filter_map(|object| object.key()) - { - let object_id = ObjectIdentifier::builder().key(object_key).build()?; - yield object_id; - } - } else { - for prefix in fetch_response.common_prefixes().iter().filter_map(|p| p.prefix()) { - let object_id = ObjectIdentifier::builder().key(prefix).build()?; - yield object_id; - } - } - - match fetch_response.next_continuation_token { - Some(new_token) => continuation_token = Some(new_token), - None => break, - } - } - } -} diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 364cad7892..0fdb3147bf 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -23,7 +23,6 @@ clap = { workspace = true, features = ["derive"] } const_format.workspace = true crc32c.workspace = true fail.workspace = true -fs2.workspace = true git-version.workspace = true hex.workspace = true humantime.workspace = true @@ -33,6 +32,7 @@ once_cell.workspace = true parking_lot.workspace = true postgres.workspace = true postgres-protocol.workspace = true +rand.workspace = true regex.workspace = true scopeguard.workspace = true reqwest = { workspace = true, features = ["json"] } @@ -40,11 +40,14 @@ serde.workspace = true serde_json.workspace = true serde_with.workspace = true signal-hook.workspace = true +strum.workspace = true +strum_macros.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["fs"] } tokio-util = { workspace = true } tokio-io-timeout.workspace = true tokio-postgres.workspace = true +tokio-tar.workspace = true toml_edit.workspace = true tracing.workspace = true url.workspace = true @@ -61,3 +64,10 @@ tokio-stream.workspace = true utils.workspace = true workspace_hack.workspace = true + +[dev-dependencies] +walproposer.workspace = true +rand.workspace = true +desim.workspace = true +tracing.workspace = true +tracing-subscriber = { workspace = true, features = ["json"] } diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs index bf4905aaa7..b8bc3f3e06 100644 --- a/safekeeper/src/auth.rs +++ b/safekeeper/src/auth.rs @@ -12,9 +12,15 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< } Ok(()) } - (Scope::PageServerApi, _) => Err(AuthError( - "PageServerApi scope makes no sense for Safekeeper".into(), - )), + (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi | Scope::Scrubber, _) => { + Err(AuthError( + format!( + "JWT scope '{:?}' is ineligible for Safekeeper auth", + claims.scope + ) + .into(), + )) + } (Scope::SafekeeperData, _) => Ok(()), } } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 33047051df..41c2d3fe08 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -12,31 +12,31 @@ use sd_notify::NotifyState; use tokio::runtime::Handle; use tokio::signal::unix::{signal, SignalKind}; use tokio::task::JoinError; -use toml_edit::Document; +use utils::logging::SecretString; +use std::env::{var, VarError}; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; use storage_broker::Uri; -use tokio::sync::mpsc; use tracing::*; use utils::pid_file; use metrics::set_build_info_metric; use safekeeper::defaults::{ - DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, - DEFAULT_PG_LISTEN_ADDR, + DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT, + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY, + DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, }; +use safekeeper::http; use safekeeper::wal_service; use safekeeper::GlobalTimelines; use safekeeper::SafeKeeperConf; use safekeeper::{broker, WAL_SERVICE_RUNTIME}; use safekeeper::{control_file, BROKER_RUNTIME}; -use safekeeper::{http, WAL_REMOVER_RUNTIME}; -use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME}; use safekeeper::{wal_backup, HTTP_RUNTIME}; use storage_broker::DEFAULT_ENDPOINT; use utils::auth::{JwtAuth, Scope, SwappableJwtAuth}; @@ -125,7 +125,7 @@ struct Args { peer_recovery: bool, /// Remote storage configuration for WAL backup (offloading to s3) as TOML /// inline table, e.g. - /// {"max_concurrent_syncs" = 17, "max_sync_errors": 13, "bucket_name": "", "bucket_region":"", "concurrency_limit": 119} + /// {max_concurrent_syncs = 17, max_sync_errors = 13, bucket_name = "", bucket_region = "", concurrency_limit = 119} /// Safekeeper offloads WAL to /// [prefix_in_bucket/]//, mirroring /// structure on the file system. @@ -166,6 +166,35 @@ struct Args { /// useful for debugging. #[arg(long)] current_thread_runtime: bool, + /// Keep horizon for walsenders, i.e. don't remove WAL segments that are + /// still needed for existing replication connection. + #[arg(long)] + walsenders_keep_horizon: bool, + /// Controls how long backup will wait until uploading the partial segment. + #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)] + partial_backup_timeout: Duration, + /// Disable task to push messages to broker every second. Supposed to + /// be used in tests. + #[arg(long)] + disable_periodic_broker_push: bool, + /// Enable automatic switching to offloaded state. + #[arg(long)] + enable_offload: bool, + /// Delete local WAL files after offloading. When disabled, they will be left on disk. + #[arg(long)] + delete_offloaded_wal: bool, + /// Pending updates to control file will be automatically saved after this interval. + #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_CONTROL_FILE_SAVE_INTERVAL)] + control_file_save_interval: Duration, + /// Number of allowed concurrent uploads of partial segments to remote storage. + #[arg(long, default_value = DEFAULT_PARTIAL_BACKUP_CONCURRENCY)] + partial_backup_concurrency: usize, + /// How long a timeline must be resident before it is eligible for eviction. + /// Usually, timeline eviction has to wait for `partial_backup_timeout` before being eligible for eviction, + /// but if a timeline is un-evicted and then _not_ written to, it would immediately flap to evicting again, + /// if it weren't for `eviction_min_resident` preventing that. + #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)] + eviction_min_resident: Duration, } // Like PathBufValueParser, but allows empty string. @@ -274,6 +303,22 @@ async fn main() -> anyhow::Result<()> { } }; + // Load JWT auth token to connect to other safekeepers for pull_timeline. + let sk_auth_token = match var("SAFEKEEPER_AUTH_TOKEN") { + Ok(v) => { + info!("loaded JWT token for authentication with safekeepers"); + Some(SecretString::from(v)) + } + Err(VarError::NotPresent) => { + info!("no JWT token for authentication with safekeepers detected"); + None + } + Err(_) => { + warn!("JWT token for authentication with safekeepers is not unicode"); + None + } + }; + let conf = SafeKeeperConf { workdir, my_id: id, @@ -294,7 +339,16 @@ async fn main() -> anyhow::Result<()> { pg_auth, pg_tenant_only_auth, http_auth, + sk_auth_token, current_thread_runtime: args.current_thread_runtime, + walsenders_keep_horizon: args.walsenders_keep_horizon, + partial_backup_timeout: args.partial_backup_timeout, + disable_periodic_broker_push: args.disable_periodic_broker_push, + enable_offload: args.enable_offload, + delete_offloaded_wal: args.delete_offloaded_wal, + control_file_save_interval: args.control_file_save_interval, + partial_backup_concurrency: args.partial_backup_concurrency, + eviction_min_resident: args.eviction_min_resident, }; // initialize sentry if SENTRY_DSN is provided @@ -358,7 +412,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let timeline_collector = safekeeper::metrics::TimelineCollector::new(); metrics::register_internal(Box::new(timeline_collector))?; - let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); + wal_backup::init_remote_storage(&conf).await; // Keep handles to main tasks to die if any of them disappears. let mut tasks_handles: FuturesUnordered> = @@ -370,19 +424,9 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let current_thread_rt = conf .current_thread_runtime .then(|| Handle::try_current().expect("no runtime in main")); - let conf_ = conf.clone(); - let wal_backup_handle = current_thread_rt - .as_ref() - .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle()) - .spawn(wal_backup::wal_backup_launcher_task_main( - conf_, - wal_backup_launcher_rx, - )) - .map(|res| ("WAL backup launcher".to_owned(), res)); - tasks_handles.push(Box::pin(wal_backup_handle)); // Load all timelines from disk to memory. - GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx).await?; + GlobalTimelines::init(conf.clone()).await?; let conf_ = conf.clone(); // Run everything in current thread rt, if asked. @@ -402,6 +446,19 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { .map(|res| ("WAL service main".to_owned(), res)); tasks_handles.push(Box::pin(wal_service_handle)); + let timeline_housekeeping_handle = current_thread_rt + .as_ref() + .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle()) + .spawn(async move { + const TOMBSTONE_TTL: Duration = Duration::from_secs(3600 * 24); + loop { + tokio::time::sleep(TOMBSTONE_TTL).await; + GlobalTimelines::housekeeping(&TOMBSTONE_TTL); + } + }) + .map(|res| ("Timeline map housekeeping".to_owned(), res)); + tasks_handles.push(Box::pin(timeline_housekeeping_handle)); + if let Some(pg_listener_tenant_only) = pg_listener_tenant_only { let conf_ = conf.clone(); let wal_service_handle = current_thread_rt @@ -433,14 +490,6 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { .map(|res| ("broker main".to_owned(), res)); tasks_handles.push(Box::pin(broker_task_handle)); - let conf_ = conf.clone(); - let wal_remover_handle = current_thread_rt - .as_ref() - .unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle()) - .spawn(remove_wal::task_main(conf_)) - .map(|res| ("WAL remover".to_owned(), res)); - tasks_handles.push(Box::pin(wal_remover_handle)); - set_build_info_metric(GIT_VERSION, BUILD_TAG); // TODO: update tokio-stream, convert to real async Stream with @@ -517,16 +566,8 @@ fn set_id(workdir: &Utf8Path, given_id: Option) -> Result { Ok(my_id) } -// Parse RemoteStorage from TOML table. fn parse_remote_storage(storage_conf: &str) -> anyhow::Result { - // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse - let storage_conf_toml = format!("remote_storage = {storage_conf}"); - let parsed_toml = storage_conf_toml.parse::()?; // parse - let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again - RemoteStorageConfig::from_toml(storage_conf_parsed_toml).and_then(|parsed_config| { - // XXX: Don't print the original toml here, there might be some sensitive data - parsed_config.context("Incorrectly parsed remote storage toml as no remote storage config") - }) + RemoteStorageConfig::from_toml(&storage_conf.parse()?) } #[test] diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 2b1db2714b..485816408f 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -10,11 +10,20 @@ use anyhow::Result; use storage_broker::parse_proto_ttid; use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; +use storage_broker::proto::FilterTenantTimelineId; +use storage_broker::proto::MessageType; +use storage_broker::proto::SafekeeperDiscoveryResponse; +use storage_broker::proto::SubscribeByFilterRequest; use storage_broker::proto::SubscribeSafekeeperInfoRequest; +use storage_broker::proto::TypeSubscription; +use storage_broker::proto::TypedMessage; use storage_broker::Request; +use std::sync::atomic::AtomicU64; +use std::sync::Arc; use std::time::Duration; use std::time::Instant; +use std::time::UNIX_EPOCH; use tokio::task::JoinHandle; use tokio::time::sleep; use tracing::*; @@ -31,6 +40,14 @@ const PUSH_INTERVAL_MSEC: u64 = 1000; /// Push once in a while data about all active timelines to the broker. async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { + if conf.disable_periodic_broker_push { + info!("broker push_loop is disabled, doing nothing..."); + futures::future::pending::<()>().await; // sleep forever + return Ok(()); + } + + let active_timelines_set = GlobalTimelines::get_global_broker_active_set(); + let mut client = storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?; let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); @@ -42,15 +59,9 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // sensitive and there is no risk of deadlock as we don't await while // lock is held. let now = Instant::now(); - let all_tlis = GlobalTimelines::get_all(); + let all_tlis = active_timelines_set.get_all(); let mut n_pushed_tlis = 0; for tli in &all_tlis { - // filtering alternative futures::stream::iter(all_tlis) - // .filter(|tli| {let tli = tli.clone(); async move { tli.is_active().await}}).collect::>().await; - // doesn't look better, and I'm not sure how to do that without collect. - if !tli.is_active().await { - continue; - } let sk_info = tli.get_safekeeper_info(&conf).await; yield sk_info; BROKER_PUSHED_UPDATES.inc(); @@ -75,7 +86,8 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { } /// Subscribe and fetch all the interesting data from the broker. -async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { +#[instrument(name = "broker_pull", skip_all)] +async fn pull_loop(conf: SafeKeeperConf, stats: Arc) -> Result<()> { let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?; // TODO: subscribe only to local timelines instead of all @@ -94,6 +106,8 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { let err_counter = BROKER_PULLED_UPDATES.with_label_values(&["error"]); while let Some(msg) = stream.message().await? { + stats.update_pulled(); + let proto_ttid = msg .tenant_timeline_id .as_ref() @@ -119,12 +133,94 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { bail!("end of stream"); } +/// Process incoming discover requests. This is done in a separate task to avoid +/// interfering with the normal pull/push loops. +async fn discover_loop(conf: SafeKeeperConf, stats: Arc) -> Result<()> { + let mut client = + storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?; + + let request = SubscribeByFilterRequest { + types: vec![TypeSubscription { + r#type: MessageType::SafekeeperDiscoveryRequest as i32, + }], + tenant_timeline_id: Some(FilterTenantTimelineId { + enabled: false, + tenant_timeline_id: None, + }), + }; + + let mut stream = client + .subscribe_by_filter(request) + .await + .context("subscribe_by_filter request failed")? + .into_inner(); + + let discover_counter = BROKER_PULLED_UPDATES.with_label_values(&["discover"]); + + while let Some(typed_msg) = stream.message().await? { + stats.update_pulled(); + + match typed_msg.r#type() { + MessageType::SafekeeperDiscoveryRequest => { + let msg = typed_msg + .safekeeper_discovery_request + .expect("proto type mismatch from broker message"); + + let proto_ttid = msg + .tenant_timeline_id + .as_ref() + .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?; + let ttid = parse_proto_ttid(proto_ttid)?; + if let Ok(tli) = GlobalTimelines::get(ttid) { + // we received a discovery request for a timeline we know about + discover_counter.inc(); + + // create and reply with discovery response + let sk_info = tli.get_safekeeper_info(&conf).await; + let response = SafekeeperDiscoveryResponse { + safekeeper_id: sk_info.safekeeper_id, + tenant_timeline_id: sk_info.tenant_timeline_id, + commit_lsn: sk_info.commit_lsn, + safekeeper_connstr: sk_info.safekeeper_connstr, + availability_zone: sk_info.availability_zone, + standby_horizon: 0, + }; + + // note this is a blocking call + client + .publish_one(TypedMessage { + r#type: MessageType::SafekeeperDiscoveryResponse as i32, + safekeeper_timeline_info: None, + safekeeper_discovery_request: None, + safekeeper_discovery_response: Some(response), + }) + .await?; + } + } + + _ => { + warn!( + "unexpected message type i32 {}, {:?}", + typed_msg.r#type, + typed_msg.r#type() + ); + } + } + } + bail!("end of stream"); +} + pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { info!("started, broker endpoint {:?}", conf.broker_endpoint); let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC)); let mut push_handle: Option>> = None; let mut pull_handle: Option>> = None; + let mut discover_handle: Option>> = None; + + let stats = Arc::new(BrokerStats::new()); + let stats_task = task_stats(stats.clone()); + tokio::pin!(stats_task); // Selecting on JoinHandles requires some squats; is there a better way to // reap tasks individually? @@ -153,13 +249,77 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { }; pull_handle = None; }, + res = async { discover_handle.as_mut().unwrap().await }, if discover_handle.is_some() => { + // was it panic or normal error? + match res { + Ok(res_internal) => if let Err(err_inner) = res_internal { + warn!("discover task failed: {:?}", err_inner); + } + Err(err_outer) => { warn!("discover task panicked: {:?}", err_outer) } + }; + discover_handle = None; + }, _ = ticker.tick() => { if push_handle.is_none() { push_handle = Some(tokio::spawn(push_loop(conf.clone()))); } if pull_handle.is_none() { - pull_handle = Some(tokio::spawn(pull_loop(conf.clone()))); + pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), stats.clone()))); } + if discover_handle.is_none() { + discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), stats.clone()))); + } + }, + _ = &mut stats_task => {} + } + } +} + +struct BrokerStats { + /// Timestamp of the last received message from the broker. + last_pulled_ts: AtomicU64, +} + +impl BrokerStats { + fn new() -> Self { + BrokerStats { + last_pulled_ts: AtomicU64::new(0), + } + } + + fn now_millis() -> u64 { + std::time::SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("time is before epoch") + .as_millis() as u64 + } + + /// Update last_pulled timestamp to current time. + fn update_pulled(&self) { + self.last_pulled_ts + .store(Self::now_millis(), std::sync::atomic::Ordering::Relaxed); + } +} + +/// Periodically write to logs if there are issues with receiving data from the broker. +async fn task_stats(stats: Arc) { + let warn_duration = Duration::from_secs(10); + let mut ticker = tokio::time::interval(warn_duration); + + loop { + tokio::select! { + _ = ticker.tick() => { + let last_pulled = stats.last_pulled_ts.load(std::sync::atomic::Ordering::SeqCst); + if last_pulled == 0 { + // no broker updates yet + continue; + } + + let now = BrokerStats::now_millis(); + if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 { + let ts = chrono::DateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp"); + info!("no broker updates for some time, last update: {:?}", ts); + } } } } diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index f1daddd7c3..8b252b4ab4 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -2,39 +2,39 @@ use anyhow::{bail, ensure, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use camino::Utf8PathBuf; -use tokio::fs::{self, File}; +use camino::{Utf8Path, Utf8PathBuf}; +use tokio::fs::File; use tokio::io::AsyncWriteExt; +use utils::crashsafe::durable_rename; +use std::future::Future; use std::io::Read; use std::ops::Deref; use std::path::Path; use std::time::Instant; -use crate::control_file_upgrade::upgrade_control_file; +use crate::control_file_upgrade::downgrade_v9_to_v8; use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; -use crate::state::TimelinePersistentState; +use crate::state::{EvictionState, TimelinePersistentState}; +use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir}; use utils::{bin_ser::LeSer, id::TenantTimelineId}; use crate::SafeKeeperConf; -use std::convert::TryInto; - pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 7; +pub const SK_FORMAT_VERSION: u32 = 9; // contains persistent metadata for safekeeper -const CONTROL_FILE_NAME: &str = "safekeeper.control"; +pub const CONTROL_FILE_NAME: &str = "safekeeper.control"; // needed to atomically update the state using `rename` const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial"; -pub const CHECKSUM_SIZE: usize = std::mem::size_of::(); +pub const CHECKSUM_SIZE: usize = size_of::(); /// Storage should keep actual state inside of it. It should implement Deref /// trait to access state fields and have persist method for updating that state. -#[async_trait::async_trait] pub trait Storage: Deref { /// Persist safekeeper state on disk and update internal state. - async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()>; + fn persist(&mut self, s: &TimelinePersistentState) -> impl Future> + Send; /// Timestamp of last persist. fn last_persist_at(&self) -> Instant; @@ -44,7 +44,7 @@ pub trait Storage: Deref { pub struct FileStorage { // save timeline dir to avoid reconstructing it every time timeline_dir: Utf8PathBuf, - conf: SafeKeeperConf, + no_sync: bool, /// Last state persisted to disk. state: TimelinePersistentState, @@ -55,13 +55,12 @@ pub struct FileStorage { impl FileStorage { /// Initialize storage by loading state from disk. pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result { - let timeline_dir = conf.timeline_dir(ttid); - - let state = Self::load_control_file_conf(conf, ttid)?; + let timeline_dir = get_timeline_dir(conf, ttid); + let state = Self::load_control_file_from_dir(&timeline_dir)?; Ok(FileStorage { timeline_dir, - conf: conf.clone(), + no_sync: conf.no_sync, state, last_persist_at: Instant::now(), }) @@ -73,9 +72,12 @@ impl FileStorage { conf: &SafeKeeperConf, state: TimelinePersistentState, ) -> Result { + // we don't support creating new timelines in offloaded state + assert!(matches!(state.eviction_state, EvictionState::Present)); + let store = FileStorage { timeline_dir, - conf: conf.clone(), + no_sync: conf.no_sync, state, last_persist_at: Instant::now(), }; @@ -103,12 +105,9 @@ impl FileStorage { upgrade_control_file(buf, version) } - /// Load control file for given ttid at path specified by conf. - pub fn load_control_file_conf( - conf: &SafeKeeperConf, - ttid: &TenantTimelineId, - ) -> Result { - let path = conf.timeline_dir(ttid).join(CONTROL_FILE_NAME); + /// Load control file from given directory. + fn load_control_file_from_dir(timeline_dir: &Utf8Path) -> Result { + let path = timeline_dir.join(CONTROL_FILE_NAME); Self::load_control_file(path) } @@ -165,7 +164,30 @@ impl Deref for FileStorage { } } -#[async_trait::async_trait] +impl TimelinePersistentState { + pub(crate) fn write_to_buf(&self) -> Result> { + let mut buf: Vec = Vec::new(); + WriteBytesExt::write_u32::(&mut buf, SK_MAGIC)?; + + if self.eviction_state == EvictionState::Present { + // temp hack for forward compatibility + const PREV_FORMAT_VERSION: u32 = 8; + let prev = downgrade_v9_to_v8(self); + WriteBytesExt::write_u32::(&mut buf, PREV_FORMAT_VERSION)?; + prev.ser_into(&mut buf)?; + } else { + // otherwise, we write the current format version + WriteBytesExt::write_u32::(&mut buf, SK_FORMAT_VERSION)?; + self.ser_into(&mut buf)?; + } + + // calculate checksum before resize + let checksum = crc32c::crc32c(&buf); + buf.extend_from_slice(&checksum.to_le_bytes()); + Ok(buf) + } +} + impl Storage for FileStorage { /// Persists state durably to the underlying storage. /// @@ -181,14 +203,8 @@ impl Storage for FileStorage { &control_partial_path ) })?; - let mut buf: Vec = Vec::new(); - WriteBytesExt::write_u32::(&mut buf, SK_MAGIC)?; - WriteBytesExt::write_u32::(&mut buf, SK_FORMAT_VERSION)?; - s.ser_into(&mut buf)?; - // calculate checksum before resize - let checksum = crc32c::crc32c(&buf); - buf.extend_from_slice(&checksum.to_le_bytes()); + let buf: Vec = s.write_to_buf()?; control_partial.write_all(&buf).await.with_context(|| { format!( @@ -203,35 +219,8 @@ impl Storage for FileStorage { ) })?; - // fsync the file - if !self.conf.no_sync { - control_partial.sync_all().await.with_context(|| { - format!( - "failed to sync partial control file at {}", - control_partial_path - ) - })?; - } - let control_path = self.timeline_dir.join(CONTROL_FILE_NAME); - - // rename should be atomic - fs::rename(&control_partial_path, &control_path).await?; - // this sync is not required by any standard but postgres does this (see durable_rename) - if !self.conf.no_sync { - let new_f = File::open(&control_path).await?; - new_f - .sync_all() - .await - .with_context(|| format!("failed to sync control file at: {}", &control_path))?; - - // fsync the directory (linux specific) - let tli_dir = File::open(&self.timeline_dir).await?; - tli_dir - .sync_all() - .await - .context("failed to sync control file directory")?; - } + durable_rename(&control_partial_path, &control_path, !self.no_sync).await?; // update internal state self.state = s.clone(); @@ -245,11 +234,9 @@ impl Storage for FileStorage { #[cfg(test)] mod test { - use super::FileStorage; use super::*; - use crate::SafeKeeperConf; - use anyhow::Result; - use utils::{id::TenantTimelineId, lsn::Lsn}; + use tokio::fs; + use utils::lsn::Lsn; fn stub_conf() -> SafeKeeperConf { let workdir = camino_tempfile::tempdir().unwrap().into_path(); @@ -263,12 +250,13 @@ mod test { conf: &SafeKeeperConf, ttid: &TenantTimelineId, ) -> Result<(FileStorage, TimelinePersistentState)> { - fs::create_dir_all(conf.timeline_dir(ttid)) + let timeline_dir = get_timeline_dir(conf, ttid); + fs::create_dir_all(&timeline_dir) .await .expect("failed to create timeline dir"); Ok(( FileStorage::restore_new(ttid, conf)?, - FileStorage::load_control_file_conf(conf, ttid)?, + FileStorage::load_control_file_from_dir(&timeline_dir)?, )) } @@ -276,11 +264,11 @@ mod test { conf: &SafeKeeperConf, ttid: &TenantTimelineId, ) -> Result<(FileStorage, TimelinePersistentState)> { - fs::create_dir_all(conf.timeline_dir(ttid)) + let timeline_dir = get_timeline_dir(conf, ttid); + fs::create_dir_all(&timeline_dir) .await .expect("failed to create timeline dir"); let state = TimelinePersistentState::empty(); - let timeline_dir = conf.timeline_dir(ttid); let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?; Ok((storage, state)) } @@ -321,7 +309,7 @@ mod test { .await .expect("failed to persist state"); } - let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME); + let control_path = get_timeline_dir(&conf, &ttid).join(CONTROL_FILE_NAME); let mut data = fs::read(&control_path).await.unwrap(); data[0] += 1; // change the first byte of the file to fail checksum validation fs::write(&control_path, &data) diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 2fd719326d..a4b4670e42 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -1,7 +1,8 @@ //! Code to deal with safekeeper control file upgrades use crate::{ safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn}, - state::{PersistedPeers, TimelinePersistentState}, + state::{EvictionState, PersistedPeers, TimelinePersistentState}, + wal_backup_partial, }; use anyhow::{bail, Result}; use pq_proto::SystemId; @@ -138,6 +139,99 @@ pub struct SafeKeeperStateV4 { pub peers: PersistedPeers, } +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SafeKeeperStateV7 { + #[serde(with = "hex")] + pub tenant_id: TenantId, + #[serde(with = "hex")] + pub timeline_id: TimelineId, + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfo, + /// Unique id of the last *elected* proposer we dealt with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// Since which LSN this timeline generally starts. Safekeeper might have + /// joined later. + pub timeline_start_lsn: Lsn, + /// Since which LSN safekeeper has (had) WAL for this timeline. + /// All WAL segments next to one containing local_start_lsn are + /// filled with data from the beginning. + pub local_start_lsn: Lsn, + /// Part of WAL acknowledged by quorum *and available locally*. Always points + /// to record boundary. + pub commit_lsn: Lsn, + /// LSN that points to the end of the last backed up segment. Useful to + /// persist to avoid finding out offloading progress on boot. + pub backup_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. Updates are currently drived + /// only by walproposer. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver (or broker). + pub remote_consistent_lsn: Lsn, + // Peers and their state as we remember it. Knowing peers themselves is + // fundamental; but state is saved here only for informational purposes and + // obviously can be stale. (Currently not saved at all, but let's provision + // place to have less file version upgrades). + pub peers: PersistedPeers, +} + +/// Persistent information stored on safekeeper node about timeline. +/// On disk data is prefixed by magic and format version and followed by checksum. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SafeKeeperStateV8 { + #[serde(with = "hex")] + pub tenant_id: TenantId, + #[serde(with = "hex")] + pub timeline_id: TimelineId, + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfo, + /// Unique id of the last *elected* proposer we dealt with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// Since which LSN this timeline generally starts. Safekeeper might have + /// joined later. + pub timeline_start_lsn: Lsn, + /// Since which LSN safekeeper has (had) WAL for this timeline. + /// All WAL segments next to one containing local_start_lsn are + /// filled with data from the beginning. + pub local_start_lsn: Lsn, + /// Part of WAL acknowledged by quorum *and available locally*. Always points + /// to record boundary. + pub commit_lsn: Lsn, + /// LSN that points to the end of the last backed up segment. Useful to + /// persist to avoid finding out offloading progress on boot. + pub backup_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. Updates are currently drived + /// only by walproposer. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver (or broker). + pub remote_consistent_lsn: Lsn, + /// Peers and their state as we remember it. Knowing peers themselves is + /// fundamental; but state is saved here only for informational purposes and + /// obviously can be stale. (Currently not saved at all, but let's provision + /// place to have less file version upgrades). + pub peers: PersistedPeers, + /// Holds names of partial segments uploaded to remote storage. Used to + /// clean up old objects without leaving garbage in remote storage. + pub partial_backup: wal_backup_partial::State, +} + pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { // migrate to storing full term history if version == 1 { @@ -167,6 +261,8 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result Result Result Result Result SafeKeeperStateV8 { + assert!(state.eviction_state == EvictionState::Present); + SafeKeeperStateV8 { + tenant_id: state.tenant_id, + timeline_id: state.timeline_id, + acceptor_state: state.acceptor_state.clone(), + server: state.server.clone(), + proposer_uuid: state.proposer_uuid, + timeline_start_lsn: state.timeline_start_lsn, + local_start_lsn: state.local_start_lsn, + commit_lsn: state.commit_lsn, + backup_lsn: state.backup_lsn, + peer_horizon_lsn: state.peer_horizon_lsn, + remote_consistent_lsn: state.remote_consistent_lsn, + peers: state.peers.clone(), + partial_backup: state.partial_backup.clone(), + } +} + #[cfg(test)] mod tests { use std::str::FromStr; diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs index 5bc877adbd..220988c3ce 100644 --- a/safekeeper/src/copy_timeline.rs +++ b/safekeeper/src/copy_timeline.rs @@ -15,10 +15,10 @@ use crate::{ control_file::{FileStorage, Storage}, pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline}, state::TimelinePersistentState, - timeline::{Timeline, TimelineError}, + timeline::{Timeline, TimelineError, WalResidentTimeline}, wal_backup::copy_s3_segments, wal_storage::{wal_file_paths, WalReader}, - GlobalTimelines, SafeKeeperConf, + GlobalTimelines, }; // we don't want to have more than 10 segments on disk after copy, because they take space @@ -46,12 +46,14 @@ pub async fn handle_request(request: Request) -> Result<()> { } } + let source_tli = request.source.wal_residence_guard().await?; + let conf = &GlobalTimelines::get_global_config(); let ttid = request.destination_ttid; let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?; - let (mem_state, state) = request.source.get_state().await; + let (mem_state, state) = source_tli.get_state().await; let start_lsn = state.timeline_start_lsn; if start_lsn == Lsn::INVALID { bail!("timeline is not initialized"); @@ -60,7 +62,7 @@ pub async fn handle_request(request: Request) -> Result<()> { { let commit_lsn = mem_state.commit_lsn; - let flush_lsn = request.source.get_flush_lsn().await; + let flush_lsn = source_tli.get_flush_lsn().await; info!( "collected info about source timeline: start_lsn={}, backup_lsn={}, commit_lsn={}, flush_lsn={}", @@ -72,10 +74,16 @@ pub async fn handle_request(request: Request) -> Result<()> { assert!(flush_lsn >= start_lsn); if request.until_lsn > flush_lsn { - bail!("requested LSN is beyond the end of the timeline"); + bail!(format!( + "requested LSN {} is beyond the end of the timeline {}", + request.until_lsn, flush_lsn + )); } if request.until_lsn < start_lsn { - bail!("requested LSN is before the start of the timeline"); + bail!(format!( + "requested LSN {} is before the start of the timeline {}", + request.until_lsn, start_lsn + )); } if request.until_lsn > commit_lsn { @@ -127,10 +135,8 @@ pub async fn handle_request(request: Request) -> Result<()> { .await?; copy_disk_segments( - conf, - &state, + &source_tli, wal_seg_size, - &request.source.ttid, new_backup_lsn, request.until_lsn, &tli_dir_path, @@ -159,21 +165,13 @@ pub async fn handle_request(request: Request) -> Result<()> { } async fn copy_disk_segments( - conf: &SafeKeeperConf, - persisted_state: &TimelinePersistentState, + tli: &WalResidentTimeline, wal_seg_size: usize, - source_ttid: &TenantTimelineId, start_lsn: Lsn, end_lsn: Lsn, tli_dir_path: &Utf8PathBuf, ) -> Result<()> { - let mut wal_reader = WalReader::new( - conf.workdir.clone(), - conf.timeline_dir(source_ttid), - persisted_state, - start_lsn, - true, - )?; + let mut wal_reader = tli.get_walreader(start_lsn).await?; let mut buf = [0u8; MAX_SEND_SIZE]; @@ -191,7 +189,7 @@ async fn copy_disk_segments( let copy_end = copy_end - segment_start; let wal_file_path = { - let (normal, partial) = wal_file_paths(tli_dir_path, segment, wal_seg_size)?; + let (normal, partial) = wal_file_paths(tli_dir_path, segment, wal_seg_size); if segment == last_segment { partial @@ -225,6 +223,7 @@ async fn write_segment( assert!(from <= to); assert!(to <= wal_seg_size); + #[allow(clippy::suspicious_open_options)] let mut file = OpenOptions::new() .create(true) .write(true) diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs index b50f2e1158..15b0272cd9 100644 --- a/safekeeper/src/debug_dump.rs +++ b/safekeeper/src/debug_dump.rs @@ -10,6 +10,7 @@ use std::sync::Arc; use anyhow::bail; use anyhow::Result; use camino::Utf8Path; +use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use postgres_ffi::XLogSegNo; use postgres_ffi::MAX_SEND_SIZE; @@ -26,7 +27,9 @@ use crate::safekeeper::TermHistory; use crate::send_wal::WalSenderState; use crate::state::TimelineMemState; use crate::state::TimelinePersistentState; -use crate::wal_storage::WalReader; +use crate::timeline::get_timeline_dir; +use crate::timeline::WalResidentTimeline; +use crate::timeline_manager; use crate::GlobalTimelines; use crate::SafeKeeperConf; @@ -68,6 +71,7 @@ pub struct Response { pub struct TimelineDumpSer { pub tli: Arc, pub args: Args, + pub timeline_dir: Utf8PathBuf, pub runtime: Arc, } @@ -85,14 +89,20 @@ impl Serialize for TimelineDumpSer { where S: serde::Serializer, { - let dump = self - .runtime - .block_on(build_from_tli_dump(self.tli.clone(), self.args.clone())); + let dump = self.runtime.block_on(build_from_tli_dump( + &self.tli, + &self.args, + &self.timeline_dir, + )); dump.serialize(serializer) } } -async fn build_from_tli_dump(timeline: Arc, args: Args) -> Timeline { +async fn build_from_tli_dump( + timeline: &Arc, + args: &Args, + timeline_dir: &Utf8Path, +) -> Timeline { let control_file = if args.dump_control_file { let mut state = timeline.get_state().await.1; if !args.dump_term_history { @@ -112,7 +122,8 @@ async fn build_from_tli_dump(timeline: Arc, args: Arg let disk_content = if args.dump_disk_content { // build_disk_content can fail, but we don't want to fail the whole // request because of that. - build_disk_content(&timeline.timeline_dir).ok() + // Note: timeline can be in offloaded state, this is not a problem. + build_disk_content(timeline_dir).ok() } else { None }; @@ -158,6 +169,7 @@ pub struct Memory { pub last_removed_segno: XLogSegNo, pub epoch_start_lsn: Lsn, pub mem_state: TimelineMemState, + pub mgr_status: timeline_manager::Status, // PhysicalStorage state. pub write_lsn: Lsn, @@ -186,6 +198,7 @@ pub struct FileInfo { pub async fn build(args: Args) -> Result { let start_time = Utc::now(); let timelines_count = GlobalTimelines::timelines_count(); + let config = GlobalTimelines::get_global_config(); let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() { // If both tenant_id and timeline_id are specified, we can just get the @@ -223,12 +236,11 @@ pub async fn build(args: Args) -> Result { timelines.push(TimelineDumpSer { tli, args: args.clone(), + timeline_dir: get_timeline_dir(&config, &ttid), runtime: runtime.clone(), }); } - let config = GlobalTimelines::get_global_config(); - Ok(Response { start_time, finish_time: Utc::now(), @@ -316,27 +328,19 @@ pub struct TimelineDigest { } pub async fn calculate_digest( - tli: &Arc, + tli: &WalResidentTimeline, request: TimelineDigestRequest, ) -> Result { if request.from_lsn > request.until_lsn { bail!("from_lsn is greater than until_lsn"); } - let conf = GlobalTimelines::get_global_config(); let (_, persisted_state) = tli.get_state().await; - if persisted_state.timeline_start_lsn > request.from_lsn { bail!("requested LSN is before the start of the timeline"); } - let mut wal_reader = WalReader::new( - conf.workdir.clone(), - tli.timeline_dir.clone(), - &persisted_state, - request.from_lsn, - true, - )?; + let mut wal_reader = tli.get_walreader(request.from_lsn).await?; let mut hasher = Sha256::new(); let mut buf = [0u8; MAX_SEND_SIZE]; diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 761541168c..2c519433ef 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -2,8 +2,7 @@ //! protocol commands. use anyhow::Context; -use std::str::FromStr; -use std::str::{self}; +use std::str::{self, FromStr}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{debug, info, info_span, Instrument}; @@ -16,8 +15,8 @@ use crate::safekeeper::Term; use crate::timeline::TimelineError; use crate::wal_service::ConnectionId; use crate::{GlobalTimelines, SafeKeeperConf}; +use postgres_backend::PostgresBackend; use postgres_backend::QueryError; -use postgres_backend::{self, PostgresBackend}; use postgres_ffi::PG_TLI; use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}; use regex::Regex; @@ -144,7 +143,12 @@ impl postgres_backend::Handler self.tenant_id.unwrap_or(TenantId::from([0u8; 16])), self.timeline_id.unwrap_or(TimelineId::from([0u8; 16])), ); - tracing::Span::current().record("ttid", tracing::field::display(ttid)); + tracing::Span::current() + .record("ttid", tracing::field::display(ttid)) + .record( + "application_name", + tracing::field::debug(self.appname.clone()), + ); Ok(()) } else { diff --git a/safekeeper/src/http/client.rs b/safekeeper/src/http/client.rs new file mode 100644 index 0000000000..c56f7880d4 --- /dev/null +++ b/safekeeper/src/http/client.rs @@ -0,0 +1,140 @@ +//! Safekeeper http client. +//! +//! Partially copied from pageserver client; some parts might be better to be +//! united. +//! +//! It would be also good to move it out to separate crate, but this needs +//! duplication of internal-but-reported structs like WalSenderState, ServerInfo +//! etc. + +use reqwest::{IntoUrl, Method, StatusCode}; +use utils::{ + http::error::HttpErrorBody, + id::{NodeId, TenantId, TimelineId}, + logging::SecretString, +}; + +use super::routes::TimelineStatus; + +#[derive(Debug, Clone)] +pub struct Client { + mgmt_api_endpoint: String, + authorization_header: Option, + client: reqwest::Client, +} + +#[derive(thiserror::Error, Debug)] +pub enum Error { + /// Failed to receive body (reqwest error). + #[error("receive body: {0}")] + ReceiveBody(reqwest::Error), + + /// Status is not ok, but failed to parse body as `HttpErrorBody`. + #[error("receive error body: {0}")] + ReceiveErrorBody(String), + + /// Status is not ok; parsed error in body as `HttpErrorBody`. + #[error("safekeeper API: {1}")] + ApiError(StatusCode, String), +} + +pub type Result = std::result::Result; + +pub trait ResponseErrorMessageExt: Sized { + fn error_from_body(self) -> impl std::future::Future> + Send; +} + +/// If status is not ok, try to extract error message from the body. +impl ResponseErrorMessageExt for reqwest::Response { + async fn error_from_body(self) -> Result { + let status = self.status(); + if !(status.is_client_error() || status.is_server_error()) { + return Ok(self); + } + + let url = self.url().to_owned(); + Err(match self.json::().await { + Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), + Err(_) => { + Error::ReceiveErrorBody(format!("http error ({}) at {}.", status.as_u16(), url)) + } + }) + } +} + +impl Client { + pub fn new(mgmt_api_endpoint: String, jwt: Option) -> Self { + Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt) + } + + pub fn from_client( + client: reqwest::Client, + mgmt_api_endpoint: String, + jwt: Option, + ) -> Self { + Self { + mgmt_api_endpoint, + authorization_header: jwt + .map(|jwt| SecretString::from(format!("Bearer {}", jwt.get_contents()))), + client, + } + } + + pub async fn timeline_status( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}", + self.mgmt_api_endpoint, tenant_id, timeline_id + ); + let resp = self.get(&uri).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + + pub async fn snapshot( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + stream_to: NodeId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/snapshot/{}", + self.mgmt_api_endpoint, tenant_id, timeline_id, stream_to.0 + ); + self.get(&uri).await + } + + async fn get(&self, uri: U) -> Result { + self.request(Method::GET, uri, ()).await + } + + /// Send the request and check that the status code is good. + async fn request( + &self, + method: Method, + uri: U, + body: B, + ) -> Result { + let res = self.request_noerror(method, uri, body).await?; + let response = res.error_from_body().await?; + Ok(response) + } + + /// Just send the request. + async fn request_noerror( + &self, + method: Method, + uri: U, + body: B, + ) -> Result { + let req = self.client.request(method, uri); + let req = if let Some(value) = &self.authorization_header { + req.header(reqwest::header::AUTHORIZATION, value.get_contents()) + } else { + req + }; + req.json(&body).send().await.map_err(Error::ReceiveBody) + } +} diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 2a9570595f..52fb13ff5b 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -1,3 +1,4 @@ +pub mod client; pub mod routes; pub use routes::make_router; diff --git a/safekeeper/src/http/openapi_spec.yaml b/safekeeper/src/http/openapi_spec.yaml index a617e0310c..70999853c2 100644 --- a/safekeeper/src/http/openapi_spec.yaml +++ b/safekeeper/src/http/openapi_spec.yaml @@ -86,42 +86,6 @@ paths: default: $ref: "#/components/responses/GenericError" - /v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - format: hex - - name: source_timeline_id - in: path - required: true - schema: - type: string - format: hex - - post: - tags: - - "Timeline" - summary: Register new timeline as copy of existing timeline - description: "" - operationId: v1CopyTenantTimeline - requestBody: - content: - application/json: - schema: - $ref: "#/components/schemas/TimelineCopyRequest" - responses: - "201": - description: Timeline created - # TODO: return timeline info? - "403": - $ref: "#/components/responses/ForbiddenError" - default: - $ref: "#/components/responses/GenericError" - - /v1/tenant/{tenant_id}/timeline/{timeline_id}: parameters: - name: tenant_id @@ -179,6 +143,40 @@ paths: default: $ref: "#/components/responses/GenericError" + /v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: source_timeline_id + in: path + required: true + schema: + type: string + format: hex + + post: + tags: + - "Timeline" + summary: Register new timeline as copy of existing timeline + description: "" + operationId: v1CopyTenantTimeline + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/TimelineCopyRequest" + responses: + "201": + description: Timeline created + # TODO: return timeline info? + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" /v1/record_safekeeper_info/{tenant_id}/{timeline_id}: parameters: diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 919b6b2982..9b7424a818 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,38 +1,25 @@ use hyper::{Body, Request, Response, StatusCode, Uri}; - use once_cell::sync::Lazy; -use postgres_ffi::WAL_SEGMENT_SIZE; -use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest}; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::fmt; +use std::io::Write as _; use std::str::FromStr; use std::sync::Arc; use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; -use tokio::fs::File; -use tokio::io::AsyncReadExt; +use tokio::sync::mpsc; +use tokio::task; +use tokio_stream::wrappers::ReceiverStream; use tokio_util::sync::CancellationToken; +use tracing::{info_span, Instrument}; use utils::failpoint_support::failpoints_handler; +use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWriter}; use utils::http::request::parse_query_param; -use std::io::Write as _; -use tokio::sync::mpsc; -use tokio_stream::wrappers::ReceiverStream; -use tracing::{info_span, Instrument}; -use utils::http::endpoint::{request_span, ChannelWriter}; - -use crate::debug_dump::TimelineDigestRequest; -use crate::receive_wal::WalReceiverState; -use crate::safekeeper::Term; -use crate::safekeeper::{ServerInfo, TermLsn}; -use crate::send_wal::WalSenderState; -use crate::timeline::PeerInfo; -use crate::{copy_timeline, debug_dump, pull_timeline}; - -use crate::timelines_global_map::TimelineDeleteForceResult; -use crate::GlobalTimelines; -use crate::SafeKeeperConf; +use postgres_ffi::WAL_SEGMENT_SIZE; +use safekeeper_api::models::TimelineCreateRequest; +use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest}; use utils::{ auth::SwappableJwtAuth, http::{ @@ -46,7 +33,16 @@ use utils::{ lsn::Lsn, }; -use super::models::TimelineCreateRequest; +use crate::debug_dump::TimelineDigestRequest; +use crate::receive_wal::WalReceiverState; +use crate::safekeeper::Term; +use crate::safekeeper::{ServerInfo, TermLsn}; +use crate::send_wal::WalSenderState; +use crate::timeline::PeerInfo; +use crate::timelines_global_map::TimelineDeleteForceResult; +use crate::GlobalTimelines; +use crate::SafeKeeperConf; +use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline}; #[derive(Debug, Serialize)] struct SafekeeperStatus { @@ -85,11 +81,11 @@ impl From for TermLsn { } } -/// Augment AcceptorState with epoch for convenience +/// Augment AcceptorState with last_log_term for convenience #[derive(Debug, Serialize, Deserialize)] pub struct AcceptorStateStatus { pub term: Term, - pub epoch: Term, + pub epoch: Term, // aka last_log_term pub term_history: Vec, } @@ -118,54 +114,25 @@ fn check_permission(request: &Request, tenant_id: Option) -> Res }) } -/// Report info about timeline. -async fn timeline_status_handler(request: Request) -> Result, ApiError> { - let ttid = TenantTimelineId::new( - parse_request_param(&request, "tenant_id")?, - parse_request_param(&request, "timeline_id")?, - ); - check_permission(&request, Some(ttid.tenant_id))?; - - let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; - let (inmem, state) = tli.get_state().await; - let flush_lsn = tli.get_flush_lsn().await; - - let epoch = state.acceptor_state.get_epoch(flush_lsn); - let term_history = state - .acceptor_state - .term_history - .0 - .into_iter() - .map(|ts| TermSwitchApiEntry { - term: ts.term, - lsn: ts.lsn, - }) - .collect(); - let acc_state = AcceptorStateStatus { - term: state.acceptor_state.term, - epoch, - term_history, - }; - - let conf = get_conf(&request); - // Note: we report in memory values which can be lost. - let status = TimelineStatus { - tenant_id: ttid.tenant_id, - timeline_id: ttid.timeline_id, - acceptor_state: acc_state, - pg_info: state.server, - flush_lsn, - timeline_start_lsn: state.timeline_start_lsn, - local_start_lsn: state.local_start_lsn, - commit_lsn: inmem.commit_lsn, - backup_lsn: inmem.backup_lsn, - peer_horizon_lsn: inmem.peer_horizon_lsn, - remote_consistent_lsn: inmem.remote_consistent_lsn, - peers: tli.get_peers(conf).await, - walsenders: tli.get_walsenders().get_all(), - walreceivers: tli.get_walreceivers().get_all(), - }; - json_response(StatusCode::OK, status) +/// Deactivates all timelines for the tenant and removes its data directory. +/// See `timeline_delete_handler`. +async fn tenant_delete_handler(mut request: Request) -> Result, ApiError> { + let tenant_id = parse_request_param(&request, "tenant_id")?; + let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false); + check_permission(&request, Some(tenant_id))?; + ensure_no_body(&mut request).await?; + // FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons; + // Using an `InternalServerError` should be fixed when the types support it + let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id, only_local) + .await + .map_err(ApiError::InternalServerError)?; + json_response( + StatusCode::OK, + delete_info + .iter() + .map(|(ttid, resp)| (format!("{}", ttid.timeline_id), *resp)) + .collect::>(), + ) } async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { @@ -194,18 +161,140 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result, ApiError> { + check_permission(&request, None)?; + let res: Vec = GlobalTimelines::get_all() + .iter() + .map(|tli| tli.ttid) + .collect(); + json_response(StatusCode::OK, res) +} + +/// Report info about timeline. +async fn timeline_status_handler(request: Request) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + + let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; + let (inmem, state) = tli.get_state().await; + let flush_lsn = tli.get_flush_lsn().await; + + let last_log_term = state.acceptor_state.get_last_log_term(flush_lsn); + let term_history = state + .acceptor_state + .term_history + .0 + .into_iter() + .map(|ts| TermSwitchApiEntry { + term: ts.term, + lsn: ts.lsn, + }) + .collect(); + let acc_state = AcceptorStateStatus { + term: state.acceptor_state.term, + epoch: last_log_term, + term_history, + }; + + let conf = get_conf(&request); + // Note: we report in memory values which can be lost. + let status = TimelineStatus { + tenant_id: ttid.tenant_id, + timeline_id: ttid.timeline_id, + acceptor_state: acc_state, + pg_info: state.server, + flush_lsn, + timeline_start_lsn: state.timeline_start_lsn, + local_start_lsn: state.local_start_lsn, + commit_lsn: inmem.commit_lsn, + backup_lsn: inmem.backup_lsn, + peer_horizon_lsn: inmem.peer_horizon_lsn, + remote_consistent_lsn: inmem.remote_consistent_lsn, + peers: tli.get_peers(conf).await, + walsenders: tli.get_walsenders().get_all(), + walreceivers: tli.get_walreceivers().get_all(), + }; + json_response(StatusCode::OK, status) +} + +/// Deactivates the timeline and removes its data directory. +async fn timeline_delete_handler(mut request: Request) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false); + check_permission(&request, Some(ttid.tenant_id))?; + ensure_no_body(&mut request).await?; + // FIXME: `delete_force` can fail from both internal errors and bad requests. Add better + // error handling here when we're able to. + let resp = GlobalTimelines::delete(&ttid, only_local) + .await + .map_err(ApiError::InternalServerError)?; + json_response(StatusCode::OK, resp) +} + /// Pull timeline from peer safekeeper instances. async fn timeline_pull_handler(mut request: Request) -> Result, ApiError> { check_permission(&request, None)?; let data: pull_timeline::Request = json_request(&mut request).await?; + let conf = get_conf(&request); - let resp = pull_timeline::handle_request(data) + let resp = pull_timeline::handle_request(data, conf.sk_auth_token.clone()) .await .map_err(ApiError::InternalServerError)?; json_response(StatusCode::OK, resp) } +/// Stream tar archive with all timeline data. +async fn timeline_snapshot_handler(request: Request) -> Result, ApiError> { + let destination = parse_request_param(&request, "destination_id")?; + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + + let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; + // Note: with evicted timelines it should work better then de-evict them and + // stream; probably start_snapshot would copy partial s3 file to dest path + // and stream control file, or return WalResidentTimeline if timeline is not + // evicted. + let tli = tli + .wal_residence_guard() + .await + .map_err(ApiError::InternalServerError)?; + + // To stream the body use wrap_stream which wants Stream of Result, + // so create the chan and write to it in another task. + let (tx, rx) = mpsc::channel(1); + + let conf = get_conf(&request); + task::spawn(pull_timeline::stream_snapshot( + tli, + conf.my_id, + destination, + tx, + )); + + let rx_stream = ReceiverStream::new(rx); + let body = Body::wrap_stream(rx_stream); + + let response = Response::builder() + .status(200) + .header(hyper::header::CONTENT_TYPE, "application/octet-stream") + .body(body) + .unwrap(); + + Ok(response) +} + async fn timeline_copy_handler(mut request: Request) -> Result, ApiError> { check_permission(&request, None)?; @@ -229,6 +318,46 @@ async fn timeline_copy_handler(mut request: Request) -> Result, +) -> Result, ApiError> { + check_permission(&request, None)?; + + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + + let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; + + let patch_request: patch_control_file::Request = json_request(&mut request).await?; + let response = patch_control_file::handle_request(tli, patch_request) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, response) +} + +/// Force persist control file. +async fn timeline_checkpoint_handler(request: Request) -> Result, ApiError> { + check_permission(&request, None)?; + + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + + let tli = GlobalTimelines::get(ttid)?; + tli.write_shared_state() + .await + .sk + .state_mut() + .flush() + .await + .map_err(ApiError::InternalServerError)?; + json_response(StatusCode::OK, ()) +} + async fn timeline_digest_handler(request: Request) -> Result, ApiError> { let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, @@ -249,6 +378,10 @@ async fn timeline_digest_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { +/// Unevict timeline and remove uploaded partial segment(s) from the remote storage. +/// Successfull response returns list of segments existed before the deletion. +/// Aimed for one-off usage not normally needed. +async fn timeline_backup_partial_reset(request: Request) -> Result, ApiError> { let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); check_permission(&request, Some(ttid.tenant_id))?; - let filename: String = parse_request_param(&request, "filename")?; - let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; - let filepath = tli.timeline_dir.join(filename); - let mut file = File::open(&filepath) - .await - .map_err(|e| ApiError::InternalServerError(e.into()))?; - - let mut content = Vec::new(); - // TODO: don't store files in memory - file.read_to_end(&mut content) - .await - .map_err(|e| ApiError::InternalServerError(e.into()))?; - - Response::builder() - .status(StatusCode::OK) - .header("Content-Type", "application/octet-stream") - .body(Body::from(content)) - .map_err(|e| ApiError::InternalServerError(e.into())) -} - -/// Deactivates the timeline and removes its data directory. -async fn timeline_delete_handler(mut request: Request) -> Result, ApiError> { - let ttid = TenantTimelineId::new( - parse_request_param(&request, "tenant_id")?, - parse_request_param(&request, "timeline_id")?, - ); - let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false); - check_permission(&request, Some(ttid.tenant_id))?; - ensure_no_body(&mut request).await?; - // FIXME: `delete_force` can fail from both internal errors and bad requests. Add better - // error handling here when we're able to. - let resp = GlobalTimelines::delete(&ttid, only_local) + let response = tli + .backup_partial_reset() .await .map_err(ApiError::InternalServerError)?; - json_response(StatusCode::OK, resp) -} - -/// Deactivates all timelines for the tenant and removes its data directory. -/// See `timeline_delete_handler`. -async fn tenant_delete_handler(mut request: Request) -> Result, ApiError> { - let tenant_id = parse_request_param(&request, "tenant_id")?; - let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false); - check_permission(&request, Some(tenant_id))?; - ensure_no_body(&mut request).await?; - // FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons; - // Using an `InternalServerError` should be fixed when the types support it - let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id, only_local) - .await - .map_err(ApiError::InternalServerError)?; - json_response( - StatusCode::OK, - delete_info - .iter() - .map(|(ttid, resp)| (format!("{}", ttid.timeline_id), *resp)) - .collect::>(), - ) + json_response(StatusCode::OK, response) } /// Used only in tests to hand craft required data. @@ -350,6 +433,7 @@ async fn record_safekeeper_info(mut request: Request) -> Result RouterBuilder router .data(Arc::new(conf)) .data(auth) + .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .get("/v1/status", |r| request_span(r, status_handler)) .put("/v1/failpoints", |r| { request_span(r, move |r| async { + check_permission(&r, None)?; let cancel = CancellationToken::new(); failpoints_handler(r, cancel).await }) }) + .delete("/v1/tenant/:tenant_id", |r| { + request_span(r, tenant_delete_handler) + }) // Will be used in the future instead of implicit timeline creation .post("/v1/tenant/timeline", |r| { request_span(r, timeline_create_handler) }) + .get("/v1/tenant/timeline", |r| { + request_span(r, timeline_list_handler) + }) .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { request_span(r, timeline_status_handler) }) .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { request_span(r, timeline_delete_handler) }) - .delete("/v1/tenant/:tenant_id", |r| { - request_span(r, tenant_delete_handler) - }) .post("/v1/pull_timeline", |r| { request_span(r, timeline_pull_handler) }) .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename", - |r| request_span(r, timeline_files_handler), + "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id", + |r| request_span(r, timeline_snapshot_handler), ) .post( "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy", |r| request_span(r, timeline_copy_handler), ) - // for tests + .patch( + "/v1/tenant/:tenant_id/timeline/:timeline_id/control_file", + |r| request_span(r, patch_control_file_handler), + ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint", + |r| request_span(r, timeline_checkpoint_handler), + ) + .get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| { + request_span(r, timeline_digest_handler) + }) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/backup_partial_reset", + |r| request_span(r, timeline_backup_partial_reset), + ) .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| { request_span(r, record_safekeeper_info) }) .get("/v1/debug_dump", |r| request_span(r, dump_debug_handler)) - .get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| { - request_span(r, timeline_digest_handler) - }) } #[cfg(test)] diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 32d5889803..7fe924a08e 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -6,8 +6,6 @@ //! modifications in tests. //! -use std::sync::Arc; - use anyhow::Context; use bytes::Bytes; use postgres_backend::QueryError; @@ -23,7 +21,7 @@ use crate::safekeeper::{ }; use crate::safekeeper::{Term, TermHistory, TermLsn}; use crate::state::TimelinePersistentState; -use crate::timeline::Timeline; +use crate::timeline::WalResidentTimeline; use crate::GlobalTimelines; use postgres_backend::PostgresBackend; use postgres_ffi::encode_logical_message; @@ -104,8 +102,8 @@ pub async fn handle_json_ctrl( async fn prepare_safekeeper( ttid: TenantTimelineId, pg_version: u32, -) -> anyhow::Result> { - GlobalTimelines::create( +) -> anyhow::Result { + let tli = GlobalTimelines::create( ttid, ServerInfo { pg_version, @@ -115,10 +113,16 @@ async fn prepare_safekeeper( Lsn::INVALID, Lsn::INVALID, ) - .await + .await?; + + tli.wal_residence_guard().await } -async fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> anyhow::Result<()> { +async fn send_proposer_elected( + tli: &WalResidentTimeline, + term: Term, + lsn: Lsn, +) -> anyhow::Result<()> { // add new term to existing history let history = tli.get_state().await.1.acceptor_state.term_history; let history = history.up_to(lsn.checked_sub(1u64).unwrap()); @@ -147,7 +151,7 @@ pub struct InsertedWAL { /// Extend local WAL with new LogicalMessage record. To do that, /// create AppendRequest with new WAL and pass it to safekeeper. pub async fn append_logical_message( - tli: &Arc, + tli: &WalResidentTimeline, msg: &AppendLogicalMessage, ) -> anyhow::Result { let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); @@ -165,7 +169,7 @@ pub async fn append_logical_message( let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest { h: AppendRequestHeader { term: msg.term, - epoch_start_lsn: begin_lsn, + term_start_lsn: begin_lsn, begin_lsn, end_lsn, commit_lsn, diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index f18a1ec22d..2e11a279ca 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -7,10 +7,7 @@ use tokio::runtime::Runtime; use std::time::Duration; use storage_broker::Uri; -use utils::{ - auth::SwappableJwtAuth, - id::{NodeId, TenantId, TenantTimelineId}, -}; +use utils::{auth::SwappableJwtAuth, id::NodeId, logging::SecretString}; mod auth; pub mod broker; @@ -22,7 +19,9 @@ pub mod handler; pub mod http; pub mod json_ctrl; pub mod metrics; +pub mod patch_control_file; pub mod pull_timeline; +pub mod rate_limit; pub mod receive_wal; pub mod recovery; pub mod remove_wal; @@ -30,7 +29,12 @@ pub mod safekeeper; pub mod send_wal; pub mod state; pub mod timeline; +pub mod timeline_eviction; +pub mod timeline_guard; +pub mod timeline_manager; +pub mod timelines_set; pub mod wal_backup; +pub mod wal_backup_partial; pub mod wal_service; pub mod wal_storage; @@ -47,6 +51,15 @@ pub mod defaults { pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms"; pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20); + pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m"; + pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s"; + pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5"; + pub const DEFAULT_EVICTION_CONCURRENCY: usize = 2; + + // By default, our required residency before eviction is the same as the period that passes + // before uploading a partial segment, so that in normal operation the eviction can happen + // as soon as we have done the partial segment upload. + pub const DEFAULT_EVICTION_MIN_RESIDENT: &str = DEFAULT_PARTIAL_BACKUP_TIMEOUT; } #[derive(Debug, Clone)] @@ -76,19 +89,20 @@ pub struct SafeKeeperConf { pub pg_auth: Option>, pub pg_tenant_only_auth: Option>, pub http_auth: Option>, + /// JWT token to connect to other safekeepers with. + pub sk_auth_token: Option, pub current_thread_runtime: bool, + pub walsenders_keep_horizon: bool, + pub partial_backup_timeout: Duration, + pub disable_periodic_broker_push: bool, + pub enable_offload: bool, + pub delete_offloaded_wal: bool, + pub control_file_save_interval: Duration, + pub partial_backup_concurrency: usize, + pub eviction_min_resident: Duration, } impl SafeKeeperConf { - pub fn tenant_dir(&self, tenant_id: &TenantId) -> Utf8PathBuf { - self.workdir.join(tenant_id.to_string()) - } - - pub fn timeline_dir(&self, ttid: &TenantTimelineId) -> Utf8PathBuf { - self.tenant_dir(&ttid.tenant_id) - .join(ttid.timeline_id.to_string()) - } - pub fn is_wal_backup_enabled(&self) -> bool { self.remote_storage.is_some() && self.wal_backup_enabled } @@ -117,9 +131,18 @@ impl SafeKeeperConf { pg_auth: None, pg_tenant_only_auth: None, http_auth: None, + sk_auth_token: None, heartbeat_timeout: Duration::new(5, 0), max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES, current_thread_runtime: false, + walsenders_keep_horizon: false, + partial_backup_timeout: Duration::from_secs(0), + disable_periodic_broker_push: false, + enable_offload: false, + delete_offloaded_wal: false, + control_file_save_interval: Duration::from_secs(1), + partial_backup_concurrency: 1, + eviction_min_resident: Duration::ZERO, } } } @@ -150,15 +173,6 @@ pub static BROKER_RUNTIME: Lazy = Lazy::new(|| { .expect("Failed to create broker runtime") }); -pub static WAL_REMOVER_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("WAL remover") - .worker_threads(1) - .enable_all() - .build() - .expect("Failed to create broker runtime") -}); - pub static WAL_BACKUP_RUNTIME: Lazy = Lazy::new(|| { tokio::runtime::Builder::new_multi_thread() .thread_name("WAL backup worker") @@ -166,12 +180,3 @@ pub static WAL_BACKUP_RUNTIME: Lazy = Lazy::new(|| { .build() .expect("Failed to create WAL backup runtime") }); - -pub static METRICS_SHIFTER_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("metric shifter") - .worker_threads(1) - .enable_all() - .build() - .expect("Failed to create broker runtime") -}); diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index fbba2e00fc..aa2bafbe92 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -5,14 +5,15 @@ use std::{ time::{Instant, SystemTime}, }; -use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS}; +use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_FSYNC_SECONDS_BUCKETS}; use anyhow::Result; use futures::Future; use metrics::{ core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, proto::MetricFamily, - register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, Gauge, - IntCounter, IntCounterPairVec, IntCounterVec, IntGaugeVec, + register_histogram_vec, register_int_counter, register_int_counter_pair, + register_int_counter_pair_vec, register_int_counter_vec, Gauge, HistogramVec, IntCounter, + IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec, }; use once_cell::sync::Lazy; @@ -47,7 +48,7 @@ pub static WRITE_WAL_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_write_wal_seconds", "Seconds spent writing and syncing WAL to a disk in a single request", - DISK_WRITE_SECONDS_BUCKETS.to_vec() + DISK_FSYNC_SECONDS_BUCKETS.to_vec() ) .expect("Failed to register safekeeper_write_wal_seconds histogram") }); @@ -55,7 +56,7 @@ pub static FLUSH_WAL_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_flush_wal_seconds", "Seconds spent syncing WAL to a disk", - DISK_WRITE_SECONDS_BUCKETS.to_vec() + DISK_FSYNC_SECONDS_BUCKETS.to_vec() ) .expect("Failed to register safekeeper_flush_wal_seconds histogram") }); @@ -63,10 +64,28 @@ pub static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_persist_control_file_seconds", "Seconds to persist and sync control file", - DISK_WRITE_SECONDS_BUCKETS.to_vec() + DISK_FSYNC_SECONDS_BUCKETS.to_vec() ) .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec") }); +pub static WAL_STORAGE_OPERATION_SECONDS: Lazy = Lazy::new(|| { + register_histogram_vec!( + "safekeeper_wal_storage_operation_seconds", + "Seconds spent on WAL storage operations", + &["operation"], + DISK_FSYNC_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_wal_storage_operation_seconds histogram vec") +}); +pub static MISC_OPERATION_SECONDS: Lazy = Lazy::new(|| { + register_histogram_vec!( + "safekeeper_misc_operation_seconds", + "Seconds spent on miscellaneous operations", + &["operation"], + DISK_FSYNC_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_misc_operation_seconds histogram vec") +}); pub static PG_IO_BYTES: Lazy = Lazy::new(|| { register_int_counter_vec!( "safekeeper_pg_io_bytes_total", @@ -125,7 +144,7 @@ pub static BROKER_PUSH_ALL_UPDATES_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_broker_push_update_seconds", "Seconds to push all timeline updates to the broker", - DISK_WRITE_SECONDS_BUCKETS.to_vec() + DISK_FSYNC_SECONDS_BUCKETS.to_vec() ) .expect("Failed to register safekeeper_broker_push_update_seconds histogram vec") }); @@ -140,6 +159,77 @@ pub static BROKER_ITERATION_TIMELINES: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_broker_iteration_timelines histogram vec") }); +pub static RECEIVED_PS_FEEDBACKS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_received_ps_feedbacks_total", + "Number of pageserver feedbacks received" + ) + .expect("Failed to register safekeeper_received_ps_feedbacks_total counter") +}); +pub static PARTIAL_BACKUP_UPLOADS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_partial_backup_uploads_total", + "Number of partial backup uploads to the S3", + &["result"] + ) + .expect("Failed to register safekeeper_partial_backup_uploads_total counter") +}); +pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_partial_backup_uploaded_bytes_total", + "Number of bytes uploaded to the S3 during partial backup" + ) + .expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter") +}); +pub static MANAGER_ITERATIONS_TOTAL: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_manager_iterations_total", + "Number of iterations of the timeline manager task" + ) + .expect("Failed to register safekeeper_manager_iterations_total counter") +}); +pub static MANAGER_ACTIVE_CHANGES: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_manager_active_changes_total", + "Number of timeline active status changes in the timeline manager task" + ) + .expect("Failed to register safekeeper_manager_active_changes_total counter") +}); +pub static WAL_BACKUP_TASKS: Lazy = Lazy::new(|| { + register_int_counter_pair!( + "safekeeper_wal_backup_tasks_started_total", + "Number of active WAL backup tasks", + "safekeeper_wal_backup_tasks_finished_total", + "Number of finished WAL backup tasks", + ) + .expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter") +}); + +// Metrics collected on operations on the storage repository. +#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)] +#[strum(serialize_all = "kebab_case")] +pub(crate) enum EvictionEvent { + Evict, + Restore, +} + +pub(crate) static EVICTION_EVENTS_STARTED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_eviction_events_started_total", + "Number of eviction state changes, incremented when they start", + &["kind"] + ) + .expect("Failed to register metric") +}); + +pub(crate) static EVICTION_EVENTS_COMPLETED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_eviction_events_completed_total", + "Number of eviction state changes, incremented when they complete", + &["kind"] + ) + .expect("Failed to register metric") +}); pub const LABEL_UNKNOWN: &str = "unknown"; @@ -301,7 +391,8 @@ pub async fn time_io_closure>( #[derive(Clone)] pub struct FullTimelineInfo { pub ttid: TenantTimelineId, - pub ps_feedback: PageserverFeedback, + pub ps_feedback_count: u64, + pub last_ps_feedback: PageserverFeedback, pub wal_backup_active: bool, pub timeline_is_active: bool, pub num_computes: u32, @@ -327,6 +418,7 @@ pub struct TimelineCollector { remote_consistent_lsn: GenericGaugeVec, ps_last_received_lsn: GenericGaugeVec, feedback_last_time_seconds: GenericGaugeVec, + ps_feedback_count: GenericGaugeVec, timeline_active: GenericGaugeVec, wal_backup_active: GenericGaugeVec, connected_computes: IntGaugeVec, @@ -430,6 +522,15 @@ impl TimelineCollector { .unwrap(); descs.extend(feedback_last_time_seconds.desc().into_iter().cloned()); + let ps_feedback_count = GenericGaugeVec::new( + Opts::new( + "safekeeper_ps_feedback_count_total", + "Number of feedbacks received from the pageserver", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + let timeline_active = GenericGaugeVec::new( Opts::new( "safekeeper_timeline_active", @@ -538,6 +639,7 @@ impl TimelineCollector { remote_consistent_lsn, ps_last_received_lsn, feedback_last_time_seconds, + ps_feedback_count, timeline_active, wal_backup_active, connected_computes, @@ -570,6 +672,7 @@ impl Collector for TimelineCollector { self.remote_consistent_lsn.reset(); self.ps_last_received_lsn.reset(); self.feedback_last_time_seconds.reset(); + self.ps_feedback_count.reset(); self.timeline_active.reset(); self.wal_backup_active.reset(); self.connected_computes.reset(); @@ -579,8 +682,7 @@ impl Collector for TimelineCollector { self.written_wal_seconds.reset(); self.flushed_wal_seconds.reset(); - let timelines = GlobalTimelines::get_all(); - let timelines_count = timelines.len(); + let timelines_count = GlobalTimelines::get_all().len(); let mut active_timelines_count = 0; // Prometheus Collector is sync, and data is stored under async lock. To @@ -646,9 +748,12 @@ impl Collector for TimelineCollector { self.ps_last_received_lsn .with_label_values(labels) - .set(tli.ps_feedback.last_received_lsn.0); + .set(tli.last_ps_feedback.last_received_lsn.0); + self.ps_feedback_count + .with_label_values(labels) + .set(tli.ps_feedback_count); if let Ok(unix_time) = tli - .ps_feedback + .last_ps_feedback .replytime .duration_since(SystemTime::UNIX_EPOCH) { @@ -679,6 +784,7 @@ impl Collector for TimelineCollector { mfs.extend(self.remote_consistent_lsn.collect()); mfs.extend(self.ps_last_received_lsn.collect()); mfs.extend(self.feedback_last_time_seconds.collect()); + mfs.extend(self.ps_feedback_count.collect()); mfs.extend(self.timeline_active.collect()); mfs.extend(self.wal_backup_active.collect()); mfs.extend(self.connected_computes.collect()); @@ -695,9 +801,11 @@ impl Collector for TimelineCollector { // report total number of timelines self.timelines_count.set(timelines_count as i64); + mfs.extend(self.timelines_count.collect()); + self.active_timelines_count .set(active_timelines_count as i64); - mfs.extend(self.timelines_count.collect()); + mfs.extend(self.active_timelines_count.collect()); mfs } @@ -705,9 +813,9 @@ impl Collector for TimelineCollector { async fn collect_timeline_metrics() -> Vec { let mut res = vec![]; - let timelines = GlobalTimelines::get_all(); + let active_timelines = GlobalTimelines::get_global_broker_active_set().get_all(); - for tli in timelines { + for tli in active_timelines { if let Some(info) = tli.info_for_metrics().await { res.push(info); } diff --git a/safekeeper/src/patch_control_file.rs b/safekeeper/src/patch_control_file.rs new file mode 100644 index 0000000000..2136d1b5f7 --- /dev/null +++ b/safekeeper/src/patch_control_file.rs @@ -0,0 +1,85 @@ +use std::sync::Arc; + +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use tracing::info; + +use crate::{state::TimelinePersistentState, timeline::Timeline}; + +#[derive(Deserialize, Debug, Clone)] +pub struct Request { + /// JSON object with fields to update + pub updates: serde_json::Value, + /// List of fields to apply + pub apply_fields: Vec, +} + +#[derive(Serialize)] +pub struct Response { + pub old_control_file: TimelinePersistentState, + pub new_control_file: TimelinePersistentState, +} + +/// Patch control file with given request. Will update the persistent state using +/// fields from the request and persist the new state on disk. +pub async fn handle_request(tli: Arc, request: Request) -> anyhow::Result { + let response = tli + .map_control_file(|state| { + let old_control_file = state.clone(); + let new_control_file = state_apply_diff(&old_control_file, &request)?; + + info!( + "patching control file, old: {:?}, new: {:?}, patch: {:?}", + old_control_file, new_control_file, request + ); + *state = new_control_file.clone(); + + Ok(Response { + old_control_file, + new_control_file, + }) + }) + .await?; + + Ok(response) +} + +fn state_apply_diff( + state: &TimelinePersistentState, + request: &Request, +) -> anyhow::Result { + let mut json_value = serde_json::to_value(state)?; + + if let Value::Object(a) = &mut json_value { + if let Value::Object(b) = &request.updates { + json_apply_diff(a, b, &request.apply_fields)?; + } else { + anyhow::bail!("request.updates is not a json object") + } + } else { + anyhow::bail!("TimelinePersistentState is not a json object") + } + + let new_state: TimelinePersistentState = serde_json::from_value(json_value)?; + Ok(new_state) +} + +fn json_apply_diff( + object: &mut serde_json::Map, + updates: &serde_json::Map, + apply_keys: &Vec, +) -> anyhow::Result<()> { + for key in apply_keys { + if let Some(new_value) = updates.get(key) { + if let Some(existing_value) = object.get_mut(key) { + *existing_value = new_value.clone(); + } else { + anyhow::bail!("key not found in original object: {}", key); + } + } else { + anyhow::bail!("key not found in request.updates: {}", key); + } + } + + Ok(()) +} diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 93b51f32c0..64585f5edc 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -1,27 +1,283 @@ -use std::sync::Arc; - +use anyhow::{anyhow, bail, Context, Result}; +use bytes::Bytes; use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; use chrono::{DateTime, Utc}; +use futures::{SinkExt, StreamExt, TryStreamExt}; +use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; use serde::{Deserialize, Serialize}; - -use anyhow::{bail, Context, Result}; -use tokio::io::AsyncWriteExt; -use tracing::info; -use utils::{ - id::{TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, +use std::{ + cmp::min, + io::{self, ErrorKind}, + sync::Arc, }; +use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task}; +use tokio_tar::{Archive, Builder, Header}; +use tokio_util::{ + io::{CopyToBytes, SinkWriter}, + sync::PollSender, +}; +use tracing::{error, info, instrument}; use crate::{ - control_file, debug_dump, - http::routes::TimelineStatus, - timeline::{Timeline, TimelineError}, - wal_storage::{self, Storage}, + control_file::{self, CONTROL_FILE_NAME}, + debug_dump, + http::{ + client::{self, Client}, + routes::TimelineStatus, + }, + safekeeper::Term, + state::TimelinePersistentState, + timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError, WalResidentTimeline}, + wal_backup, + wal_storage::{self, open_wal_file, Storage}, GlobalTimelines, SafeKeeperConf, }; +use utils::{ + crashsafe::{durable_rename, fsync_async_opt}, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, + logging::SecretString, + lsn::Lsn, + pausable_failpoint, +}; -/// Info about timeline on safekeeper ready for reporting. +/// Stream tar archive of timeline to tx. +#[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))] +pub async fn stream_snapshot( + tli: WalResidentTimeline, + source: NodeId, + destination: NodeId, + tx: mpsc::Sender>, +) { + if let Err(e) = stream_snapshot_guts(tli, source, destination, tx.clone()).await { + // Error type/contents don't matter as they won't can't reach the client + // (hyper likely doesn't do anything with it), but http stream will be + // prematurely terminated. It would be nice to try to send the error in + // trailers though. + tx.send(Err(anyhow!("snapshot failed"))).await.ok(); + error!("snapshot failed: {:#}", e); + } +} + +/// State needed while streaming the snapshot. +pub struct SnapshotContext { + pub from_segno: XLogSegNo, // including + pub upto_segno: XLogSegNo, // including + pub term: Term, + pub last_log_term: Term, + pub flush_lsn: Lsn, + pub wal_seg_size: usize, + // used to remove WAL hold off in Drop. + pub tli: WalResidentTimeline, +} + +impl Drop for SnapshotContext { + fn drop(&mut self) { + let tli = self.tli.clone(); + task::spawn(async move { + let mut shared_state = tli.write_shared_state().await; + shared_state.wal_removal_on_hold = false; + }); + } +} + +pub async fn stream_snapshot_guts( + tli: WalResidentTimeline, + source: NodeId, + destination: NodeId, + tx: mpsc::Sender>, +) -> Result<()> { + // tokio-tar wants Write implementor, but we have mpsc tx >; + // use SinkWriter as a Write impl. That is, + // - create Sink from the tx. It returns PollSendError if chan is closed. + let sink = PollSender::new(tx); + // - SinkWriter needs sink error to be io one, map it. + let sink_io_err = sink.sink_map_err(|_| io::Error::from(ErrorKind::BrokenPipe)); + // - SinkWriter wants sink type to be just Bytes, not Result, so map + // it with with(). Note that with() accepts async function which we don't + // need and allows the map to fail, which we don't need either, but hence + // two Oks. + let oksink = sink_io_err.with(|b: Bytes| async { io::Result::Ok(Result::Ok(b)) }); + // - SinkWriter (not surprisingly) wants sink of &[u8], not bytes, so wrap + // into CopyToBytes. This is a data copy. + let copy_to_bytes = CopyToBytes::new(oksink); + let mut writer = SinkWriter::new(copy_to_bytes); + let pinned_writer = std::pin::pin!(writer); + + // Note that tokio_tar append_* funcs use tokio::io::copy with 8KB buffer + // which is also likely suboptimal. + let mut ar = Builder::new_non_terminated(pinned_writer); + + let bctx = tli.start_snapshot(&mut ar, source, destination).await?; + pausable_failpoint!("sk-snapshot-after-list-pausable"); + + let tli_dir = tli.get_timeline_dir(); + info!( + "sending {} segments [{:#X}-{:#X}], term={}, last_log_term={}, flush_lsn={}", + bctx.upto_segno - bctx.from_segno + 1, + bctx.from_segno, + bctx.upto_segno, + bctx.term, + bctx.last_log_term, + bctx.flush_lsn, + ); + for segno in bctx.from_segno..=bctx.upto_segno { + let (mut sf, is_partial) = open_wal_file(&tli_dir, segno, bctx.wal_seg_size).await?; + let mut wal_file_name = XLogFileName(PG_TLI, segno, bctx.wal_seg_size); + if is_partial { + wal_file_name.push_str(".partial"); + } + ar.append_file(&wal_file_name, &mut sf).await?; + } + + // Do the term check before ar.finish to make archive corrupted in case of + // term change. Client shouldn't ignore abrupt stream end, but to be sure. + tli.finish_snapshot(&bctx).await?; + + ar.finish().await?; + + Ok(()) +} + +impl WalResidentTimeline { + /// Start streaming tar archive with timeline: + /// 1) stream control file under lock; + /// 2) hold off WAL removal; + /// 3) collect SnapshotContext to understand which WAL segments should be + /// streamed. + /// + /// Snapshot streams data up to flush_lsn. To make this safe, we must check + /// that term doesn't change during the procedure, or we risk sending mix of + /// WAL from different histories. Term is remembered in the SnapshotContext + /// and checked in finish_snapshot. Note that in the last segment some WAL + /// higher than flush_lsn set here might be streamed; that's fine as long as + /// terms doesn't change. + /// + /// Alternatively we could send only up to commit_lsn to get some valid + /// state which later will be recovered by compute, in this case term check + /// is not needed, but we likely don't want that as there might be no + /// compute which could perform the recovery. + /// + /// When returned SnapshotContext is dropped WAL hold is removed. + async fn start_snapshot( + &self, + ar: &mut tokio_tar::Builder, + source: NodeId, + destination: NodeId, + ) -> Result { + let mut shared_state = self.write_shared_state().await; + let wal_seg_size = shared_state.get_wal_seg_size(); + + let mut control_store = TimelinePersistentState::clone(shared_state.sk.state()); + // Modify the partial segment of the in-memory copy for the control file to + // point to the destination safekeeper. + let replace = control_store + .partial_backup + .replace_uploaded_segment(source, destination)?; + + if let Some(replace) = replace { + // The deserialized control file has an uploaded partial. We upload a copy + // of it to object storage for the destination safekeeper and send an updated + // control file in the snapshot. + tracing::info!( + "Replacing uploaded partial segment in in-mem control file: {replace:?}" + ); + + let remote_timeline_path = &self.tli.remote_path; + wal_backup::copy_partial_segment( + &replace.previous.remote_path(remote_timeline_path), + &replace.current.remote_path(remote_timeline_path), + ) + .await?; + } + + let buf = control_store + .write_to_buf() + .with_context(|| "failed to serialize control store")?; + let mut header = Header::new_gnu(); + header.set_size(buf.len().try_into().expect("never breaches u64")); + ar.append_data(&mut header, CONTROL_FILE_NAME, buf.as_slice()) + .await + .with_context(|| "failed to append to archive")?; + + // We need to stream since the oldest segment someone (s3 or pageserver) + // still needs. This duplicates calc_horizon_lsn logic. + // + // We know that WAL wasn't removed up to this point because it cannot be + // removed further than `backup_lsn`. Since we're holding shared_state + // lock and setting `wal_removal_on_hold` later, it guarantees that WAL + // won't be removed until we're done. + let from_lsn = min( + shared_state.sk.state().remote_consistent_lsn, + shared_state.sk.state().backup_lsn, + ); + if from_lsn == Lsn::INVALID { + // this is possible if snapshot is called before handling first + // elected message + bail!("snapshot is called on uninitialized timeline"); + } + let from_segno = from_lsn.segment_number(wal_seg_size); + let term = shared_state.sk.state().acceptor_state.term; + let last_log_term = shared_state.sk.last_log_term(); + let flush_lsn = shared_state.sk.flush_lsn(); + let upto_segno = flush_lsn.segment_number(wal_seg_size); + // have some limit on max number of segments as a sanity check + const MAX_ALLOWED_SEGS: u64 = 1000; + let num_segs = upto_segno - from_segno + 1; + if num_segs > MAX_ALLOWED_SEGS { + bail!( + "snapshot is called on timeline with {} segments, but the limit is {}", + num_segs, + MAX_ALLOWED_SEGS + ); + } + + // Prevent WAL removal while we're streaming data. + // + // Since this a flag, not a counter just bail out if already set; we + // shouldn't need concurrent snapshotting. + if shared_state.wal_removal_on_hold { + bail!("wal_removal_on_hold is already true"); + } + shared_state.wal_removal_on_hold = true; + + // Drop shared_state to release the lock, before calling wal_residence_guard(). + drop(shared_state); + + let tli_copy = self.wal_residence_guard().await?; + let bctx = SnapshotContext { + from_segno, + upto_segno, + term, + last_log_term, + flush_lsn, + wal_seg_size, + tli: tli_copy, + }; + + Ok(bctx) + } + + /// Finish snapshotting: check that term(s) hasn't changed. + /// + /// Note that WAL gc hold off is removed in Drop of SnapshotContext to not + /// forget this if snapshotting fails mid the way. + pub async fn finish_snapshot(&self, bctx: &SnapshotContext) -> Result<()> { + let shared_state = self.read_shared_state().await; + let term = shared_state.sk.state().acceptor_state.term; + let last_log_term = shared_state.sk.last_log_term(); + // There are some cases to relax this check (e.g. last_log_term might + // change, but as long as older history is strictly part of new that's + // fine), but there is no need to do it. + if bctx.term != term || bctx.last_log_term != last_log_term { + bail!("term(s) changed during snapshot: were term={}, last_log_term={}, now term={}, last_log_term={}", + bctx.term, bctx.last_log_term, term, last_log_term); + } + Ok(()) + } +} + +/// pull_timeline request body. #[derive(Debug, Serialize, Deserialize)] pub struct Request { pub tenant_id: TenantId, @@ -47,7 +303,10 @@ pub struct DebugDumpResponse { } /// Find the most advanced safekeeper and pull timeline from it. -pub async fn handle_request(request: Request) -> Result { +pub async fn handle_request( + request: Request, + sk_auth_token: Option, +) -> Result { let existing_tli = GlobalTimelines::get(TenantTimelineId::new( request.tenant_id, request.timeline_id, @@ -56,28 +315,26 @@ pub async fn handle_request(request: Request) -> Result { bail!("Timeline {} already exists", request.timeline_id); } - let client = reqwest::Client::new(); let http_hosts = request.http_hosts.clone(); - // Send request to /v1/tenant/:tenant_id/timeline/:timeline_id - let responses = futures::future::join_all(http_hosts.iter().map(|url| { - let url = format!( - "{}/v1/tenant/{}/timeline/{}", - url, request.tenant_id, request.timeline_id - ); - client.get(url).send() - })) - .await; + // Figure out statuses of potential donors. + let responses: Vec> = + futures::future::join_all(http_hosts.iter().map(|url| async { + let cclient = Client::new(url.clone(), sk_auth_token.clone()); + let info = cclient + .timeline_status(request.tenant_id, request.timeline_id) + .await?; + Ok(info) + })) + .await; let mut statuses = Vec::new(); for (i, response) in responses.into_iter().enumerate() { - let response = response.context(format!("Failed to get status from {}", http_hosts[i]))?; - let status: crate::http::routes::TimelineStatus = response.json().await?; + let status = response.context(format!("fetching status from {}", http_hosts[i]))?; statuses.push((status, i)); } // Find the most advanced safekeeper - // TODO: current logic may be wrong, fix it later let (status, i) = statuses .into_iter() .max_by_key(|(status, _)| { @@ -93,10 +350,14 @@ pub async fn handle_request(request: Request) -> Result { assert!(status.tenant_id == request.tenant_id); assert!(status.timeline_id == request.timeline_id); - pull_timeline(status, safekeeper_host).await + pull_timeline(status, safekeeper_host, sk_auth_token).await } -async fn pull_timeline(status: TimelineStatus, host: String) -> Result { +async fn pull_timeline( + status: TimelineStatus, + host: String, + sk_auth_token: Option, +) -> Result { let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id); info!( "pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}", @@ -110,86 +371,53 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result let conf = &GlobalTimelines::get_global_config(); - let client = reqwest::Client::new(); - // TODO: don't use debug dump, it should be used only in tests. - // This is a proof of concept, we should figure out a way - // to use scp without implementing it manually. - - // Implementing our own scp over HTTP. - // At first, we need to fetch list of files from safekeeper. - let dump: DebugDumpResponse = client - .get(format!( - "{}/v1/debug_dump?dump_all=true&tenant_id={}&timeline_id={}", - host, status.tenant_id, status.timeline_id - )) - .send() - .await? - .json() - .await?; - - if dump.timelines.len() != 1 { - bail!( - "expected to fetch single timeline, got {} timelines", - dump.timelines.len() - ); - } - - let timeline = dump.timelines.into_iter().next().unwrap(); - let disk_content = timeline.disk_content.ok_or(anyhow::anyhow!( - "timeline {} doesn't have disk content", - ttid - ))?; - - let mut filenames = disk_content - .files - .iter() - .map(|file| file.name.clone()) - .collect::>(); - - // Sort filenames to make sure we pull files in correct order - // After sorting, we should have: - // - 000000010000000000000001 - // - ... - // - 000000010000000000000002.partial - // - safekeeper.control - filenames.sort(); - - // safekeeper.control should be the first file, so we need to move it to the beginning - let control_file_index = filenames - .iter() - .position(|name| name == "safekeeper.control") - .ok_or(anyhow::anyhow!("safekeeper.control not found"))?; - filenames.remove(control_file_index); - filenames.insert(0, "safekeeper.control".to_string()); - - info!( - "downloading {} files from safekeeper {}", - filenames.len(), - host - ); - let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?; - // Note: some time happens between fetching list of files and fetching files themselves. - // It's possible that some files will be removed from safekeeper and we will fail to fetch them. - // This function will fail in this case, should be retried by the caller. - for filename in filenames { - let file_path = tli_dir_path.join(&filename); - // /v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename - let http_url = format!( - "{}/v1/tenant/{}/timeline/{}/file/{}", - host, status.tenant_id, status.timeline_id, filename - ); + let client = Client::new(host.clone(), sk_auth_token.clone()); + // Request stream with basebackup archive. + let bb_resp = client + .snapshot(status.tenant_id, status.timeline_id, conf.my_id) + .await?; - let mut file = tokio::fs::File::create(&file_path).await?; - let mut response = client.get(&http_url).send().await?; - while let Some(chunk) = response.chunk().await? { - file.write_all(&chunk).await?; - file.flush().await?; + // Make Stream of Bytes from it... + let bb_stream = bb_resp.bytes_stream().map_err(std::io::Error::other); + // and turn it into StreamReader implementing AsyncRead. + let bb_reader = tokio_util::io::StreamReader::new(bb_stream); + + // Extract it on the fly to the disk. We don't use simple unpack() to fsync + // files. + let mut entries = Archive::new(bb_reader).entries()?; + while let Some(base_tar_entry) = entries.next().await { + let mut entry = base_tar_entry?; + let header = entry.header(); + let file_path = header.path()?.into_owned(); + match header.entry_type() { + tokio_tar::EntryType::Regular => { + let utf8_file_path = + Utf8PathBuf::from_path_buf(file_path).expect("non-Unicode path"); + let dst_path = tli_dir_path.join(utf8_file_path); + let mut f = OpenOptions::new() + .create(true) + .truncate(true) + .write(true) + .open(&dst_path) + .await?; + tokio::io::copy(&mut entry, &mut f).await?; + // fsync the file + f.sync_all().await?; + } + _ => { + bail!( + "entry {} in backup tar archive is of unexpected type: {:?}", + file_path.display(), + header.entry_type() + ); + } } } - // TODO: fsync? + // fsync temp timeline directory to remember its contents. + fsync_async_opt(&tli_dir_path, !conf.no_sync).await?; // Let's create timeline from temp directory and verify that it's correct let (commit_lsn, flush_lsn) = validate_temp_timeline(conf, ttid, &tli_dir_path).await?; @@ -256,6 +484,7 @@ pub async fn validate_temp_timeline( } /// Move timeline from a temp directory to the main storage, and load it to the global map. +/// /// This operation is done under a lock to prevent bugs if several concurrent requests are /// trying to load the same timeline. Note that it doesn't guard against creating the /// timeline with the same ttid, but no one should be doing this anyway. @@ -273,14 +502,16 @@ pub async fn load_temp_timeline( } // Move timeline dir to the correct location - let timeline_path = conf.timeline_dir(&ttid); + let timeline_path = get_timeline_dir(conf, &ttid); info!( "moving timeline {} from {} to {}", ttid, tmp_path, timeline_path ); - tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?; - tokio::fs::rename(tmp_path, &timeline_path).await?; + tokio::fs::create_dir_all(get_tenant_dir(conf, &ttid.tenant_id)).await?; + // fsync tenant dir creation + fsync_async_opt(&conf.workdir, !conf.no_sync).await?; + durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?; let tli = GlobalTimelines::load_timeline(&guard, ttid) .await diff --git a/safekeeper/src/rate_limit.rs b/safekeeper/src/rate_limit.rs new file mode 100644 index 0000000000..72373b5786 --- /dev/null +++ b/safekeeper/src/rate_limit.rs @@ -0,0 +1,49 @@ +use std::sync::Arc; + +use rand::Rng; + +use crate::metrics::MISC_OPERATION_SECONDS; + +/// Global rate limiter for background tasks. +#[derive(Clone)] +pub struct RateLimiter { + partial_backup: Arc, + eviction: Arc, +} + +impl RateLimiter { + /// Create a new rate limiter. + /// - `partial_backup_max`: maximum number of concurrent partial backups. + /// - `eviction_max`: maximum number of concurrent timeline evictions. + pub fn new(partial_backup_max: usize, eviction_max: usize) -> Self { + Self { + partial_backup: Arc::new(tokio::sync::Semaphore::new(partial_backup_max)), + eviction: Arc::new(tokio::sync::Semaphore::new(eviction_max)), + } + } + + /// Get a permit for partial backup. This will block if the maximum number of concurrent + /// partial backups is reached. + pub async fn acquire_partial_backup(&self) -> tokio::sync::OwnedSemaphorePermit { + let _timer = MISC_OPERATION_SECONDS + .with_label_values(&["partial_permit_acquire"]) + .start_timer(); + self.partial_backup + .clone() + .acquire_owned() + .await + .expect("semaphore is closed") + } + + /// Try to get a permit for timeline eviction. This will return None if the maximum number of + /// concurrent timeline evictions is reached. + pub fn try_acquire_eviction(&self) -> Option { + self.eviction.clone().try_acquire_owned().ok() + } +} + +/// Generate a random duration that is a fraction of the given duration. +pub fn rand_duration(duration: &std::time::Duration) -> std::time::Duration { + let randf64 = rand::thread_rng().gen_range(0.0..1.0); + duration.mul_f64(randf64) +} diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 9ce9b049ba..e35f806e90 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -6,7 +6,7 @@ use crate::handler::SafekeeperPostgresHandler; use crate::safekeeper::AcceptorProposerMessage; use crate::safekeeper::ProposerAcceptorMessage; use crate::safekeeper::ServerInfo; -use crate::timeline::Timeline; +use crate::timeline::WalResidentTimeline; use crate::wal_service::ConnectionId; use crate::GlobalTimelines; use anyhow::{anyhow, Context}; @@ -36,11 +36,18 @@ use tokio::time::Instant; use tracing::*; use utils::id::TenantTimelineId; use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; + +const DEFAULT_FEEDBACK_CAPACITY: usize = 8; /// Registry of WalReceivers (compute connections). Timeline holds it (wrapped /// in Arc). pub struct WalReceivers { mutex: Mutex, + pageserver_feedback_tx: tokio::sync::broadcast::Sender, + + num_computes_tx: tokio::sync::watch::Sender, + num_computes_rx: tokio::sync::watch::Receiver, } /// Id under which walreceiver is registered in shmem. @@ -48,15 +55,24 @@ type WalReceiverId = usize; impl WalReceivers { pub fn new() -> Arc { + let (pageserver_feedback_tx, _) = + tokio::sync::broadcast::channel(DEFAULT_FEEDBACK_CAPACITY); + + let (num_computes_tx, num_computes_rx) = tokio::sync::watch::channel(0usize); + Arc::new(WalReceivers { mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }), + pageserver_feedback_tx, + num_computes_tx, + num_computes_rx, }) } /// Register new walreceiver. Returned guard provides access to the slot and /// automatically deregisters in Drop. pub fn register(self: &Arc, conn_id: Option) -> WalReceiverGuard { - let slots = &mut self.mutex.lock().slots; + let mut shared = self.mutex.lock(); + let slots = &mut shared.slots; let walreceiver = WalReceiverState { conn_id, status: WalReceiverStatus::Voting, @@ -70,6 +86,9 @@ impl WalReceivers { slots.push(Some(walreceiver)); pos }; + + self.update_num(&shared); + WalReceiverGuard { id: pos, walreceivers: self.clone(), @@ -91,7 +110,18 @@ impl WalReceivers { /// Get number of walreceivers (compute connections). pub fn get_num(self: &Arc) -> usize { - self.mutex.lock().slots.iter().flatten().count() + self.mutex.lock().get_num() + } + + /// Get channel for number of walreceivers. + pub fn get_num_rx(self: &Arc) -> tokio::sync::watch::Receiver { + self.num_computes_rx.clone() + } + + /// Should get called after every update of slots. + fn update_num(self: &Arc, shared: &MutexGuard) { + let num = shared.get_num(); + self.num_computes_tx.send_replace(num); } /// Get state of all walreceivers. @@ -115,6 +145,13 @@ impl WalReceivers { fn unregister(self: &Arc, id: WalReceiverId) { let mut shared = self.mutex.lock(); shared.slots[id] = None; + self.update_num(&shared); + } + + /// Broadcast pageserver feedback to connected walproposers. + pub fn broadcast_pageserver_feedback(&self, feedback: PageserverFeedback) { + // Err means there is no subscribers, it is fine. + let _ = self.pageserver_feedback_tx.send(feedback); } } @@ -123,6 +160,13 @@ struct WalReceiversShared { slots: Vec>, } +impl WalReceiversShared { + /// Get number of walreceivers (compute connections). + fn get_num(&self) -> usize { + self.slots.iter().flatten().count() + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct WalReceiverState { /// None means it is recovery initiated by us (this safekeeper). @@ -169,9 +213,19 @@ impl SafekeeperPostgresHandler { &mut self, pgb: &mut PostgresBackend, ) -> Result<(), QueryError> { - if let Err(end) = self.handle_start_wal_push_guts(pgb).await { + let mut tli: Option = None; + if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await { // Log the result and probably send it to the client, closing the stream. - pgb.handle_copy_stream_end(end).await; + let handle_end_fut = pgb.handle_copy_stream_end(end); + // If we managed to create the timeline, augment logging with current LSNs etc. + if let Some(tli) = tli { + let info = tli.get_safekeeper_info(&self.conf).await; + handle_end_fut + .instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.commit_lsn))) + .await; + } else { + handle_end_fut.await; + } } Ok(()) } @@ -179,6 +233,7 @@ impl SafekeeperPostgresHandler { pub async fn handle_start_wal_push_guts( &mut self, pgb: &mut PostgresBackend, + tli: &mut Option, ) -> Result<(), CopyStreamHandlerEnd> { // Notify the libpq client that it's allowed to send `CopyData` messages pgb.write_message(&BeMessage::CopyBothResponse).await?; @@ -197,17 +252,32 @@ impl SafekeeperPostgresHandler { // sends, so this avoids deadlocks. let mut pgb_reader = pgb.split().context("START_WAL_PUSH split")?; let peer_addr = *pgb.get_peer_addr(); - let network_reader = NetworkReader { + let mut network_reader = NetworkReader { ttid: self.ttid, conn_id: self.conn_id, pgb_reader: &mut pgb_reader, peer_addr, acceptor_handle: &mut acceptor_handle, }; - let res = tokio::select! { - // todo: add read|write .context to these errors - r = network_reader.run(msg_tx, msg_rx, reply_tx) => r, - r = network_write(pgb, reply_rx) => r, + + // Read first message and create timeline if needed. + let res = network_reader.read_first_message().await; + + let network_res = if let Ok((timeline, next_msg)) = res { + let pageserver_feedback_rx: tokio::sync::broadcast::Receiver = + timeline + .get_walreceivers() + .pageserver_feedback_tx + .subscribe(); + *tli = Some(timeline.wal_residence_guard().await?); + + tokio::select! { + // todo: add read|write .context to these errors + r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r, + r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r, + } + } else { + res.map(|_| ()) }; // Join pg backend back. @@ -219,13 +289,13 @@ impl SafekeeperPostgresHandler { match acceptor_handle { None => { // failed even before spawning; read_network should have error - Err(res.expect_err("no error with WalAcceptor not spawn")) + Err(network_res.expect_err("no error with WalAcceptor not spawn")) } Some(handle) => { let wal_acceptor_res = handle.await; // If there was any network error, return it. - res?; + network_res?; // Otherwise, WalAcceptor thread must have errored. match wal_acceptor_res { @@ -251,12 +321,9 @@ struct NetworkReader<'a, IO> { } impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { - async fn run( - self, - msg_tx: Sender, - msg_rx: Receiver, - reply_tx: Sender, - ) -> Result<(), CopyStreamHandlerEnd> { + async fn read_first_message( + &mut self, + ) -> Result<(WalResidentTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> { // Receive information about server to create timeline, if not yet. let next_msg = read_message(self.pgb_reader).await?; let tli = match next_msg { @@ -270,7 +337,10 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { system_id: greeting.system_id, wal_seg_size: greeting.wal_seg_size, }; - GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID).await? + let tli = + GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID) + .await?; + tli.wal_residence_guard().await? } _ => { return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!( @@ -278,9 +348,19 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { ))) } }; + Ok((tli, next_msg)) + } + async fn run( + self, + msg_tx: Sender, + msg_rx: Receiver, + reply_tx: Sender, + tli: WalResidentTimeline, + next_msg: ProposerAcceptorMessage, + ) -> Result<(), CopyStreamHandlerEnd> { *self.acceptor_handle = Some(WalAcceptor::spawn( - tli.clone(), + tli, msg_rx, reply_tx, Some(self.conn_id), @@ -320,18 +400,46 @@ async fn read_network_loop( async fn network_write( pgb_writer: &mut PostgresBackend, mut reply_rx: Receiver, + mut pageserver_feedback_rx: tokio::sync::broadcast::Receiver, ) -> Result<(), CopyStreamHandlerEnd> { let mut buf = BytesMut::with_capacity(128); + // storing append_response to inject PageserverFeedback into it + let mut last_append_response = None; + loop { - match reply_rx.recv().await { - Some(msg) => { - buf.clear(); - msg.serialize(&mut buf)?; - pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?; + // trying to read either AcceptorProposerMessage or PageserverFeedback + let msg = tokio::select! { + reply = reply_rx.recv() => { + if let Some(msg) = reply { + if let AcceptorProposerMessage::AppendResponse(append_response) = &msg { + last_append_response = Some(append_response.clone()); + } + Some(msg) + } else { + return Ok(()); // chan closed, WalAcceptor terminated + } } - None => return Ok(()), // chan closed, WalAcceptor terminated - } + + feedback = pageserver_feedback_rx.recv() => + match (feedback, &last_append_response) { + (Ok(feedback), Some(append_response)) => { + // clone AppendResponse and inject PageserverFeedback into it + let mut append_response = append_response.clone(); + append_response.pageserver_feedback = Some(feedback); + Some(AcceptorProposerMessage::AppendResponse(append_response)) + } + _ => None, + } + }; + + let Some(msg) = msg else { + continue; + }; + + buf.clear(); + msg.serialize(&mut buf)?; + pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?; } } @@ -340,10 +448,12 @@ async fn network_write( const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1); /// Encapsulates a task which takes messages from msg_rx, processes and pushes -/// replies to reply_tx; reading from socket and writing to disk in parallel is -/// beneficial for performance, this struct provides writing to disk part. +/// replies to reply_tx. +/// +/// Reading from socket and writing to disk in parallel is beneficial for +/// performance, this struct provides the writing to disk part. pub struct WalAcceptor { - tli: Arc, + tli: WalResidentTimeline, msg_rx: Receiver, reply_tx: Sender, conn_id: Option, @@ -356,7 +466,7 @@ impl WalAcceptor { /// /// conn_id None means WalAcceptor is used by recovery initiated at this safekeeper. pub fn spawn( - tli: Arc, + tli: WalResidentTimeline, msg_rx: Receiver, reply_tx: Sender, conn_id: Option, @@ -381,14 +491,7 @@ impl WalAcceptor { /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed; /// it must mean that network thread terminated. async fn run(&mut self) -> anyhow::Result<()> { - // Register the connection and defer unregister. - // Order of the next two lines is important: we want first to remove our entry and then - // update status which depends on registered connections. - let _compute_conn_guard = ComputeConnectionGuard { - timeline: Arc::clone(&self.tli), - }; let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id); - self.tli.update_status_notify().await?; // After this timestamp we will stop processing AppendRequests and send a response // to the walproposer. walproposer sends at least one AppendRequest per second, @@ -454,19 +557,3 @@ impl WalAcceptor { } } } - -/// Calls update_status_notify in drop to update timeline status. -struct ComputeConnectionGuard { - timeline: Arc, -} - -impl Drop for ComputeConnectionGuard { - fn drop(&mut self) { - let tli = self.timeline.clone(); - tokio::spawn(async move { - if let Err(e) = tli.update_status_notify().await { - error!("failed to update timeline status: {}", e); - } - }); - } -} diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs index e8fa6c55f4..9c4149d8f1 100644 --- a/safekeeper/src/recovery.rs +++ b/safekeeper/src/recovery.rs @@ -2,7 +2,7 @@ //! provide it, i.e. safekeeper lags too much. use std::time::SystemTime; -use std::{fmt, pin::pin, sync::Arc}; +use std::{fmt, pin::pin}; use anyhow::{bail, Context}; use futures::StreamExt; @@ -21,6 +21,7 @@ use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config} use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE}; use crate::safekeeper::{AppendRequest, AppendRequestHeader}; +use crate::timeline::WalResidentTimeline; use crate::{ http::routes::TimelineStatus, receive_wal::MSG_QUEUE_SIZE, @@ -28,31 +29,106 @@ use crate::{ AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory, TermLsn, VoteRequest, }, - timeline::{PeerInfo, Timeline}, + timeline::PeerInfo, SafeKeeperConf, }; /// Entrypoint for per timeline task which always runs, checking whether /// recovery for this safekeeper is needed and starting it if so. -#[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))] -pub async fn recovery_main(tli: Arc, conf: SafeKeeperConf) { +#[instrument(name = "recovery", skip_all, fields(ttid = %tli.ttid))] +pub async fn recovery_main(tli: WalResidentTimeline, conf: SafeKeeperConf) { info!("started"); - let mut cancellation_rx = match tli.get_cancellation_rx() { - Ok(rx) => rx, - Err(_) => { - info!("timeline canceled during task start"); - return; - } - }; + let cancel = tli.cancel.clone(); select! { _ = recovery_main_loop(tli, conf) => { unreachable!() } - _ = cancellation_rx.changed() => { + _ = cancel.cancelled() => { info!("stopped"); } } } +/// Should we start fetching WAL from a peer safekeeper, and if yes, from +/// which? Answer is yes, i.e. .donors is not empty if 1) there is something +/// to fetch, and we can do that without running elections; 2) there is no +/// actively streaming compute, as we don't want to compete with it. +/// +/// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal +/// to its last_log_term so we are sure such a leader ever had been elected. +/// +/// All possible donors are returned so that we could keep connection to the +/// current one if it is good even if it slightly lags behind. +/// +/// Note that term conditions above might be not met, but safekeepers are +/// still not aligned on last flush_lsn. Generally in this case until +/// elections are run it is not possible to say which safekeeper should +/// recover from which one -- history which would be committed is different +/// depending on assembled quorum (e.g. classic picture 8 from Raft paper). +/// Thus we don't try to predict it here. +async fn recovery_needed( + tli: &WalResidentTimeline, + heartbeat_timeout: Duration, +) -> RecoveryNeededInfo { + let ss = tli.read_shared_state().await; + let term = ss.sk.state().acceptor_state.term; + let last_log_term = ss.sk.last_log_term(); + let flush_lsn = ss.sk.flush_lsn(); + // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us. + let mut peers = ss.get_peers(heartbeat_timeout); + // Sort by pairs. + peers.sort_by(|p1, p2| { + let tl1 = TermLsn { + term: p1.last_log_term, + lsn: p1.flush_lsn, + }; + let tl2 = TermLsn { + term: p2.last_log_term, + lsn: p2.flush_lsn, + }; + tl2.cmp(&tl1) // desc + }); + let num_streaming_computes = tli.get_walreceivers().get_num_streaming(); + let donors = if num_streaming_computes > 0 { + vec![] // If there is a streaming compute, don't try to recover to not intervene. + } else { + peers + .iter() + .filter_map(|candidate| { + // Are we interested in this candidate? + let candidate_tl = TermLsn { + term: candidate.last_log_term, + lsn: candidate.flush_lsn, + }; + let my_tl = TermLsn { + term: last_log_term, + lsn: flush_lsn, + }; + if my_tl < candidate_tl { + // Yes, we are interested. Can we pull from it without + // (re)running elections? It is possible if 1) his term + // is equal to his last_log_term so we could act on + // behalf of leader of this term (we must be sure he was + // ever elected) and 2) our term is not higher, or we'll refuse data. + if candidate.term == candidate.last_log_term && candidate.term >= term { + Some(Donor::from(candidate)) + } else { + None + } + } else { + None + } + }) + .collect() + }; + RecoveryNeededInfo { + term, + last_log_term, + flush_lsn, + peers, + num_streaming_computes, + donors, + } +} /// Result of Timeline::recovery_needed, contains donor(s) if recovery needed and /// fields to explain the choice. #[derive(Debug)] @@ -119,17 +195,22 @@ impl From<&PeerInfo> for Donor { const CHECK_INTERVAL_MS: u64 = 2000; /// Check regularly whether we need to start recovery. -async fn recovery_main_loop(tli: Arc, conf: SafeKeeperConf) { +async fn recovery_main_loop(tli: WalResidentTimeline, conf: SafeKeeperConf) { let check_duration = Duration::from_millis(CHECK_INTERVAL_MS); loop { - let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await; + let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await; match recovery_needed_info.donors.first() { Some(donor) => { info!( "starting recovery from donor {}: {}", donor.sk_id, recovery_needed_info ); - match recover(tli.clone(), donor, &conf).await { + let res = tli.wal_residence_guard().await; + if let Err(e) = res { + warn!("failed to obtain guard: {}", e); + continue; + } + match recover(res.unwrap(), donor, &conf).await { // Note: 'write_wal rewrites WAL written before' error is // expected here and might happen if compute and recovery // concurrently write the same data. Eventually compute @@ -152,7 +233,7 @@ async fn recovery_main_loop(tli: Arc, conf: SafeKeeperConf) { /// Recover from the specified donor. Returns message explaining normal finish /// reason or error. async fn recover( - tli: Arc, + tli: WalResidentTimeline, donor: &Donor, conf: &SafeKeeperConf, ) -> anyhow::Result { @@ -238,7 +319,7 @@ async fn recover( // Pull WAL from donor, assuming handshake is already done. async fn recovery_stream( - tli: Arc, + tli: WalResidentTimeline, donor: &Donor, start_streaming_at: Lsn, conf: &SafeKeeperConf, @@ -288,10 +369,10 @@ async fn recovery_stream( // As in normal walreceiver, do networking and writing to disk in parallel. let (msg_tx, msg_rx) = channel(MSG_QUEUE_SIZE); let (reply_tx, reply_rx) = channel(REPLY_QUEUE_SIZE); - let wa = WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, None); + let wa = WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, None); let res = tokio::select! { - r = network_io(physical_stream, msg_tx, donor.clone(), tli.clone(), conf.clone()) => r, + r = network_io(physical_stream, msg_tx, donor.clone(), tli, conf.clone()) => r, r = read_replies(reply_rx, donor.term) => r.map(|()| None), }; @@ -322,7 +403,7 @@ async fn network_io( physical_stream: ReplicationStream, msg_tx: Sender, donor: Donor, - tli: Arc, + tli: WalResidentTimeline, conf: SafeKeeperConf, ) -> anyhow::Result> { let mut physical_stream = pin!(physical_stream); @@ -343,7 +424,7 @@ async fn network_io( ReplicationMessage::XLogData(xlog_data) => { let ar_hdr = AppendRequestHeader { term: donor.term, - epoch_start_lsn: Lsn::INVALID, // unused + term_start_lsn: Lsn::INVALID, // unused begin_lsn: Lsn(xlog_data.wal_start()), end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64, commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it @@ -371,7 +452,7 @@ async fn network_io( } ReplicationMessage::PrimaryKeepAlive(_) => { // keepalive means nothing is being streamed for a while. Check whether we need to stop. - let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await; + let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await; // do current donors still contain one we currently connected to? if !recovery_needed_info .donors diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index 9dce06a886..16239d847b 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -1,52 +1,25 @@ -//! Thread removing old WAL. +use utils::lsn::Lsn; -use std::time::Duration; +use crate::timeline_manager::StateSnapshot; -use tokio::time::sleep; -use tracing::*; +/// Get oldest LSN we still need to keep. We hold WAL till it is consumed +/// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3 +/// offloading. +/// While it is safe to use inmem values for determining horizon, +/// we use persistent to make possible normal states less surprising. +/// All segments covering LSNs before horizon_lsn can be removed. +pub(crate) fn calc_horizon_lsn(state: &StateSnapshot, extra_horizon_lsn: Option) -> Lsn { + use std::cmp::min; -use crate::{GlobalTimelines, SafeKeeperConf}; - -const ALLOW_INACTIVE_TIMELINES: bool = true; - -pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { - let wal_removal_interval = Duration::from_millis(5000); - loop { - let now = tokio::time::Instant::now(); - let mut active_timelines = 0; - - let tlis = GlobalTimelines::get_all(); - for tli in &tlis { - let is_active = tli.is_active().await; - if is_active { - active_timelines += 1; - } - if !ALLOW_INACTIVE_TIMELINES && !is_active { - continue; - } - let ttid = tli.ttid; - async { - if let Err(e) = tli.maybe_persist_control_file().await { - warn!("failed to persist control file: {e}"); - } - if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled).await { - error!("failed to remove WAL: {}", e); - } - } - .instrument(info_span!("WAL removal", ttid = %ttid)) - .await; - } - - let elapsed = now.elapsed(); - let total_timelines = tlis.len(); - - if elapsed > wal_removal_interval { - info!( - "WAL removal is too long, processed {} active timelines ({} total) in {:?}", - active_timelines, total_timelines, elapsed - ); - } - - sleep(wal_removal_interval).await; + let mut horizon_lsn = min( + state.cfile_remote_consistent_lsn, + state.cfile_peer_horizon_lsn, + ); + // we don't want to remove WAL that is not yet offloaded to s3 + horizon_lsn = min(horizon_lsn, state.cfile_backup_lsn); + if let Some(extra_horizon_lsn) = extra_horizon_lsn { + horizon_lsn = min(horizon_lsn, extra_horizon_lsn); } + + horizon_lsn } diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index d66db9b652..dbe0034de2 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -4,18 +4,18 @@ use anyhow::{bail, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use postgres_ffi::{TimeLineID, XLogSegNo, MAX_SEND_SIZE}; +use postgres_ffi::{TimeLineID, MAX_SEND_SIZE}; use serde::{Deserialize, Serialize}; use std::cmp::max; use std::cmp::min; use std::fmt; use std::io::Read; -use std::time::Duration; use storage_broker::proto::SafekeeperTimelineInfo; use tracing::*; use crate::control_file; +use crate::metrics::MISC_OPERATION_SECONDS; use crate::send_wal::HotStandbyFeedback; use crate::state::TimelineState; @@ -92,7 +92,7 @@ impl TermHistory { } /// Find point of divergence between leader (walproposer) term history and - /// safekeeper. Arguments are not symmetrics as proposer history ends at + /// safekeeper. Arguments are not symmetric as proposer history ends at /// +infinity while safekeeper at flush_lsn. /// C version is at walproposer SendProposerElected. pub fn find_highest_common_point( @@ -188,8 +188,8 @@ pub struct AcceptorState { } impl AcceptorState { - /// acceptor's epoch is the term of the highest entry in the log - pub fn get_epoch(&self, flush_lsn: Lsn) -> Term { + /// acceptor's last_log_term is the term of the highest entry in the log + pub fn get_last_log_term(&self, flush_lsn: Lsn) -> Term { let th = self.term_history.up_to(flush_lsn); match th.0.last() { Some(e) => e.term, @@ -305,9 +305,9 @@ pub struct AppendRequest { pub struct AppendRequestHeader { // safekeeper's current term; if it is higher than proposer's, the compute is out of date. pub term: Term, - // TODO: remove this field, it in unused -- LSN of term switch can be taken - // from ProposerElected (as well as from term history). - pub epoch_start_lsn: Lsn, + // TODO: remove this field from the protocol, it in unused -- LSN of term + // switch can be taken from ProposerElected (as well as from term history). + pub term_start_lsn: Lsn, /// start position of message in WAL pub begin_lsn: Lsn, /// end position of message in WAL @@ -321,20 +321,21 @@ pub struct AppendRequestHeader { } /// Report safekeeper state to proposer -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] pub struct AppendResponse { // Current term of the safekeeper; if it is higher than proposer's, the // compute is out of date. pub term: Term, - // NOTE: this is physical end of wal on safekeeper; currently it doesn't - // make much sense without taking epoch into account, as history can be - // diverged. + // Flushed end of wal on safekeeper; one should be always mindful from what + // term history this value comes, either checking history directly or + // observing term being set to one for which WAL truncation is known to have + // happened. pub flush_lsn: Lsn, // We report back our awareness about which WAL is committed, as this is // a criterion for walproposer --sync mode exit pub commit_lsn: Lsn, pub hs_feedback: HotStandbyFeedback, - pub pageserver_feedback: PageserverFeedback, + pub pageserver_feedback: Option, } impl AppendResponse { @@ -344,7 +345,7 @@ impl AppendResponse { flush_lsn: Lsn(0), commit_lsn: Lsn(0), hs_feedback: HotStandbyFeedback::empty(), - pageserver_feedback: PageserverFeedback::empty(), + pageserver_feedback: None, } } } @@ -462,7 +463,11 @@ impl AcceptorProposerMessage { buf.put_u64_le(msg.hs_feedback.xmin); buf.put_u64_le(msg.hs_feedback.catalog_xmin); - msg.pageserver_feedback.serialize(buf); + // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback + // if it is not present. + if let Some(ref msg) = msg.pageserver_feedback { + msg.serialize(buf); + } } } @@ -478,8 +483,8 @@ impl AcceptorProposerMessage { /// - messages from broker peers pub struct SafeKeeper { /// LSN since the proposer safekeeper currently talking to appends WAL; - /// determines epoch switch point. - pub epoch_start_lsn: Lsn, + /// determines last_log_term switch point. + pub term_start_lsn: Lsn, pub state: TimelineState, // persistent state storage pub wal_store: WAL, @@ -495,7 +500,11 @@ where /// Accepts a control file storage containing the safekeeper state. /// State must be initialized, i.e. contain filled `tenant_id`, `timeline_id` /// and `server` (`wal_seg_size` inside it) fields. - pub fn new(state: CTRL, wal_store: WAL, node_id: NodeId) -> Result> { + pub fn new( + state: TimelineState, + wal_store: WAL, + node_id: NodeId, + ) -> Result> { if state.tenant_id == TenantId::from([0u8; 16]) || state.timeline_id == TimelineId::from([0u8; 16]) { @@ -507,8 +516,8 @@ where } Ok(SafeKeeper { - epoch_start_lsn: Lsn(0), - state: TimelineState::new(state), + term_start_lsn: Lsn(0), + state, wal_store, node_id, }) @@ -522,13 +531,10 @@ where .up_to(self.flush_lsn()) } - /// Get current term. - pub fn get_term(&self) -> Term { - self.state.acceptor_state.term - } - - pub fn get_epoch(&self) -> Term { - self.state.acceptor_state.get_epoch(self.flush_lsn()) + pub fn get_last_log_term(&self) -> Term { + self.state + .acceptor_state + .get_last_log_term(self.flush_lsn()) } /// wal_store wrapper avoiding commit_lsn <= flush_lsn violation when we don't have WAL yet. @@ -681,7 +687,7 @@ where commit_lsn: self.state.commit_lsn, // will be filled by the upper code to avoid bothering safekeeper hs_feedback: HotStandbyFeedback::empty(), - pageserver_feedback: PageserverFeedback::empty(), + pageserver_feedback: None, }; trace!("formed AppendResponse {:?}", ar); ar @@ -691,7 +697,17 @@ where &mut self, msg: &ProposerElected, ) -> Result> { - info!("received ProposerElected {:?}", msg); + let _timer = MISC_OPERATION_SECONDS + .with_label_values(&["handle_elected"]) + .start_timer(); + + info!( + "received ProposerElected {:?}, term={}, last_log_term={}, flush_lsn={}", + msg, + self.state.acceptor_state.term, + self.get_last_log_term(), + self.flush_lsn() + ); if self.state.acceptor_state.term < msg.term { let mut state = self.state.start_change(); state.acceptor_state.term = msg.term; @@ -703,26 +719,56 @@ where return Ok(None); } - // This might happen in a rare race when another (old) connection from - // the same walproposer writes + flushes WAL after this connection - // already sent flush_lsn in VoteRequest. It is generally safe to - // proceed, but to prevent commit_lsn surprisingly going down we should - // either refuse the session (simpler) or skip the part we already have - // from the stream (can be implemented). - if msg.term == self.get_epoch() && self.flush_lsn() > msg.start_streaming_at { - bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help", - msg.term, self.flush_lsn(), msg.start_streaming_at) + // Before truncating WAL check-cross the check divergence point received + // from the walproposer. + let sk_th = self.get_term_history(); + let last_common_point = match TermHistory::find_highest_common_point( + &msg.term_history, + &sk_th, + self.flush_lsn(), + ) { + // No common point. Expect streaming from the beginning of the + // history like walproposer while we don't have proper init. + None => *msg.term_history.0.first().ok_or(anyhow::anyhow!( + "empty walproposer term history {:?}", + msg.term_history + ))?, + Some(lcp) => lcp, + }; + // This is expected to happen in a rare race when another connection + // from the same walproposer writes + flushes WAL after this connection + // sent flush_lsn in VoteRequest; for instance, very late + // ProposerElected message delivery after another connection was + // established and wrote WAL. In such cases error is transient; + // reconnection makes safekeeper send newest term history and flush_lsn + // and walproposer recalculates the streaming point. OTOH repeating + // error indicates a serious bug. + if last_common_point.lsn != msg.start_streaming_at { + bail!("refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}", + last_common_point, msg.start_streaming_at, + self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history, + ); } - // Otherwise we must never attempt to truncate committed data. + + // We are also expected to never attempt to truncate committed data. assert!( msg.start_streaming_at >= self.state.inmem.commit_lsn, - "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}", - msg.start_streaming_at, - self.state.inmem.commit_lsn + "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}", + msg.start_streaming_at, self.state.inmem.commit_lsn, + self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history, ); - // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to - // intersection of our history and history from msg + // Before first WAL write initialize its segment. It makes first segment + // pg_waldump'able because stream from compute doesn't include its + // segment and page headers. + // + // If we fail before first WAL write flush this action would be + // repeated, that's ok because it is idempotent. + if self.wal_store.flush_lsn() == Lsn::INVALID { + self.wal_store + .initialize_first_segment(msg.start_streaming_at) + .await?; + } // truncate wal, update the LSNs self.wal_store.truncate_wal(msg.start_streaming_at).await?; @@ -762,6 +808,9 @@ where // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment. state.backup_lsn = max(state.backup_lsn, state.timeline_start_lsn); + // similar for remote_consistent_lsn + state.remote_consistent_lsn = + max(state.remote_consistent_lsn, state.timeline_start_lsn); state.acceptor_state.term_history = msg.term_history.clone(); self.state.finish_change(&state).await?; @@ -772,7 +821,7 @@ where // Cache LSN where term starts to immediately fsync control file with // commit_lsn once we reach it -- sync-safekeepers finishes when // persisted commit_lsn on majority of safekeepers aligns. - self.epoch_start_lsn = match msg.term_history.0.last() { + self.term_start_lsn = match msg.term_history.0.last() { None => bail!("proposer elected with empty term history"), Some(term_lsn_start) => term_lsn_start.lsn, }; @@ -798,35 +847,17 @@ where self.state.inmem.commit_lsn = commit_lsn; - // If new commit_lsn reached epoch switch, force sync of control + // If new commit_lsn reached term switch, force sync of control // file: walproposer in sync mode is very interested when this // happens. Note: this is for sync-safekeepers mode only, as - // otherwise commit_lsn might jump over epoch_start_lsn. - if commit_lsn >= self.epoch_start_lsn && self.state.commit_lsn < self.epoch_start_lsn { + // otherwise commit_lsn might jump over term_start_lsn. + if commit_lsn >= self.term_start_lsn && self.state.commit_lsn < self.term_start_lsn { self.state.flush().await?; } Ok(()) } - /// Persist control file if there is something to save and enough time - /// passed after the last save. - pub async fn maybe_persist_inmem_control_file(&mut self) -> Result<()> { - const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300); - if self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL { - return Ok(()); - } - let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn - || self.state.inmem.backup_lsn > self.state.backup_lsn - || self.state.inmem.peer_horizon_lsn > self.state.peer_horizon_lsn - || self.state.inmem.remote_consistent_lsn > self.state.remote_consistent_lsn; - if need_persist { - self.state.flush().await?; - trace!("saved control file: {CF_SAVE_INTERVAL:?} passed"); - } - Ok(()) - } - /// Handle request to append WAL. #[allow(clippy::comparison_chain)] async fn handle_append_request( @@ -844,6 +875,29 @@ where return Ok(Some(AcceptorProposerMessage::AppendResponse(resp))); } + // Disallow any non-sequential writes, which can result in gaps or + // overwrites. If we need to move the pointer, ProposerElected message + // should have truncated WAL first accordingly. Note that the first + // condition (WAL rewrite) is quite expected in real world; it happens + // when walproposer reconnects to safekeeper and writes some more data + // while first connection still gets some packets later. It might be + // better to not log this as error! above. + let write_lsn = self.wal_store.write_lsn(); + if write_lsn > msg.h.begin_lsn { + bail!( + "append request rewrites WAL written before, write_lsn={}, msg lsn={}", + write_lsn, + msg.h.begin_lsn + ); + } + if write_lsn < msg.h.begin_lsn && write_lsn != Lsn(0) { + bail!( + "append request creates gap in written WAL, write_lsn={}, msg lsn={}", + write_lsn, + msg.h.begin_lsn, + ); + } + // Now we know that we are in the same term as the proposer, // processing the message. @@ -909,71 +963,27 @@ where ))) } - /// Update timeline state with peer safekeeper data. + /// Update commit_lsn from peer safekeeper data. pub async fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> { - let mut sync_control_file = false; - if (Lsn(sk_info.commit_lsn) != Lsn::INVALID) && (sk_info.last_log_term != INVALID_TERM) { // Note: the check is too restrictive, generally we can update local // commit_lsn if our history matches (is part of) history of advanced // commit_lsn provider. - if sk_info.last_log_term == self.get_epoch() { + if sk_info.last_log_term == self.get_last_log_term() { self.update_commit_lsn(Lsn(sk_info.commit_lsn)).await?; } } - - self.state.inmem.backup_lsn = max(Lsn(sk_info.backup_lsn), self.state.inmem.backup_lsn); - sync_control_file |= self.state.backup_lsn + (self.state.server.wal_seg_size as u64) - < self.state.inmem.backup_lsn; - - self.state.inmem.remote_consistent_lsn = max( - Lsn(sk_info.remote_consistent_lsn), - self.state.inmem.remote_consistent_lsn, - ); - sync_control_file |= self.state.remote_consistent_lsn - + (self.state.server.wal_seg_size as u64) - < self.state.inmem.remote_consistent_lsn; - - self.state.inmem.peer_horizon_lsn = max( - Lsn(sk_info.peer_horizon_lsn), - self.state.inmem.peer_horizon_lsn, - ); - sync_control_file |= self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64) - < self.state.inmem.peer_horizon_lsn; - - if sync_control_file { - self.state.flush().await?; - } Ok(()) } - - /// Get oldest segno we still need to keep. We hold WAL till it is consumed - /// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3 - /// offloading. - /// While it is safe to use inmem values for determining horizon, - /// we use persistent to make possible normal states less surprising. - pub fn get_horizon_segno(&self, wal_backup_enabled: bool) -> XLogSegNo { - let mut horizon_lsn = min( - self.state.remote_consistent_lsn, - self.state.peer_horizon_lsn, - ); - if wal_backup_enabled { - horizon_lsn = min(horizon_lsn, self.state.backup_lsn); - } - horizon_lsn.segment_number(self.state.server.wal_seg_size as usize) - } } #[cfg(test)] mod tests { use futures::future::BoxFuture; - use postgres_ffi::WAL_SEGMENT_SIZE; + use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE}; use super::*; - use crate::{ - state::{PersistedPeers, TimelinePersistentState}, - wal_storage::Storage, - }; + use crate::state::{EvictionState, PersistedPeers, TimelinePersistentState}; use std::{ops::Deref, str::FromStr, time::Instant}; // fake storage for tests @@ -981,7 +991,6 @@ mod tests { persisted_state: TimelinePersistentState, } - #[async_trait::async_trait] impl control_file::Storage for InMemoryState { async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> { self.persisted_state = s.clone(); @@ -1013,12 +1022,19 @@ mod tests { lsn: Lsn, } - #[async_trait::async_trait] impl wal_storage::Storage for DummyWalStore { + fn write_lsn(&self) -> Lsn { + self.lsn + } + fn flush_lsn(&self) -> Lsn { self.lsn } + async fn initialize_first_segment(&mut self, _init_lsn: Lsn) -> Result<()> { + Ok(()) + } + async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { self.lsn = startpos + buf.len() as u64; Ok(()) @@ -1048,7 +1064,7 @@ mod tests { persisted_state: test_sk_state(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; - let mut sk = SafeKeeper::new(storage, wal_store, NodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -1064,7 +1080,7 @@ mod tests { persisted_state: state, }; - sk = SafeKeeper::new(storage, sk.wal_store, NodeId(0)).unwrap(); + sk = SafeKeeper::new(TimelineState::new(storage), sk.wal_store, NodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request).await; @@ -1075,17 +1091,17 @@ mod tests { } #[tokio::test] - async fn test_epoch_switch() { + async fn test_last_log_term_switch() { let storage = InMemoryState { persisted_state: test_sk_state(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; - let mut sk = SafeKeeper::new(storage, wal_store, NodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { - term: 1, - epoch_start_lsn: Lsn(3), + term: 2, + term_start_lsn: Lsn(3), begin_lsn: Lsn(1), end_lsn: Lsn(2), commit_lsn: Lsn(0), @@ -1098,38 +1114,94 @@ mod tests { }; let pem = ProposerElected { - term: 1, + term: 2, start_streaming_at: Lsn(1), - term_history: TermHistory(vec![TermLsn { - term: 1, - lsn: Lsn(3), - }]), - timeline_start_lsn: Lsn(0), + term_history: TermHistory(vec![ + TermLsn { + term: 1, + lsn: Lsn(1), + }, + TermLsn { + term: 2, + lsn: Lsn(3), + }, + ]), + timeline_start_lsn: Lsn(1), }; sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) .await .unwrap(); - // check that AppendRequest before epochStartLsn doesn't switch epoch - let resp = sk - .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) - .await; - assert!(resp.is_ok()); - assert_eq!(sk.get_epoch(), 0); + // check that AppendRequest before term_start_lsn doesn't switch last_log_term. + sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) + .await + .unwrap(); + assert_eq!(sk.get_last_log_term(), 1); - // but record at epochStartLsn does the switch + // but record at term_start_lsn does the switch ar_hdr.begin_lsn = Lsn(2); ar_hdr.end_lsn = Lsn(3); append_request = AppendRequest { h: ar_hdr, wal_data: Bytes::from_static(b"b"), }; - let resp = sk - .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) - .await; - assert!(resp.is_ok()); - sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %) - assert_eq!(sk.get_epoch(), 1); + sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) + .await + .unwrap(); + assert_eq!(sk.get_last_log_term(), 2); + } + + #[tokio::test] + async fn test_non_consecutive_write() { + let storage = InMemoryState { + persisted_state: test_sk_state(), + }; + let wal_store = DummyWalStore { lsn: Lsn(0) }; + + let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); + + let pem = ProposerElected { + term: 1, + start_streaming_at: Lsn(1), + term_history: TermHistory(vec![TermLsn { + term: 1, + lsn: Lsn(1), + }]), + timeline_start_lsn: Lsn(1), + }; + sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) + .await + .unwrap(); + + let ar_hdr = AppendRequestHeader { + term: 1, + term_start_lsn: Lsn(3), + begin_lsn: Lsn(1), + end_lsn: Lsn(2), + commit_lsn: Lsn(0), + truncate_lsn: Lsn(0), + proposer_uuid: [0; 16], + }; + let append_request = AppendRequest { + h: ar_hdr.clone(), + wal_data: Bytes::from_static(b"b"), + }; + + // do write ending at 2, it should be ok + sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) + .await + .unwrap(); + let mut ar_hrd2 = ar_hdr.clone(); + ar_hrd2.begin_lsn = Lsn(4); + ar_hrd2.end_lsn = Lsn(5); + let append_request = AppendRequest { + h: ar_hdr, + wal_data: Bytes::from_static(b"b"), + }; + // and now starting at 4, it must fail + sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) + .await + .unwrap_err(); } #[test] @@ -1233,6 +1305,8 @@ mod tests { commit_lsn: Lsn(1234567600), }, )]), + partial_backup: crate::wal_backup_partial::State::default(), + eviction_state: EvictionState::Present, }; let ser = state.ser().unwrap(); @@ -1278,6 +1352,10 @@ mod tests { 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, + // partial_backup + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + // eviction_state + 0x00, 0x00, 0x00, 0x00, ]; assert_eq!(Hex(&ser), Hex(&expected)); diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index ee3e4c8ead..6d677f405a 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -2,8 +2,10 @@ //! with the "START_REPLICATION" message, and registry of walsenders. use crate::handler::SafekeeperPostgresHandler; +use crate::metrics::RECEIVED_PS_FEEDBACKS; +use crate::receive_wal::WalReceivers; use crate::safekeeper::{Term, TermLsn}; -use crate::timeline::Timeline; +use crate::timeline::WalResidentTimeline; use crate::wal_service::ConnectionId; use crate::wal_storage::WalReader; use crate::GlobalTimelines; @@ -83,19 +85,30 @@ impl StandbyReply { #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct StandbyFeedback { - reply: StandbyReply, - hs_feedback: HotStandbyFeedback, + pub reply: StandbyReply, + pub hs_feedback: HotStandbyFeedback, +} + +impl StandbyFeedback { + pub fn empty() -> Self { + StandbyFeedback { + reply: StandbyReply::empty(), + hs_feedback: HotStandbyFeedback::empty(), + } + } } /// WalSenders registry. Timeline holds it (wrapped in Arc). pub struct WalSenders { mutex: Mutex, + walreceivers: Arc, } impl WalSenders { - pub fn new() -> Arc { + pub fn new(walreceivers: Arc) -> Arc { Arc::new(WalSenders { mutex: Mutex::new(WalSendersShared::new()), + walreceivers, }) } @@ -136,28 +149,54 @@ impl WalSenders { self.mutex.lock().slots.iter().flatten().cloned().collect() } - /// Get aggregated pageserver feedback. - pub fn get_ps_feedback(self: &Arc) -> PageserverFeedback { - self.mutex.lock().agg_ps_feedback + /// Get LSN of the most lagging pageserver receiver. Return None if there are no + /// active walsenders. + pub fn laggard_lsn(self: &Arc) -> Option { + self.mutex + .lock() + .slots + .iter() + .flatten() + .filter_map(|s| match s.feedback { + ReplicationFeedback::Pageserver(feedback) => Some(feedback.last_received_lsn), + ReplicationFeedback::Standby(_) => None, + }) + .min() } - /// Get aggregated pageserver and hot standby feedback (we send them to compute). - pub fn get_feedbacks(self: &Arc) -> (PageserverFeedback, HotStandbyFeedback) { + /// Returns total counter of pageserver feedbacks received and last feedback. + pub fn get_ps_feedback_stats(self: &Arc) -> (u64, PageserverFeedback) { let shared = self.mutex.lock(); - (shared.agg_ps_feedback, shared.agg_hs_feedback) + (shared.ps_feedback_counter, shared.last_ps_feedback) + } + + /// Get aggregated hot standby feedback (we send it to compute). + pub fn get_hotstandby(self: &Arc) -> StandbyFeedback { + self.mutex.lock().agg_standby_feedback } /// Record new pageserver feedback, update aggregated values. fn record_ps_feedback(self: &Arc, id: WalSenderId, feedback: &PageserverFeedback) { let mut shared = self.mutex.lock(); shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback); - shared.update_ps_feedback(); + shared.last_ps_feedback = *feedback; + shared.ps_feedback_counter += 1; + drop(shared); + + RECEIVED_PS_FEEDBACKS.inc(); + + // send feedback to connected walproposers + self.walreceivers.broadcast_pageserver_feedback(*feedback); } /// Record standby reply. fn record_standby_reply(self: &Arc, id: WalSenderId, reply: &StandbyReply) { let mut shared = self.mutex.lock(); let slot = shared.get_slot_mut(id); + debug!( + "Record standby reply: ts={} apply_lsn={}", + reply.reply_ts, reply.apply_lsn + ); match &mut slot.feedback { ReplicationFeedback::Standby(sf) => sf.reply = *reply, ReplicationFeedback::Pageserver(_) => { @@ -182,7 +221,7 @@ impl WalSenders { }) } } - shared.update_hs_feedback(); + shared.update_reply_feedback(); } /// Get remote_consistent_lsn reported by the pageserver. Returns None if @@ -200,23 +239,26 @@ impl WalSenders { fn unregister(self: &Arc, id: WalSenderId) { let mut shared = self.mutex.lock(); shared.slots[id] = None; - shared.update_hs_feedback(); + shared.update_reply_feedback(); } } struct WalSendersShared { // aggregated over all walsenders value - agg_hs_feedback: HotStandbyFeedback, - // aggregated over all walsenders value - agg_ps_feedback: PageserverFeedback, + agg_standby_feedback: StandbyFeedback, + // last feedback ever received from any pageserver, empty if none + last_ps_feedback: PageserverFeedback, + // total counter of pageserver feedbacks received + ps_feedback_counter: u64, slots: Vec>, } impl WalSendersShared { fn new() -> Self { WalSendersShared { - agg_hs_feedback: HotStandbyFeedback::empty(), - agg_ps_feedback: PageserverFeedback::empty(), + agg_standby_feedback: StandbyFeedback::empty(), + last_ps_feedback: PageserverFeedback::empty(), + ps_feedback_counter: 0, slots: Vec::new(), } } @@ -231,10 +273,11 @@ impl WalSendersShared { self.slots[id].as_mut().expect("walsender doesn't exist") } - /// Update aggregated hot standy feedback. We just take min of valid xmins + /// Update aggregated hot standy and normal reply feedbacks. We just take min of valid xmins /// and ts. - fn update_hs_feedback(&mut self) { + fn update_reply_feedback(&mut self) { let mut agg = HotStandbyFeedback::empty(); + let mut reply_agg = StandbyReply::empty(); for ws_state in self.slots.iter().flatten() { if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback { let hs_feedback = standby_feedback.hs_feedback; @@ -247,7 +290,7 @@ impl WalSendersShared { } else { agg.xmin = hs_feedback.xmin; } - agg.ts = min(agg.ts, hs_feedback.ts); + agg.ts = max(agg.ts, hs_feedback.ts); } if hs_feedback.catalog_xmin != INVALID_FULL_TRANSACTION_ID { if agg.catalog_xmin != INVALID_FULL_TRANSACTION_ID { @@ -255,42 +298,43 @@ impl WalSendersShared { } else { agg.catalog_xmin = hs_feedback.catalog_xmin; } - agg.ts = min(agg.ts, hs_feedback.ts); + agg.ts = max(agg.ts, hs_feedback.ts); + } + let reply = standby_feedback.reply; + if reply.write_lsn != Lsn::INVALID { + if reply_agg.write_lsn != Lsn::INVALID { + reply_agg.write_lsn = Lsn::min(reply_agg.write_lsn, reply.write_lsn); + } else { + reply_agg.write_lsn = reply.write_lsn; + } + } + if reply.flush_lsn != Lsn::INVALID { + if reply_agg.flush_lsn != Lsn::INVALID { + reply_agg.flush_lsn = Lsn::min(reply_agg.flush_lsn, reply.flush_lsn); + } else { + reply_agg.flush_lsn = reply.flush_lsn; + } + } + if reply.apply_lsn != Lsn::INVALID { + if reply_agg.apply_lsn != Lsn::INVALID { + reply_agg.apply_lsn = Lsn::min(reply_agg.apply_lsn, reply.apply_lsn); + } else { + reply_agg.apply_lsn = reply.apply_lsn; + } + } + if reply.reply_ts != 0 { + if reply_agg.reply_ts != 0 { + reply_agg.reply_ts = TimestampTz::min(reply_agg.reply_ts, reply.reply_ts); + } else { + reply_agg.reply_ts = reply.reply_ts; + } } } } - self.agg_hs_feedback = agg; - } - - /// Update aggregated pageserver feedback. LSNs (last_received, - /// disk_consistent, remote_consistent) and reply timestamp are just - /// maximized; timeline_size if taken from feedback with highest - /// last_received lsn. This is generally reasonable, but we might want to - /// implement other policies once multiple pageservers start to be actively - /// used. - fn update_ps_feedback(&mut self) { - let init = PageserverFeedback::empty(); - let acc = - self.slots - .iter() - .flatten() - .fold(init, |mut acc, ws_state| match ws_state.feedback { - ReplicationFeedback::Pageserver(feedback) => { - if feedback.last_received_lsn > acc.last_received_lsn { - acc.current_timeline_size = feedback.current_timeline_size; - } - acc.last_received_lsn = - max(feedback.last_received_lsn, acc.last_received_lsn); - acc.disk_consistent_lsn = - max(feedback.disk_consistent_lsn, acc.disk_consistent_lsn); - acc.remote_consistent_lsn = - max(feedback.remote_consistent_lsn, acc.remote_consistent_lsn); - acc.replytime = max(feedback.replytime, acc.replytime); - acc - } - ReplicationFeedback::Standby(_) => acc, - }); - self.agg_ps_feedback = acc; + self.agg_standby_feedback = StandbyFeedback { + reply: reply_agg, + hs_feedback: agg, + }; } } @@ -342,12 +386,18 @@ impl SafekeeperPostgresHandler { start_pos: Lsn, term: Option, ) -> Result<(), QueryError> { + let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?; + let residence_guard = tli.wal_residence_guard().await?; + if let Err(end) = self - .handle_start_replication_guts(pgb, start_pos, term) + .handle_start_replication_guts(pgb, start_pos, term, residence_guard) .await { + let info = tli.get_safekeeper_info(&self.conf).await; // Log the result and probably send it to the client, closing the stream. - pgb.handle_copy_stream_end(end).await; + pgb.handle_copy_stream_end(end) + .instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.flush_lsn))) + .await; } Ok(()) } @@ -357,10 +407,9 @@ impl SafekeeperPostgresHandler { pgb: &mut PostgresBackend, start_pos: Lsn, term: Option, + tli: WalResidentTimeline, ) -> Result<(), CopyStreamHandlerEnd> { let appname = self.appname.clone(); - let tli = - GlobalTimelines::get(self.ttid).map_err(|e| CopyStreamHandlerEnd::Other(e.into()))?; // Use a guard object to remove our entry from the timeline when we are done. let ws_guard = Arc::new(tli.get_walsenders().register( @@ -401,14 +450,7 @@ impl SafekeeperPostgresHandler { // switch to copy pgb.write_message(&BeMessage::CopyBothResponse).await?; - let (_, persisted_state) = tli.get_state().await; - let wal_reader = WalReader::new( - self.conf.workdir.clone(), - self.conf.timeline_dir(&tli.ttid), - &persisted_state, - start_pos, - self.conf.is_wal_backup_enabled(), - )?; + let wal_reader = tli.get_walreader(start_pos).await?; // Split to concurrently receive and send data; replies are generally // not synchronized with sends, so this avoids deadlocks. @@ -416,7 +458,8 @@ impl SafekeeperPostgresHandler { let mut sender = WalSender { pgb, - tli: tli.clone(), + // should succeed since we're already holding another guard + tli: tli.wal_residence_guard().await?, appname, start_pos, end_pos, @@ -428,7 +471,7 @@ impl SafekeeperPostgresHandler { }; let mut reply_reader = ReplyReader { reader, - ws_guard, + ws_guard: ws_guard.clone(), tli, }; @@ -437,6 +480,18 @@ impl SafekeeperPostgresHandler { r = sender.run() => r, r = reply_reader.run() => r, }; + + let ws_state = ws_guard + .walsenders + .mutex + .lock() + .get_slot(ws_guard.id) + .clone(); + info!( + "finished streaming to {}, feedback={:?}", + ws_state.addr, ws_state.feedback, + ); + // Join pg backend back. pgb.unsplit(reply_reader.reader)?; @@ -473,7 +528,7 @@ impl EndWatch { /// A half driving sending WAL. struct WalSender<'a, IO> { pgb: &'a mut PostgresBackend, - tli: Arc, + tli: WalResidentTimeline, appname: Option, // Position since which we are sending next chunk. start_pos: Lsn, @@ -496,6 +551,8 @@ struct WalSender<'a, IO> { send_buf: [u8; MAX_SEND_SIZE], } +const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); + impl WalSender<'_, IO> { /// Send WAL until /// - an error occurs @@ -574,14 +631,22 @@ impl WalSender<'_, IO> { async fn wait_wal(&mut self) -> Result<(), CopyStreamHandlerEnd> { loop { self.end_pos = self.end_watch.get(); - if self.end_pos > self.start_pos { - // We have something to send. + let have_something_to_send = (|| { + fail::fail_point!( + "sk-pause-send", + self.appname.as_deref() != Some("pageserver"), + |_| { false } + ); + self.end_pos > self.start_pos + })(); + + if have_something_to_send { trace!("got end_pos {:?}, streaming", self.end_pos); return Ok(()); } // Wait for WAL to appear, now self.end_pos == self.start_pos. - if let Some(lsn) = wait_for_lsn(&mut self.end_watch, self.term, self.start_pos).await? { + if let Some(lsn) = self.wait_for_lsn().await? { self.end_pos = lsn; trace!("got end_pos {:?}, streaming", self.end_pos); return Ok(()); @@ -618,13 +683,61 @@ impl WalSender<'_, IO> { .await?; } } + + /// Wait until we have available WAL > start_pos or timeout expires. Returns + /// - Ok(Some(end_pos)) if needed lsn is successfully observed; + /// - Ok(None) if timeout expired; + /// - Err in case of error -- only if 1) term changed while fetching in recovery + /// mode 2) watch channel closed, which must never happen. + async fn wait_for_lsn(&mut self) -> anyhow::Result> { + let fp = (|| { + fail::fail_point!( + "sk-pause-send", + self.appname.as_deref() != Some("pageserver"), + |_| { true } + ); + false + })(); + if fp { + tokio::time::sleep(POLL_STATE_TIMEOUT).await; + return Ok(None); + } + + let res = timeout(POLL_STATE_TIMEOUT, async move { + loop { + let end_pos = self.end_watch.get(); + if end_pos > self.start_pos { + return Ok(end_pos); + } + if let EndWatch::Flush(rx) = &self.end_watch { + let curr_term = rx.borrow().term; + if let Some(client_term) = self.term { + if curr_term != client_term { + bail!("term changed: requested {}, now {}", client_term, curr_term); + } + } + } + self.end_watch.changed().await?; + } + }) + .await; + + match res { + // success + Ok(Ok(commit_lsn)) => Ok(Some(commit_lsn)), + // error inside closure + Ok(Err(err)) => Err(err), + // timeout + Err(_) => Ok(None), + } + } } /// A half driving receiving replies. struct ReplyReader { reader: PostgresBackendReader, ws_guard: Arc, - tli: Arc, + tli: WalResidentTimeline, } impl ReplyReader { @@ -639,8 +752,14 @@ impl ReplyReader { match msg.first().cloned() { Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => { // Note: deserializing is on m[1..] because we skip the tag byte. - let hs_feedback = HotStandbyFeedback::des(&msg[1..]) + let mut hs_feedback = HotStandbyFeedback::des(&msg[1..]) .context("failed to deserialize HotStandbyFeedback")?; + // TODO: xmin/catalog_xmin are serialized by walreceiver.c in this way: + // pq_sendint32(&reply_message, xmin); + // pq_sendint32(&reply_message, xmin_epoch); + // So it is two big endian 32-bit words in low endian order! + hs_feedback.xmin = hs_feedback.xmin.rotate_left(32); + hs_feedback.catalog_xmin = hs_feedback.catalog_xmin.rotate_left(32); self.ws_guard .walsenders .record_hs_feedback(self.ws_guard.id, &hs_feedback); @@ -675,50 +794,8 @@ impl ReplyReader { } } -const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); - -/// Wait until we have available WAL > start_pos or timeout expires. Returns -/// - Ok(Some(end_pos)) if needed lsn is successfully observed; -/// - Ok(None) if timeout expired; -/// - Err in case of error -- only if 1) term changed while fetching in recovery -/// mode 2) watch channel closed, which must never happen. -async fn wait_for_lsn( - rx: &mut EndWatch, - client_term: Option, - start_pos: Lsn, -) -> anyhow::Result> { - let res = timeout(POLL_STATE_TIMEOUT, async move { - loop { - let end_pos = rx.get(); - if end_pos > start_pos { - return Ok(end_pos); - } - if let EndWatch::Flush(rx) = rx { - let curr_term = rx.borrow().term; - if let Some(client_term) = client_term { - if curr_term != client_term { - bail!("term changed: requested {}, now {}", client_term, curr_term); - } - } - } - rx.changed().await?; - } - }) - .await; - - match res { - // success - Ok(Ok(commit_lsn)) => Ok(Some(commit_lsn)), - // error inside closure - Ok(Err(err)) => Err(err), - // timeout - Err(_) => Ok(None), - } -} - #[cfg(test)] mod tests { - use postgres_protocol::PG_EPOCH; use utils::id::{TenantId, TimelineId}; use super::*; @@ -764,8 +841,11 @@ mod tests { fn test_hs_feedback_no_valid() { let mut wss = WalSendersShared::new(); push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID)); - wss.update_hs_feedback(); - assert_eq!(wss.agg_hs_feedback.xmin, INVALID_FULL_TRANSACTION_ID); + wss.update_reply_feedback(); + assert_eq!( + wss.agg_standby_feedback.hs_feedback.xmin, + INVALID_FULL_TRANSACTION_ID + ); } #[test] @@ -774,30 +854,7 @@ mod tests { push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID)); push_feedback(&mut wss, hs_feedback(1, 42)); push_feedback(&mut wss, hs_feedback(1, 64)); - wss.update_hs_feedback(); - assert_eq!(wss.agg_hs_feedback.xmin, 42); - } - - // form pageserver feedback with given last_record_lsn / tli size and the - // rest set to dummy values. - fn ps_feedback(current_timeline_size: u64, last_received_lsn: Lsn) -> ReplicationFeedback { - ReplicationFeedback::Pageserver(PageserverFeedback { - current_timeline_size, - last_received_lsn, - disk_consistent_lsn: Lsn::INVALID, - remote_consistent_lsn: Lsn::INVALID, - replytime: *PG_EPOCH, - }) - } - - // test that ps aggregation works as expected - #[test] - fn test_ps_feedback() { - let mut wss = WalSendersShared::new(); - push_feedback(&mut wss, ps_feedback(8, Lsn(42))); - push_feedback(&mut wss, ps_feedback(4, Lsn(84))); - wss.update_ps_feedback(); - assert_eq!(wss.agg_ps_feedback.current_timeline_size, 4); - assert_eq!(wss.agg_ps_feedback.last_received_lsn, Lsn(84)); + wss.update_reply_feedback(); + assert_eq!(wss.agg_standby_feedback.hs_feedback.xmin, 42); } } diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index 82f7954051..97eeae3638 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -13,6 +13,7 @@ use utils::{ use crate::{ control_file, safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory}, + wal_backup_partial::{self}, }; /// Persistent information stored on safekeeper node about timeline. @@ -54,16 +55,34 @@ pub struct TimelinePersistentState { /// pushed to s3. We don't remove WAL beyond it. Persisted only for /// informational purposes, we receive it from pageserver (or broker). pub remote_consistent_lsn: Lsn, - // Peers and their state as we remember it. Knowing peers themselves is - // fundamental; but state is saved here only for informational purposes and - // obviously can be stale. (Currently not saved at all, but let's provision - // place to have less file version upgrades). + /// Peers and their state as we remember it. Knowing peers themselves is + /// fundamental; but state is saved here only for informational purposes and + /// obviously can be stale. (Currently not saved at all, but let's provision + /// place to have less file version upgrades). pub peers: PersistedPeers, + /// Holds names of partial segments uploaded to remote storage. Used to + /// clean up old objects without leaving garbage in remote storage. + pub partial_backup: wal_backup_partial::State, + /// Eviction state of the timeline. If it's Offloaded, we should download + /// WAL files from remote storage to serve the timeline. + pub eviction_state: EvictionState, } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>); +/// State of the local WAL files. Used to track current timeline state, +/// that can be either WAL files are present on disk or last partial segment +/// is offloaded to remote storage. +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)] +pub enum EvictionState { + /// WAL files are present on disk. + Present, + /// Last partial segment is offloaded to remote storage. + /// Contains flush_lsn of the last offloaded segment. + Offloaded(Lsn), +} + impl TimelinePersistentState { pub fn new( ttid: &TenantTimelineId, @@ -93,6 +112,8 @@ impl TimelinePersistentState { .map(|p| (*p, PersistedPeerInfo::new())) .collect(), ), + partial_backup: wal_backup_partial::State::default(), + eviction_state: EvictionState::Present, } } @@ -126,9 +147,11 @@ pub struct TimelineMemState { pub proposer_uuid: PgUuid, } -/// Safekeeper persistent state plus in memory layer, to avoid frequent fsyncs -/// when we update fields like commit_lsn which don't need immediate -/// persistence. Provides transactional like API to atomically update the state. +/// Safekeeper persistent state plus in memory layer. +/// +/// Allows us to avoid frequent fsyncs when we update fields like commit_lsn +/// which don't need immediate persistence. Provides transactional like API +/// to atomically update the state. /// /// Implements Deref into *persistent* part. pub struct TimelineState { @@ -168,7 +191,12 @@ where /// Persist given state. c.f. start_change. pub async fn finish_change(&mut self, s: &TimelinePersistentState) -> Result<()> { - self.pers.persist(s).await?; + if s.eq(&*self.pers) { + // nothing to do if state didn't change + } else { + self.pers.persist(s).await?; + } + // keep in memory values up to date self.inmem.commit_lsn = s.commit_lsn; self.inmem.backup_lsn = s.backup_lsn; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index ec7dd7d89b..6fd5de0ad6 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -3,18 +3,19 @@ use anyhow::{anyhow, bail, Result}; use camino::Utf8PathBuf; -use postgres_ffi::XLogSegNo; +use remote_storage::RemotePath; use serde::{Deserialize, Serialize}; -use tokio::fs; +use tokio::fs::{self}; +use tokio_util::sync::CancellationToken; +use utils::id::TenantId; use std::cmp::max; +use std::ops::{Deref, DerefMut}; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::Arc; use std::time::Duration; -use tokio::sync::{Mutex, MutexGuard}; -use tokio::{ - sync::{mpsc::Sender, watch}, - time::Instant, -}; +use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; +use tokio::{sync::watch, time::Instant}; use tracing::*; use utils::http::error::ApiError; use utils::{ @@ -25,20 +26,24 @@ use utils::{ use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; +use crate::rate_limit::RateLimiter; use crate::receive_wal::WalReceivers; -use crate::recovery::{recovery_main, Donor, RecoveryNeededInfo}; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn, INVALID_TERM, }; use crate::send_wal::WalSenders; -use crate::state::{TimelineMemState, TimelinePersistentState}; -use crate::wal_backup::{self}; +use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, TimelineState}; +use crate::timeline_guard::ResidenceGuard; +use crate::timeline_manager::{AtomicStatus, ManagerCtl}; +use crate::timelines_set::TimelinesSet; +use crate::wal_backup::{self, remote_timeline_path}; +use crate::wal_backup_partial::PartialRemoteSegment; use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; -use crate::metrics::FullTimelineInfo; -use crate::wal_storage::Storage as wal_storage_iface; -use crate::{debug_dump, wal_storage}; +use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS}; +use crate::wal_storage::{Storage as wal_storage_iface, WalReader}; +use crate::{debug_dump, timeline_manager, wal_storage}; use crate::{GlobalTimelines, SafeKeeperConf}; /// Things safekeeper should know about timeline state on peers. @@ -51,8 +56,7 @@ pub struct PeerInfo { /// LSN of the last record. pub flush_lsn: Lsn, pub commit_lsn: Lsn, - /// Since which LSN safekeeper has WAL. TODO: remove this once we fill new - /// sk since backup_lsn. + /// Since which LSN safekeeper has WAL. pub local_start_lsn: Lsn, /// When info was received. Serde annotations are not very useful but make /// the code compile -- we don't rely on this field externally. @@ -97,25 +101,223 @@ impl PeersInfo { } } +pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>; + +/// WriteGuardSharedState is a wrapper around `RwLockWriteGuard` that +/// automatically updates `watch::Sender` channels with state on drop. +pub struct WriteGuardSharedState<'a> { + tli: Arc, + guard: RwLockWriteGuard<'a, SharedState>, + skip_update: bool, +} + +impl<'a> WriteGuardSharedState<'a> { + fn new(tli: Arc, guard: RwLockWriteGuard<'a, SharedState>) -> Self { + WriteGuardSharedState { + tli, + guard, + skip_update: false, + } + } +} + +impl<'a> Deref for WriteGuardSharedState<'a> { + type Target = SharedState; + + fn deref(&self) -> &Self::Target { + &self.guard + } +} + +impl<'a> DerefMut for WriteGuardSharedState<'a> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.guard + } +} + +impl<'a> Drop for WriteGuardSharedState<'a> { + fn drop(&mut self) { + let term_flush_lsn = + TermLsn::from((self.guard.sk.last_log_term(), self.guard.sk.flush_lsn())); + let commit_lsn = self.guard.sk.state().inmem.commit_lsn; + + let _ = self.tli.term_flush_lsn_watch_tx.send_if_modified(|old| { + if *old != term_flush_lsn { + *old = term_flush_lsn; + true + } else { + false + } + }); + + let _ = self.tli.commit_lsn_watch_tx.send_if_modified(|old| { + if *old != commit_lsn { + *old = commit_lsn; + true + } else { + false + } + }); + + if !self.skip_update { + // send notification about shared state update + self.tli.shared_state_version_tx.send_modify(|old| { + *old += 1; + }); + } + } +} + +/// This structure is stored in shared state and represents the state of the timeline. +/// +/// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this +/// case, SafeKeeper is not available (because WAL is not present on disk) and all +/// operations can be done only with control file. +pub enum StateSK { + Loaded(SafeKeeper), + Offloaded(Box>), + // Not used, required for moving between states. + Empty, +} + +impl StateSK { + pub fn flush_lsn(&self) -> Lsn { + match self { + StateSK::Loaded(sk) => sk.wal_store.flush_lsn(), + StateSK::Offloaded(state) => match state.eviction_state { + EvictionState::Offloaded(flush_lsn) => flush_lsn, + _ => panic!("StateSK::Offloaded mismatches with eviction_state from control_file"), + }, + StateSK::Empty => unreachable!(), + } + } + + /// Get a reference to the control file's timeline state. + pub fn state(&self) -> &TimelineState { + match self { + StateSK::Loaded(sk) => &sk.state, + StateSK::Offloaded(ref s) => s, + StateSK::Empty => unreachable!(), + } + } + + pub fn state_mut(&mut self) -> &mut TimelineState { + match self { + StateSK::Loaded(sk) => &mut sk.state, + StateSK::Offloaded(ref mut s) => s, + StateSK::Empty => unreachable!(), + } + } + + pub fn last_log_term(&self) -> Term { + self.state() + .acceptor_state + .get_last_log_term(self.flush_lsn()) + } + + /// Close open WAL files to release FDs. + fn close_wal_store(&mut self) { + if let StateSK::Loaded(sk) = self { + sk.wal_store.close(); + } + } + + /// Update timeline state with peer safekeeper data. + pub async fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> { + // update commit_lsn if safekeeper is loaded + match self { + StateSK::Loaded(sk) => sk.record_safekeeper_info(sk_info).await?, + StateSK::Offloaded(_) => {} + StateSK::Empty => unreachable!(), + } + + // update everything else, including remote_consistent_lsn and backup_lsn + let mut sync_control_file = false; + let state = self.state_mut(); + let wal_seg_size = state.server.wal_seg_size as u64; + + state.inmem.backup_lsn = max(Lsn(sk_info.backup_lsn), state.inmem.backup_lsn); + sync_control_file |= state.backup_lsn + wal_seg_size < state.inmem.backup_lsn; + + state.inmem.remote_consistent_lsn = max( + Lsn(sk_info.remote_consistent_lsn), + state.inmem.remote_consistent_lsn, + ); + sync_control_file |= + state.remote_consistent_lsn + wal_seg_size < state.inmem.remote_consistent_lsn; + + state.inmem.peer_horizon_lsn = + max(Lsn(sk_info.peer_horizon_lsn), state.inmem.peer_horizon_lsn); + sync_control_file |= state.peer_horizon_lsn + wal_seg_size < state.inmem.peer_horizon_lsn; + + if sync_control_file { + state.flush().await?; + } + Ok(()) + } + + /// Previously known as epoch_start_lsn. Needed only for reference in some APIs. + pub fn term_start_lsn(&self) -> Lsn { + match self { + StateSK::Loaded(sk) => sk.term_start_lsn, + StateSK::Offloaded(_) => Lsn(0), + StateSK::Empty => unreachable!(), + } + } + + /// Used for metrics only. + pub fn wal_storage_metrics(&self) -> WalStorageMetrics { + match self { + StateSK::Loaded(sk) => sk.wal_store.get_metrics(), + StateSK::Offloaded(_) => WalStorageMetrics::default(), + StateSK::Empty => unreachable!(), + } + } + + /// Returns WAL storage internal LSNs for debug dump. + pub fn wal_storage_internal_state(&self) -> (Lsn, Lsn, Lsn, bool) { + match self { + StateSK::Loaded(sk) => sk.wal_store.internal_state(), + StateSK::Offloaded(_) => { + let flush_lsn = self.flush_lsn(); + (flush_lsn, flush_lsn, flush_lsn, false) + } + StateSK::Empty => unreachable!(), + } + } + + /// Access to SafeKeeper object. Panics if offloaded, should be good to use from WalResidentTimeline. + pub fn safekeeper( + &mut self, + ) -> &mut SafeKeeper { + match self { + StateSK::Loaded(sk) => sk, + StateSK::Offloaded(_) => { + panic!("safekeeper is offloaded, cannot be used") + } + StateSK::Empty => unreachable!(), + } + } + + /// Moves control file's state structure out of the enum. Used to switch states. + fn take_state(self) -> TimelineState { + match self { + StateSK::Loaded(sk) => sk.state, + StateSK::Offloaded(state) => *state, + StateSK::Empty => unreachable!(), + } + } +} + /// Shared state associated with database instance pub struct SharedState { /// Safekeeper object - sk: SafeKeeper, + pub(crate) sk: StateSK, /// In memory list containing state of peers sent in latest messages from them. - peers_info: PeersInfo, - /// True when WAL backup launcher oversees the timeline, making sure WAL is - /// offloaded, allows to bother launcher less. - wal_backup_active: bool, - /// True whenever there is at least some pending activity on timeline: live - /// compute connection, pageserver is not caughtup (it must have latest WAL - /// for new compute start) or WAL backuping is not finished. Practically it - /// means safekeepers broadcast info to peers about the timeline, old WAL is - /// trimmed. - /// - /// TODO: it might be better to remove tli completely from GlobalTimelines - /// when tli is inactive instead of having this flag. - active: bool, - last_removed_segno: XLogSegNo, + pub(crate) peers_info: PeersInfo, + // True value hinders old WAL removal; this is used by snapshotting. We + // could make it a counter, but there is no need to. + pub(crate) wal_removal_on_hold: bool, } impl SharedState { @@ -143,111 +345,59 @@ impl SharedState { // We don't want to write anything to disk, because we may have existing timeline there. // These functions should not change anything on disk. - let timeline_dir = conf.timeline_dir(ttid); - let control_store = control_file::FileStorage::create_new(timeline_dir, conf, state)?; + let timeline_dir = get_timeline_dir(conf, ttid); + let control_store = + control_file::FileStorage::create_new(timeline_dir.clone(), conf, state)?; let wal_store = - wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?; - let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?; + wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?; + let sk = SafeKeeper::new(TimelineState::new(control_store), wal_store, conf.my_id)?; Ok(Self { - sk, + sk: StateSK::Loaded(sk), peers_info: PeersInfo(vec![]), - wal_backup_active: false, - active: false, - last_removed_segno: 0, + wal_removal_on_hold: false, }) } /// Restore SharedState from control file. If file doesn't exist, bails out. fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result { + let timeline_dir = get_timeline_dir(conf, ttid); let control_store = control_file::FileStorage::restore_new(ttid, conf)?; if control_store.server.wal_seg_size == 0 { bail!(TimelineError::UninitializedWalSegSize(*ttid)); } - let wal_store = - wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?; + let sk = match control_store.eviction_state { + EvictionState::Present => { + let wal_store = + wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?; + StateSK::Loaded(SafeKeeper::new( + TimelineState::new(control_store), + wal_store, + conf.my_id, + )?) + } + EvictionState::Offloaded(_) => { + StateSK::Offloaded(Box::new(TimelineState::new(control_store))) + } + }; Ok(Self { - sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?, + sk, peers_info: PeersInfo(vec![]), - wal_backup_active: false, - active: false, - last_removed_segno: 0, + wal_removal_on_hold: false, }) } - fn is_active(&self, num_computes: usize) -> bool { - self.is_wal_backup_required(num_computes) - // FIXME: add tracking of relevant pageservers and check them here individually, - // otherwise migration won't work (we suspend too early). - || self.sk.state.inmem.remote_consistent_lsn < self.sk.state.inmem.commit_lsn - } - - /// Mark timeline active/inactive and return whether s3 offloading requires - /// start/stop action. If timeline is deactivated, control file is persisted - /// as maintenance task does that only for active timelines. - async fn update_status(&mut self, num_computes: usize, ttid: TenantTimelineId) -> bool { - let is_active = self.is_active(num_computes); - if self.active != is_active { - info!( - "timeline {} active={} now, remote_consistent_lsn={}, commit_lsn={}", - ttid, - is_active, - self.sk.state.inmem.remote_consistent_lsn, - self.sk.state.inmem.commit_lsn - ); - if !is_active { - if let Err(e) = self.sk.state.flush().await { - warn!("control file save in update_status failed: {:?}", e); - } - } - } - self.active = is_active; - self.is_wal_backup_action_pending(num_computes) - } - - /// Should we run s3 offloading in current state? - fn is_wal_backup_required(&self, num_computes: usize) -> bool { - let seg_size = self.get_wal_seg_size(); - num_computes > 0 || - // Currently only the whole segment is offloaded, so compare segment numbers. - (self.sk.state.inmem.commit_lsn.segment_number(seg_size) > - self.sk.state.inmem.backup_lsn.segment_number(seg_size)) - } - - /// Is current state of s3 offloading is not what it ought to be? - fn is_wal_backup_action_pending(&self, num_computes: usize) -> bool { - let res = self.wal_backup_active != self.is_wal_backup_required(num_computes); - if res { - let action_pending = if self.is_wal_backup_required(num_computes) { - "start" - } else { - "stop" - }; - trace!( - "timeline {} s3 offloading action {} pending: num_computes={}, commit_lsn={}, backup_lsn={}", - self.sk.state.timeline_id, action_pending, num_computes, self.sk.state.inmem.commit_lsn, self.sk.state.inmem.backup_lsn - ); - } - res - } - - /// Returns whether s3 offloading is required and sets current status as - /// matching. - fn wal_backup_attend(&mut self, num_computes: usize) -> bool { - self.wal_backup_active = self.is_wal_backup_required(num_computes); - self.wal_backup_active - } - - fn get_wal_seg_size(&self) -> usize { - self.sk.state.server.wal_seg_size as usize + pub(crate) fn get_wal_seg_size(&self) -> usize { + self.sk.state().server.wal_seg_size as usize } fn get_safekeeper_info( &self, ttid: &TenantTimelineId, conf: &SafeKeeperConf, + standby_apply_lsn: Lsn, ) -> SafekeeperTimelineInfo { SafekeeperTimelineInfo { safekeeper_id: conf.my_id.0, @@ -255,28 +405,29 @@ impl SharedState { tenant_id: ttid.tenant_id.as_ref().to_owned(), timeline_id: ttid.timeline_id.as_ref().to_owned(), }), - term: self.sk.state.acceptor_state.term, - last_log_term: self.sk.get_epoch(), + term: self.sk.state().acceptor_state.term, + last_log_term: self.sk.last_log_term(), flush_lsn: self.sk.flush_lsn().0, // note: this value is not flushed to control file yet and can be lost - commit_lsn: self.sk.state.inmem.commit_lsn.0, - remote_consistent_lsn: self.sk.state.inmem.remote_consistent_lsn.0, - peer_horizon_lsn: self.sk.state.inmem.peer_horizon_lsn.0, + commit_lsn: self.sk.state().inmem.commit_lsn.0, + remote_consistent_lsn: self.sk.state().inmem.remote_consistent_lsn.0, + peer_horizon_lsn: self.sk.state().inmem.peer_horizon_lsn.0, safekeeper_connstr: conf .advertise_pg_addr .to_owned() .unwrap_or(conf.listen_pg_addr.clone()), http_connstr: conf.listen_http_addr.to_owned(), - backup_lsn: self.sk.state.inmem.backup_lsn.0, - local_start_lsn: self.sk.state.local_start_lsn.0, + backup_lsn: self.sk.state().inmem.backup_lsn.0, + local_start_lsn: self.sk.state().local_start_lsn.0, availability_zone: conf.availability_zone.clone(), + standby_horizon: standby_apply_lsn.0, } } /// Get our latest view of alive peers status on the timeline. /// We pass our own info through the broker as well, so when we don't have connection /// to the broker returned vec is empty. - fn get_peers(&self, heartbeat_timeout: Duration) -> Vec { + pub(crate) fn get_peers(&self, heartbeat_timeout: Duration) -> Vec { let now = Instant::now(); self.peers_info .0 @@ -320,11 +471,7 @@ impl From for ApiError { /// It also holds SharedState and provides mutually exclusive access to it. pub struct Timeline { pub ttid: TenantTimelineId, - - /// Sending here asks for wal backup launcher attention (start/stop - /// offloading). Sending ttid instead of concrete command allows to do - /// sending without timeline lock. - pub wal_backup_launcher_tx: Sender, + pub remote_path: RemotePath, /// Used to broadcast commit_lsn updates to all background jobs. commit_lsn_watch_tx: watch::Sender, @@ -337,55 +484,64 @@ pub struct Timeline { term_flush_lsn_watch_tx: watch::Sender, term_flush_lsn_watch_rx: watch::Receiver, + /// Broadcasts shared state updates. + shared_state_version_tx: watch::Sender, + shared_state_version_rx: watch::Receiver, + /// Safekeeper and other state, that should remain consistent and /// synchronized with the disk. This is tokio mutex as we write WAL to disk /// while holding it, ensuring that consensus checks are in order. - mutex: Mutex, + mutex: RwLock, walsenders: Arc, walreceivers: Arc, + timeline_dir: Utf8PathBuf, + manager_ctl: ManagerCtl, - /// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal. - cancellation_tx: watch::Sender, + /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires + pub(crate) cancel: CancellationToken, - /// Timeline should not be used after cancellation. Background tasks should - /// monitor this channel and stop eventually after receiving `true` from this channel. - cancellation_rx: watch::Receiver, - - /// Directory where timeline state is stored. - pub timeline_dir: Utf8PathBuf, + // timeline_manager controlled state + pub(crate) broker_active: AtomicBool, + pub(crate) wal_backup_active: AtomicBool, + pub(crate) last_removed_segno: AtomicU64, + pub(crate) mgr_status: AtomicStatus, } impl Timeline { /// Load existing timeline from disk. - pub fn load_timeline( - conf: &SafeKeeperConf, - ttid: TenantTimelineId, - wal_backup_launcher_tx: Sender, - ) -> Result { + pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result { let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered(); let shared_state = SharedState::restore(conf, &ttid)?; let (commit_lsn_watch_tx, commit_lsn_watch_rx) = - watch::channel(shared_state.sk.state.commit_lsn); + watch::channel(shared_state.sk.state().commit_lsn); let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) = watch::channel(TermLsn::from(( - shared_state.sk.get_term(), + shared_state.sk.last_log_term(), shared_state.sk.flush_lsn(), ))); - let (cancellation_tx, cancellation_rx) = watch::channel(false); + let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0); + let walreceivers = WalReceivers::new(); + let remote_path = remote_timeline_path(&ttid)?; Ok(Timeline { ttid, - wal_backup_launcher_tx, + remote_path, commit_lsn_watch_tx, commit_lsn_watch_rx, term_flush_lsn_watch_tx, term_flush_lsn_watch_rx, - mutex: Mutex::new(shared_state), - walsenders: WalSenders::new(), - walreceivers: WalReceivers::new(), - cancellation_rx, - cancellation_tx, - timeline_dir: conf.timeline_dir(&ttid), + shared_state_version_tx, + shared_state_version_rx, + mutex: RwLock::new(shared_state), + walsenders: WalSenders::new(walreceivers.clone()), + walreceivers, + cancel: CancellationToken::default(), + timeline_dir: get_timeline_dir(conf, &ttid), + manager_ctl: ManagerCtl::new(), + broker_active: AtomicBool::new(false), + wal_backup_active: AtomicBool::new(false), + last_removed_segno: AtomicU64::new(0), + mgr_status: AtomicStatus::new(), }) } @@ -393,7 +549,6 @@ impl Timeline { pub fn create_empty( conf: &SafeKeeperConf, ttid: TenantTimelineId, - wal_backup_launcher_tx: Sender, server_info: ServerInfo, commit_lsn: Lsn, local_start_lsn: Lsn, @@ -401,23 +556,32 @@ impl Timeline { let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID); let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) = watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID))); - let (cancellation_tx, cancellation_rx) = watch::channel(false); + let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0); + let state = TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn); + let walreceivers = WalReceivers::new(); + let remote_path = remote_timeline_path(&ttid)?; Ok(Timeline { ttid, - wal_backup_launcher_tx, + remote_path, commit_lsn_watch_tx, commit_lsn_watch_rx, term_flush_lsn_watch_tx, term_flush_lsn_watch_rx, - mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?), - walsenders: WalSenders::new(), - walreceivers: WalReceivers::new(), - cancellation_rx, - cancellation_tx, - timeline_dir: conf.timeline_dir(&ttid), + shared_state_version_tx, + shared_state_version_rx, + mutex: RwLock::new(SharedState::create_new(conf, &ttid, state)?), + walsenders: WalSenders::new(walreceivers.clone()), + walreceivers, + cancel: CancellationToken::default(), + timeline_dir: get_timeline_dir(conf, &ttid), + manager_ctl: ManagerCtl::new(), + broker_active: AtomicBool::new(false), + wal_backup_active: AtomicBool::new(false), + last_removed_segno: AtomicU64::new(0), + mgr_status: AtomicStatus::new(), }) } @@ -428,8 +592,10 @@ impl Timeline { /// and state on disk should remain unchanged. pub async fn init_new( self: &Arc, - shared_state: &mut MutexGuard<'_, SharedState>, + shared_state: &mut WriteGuardSharedState<'_>, conf: &SafeKeeperConf, + broker_active_set: Arc, + partial_backup_rate_limiter: RateLimiter, ) -> Result<()> { match fs::metadata(&self.timeline_dir).await { Ok(_) => { @@ -447,7 +613,7 @@ impl Timeline { fs::create_dir_all(&self.timeline_dir).await?; // Write timeline to disk and start background tasks. - if let Err(e) = shared_state.sk.state.flush().await { + if let Err(e) = shared_state.sk.state_mut().flush().await { // Bootstrap failed, cancel timeline and remove timeline directory. self.cancel(shared_state); @@ -460,16 +626,29 @@ impl Timeline { return Err(e); } - self.bootstrap(conf); + self.bootstrap(conf, broker_active_set, partial_backup_rate_limiter); Ok(()) } - /// Bootstrap new or existing timeline starting background stasks. - pub fn bootstrap(self: &Arc, conf: &SafeKeeperConf) { - // Start recovery task which always runs on the timeline. - if conf.peer_recovery_enabled { - tokio::spawn(recovery_main(self.clone(), conf.clone())); - } + /// Bootstrap new or existing timeline starting background tasks. + pub fn bootstrap( + self: &Arc, + conf: &SafeKeeperConf, + broker_active_set: Arc, + partial_backup_rate_limiter: RateLimiter, + ) { + let (tx, rx) = self.manager_ctl.bootstrap_manager(); + + // Start manager task which will monitor timeline state and update + // background tasks. + tokio::spawn(timeline_manager::main_task( + ManagerTimeline { tli: self.clone() }, + conf.clone(), + broker_active_set, + tx, + rx, + partial_backup_rate_limiter, + )); } /// Delete timeline from disk completely, by removing timeline directory. @@ -479,10 +658,9 @@ impl Timeline { /// deletion API endpoint is retriable. pub async fn delete( &self, - shared_state: &mut MutexGuard<'_, SharedState>, + shared_state: &mut WriteGuardSharedState<'_>, only_local: bool, - ) -> Result<(bool, bool)> { - let was_active = shared_state.active; + ) -> Result { self.cancel(shared_state); // TODO: It's better to wait for s3 offloader termination before @@ -496,107 +674,31 @@ impl Timeline { wal_backup::delete_timeline(&self.ttid).await?; } let dir_existed = delete_dir(&self.timeline_dir).await?; - Ok((dir_existed, was_active)) + Ok(dir_existed) } /// Cancel timeline to prevent further usage. Background tasks will stop /// eventually after receiving cancellation signal. - /// - /// Note that we can't notify backup launcher here while holding - /// shared_state lock, as this is a potential deadlock: caller is - /// responsible for that. Generally we should probably make WAL backup tasks - /// to shut down on their own, checking once in a while whether it is the - /// time. - fn cancel(&self, shared_state: &mut MutexGuard<'_, SharedState>) { + fn cancel(&self, shared_state: &mut WriteGuardSharedState<'_>) { info!("timeline {} is cancelled", self.ttid); - let _ = self.cancellation_tx.send(true); + self.cancel.cancel(); // Close associated FDs. Nobody will be able to touch timeline data once // it is cancelled, so WAL storage won't be opened again. - shared_state.sk.wal_store.close(); + shared_state.sk.close_wal_store(); } /// Returns if timeline is cancelled. pub fn is_cancelled(&self) -> bool { - *self.cancellation_rx.borrow() - } - - /// Returns watch channel which gets value when timeline is cancelled. It is - /// guaranteed to have not cancelled value observed (errors otherwise). - pub fn get_cancellation_rx(&self) -> Result> { - let rx = self.cancellation_rx.clone(); - if *rx.borrow() { - bail!(TimelineError::Cancelled(self.ttid)); - } - Ok(rx) + self.cancel.is_cancelled() } /// Take a writing mutual exclusive lock on timeline shared_state. - pub async fn write_shared_state(&self) -> MutexGuard { - self.mutex.lock().await + pub async fn write_shared_state<'a>(self: &'a Arc) -> WriteGuardSharedState<'a> { + WriteGuardSharedState::new(self.clone(), self.mutex.write().await) } - async fn update_status(&self, shared_state: &mut SharedState) -> bool { - shared_state - .update_status(self.walreceivers.get_num(), self.ttid) - .await - } - - /// Update timeline status and kick wal backup launcher to stop/start offloading if needed. - pub async fn update_status_notify(&self) -> Result<()> { - if self.is_cancelled() { - bail!(TimelineError::Cancelled(self.ttid)); - } - let is_wal_backup_action_pending: bool = { - let mut shared_state = self.write_shared_state().await; - self.update_status(&mut shared_state).await - }; - if is_wal_backup_action_pending { - // Can fail only if channel to a static thread got closed, which is not normal at all. - self.wal_backup_launcher_tx.send(self.ttid).await?; - } - Ok(()) - } - - /// Returns true if walsender should stop sending WAL to pageserver. We - /// terminate it if remote_consistent_lsn reached commit_lsn and there is no - /// computes. While there might be nothing to stream already, we learn about - /// remote_consistent_lsn update through replication feedback, and we want - /// to stop pushing to the broker if pageserver is fully caughtup. - pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool { - if self.is_cancelled() { - return true; - } - let shared_state = self.write_shared_state().await; - if self.walreceivers.get_num() == 0 { - return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet - reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn; - } - false - } - - /// Ensure taht current term is t, erroring otherwise, and lock the state. - pub async fn acquire_term(&self, t: Term) -> Result> { - let ss = self.write_shared_state().await; - if ss.sk.state.acceptor_state.term != t { - bail!( - "failed to acquire term {}, current term {}", - t, - ss.sk.state.acceptor_state.term - ); - } - Ok(ss) - } - - /// Returns whether s3 offloading is required and sets current status as - /// matching it. - pub async fn wal_backup_attend(&self) -> bool { - if self.is_cancelled() { - return false; - } - - self.write_shared_state() - .await - .wal_backup_attend(self.walreceivers.get_num()) + pub async fn read_shared_state(&self) -> ReadGuardSharedState { + self.mutex.read().await } /// Returns commit_lsn watch channel. @@ -609,72 +711,38 @@ impl Timeline { self.term_flush_lsn_watch_rx.clone() } - /// Pass arrived message to the safekeeper. - pub async fn process_msg( - &self, - msg: &ProposerAcceptorMessage, - ) -> Result> { - if self.is_cancelled() { - bail!(TimelineError::Cancelled(self.ttid)); - } - - let mut rmsg: Option; - let commit_lsn: Lsn; - let term_flush_lsn: TermLsn; - { - let mut shared_state = self.write_shared_state().await; - rmsg = shared_state.sk.process_msg(msg).await?; - - // if this is AppendResponse, fill in proper pageserver and hot - // standby feedback. - if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg { - let (ps_feedback, hs_feedback) = self.walsenders.get_feedbacks(); - resp.hs_feedback = hs_feedback; - resp.pageserver_feedback = ps_feedback; - } - - commit_lsn = shared_state.sk.state.inmem.commit_lsn; - term_flush_lsn = - TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn())); - } - self.commit_lsn_watch_tx.send(commit_lsn)?; - self.term_flush_lsn_watch_tx.send(term_flush_lsn)?; - Ok(rmsg) + /// Returns watch channel for SharedState update version. + pub fn get_state_version_rx(&self) -> watch::Receiver { + self.shared_state_version_rx.clone() } /// Returns wal_seg_size. pub async fn get_wal_seg_size(&self) -> usize { - self.write_shared_state().await.get_wal_seg_size() - } - - /// Returns true only if the timeline is loaded and active. - pub async fn is_active(&self) -> bool { - if self.is_cancelled() { - return false; - } - - self.write_shared_state().await.active + self.read_shared_state().await.get_wal_seg_size() } /// Returns state of the timeline. pub async fn get_state(&self) -> (TimelineMemState, TimelinePersistentState) { - let state = self.write_shared_state().await; - (state.sk.state.inmem.clone(), state.sk.state.clone()) + let state = self.read_shared_state().await; + ( + state.sk.state().inmem.clone(), + TimelinePersistentState::clone(state.sk.state()), + ) } /// Returns latest backup_lsn. pub async fn get_wal_backup_lsn(&self) -> Lsn { - self.write_shared_state().await.sk.state.inmem.backup_lsn + self.read_shared_state().await.sk.state().inmem.backup_lsn } /// Sets backup_lsn to the given value. - pub async fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> { + pub async fn set_wal_backup_lsn(self: &Arc, backup_lsn: Lsn) -> Result<()> { if self.is_cancelled() { bail!(TimelineError::Cancelled(self.ttid)); } let mut state = self.write_shared_state().await; - state.sk.state.inmem.backup_lsn = max(state.sk.state.inmem.backup_lsn, backup_lsn); + state.sk.state_mut().inmem.backup_lsn = max(state.sk.state().inmem.backup_lsn, backup_lsn); // we should check whether to shut down offloader, but this will be done // soon by peer communication anyway. Ok(()) @@ -682,121 +750,30 @@ impl Timeline { /// Get safekeeper info for broadcasting to broker and other peers. pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo { - let shared_state = self.write_shared_state().await; - shared_state.get_safekeeper_info(&self.ttid, conf) + let standby_apply_lsn = self.walsenders.get_hotstandby().reply.apply_lsn; + let shared_state = self.read_shared_state().await; + shared_state.get_safekeeper_info(&self.ttid, conf, standby_apply_lsn) } /// Update timeline state with peer safekeeper data. - pub async fn record_safekeeper_info(&self, sk_info: SafekeeperTimelineInfo) -> Result<()> { - let is_wal_backup_action_pending: bool; - let commit_lsn: Lsn; + pub async fn record_safekeeper_info( + self: &Arc, + sk_info: SafekeeperTimelineInfo, + ) -> Result<()> { { let mut shared_state = self.write_shared_state().await; shared_state.sk.record_safekeeper_info(&sk_info).await?; let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now()); shared_state.peers_info.upsert(&peer_info); - is_wal_backup_action_pending = self.update_status(&mut shared_state).await; - commit_lsn = shared_state.sk.state.inmem.commit_lsn; - } - self.commit_lsn_watch_tx.send(commit_lsn)?; - // Wake up wal backup launcher, if it is time to stop the offloading. - if is_wal_backup_action_pending { - self.wal_backup_launcher_tx.send(self.ttid).await?; } Ok(()) } - /// Update in memory remote consistent lsn. - pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) { - let mut shared_state = self.write_shared_state().await; - shared_state.sk.state.inmem.remote_consistent_lsn = - max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate); - } - pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec { - let shared_state = self.write_shared_state().await; + let shared_state = self.read_shared_state().await; shared_state.get_peers(conf.heartbeat_timeout) } - /// Should we start fetching WAL from a peer safekeeper, and if yes, from - /// which? Answer is yes, i.e. .donors is not empty if 1) there is something - /// to fetch, and we can do that without running elections; 2) there is no - /// actively streaming compute, as we don't want to compete with it. - /// - /// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal - /// to its last_log_term so we are sure such a leader ever had been elected. - /// - /// All possible donors are returned so that we could keep connection to the - /// current one if it is good even if it slightly lags behind. - /// - /// Note that term conditions above might be not met, but safekeepers are - /// still not aligned on last flush_lsn. Generally in this case until - /// elections are run it is not possible to say which safekeeper should - /// recover from which one -- history which would be committed is different - /// depending on assembled quorum (e.g. classic picture 8 from Raft paper). - /// Thus we don't try to predict it here. - pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo { - let ss = self.write_shared_state().await; - let term = ss.sk.state.acceptor_state.term; - let last_log_term = ss.sk.get_epoch(); - let flush_lsn = ss.sk.flush_lsn(); - // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us. - let mut peers = ss.get_peers(heartbeat_timeout); - // Sort by pairs. - peers.sort_by(|p1, p2| { - let tl1 = TermLsn { - term: p1.last_log_term, - lsn: p1.flush_lsn, - }; - let tl2 = TermLsn { - term: p2.last_log_term, - lsn: p2.flush_lsn, - }; - tl2.cmp(&tl1) // desc - }); - let num_streaming_computes = self.walreceivers.get_num_streaming(); - let donors = if num_streaming_computes > 0 { - vec![] // If there is a streaming compute, don't try to recover to not intervene. - } else { - peers - .iter() - .filter_map(|candidate| { - // Are we interested in this candidate? - let candidate_tl = TermLsn { - term: candidate.last_log_term, - lsn: candidate.flush_lsn, - }; - let my_tl = TermLsn { - term: last_log_term, - lsn: flush_lsn, - }; - if my_tl < candidate_tl { - // Yes, we are interested. Can we pull from it without - // (re)running elections? It is possible if 1) his term - // is equal to his last_log_term so we could act on - // behalf of leader of this term (we must be sure he was - // ever elected) and 2) our term is not higher, or we'll refuse data. - if candidate.term == candidate.last_log_term && candidate.term >= term { - Some(Donor::from(candidate)) - } else { - None - } - } else { - None - } - }) - .collect() - }; - RecoveryNeededInfo { - term, - last_log_term, - flush_lsn, - peers, - num_streaming_computes, - donors, - } - } - pub fn get_walsenders(&self) -> &Arc { &self.walsenders } @@ -807,100 +784,367 @@ impl Timeline { /// Returns flush_lsn. pub async fn get_flush_lsn(&self) -> Lsn { - self.write_shared_state().await.sk.wal_store.flush_lsn() + self.read_shared_state().await.sk.flush_lsn() } - /// Delete WAL segments from disk that are no longer needed. This is determined - /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn. - pub async fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> { - if self.is_cancelled() { - bail!(TimelineError::Cancelled(self.ttid)); - } - - let horizon_segno: XLogSegNo; - let remover = { - let shared_state = self.write_shared_state().await; - horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled); - if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { - return Ok(()); // nothing to do - } - - // release the lock before removing - shared_state.sk.wal_store.remove_up_to(horizon_segno - 1) - }; - - // delete old WAL files - remover.await?; - - // update last_removed_segno - let mut shared_state = self.write_shared_state().await; - shared_state.last_removed_segno = horizon_segno; - Ok(()) - } - - /// Persist control file if there is something to save and enough time - /// passed after the last save. This helps to keep remote_consistent_lsn up - /// to date so that storage nodes restart doesn't cause many pageserver -> - /// safekeeper reconnections. - pub async fn maybe_persist_control_file(&self) -> Result<()> { - self.write_shared_state() - .await - .sk - .maybe_persist_inmem_control_file() - .await - } - - /// Gather timeline data for metrics. If the timeline is not active, returns - /// None, we do not collect these. + /// Gather timeline data for metrics. pub async fn info_for_metrics(&self) -> Option { if self.is_cancelled() { return None; } - let ps_feedback = self.walsenders.get_ps_feedback(); - let state = self.write_shared_state().await; - if state.active { - Some(FullTimelineInfo { - ttid: self.ttid, - ps_feedback, - wal_backup_active: state.wal_backup_active, - timeline_is_active: state.active, - num_computes: self.walreceivers.get_num() as u32, - last_removed_segno: state.last_removed_segno, - epoch_start_lsn: state.sk.epoch_start_lsn, - mem_state: state.sk.state.inmem.clone(), - persisted_state: state.sk.state.clone(), - flush_lsn: state.sk.wal_store.flush_lsn(), - wal_storage: state.sk.wal_store.get_metrics(), - }) - } else { - None - } + let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats(); + let state = self.read_shared_state().await; + Some(FullTimelineInfo { + ttid: self.ttid, + ps_feedback_count, + last_ps_feedback, + wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed), + timeline_is_active: self.broker_active.load(Ordering::Relaxed), + num_computes: self.walreceivers.get_num() as u32, + last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed), + epoch_start_lsn: state.sk.term_start_lsn(), + mem_state: state.sk.state().inmem.clone(), + persisted_state: TimelinePersistentState::clone(state.sk.state()), + flush_lsn: state.sk.flush_lsn(), + wal_storage: state.sk.wal_storage_metrics(), + }) } /// Returns in-memory timeline state to build a full debug dump. pub async fn memory_dump(&self) -> debug_dump::Memory { - let state = self.write_shared_state().await; + let state = self.read_shared_state().await; let (write_lsn, write_record_lsn, flush_lsn, file_open) = - state.sk.wal_store.internal_state(); + state.sk.wal_storage_internal_state(); debug_dump::Memory { is_cancelled: self.is_cancelled(), peers_info_len: state.peers_info.0.len(), walsenders: self.walsenders.get_all(), - wal_backup_active: state.wal_backup_active, - active: state.active, + wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed), + active: self.broker_active.load(Ordering::Relaxed), num_computes: self.walreceivers.get_num() as u32, - last_removed_segno: state.last_removed_segno, - epoch_start_lsn: state.sk.epoch_start_lsn, - mem_state: state.sk.state.inmem.clone(), + last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed), + epoch_start_lsn: state.sk.term_start_lsn(), + mem_state: state.sk.state().inmem.clone(), + mgr_status: self.mgr_status.get(), write_lsn, write_record_lsn, flush_lsn, file_open, } } + + /// Apply a function to the control file state and persist it. + pub async fn map_control_file( + self: &Arc, + f: impl FnOnce(&mut TimelinePersistentState) -> Result, + ) -> Result { + let mut state = self.write_shared_state().await; + let mut persistent_state = state.sk.state_mut().start_change(); + // If f returns error, we abort the change and don't persist anything. + let res = f(&mut persistent_state)?; + // If persisting fails, we abort the change and return error. + state + .sk + .state_mut() + .finish_change(&persistent_state) + .await?; + Ok(res) + } + + /// Get the timeline guard for reading/writing WAL files. + /// If WAL files are not present on disk (evicted), they will be automatically + /// downloaded from remote storage. This is done in the manager task, which is + /// responsible for issuing all guards. + /// + /// NB: don't use this function from timeline_manager, it will deadlock. + /// NB: don't use this function while holding shared_state lock. + pub async fn wal_residence_guard(self: &Arc) -> Result { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + + debug!("requesting WalResidentTimeline guard"); + let started_at = Instant::now(); + let status_before = self.mgr_status.get(); + + // Wait 30 seconds for the guard to be acquired. It can time out if someone is + // holding the lock (e.g. during `SafeKeeper::process_msg()`) or manager task + // is stuck. + let res = tokio::time::timeout_at( + started_at + Duration::from_secs(30), + self.manager_ctl.wal_residence_guard(), + ) + .await; + + let guard = match res { + Ok(Ok(guard)) => { + let finished_at = Instant::now(); + let elapsed = finished_at - started_at; + MISC_OPERATION_SECONDS + .with_label_values(&["wal_residence_guard"]) + .observe(elapsed.as_secs_f64()); + + guard + } + Ok(Err(e)) => { + warn!( + "error while acquiring WalResidentTimeline guard, statuses {:?} => {:?}", + status_before, + self.mgr_status.get() + ); + return Err(e); + } + Err(_) => { + warn!( + "timeout while acquiring WalResidentTimeline guard, statuses {:?} => {:?}", + status_before, + self.mgr_status.get() + ); + anyhow::bail!("timeout while acquiring WalResidentTimeline guard"); + } + }; + + Ok(WalResidentTimeline::new(self.clone(), guard)) + } + + pub async fn backup_partial_reset(self: &Arc) -> Result> { + self.manager_ctl.backup_partial_reset().await + } +} + +/// This is a guard that allows to read/write disk timeline state. +/// All tasks that are trying to read/write WAL from disk should use this guard. +pub struct WalResidentTimeline { + pub tli: Arc, + _guard: ResidenceGuard, +} + +impl WalResidentTimeline { + pub fn new(tli: Arc, _guard: ResidenceGuard) -> Self { + WalResidentTimeline { tli, _guard } + } +} + +impl Deref for WalResidentTimeline { + type Target = Arc; + + fn deref(&self) -> &Self::Target { + &self.tli + } +} + +impl WalResidentTimeline { + /// Returns true if walsender should stop sending WAL to pageserver. We + /// terminate it if remote_consistent_lsn reached commit_lsn and there is no + /// computes. While there might be nothing to stream already, we learn about + /// remote_consistent_lsn update through replication feedback, and we want + /// to stop pushing to the broker if pageserver is fully caughtup. + pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool { + if self.is_cancelled() { + return true; + } + let shared_state = self.read_shared_state().await; + if self.walreceivers.get_num() == 0 { + return shared_state.sk.state().inmem.commit_lsn == Lsn(0) || // no data at all yet + reported_remote_consistent_lsn >= shared_state.sk.state().inmem.commit_lsn; + } + false + } + + /// Ensure that current term is t, erroring otherwise, and lock the state. + pub async fn acquire_term(&self, t: Term) -> Result { + let ss = self.read_shared_state().await; + if ss.sk.state().acceptor_state.term != t { + bail!( + "failed to acquire term {}, current term {}", + t, + ss.sk.state().acceptor_state.term + ); + } + Ok(ss) + } + + /// Pass arrived message to the safekeeper. + pub async fn process_msg( + &self, + msg: &ProposerAcceptorMessage, + ) -> Result> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + + let mut rmsg: Option; + { + let mut shared_state = self.write_shared_state().await; + rmsg = shared_state.sk.safekeeper().process_msg(msg).await?; + + // if this is AppendResponse, fill in proper hot standby feedback. + if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg { + resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback; + } + } + Ok(rmsg) + } + + pub async fn get_walreader(&self, start_lsn: Lsn) -> Result { + let (_, persisted_state) = self.get_state().await; + let enable_remote_read = GlobalTimelines::get_global_config().is_wal_backup_enabled(); + + WalReader::new( + &self.ttid, + self.timeline_dir.clone(), + &persisted_state, + start_lsn, + enable_remote_read, + ) + } + + pub fn get_timeline_dir(&self) -> Utf8PathBuf { + self.timeline_dir.clone() + } + + /// Update in memory remote consistent lsn. + pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) { + let mut shared_state = self.write_shared_state().await; + shared_state.sk.state_mut().inmem.remote_consistent_lsn = max( + shared_state.sk.state().inmem.remote_consistent_lsn, + candidate, + ); + } +} + +/// This struct contains methods that are used by timeline manager task. +pub(crate) struct ManagerTimeline { + pub(crate) tli: Arc, +} + +impl Deref for ManagerTimeline { + type Target = Arc; + + fn deref(&self) -> &Self::Target { + &self.tli + } +} + +impl ManagerTimeline { + pub(crate) fn timeline_dir(&self) -> &Utf8PathBuf { + &self.tli.timeline_dir + } + + /// Manager requests this state on startup. + pub(crate) async fn bootstrap_mgr(&self) -> (bool, Option) { + let shared_state = self.read_shared_state().await; + let is_offloaded = matches!( + shared_state.sk.state().eviction_state, + EvictionState::Offloaded(_) + ); + let partial_backup_uploaded = shared_state.sk.state().partial_backup.uploaded_segment(); + + (is_offloaded, partial_backup_uploaded) + } + + /// Try to switch state Present->Offloaded. + pub(crate) async fn switch_to_offloaded( + &self, + partial: &PartialRemoteSegment, + ) -> anyhow::Result<()> { + let mut shared = self.write_shared_state().await; + + // updating control file + let mut pstate = shared.sk.state_mut().start_change(); + + if !matches!(pstate.eviction_state, EvictionState::Present) { + bail!( + "cannot switch to offloaded state, current state is {:?}", + pstate.eviction_state + ); + } + + if partial.flush_lsn != shared.sk.flush_lsn() { + bail!( + "flush_lsn mismatch in partial backup, expected {}, got {}", + shared.sk.flush_lsn(), + partial.flush_lsn + ); + } + + if partial.commit_lsn != pstate.commit_lsn { + bail!( + "commit_lsn mismatch in partial backup, expected {}, got {}", + pstate.commit_lsn, + partial.commit_lsn + ); + } + + if partial.term != shared.sk.last_log_term() { + bail!( + "term mismatch in partial backup, expected {}, got {}", + shared.sk.last_log_term(), + partial.term + ); + } + + pstate.eviction_state = EvictionState::Offloaded(shared.sk.flush_lsn()); + shared.sk.state_mut().finish_change(&pstate).await?; + // control file is now switched to Offloaded state + + // now we can switch shared.sk to Offloaded, shouldn't fail + let prev_sk = std::mem::replace(&mut shared.sk, StateSK::Empty); + let cfile_state = prev_sk.take_state(); + shared.sk = StateSK::Offloaded(Box::new(cfile_state)); + + Ok(()) + } + + /// Try to switch state Offloaded->Present. + pub(crate) async fn switch_to_present(&self) -> anyhow::Result<()> { + let conf = GlobalTimelines::get_global_config(); + let mut shared = self.write_shared_state().await; + + // trying to restore WAL storage + let wal_store = wal_storage::PhysicalStorage::new( + &self.ttid, + self.timeline_dir.clone(), + &conf, + shared.sk.state(), + )?; + + // updating control file + let mut pstate = shared.sk.state_mut().start_change(); + + if !matches!(pstate.eviction_state, EvictionState::Offloaded(_)) { + bail!( + "cannot switch to present state, current state is {:?}", + pstate.eviction_state + ); + } + + if wal_store.flush_lsn() != shared.sk.flush_lsn() { + bail!( + "flush_lsn mismatch in restored WAL, expected {}, got {}", + shared.sk.flush_lsn(), + wal_store.flush_lsn() + ); + } + + pstate.eviction_state = EvictionState::Present; + shared.sk.state_mut().finish_change(&pstate).await?; + + // now we can switch shared.sk to Present, shouldn't fail + let prev_sk = std::mem::replace(&mut shared.sk, StateSK::Empty); + let cfile_state = prev_sk.take_state(); + shared.sk = StateSK::Loaded(SafeKeeper::new(cfile_state, wal_store, conf.my_id)?); + + Ok(()) + } + + /// Update current manager state, useful for debugging manager deadlocks. + pub(crate) fn set_status(&self, status: timeline_manager::Status) { + self.mgr_status.store(status, Ordering::Relaxed); + } } /// Deletes directory and it's contents. Returns false if directory does not exist. @@ -911,3 +1155,16 @@ async fn delete_dir(path: &Utf8PathBuf) -> Result { Err(e) => Err(e.into()), } } + +/// Get a path to the tenant directory. If you just need to get a timeline directory, +/// use WalResidentTimeline::get_timeline_dir instead. +pub(crate) fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf { + conf.workdir.join(tenant_id.to_string()) +} + +/// Get a path to the timeline directory. If you need to read WAL files from disk, +/// use WalResidentTimeline::get_timeline_dir instead. This function does not check +/// timeline eviction status and WAL files might not be present on disk. +pub(crate) fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf { + get_tenant_dir(conf, &ttid.tenant_id).join(ttid.timeline_id.to_string()) +} diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs new file mode 100644 index 0000000000..5aa4921a92 --- /dev/null +++ b/safekeeper/src/timeline_eviction.rs @@ -0,0 +1,395 @@ +//! Code related to evicting WAL files to remote storage. +//! +//! The actual upload is done by the partial WAL backup code. This file has +//! code to delete and re-download WAL files, cross-validate with partial WAL +//! backup if local file is still present. + +use anyhow::Context; +use camino::Utf8PathBuf; +use remote_storage::RemotePath; +use tokio::{ + fs::File, + io::{AsyncRead, AsyncWriteExt}, +}; +use tracing::{debug, info, instrument, warn}; +use utils::crashsafe::durable_rename; + +use crate::{ + metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED}, + rate_limit::rand_duration, + timeline_manager::{Manager, StateSnapshot}, + wal_backup, + wal_backup_partial::{self, PartialRemoteSegment}, + wal_storage::wal_file_paths, +}; + +impl Manager { + /// Returns true if the timeline is ready for eviction. + /// Current criteria: + /// - no active tasks + /// - control file is flushed (no next event scheduled) + /// - no WAL residence guards + /// - no pushes to the broker + /// - last partial WAL segment is uploaded + /// - all local segments before the uploaded partial are committed and uploaded + pub(crate) fn ready_for_eviction( + &self, + next_event: &Option, + state: &StateSnapshot, + ) -> bool { + let ready = self.backup_task.is_none() + && self.recovery_task.is_none() + && self.wal_removal_task.is_none() + && self.partial_backup_task.is_none() + && next_event.is_none() + && self.access_service.is_empty() + && !self.tli_broker_active.get() + // Partial segment of current flush_lsn is uploaded up to this flush_lsn. + && !wal_backup_partial::needs_uploading(state, &self.partial_backup_uploaded) + // And it is the next one after the last removed. Given that local + // WAL is removed only after it is uploaded to s3 (and pageserver + // advancing remote_consistent_lsn) which happens only after WAL is + // committed, true means all this is done. + // + // This also works for the first segment despite last_removed_segno + // being 0 on init because this 0 triggers run of wal_removal_task + // on success of which manager updates the horizon. + && self + .partial_backup_uploaded + .as_ref() + .unwrap() + .flush_lsn + .segment_number(self.wal_seg_size) + == self.last_removed_segno + 1; + ready + } + + /// Evict the timeline to remote storage. + #[instrument(name = "evict_timeline", skip_all)] + pub(crate) async fn evict_timeline(&mut self) { + assert!(!self.is_offloaded); + let partial_backup_uploaded = match &self.partial_backup_uploaded { + Some(p) => p.clone(), + None => { + warn!("no partial backup uploaded, skipping eviction"); + return; + } + }; + + info!("starting eviction, using {:?}", partial_backup_uploaded); + + EVICTION_EVENTS_STARTED + .with_label_values(&[EvictionEvent::Evict.into()]) + .inc(); + let _guard = scopeguard::guard((), |_| { + EVICTION_EVENTS_COMPLETED + .with_label_values(&[EvictionEvent::Evict.into()]) + .inc(); + }); + + if let Err(e) = do_eviction(self, &partial_backup_uploaded).await { + warn!("failed to evict timeline: {:?}", e); + return; + } + + info!("successfully evicted timeline"); + } + + /// Attempt to restore evicted timeline from remote storage; it must be + /// offloaded. + #[instrument(name = "unevict_timeline", skip_all)] + pub(crate) async fn unevict_timeline(&mut self) { + assert!(self.is_offloaded); + let partial_backup_uploaded = match &self.partial_backup_uploaded { + Some(p) => p.clone(), + None => { + warn!("no partial backup uploaded, cannot unevict"); + return; + } + }; + + info!("starting uneviction, using {:?}", partial_backup_uploaded); + + EVICTION_EVENTS_STARTED + .with_label_values(&[EvictionEvent::Restore.into()]) + .inc(); + let _guard = scopeguard::guard((), |_| { + EVICTION_EVENTS_COMPLETED + .with_label_values(&[EvictionEvent::Restore.into()]) + .inc(); + }); + + if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await { + warn!("failed to unevict timeline: {:?}", e); + return; + } + + self.evict_not_before = + tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident); + + info!("successfully restored evicted timeline"); + } +} + +/// Ensure that content matches the remote partial backup, if local segment exists. +/// Then change state in control file and in-memory. If `delete_offloaded_wal` is set, +/// delete the local segment. +async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> { + compare_local_segment_with_remote(mgr, partial).await?; + + mgr.tli.switch_to_offloaded(partial).await?; + // switch manager state as soon as possible + mgr.is_offloaded = true; + + if mgr.conf.delete_offloaded_wal { + delete_local_segment(mgr, partial).await?; + } + + Ok(()) +} + +/// Ensure that content matches the remote partial backup, if local segment exists. +/// Then download segment to local disk and change state in control file and in-memory. +async fn do_uneviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> { + // if the local segment is present, validate it + compare_local_segment_with_remote(mgr, partial).await?; + + // atomically download the partial segment + redownload_partial_segment(mgr, partial).await?; + + mgr.tli.switch_to_present().await?; + // switch manager state as soon as possible + mgr.is_offloaded = false; + + Ok(()) +} + +/// Delete local WAL segment. +async fn delete_local_segment(mgr: &Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> { + let local_path = local_segment_path(mgr, partial); + + info!("deleting WAL file to evict: {}", local_path); + tokio::fs::remove_file(&local_path).await?; + Ok(()) +} + +/// Redownload partial segment from remote storage. +/// The segment is downloaded to a temporary file and then renamed to the final path. +async fn redownload_partial_segment( + mgr: &Manager, + partial: &PartialRemoteSegment, +) -> anyhow::Result<()> { + let tmp_file = mgr.tli.timeline_dir().join("remote_partial.tmp"); + let remote_segfile = remote_segment_path(mgr, partial); + + debug!( + "redownloading partial segment: {} -> {}", + remote_segfile, tmp_file + ); + + let mut reader = wal_backup::read_object(&remote_segfile, 0).await?; + let mut file = File::create(&tmp_file).await?; + + let actual_len = tokio::io::copy(&mut reader, &mut file).await?; + let expected_len = partial.flush_lsn.segment_offset(mgr.wal_seg_size); + + if actual_len != expected_len as u64 { + anyhow::bail!( + "partial downloaded {} bytes, expected {}", + actual_len, + expected_len + ); + } + + if actual_len > mgr.wal_seg_size as u64 { + anyhow::bail!( + "remote segment is too long: {} bytes, expected {}", + actual_len, + mgr.wal_seg_size + ); + } + file.set_len(mgr.wal_seg_size as u64).await?; + file.flush().await?; + + let final_path = local_segment_path(mgr, partial); + info!("downloaded {actual_len} bytes, renaming to {final_path}"); + if let Err(e) = durable_rename(&tmp_file, &final_path, !mgr.conf.no_sync).await { + // Probably rename succeeded, but fsync of it failed. Remove + // the file then to avoid using it. + tokio::fs::remove_file(tmp_file) + .await + .or_else(utils::fs_ext::ignore_not_found)?; + return Err(e.into()); + } + + Ok(()) +} + +/// Compare local WAL segment with partial WAL backup in remote storage. +/// If the local segment is not present, the function does nothing. +/// If the local segment is present, it compares the local segment with the remote one. +async fn compare_local_segment_with_remote( + mgr: &Manager, + partial: &PartialRemoteSegment, +) -> anyhow::Result<()> { + let local_path = local_segment_path(mgr, partial); + + match File::open(&local_path).await { + Ok(mut local_file) => do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial) + .await + .context("validation failed"), + Err(_) => { + info!( + "local WAL file {} is not present, skipping validation", + local_path + ); + Ok(()) + } + } +} + +/// Compare opened local WAL segment with partial WAL backup in remote storage. +/// Validate full content of both files. +async fn do_validation( + mgr: &Manager, + file: &mut File, + wal_seg_size: usize, + partial: &PartialRemoteSegment, +) -> anyhow::Result<()> { + let local_size = file.metadata().await?.len() as usize; + if local_size != wal_seg_size { + anyhow::bail!( + "local segment size is invalid: found {}, expected {}", + local_size, + wal_seg_size + ); + } + + let remote_segfile = remote_segment_path(mgr, partial); + let mut remote_reader: std::pin::Pin> = + wal_backup::read_object(&remote_segfile, 0).await?; + + // remote segment should have bytes excatly up to `flush_lsn` + let expected_remote_size = partial.flush_lsn.segment_offset(mgr.wal_seg_size); + // let's compare the first `expected_remote_size` bytes + compare_n_bytes(&mut remote_reader, file, expected_remote_size).await?; + // and check that the remote segment ends here + check_end(&mut remote_reader).await?; + + // if local segment is longer, the rest should be zeroes + read_n_zeroes(file, mgr.wal_seg_size - expected_remote_size).await?; + // and check that the local segment ends here + check_end(file).await?; + + Ok(()) +} + +fn local_segment_path(mgr: &Manager, partial: &PartialRemoteSegment) -> Utf8PathBuf { + let flush_lsn = partial.flush_lsn; + let segno = flush_lsn.segment_number(mgr.wal_seg_size); + let (_, local_partial_segfile) = + wal_file_paths(mgr.tli.timeline_dir(), segno, mgr.wal_seg_size); + local_partial_segfile +} + +fn remote_segment_path(mgr: &Manager, partial: &PartialRemoteSegment) -> RemotePath { + partial.remote_path(&mgr.tli.remote_path) +} + +/// Compare first `n` bytes of two readers. If the bytes differ, return an error. +/// If the readers are shorter than `n`, return an error. +async fn compare_n_bytes(reader1: &mut R1, reader2: &mut R2, n: usize) -> anyhow::Result<()> +where + R1: AsyncRead + Unpin, + R2: AsyncRead + Unpin, +{ + use tokio::io::AsyncReadExt; + + const BUF_SIZE: usize = 32 * 1024; + + let mut buffer1 = vec![0u8; BUF_SIZE]; + let mut buffer2 = vec![0u8; BUF_SIZE]; + + let mut offset = 0; + + while offset < n { + let bytes_to_read = std::cmp::min(BUF_SIZE, n - offset); + + let bytes_read1 = reader1 + .read(&mut buffer1[..bytes_to_read]) + .await + .with_context(|| format!("failed to read from reader1 at offset {}", offset))?; + if bytes_read1 == 0 { + anyhow::bail!("unexpected EOF from reader1 at offset {}", offset); + } + + let bytes_read2 = reader2 + .read_exact(&mut buffer2[..bytes_read1]) + .await + .with_context(|| { + format!( + "failed to read {} bytes from reader2 at offset {}", + bytes_read1, offset + ) + })?; + assert!(bytes_read2 == bytes_read1); + + if buffer1[..bytes_read1] != buffer2[..bytes_read2] { + let diff_offset = buffer1[..bytes_read1] + .iter() + .zip(buffer2[..bytes_read2].iter()) + .position(|(a, b)| a != b) + .expect("mismatched buffers, but no difference found"); + anyhow::bail!("mismatch at offset {}", offset + diff_offset); + } + + offset += bytes_read1; + } + + Ok(()) +} + +async fn check_end(mut reader: R) -> anyhow::Result<()> +where + R: AsyncRead + Unpin, +{ + use tokio::io::AsyncReadExt; + + let mut buffer = [0u8; 1]; + let bytes_read = reader.read(&mut buffer).await?; + if bytes_read != 0 { + anyhow::bail!("expected EOF, found bytes"); + } + Ok(()) +} + +async fn read_n_zeroes(reader: &mut R, n: usize) -> anyhow::Result<()> +where + R: AsyncRead + Unpin, +{ + use tokio::io::AsyncReadExt; + + const BUF_SIZE: usize = 32 * 1024; + let mut buffer = vec![0u8; BUF_SIZE]; + let mut offset = 0; + + while offset < n { + let bytes_to_read = std::cmp::min(BUF_SIZE, n - offset); + + let bytes_read = reader + .read(&mut buffer[..bytes_to_read]) + .await + .context("expected zeroes, got read error")?; + if bytes_read == 0 { + anyhow::bail!("expected zeroes, got EOF"); + } + + if buffer[..bytes_read].iter().all(|&b| b == 0) { + offset += bytes_read; + } else { + anyhow::bail!("non-zero byte found"); + } + } + + Ok(()) +} diff --git a/safekeeper/src/timeline_guard.rs b/safekeeper/src/timeline_guard.rs new file mode 100644 index 0000000000..1ddac573d2 --- /dev/null +++ b/safekeeper/src/timeline_guard.rs @@ -0,0 +1,73 @@ +//! Timeline residence guard +//! +//! It is needed to ensure that WAL segments are present on disk, +//! as long as the code is holding the guard. This file implements guard logic, to issue +//! and drop guards, and to notify the manager when the guard is dropped. + +use std::collections::HashSet; + +use tracing::debug; + +use crate::timeline_manager::ManagerCtlMessage; + +#[derive(Debug, Clone, Copy)] +pub struct GuardId(u64); + +pub struct ResidenceGuard { + manager_tx: tokio::sync::mpsc::UnboundedSender, + guard_id: GuardId, +} + +impl Drop for ResidenceGuard { + fn drop(&mut self) { + // notify the manager that the guard is dropped + let res = self + .manager_tx + .send(ManagerCtlMessage::GuardDrop(self.guard_id)); + if let Err(e) = res { + debug!("failed to send GuardDrop message: {:?}", e); + } + } +} + +/// AccessService is responsible for issuing and dropping residence guards. +/// All guards are stored in the `guards` set. +/// TODO: it's possible to add `String` name to each guard, for better observability. +pub(crate) struct AccessService { + next_guard_id: u64, + guards: HashSet, + manager_tx: tokio::sync::mpsc::UnboundedSender, +} + +impl AccessService { + pub(crate) fn new(manager_tx: tokio::sync::mpsc::UnboundedSender) -> Self { + Self { + next_guard_id: 0, + guards: HashSet::new(), + manager_tx, + } + } + + pub(crate) fn is_empty(&self) -> bool { + self.guards.is_empty() + } + + pub(crate) fn create_guard(&mut self) -> ResidenceGuard { + let guard_id = self.next_guard_id; + self.next_guard_id += 1; + self.guards.insert(guard_id); + + let guard_id = GuardId(guard_id); + debug!("issued a new guard {:?}", guard_id); + + ResidenceGuard { + manager_tx: self.manager_tx.clone(), + guard_id, + } + } + + pub(crate) fn drop_guard(&mut self, guard_id: GuardId) { + debug!("dropping guard {:?}", guard_id); + assert!(self.guards.remove(&guard_id.0)); + } +} diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs new file mode 100644 index 0000000000..6be75479db --- /dev/null +++ b/safekeeper/src/timeline_manager.rs @@ -0,0 +1,763 @@ +//! The timeline manager task is responsible for managing the timeline's background tasks. +//! +//! It is spawned alongside each timeline and exits when the timeline is deleted. +//! It watches for changes in the timeline state and decides when to spawn or kill background tasks. +//! It also can manage some reactive state, like should the timeline be active for broker pushes or not. +//! +//! Be aware that you need to be extra careful with manager code, because it is not respawned on panic. +//! Also, if it will stuck in some branch, it will prevent any further progress in the timeline. + +use std::{ + sync::{atomic::AtomicUsize, Arc}, + time::Duration, +}; + +use futures::channel::oneshot; +use postgres_ffi::XLogSegNo; +use serde::{Deserialize, Serialize}; +use tokio::{ + task::{JoinError, JoinHandle}, + time::Instant, +}; +use tokio_util::sync::CancellationToken; +use tracing::{debug, info, info_span, instrument, warn, Instrument}; +use utils::lsn::Lsn; + +use crate::{ + control_file::{FileStorage, Storage}, + metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS}, + rate_limit::{rand_duration, RateLimiter}, + recovery::recovery_main, + remove_wal::calc_horizon_lsn, + safekeeper::Term, + send_wal::WalSenders, + state::TimelineState, + timeline::{ManagerTimeline, PeerInfo, ReadGuardSharedState, StateSK, WalResidentTimeline}, + timeline_guard::{AccessService, GuardId, ResidenceGuard}, + timelines_set::{TimelineSetGuard, TimelinesSet}, + wal_backup::{self, WalBackupTaskHandle}, + wal_backup_partial::{self, PartialBackup, PartialRemoteSegment}, + SafeKeeperConf, +}; + +pub(crate) struct StateSnapshot { + // inmem values + pub(crate) commit_lsn: Lsn, + pub(crate) backup_lsn: Lsn, + pub(crate) remote_consistent_lsn: Lsn, + + // persistent control file values + pub(crate) cfile_peer_horizon_lsn: Lsn, + pub(crate) cfile_remote_consistent_lsn: Lsn, + pub(crate) cfile_backup_lsn: Lsn, + + // latest state + pub(crate) flush_lsn: Lsn, + pub(crate) last_log_term: Term, + + // misc + pub(crate) cfile_last_persist_at: std::time::Instant, + pub(crate) inmem_flush_pending: bool, + pub(crate) wal_removal_on_hold: bool, + pub(crate) peers: Vec, +} + +impl StateSnapshot { + /// Create a new snapshot of the timeline state. + fn new(read_guard: ReadGuardSharedState, heartbeat_timeout: Duration) -> Self { + let state = read_guard.sk.state(); + Self { + commit_lsn: state.inmem.commit_lsn, + backup_lsn: state.inmem.backup_lsn, + remote_consistent_lsn: state.inmem.remote_consistent_lsn, + cfile_peer_horizon_lsn: state.peer_horizon_lsn, + cfile_remote_consistent_lsn: state.remote_consistent_lsn, + cfile_backup_lsn: state.backup_lsn, + flush_lsn: read_guard.sk.flush_lsn(), + last_log_term: read_guard.sk.last_log_term(), + cfile_last_persist_at: state.pers.last_persist_at(), + inmem_flush_pending: Self::has_unflushed_inmem_state(state), + wal_removal_on_hold: read_guard.wal_removal_on_hold, + peers: read_guard.get_peers(heartbeat_timeout), + } + } + + fn has_unflushed_inmem_state(state: &TimelineState) -> bool { + state.inmem.commit_lsn > state.commit_lsn + || state.inmem.backup_lsn > state.backup_lsn + || state.inmem.peer_horizon_lsn > state.peer_horizon_lsn + || state.inmem.remote_consistent_lsn > state.remote_consistent_lsn + } +} + +/// Control how often the manager task should wake up to check updates. +/// There is no need to check for updates more often than this. +const REFRESH_INTERVAL: Duration = Duration::from_millis(300); + +pub enum ManagerCtlMessage { + /// Request to get a guard for WalResidentTimeline, with WAL files available locally. + GuardRequest(tokio::sync::oneshot::Sender>), + /// Request to drop the guard. + GuardDrop(GuardId), + /// Request to reset uploaded partial backup state. + BackupPartialReset(oneshot::Sender>>), +} + +impl std::fmt::Debug for ManagerCtlMessage { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"), + ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id), + ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"), + } + } +} + +pub struct ManagerCtl { + manager_tx: tokio::sync::mpsc::UnboundedSender, + + // this is used to initialize manager, it will be moved out in bootstrap(). + init_manager_rx: + std::sync::Mutex>>, +} + +impl Default for ManagerCtl { + fn default() -> Self { + Self::new() + } +} + +impl ManagerCtl { + pub fn new() -> Self { + let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); + Self { + manager_tx: tx, + init_manager_rx: std::sync::Mutex::new(Some(rx)), + } + } + + /// Issue a new guard and wait for manager to prepare the timeline. + /// Sends a message to the manager and waits for the response. + /// Can be blocked indefinitely if the manager is stuck. + pub async fn wal_residence_guard(&self) -> anyhow::Result { + let (tx, rx) = tokio::sync::oneshot::channel(); + self.manager_tx.send(ManagerCtlMessage::GuardRequest(tx))?; + + // wait for the manager to respond with the guard + rx.await + .map_err(|e| anyhow::anyhow!("response read fail: {:?}", e)) + .and_then(std::convert::identity) + } + + /// Request timeline manager to reset uploaded partial segment state and + /// wait for the result. + pub async fn backup_partial_reset(&self) -> anyhow::Result> { + let (tx, rx) = oneshot::channel(); + self.manager_tx + .send(ManagerCtlMessage::BackupPartialReset(tx)) + .expect("manager task is not running"); + match rx.await { + Ok(res) => res, + Err(_) => anyhow::bail!("timeline manager is gone"), + } + } + + /// Must be called exactly once to bootstrap the manager. + pub fn bootstrap_manager( + &self, + ) -> ( + tokio::sync::mpsc::UnboundedSender, + tokio::sync::mpsc::UnboundedReceiver, + ) { + let rx = self + .init_manager_rx + .lock() + .expect("mutex init_manager_rx poisoned") + .take() + .expect("manager already bootstrapped"); + + (self.manager_tx.clone(), rx) + } +} + +pub(crate) struct Manager { + // configuration & dependencies + pub(crate) tli: ManagerTimeline, + pub(crate) conf: SafeKeeperConf, + pub(crate) wal_seg_size: usize, + pub(crate) walsenders: Arc, + + // current state + pub(crate) state_version_rx: tokio::sync::watch::Receiver, + pub(crate) num_computes_rx: tokio::sync::watch::Receiver, + pub(crate) tli_broker_active: TimelineSetGuard, + pub(crate) last_removed_segno: XLogSegNo, + pub(crate) is_offloaded: bool, + + // background tasks + pub(crate) backup_task: Option, + pub(crate) recovery_task: Option>, + pub(crate) wal_removal_task: Option>>, + + // partial backup + pub(crate) partial_backup_task: + Option<(JoinHandle>, CancellationToken)>, + pub(crate) partial_backup_uploaded: Option, + + // misc + pub(crate) access_service: AccessService, + pub(crate) global_rate_limiter: RateLimiter, + + // Anti-flapping state: we evict timelines eagerly if they are inactive, but should not + // evict them if they go inactive very soon after being restored. + pub(crate) evict_not_before: Instant, +} + +/// This task gets spawned alongside each timeline and is responsible for managing the timeline's +/// background tasks. +/// Be careful, this task is not respawned on panic, so it should not panic. +#[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))] +pub async fn main_task( + tli: ManagerTimeline, + conf: SafeKeeperConf, + broker_active_set: Arc, + manager_tx: tokio::sync::mpsc::UnboundedSender, + mut manager_rx: tokio::sync::mpsc::UnboundedReceiver, + global_rate_limiter: RateLimiter, +) { + tli.set_status(Status::Started); + + let defer_tli = tli.tli.clone(); + scopeguard::defer! { + if defer_tli.is_cancelled() { + info!("manager task finished"); + } else { + warn!("manager task finished prematurely"); + } + }; + + let mut mgr = Manager::new( + tli, + conf, + broker_active_set, + manager_tx, + global_rate_limiter, + ) + .await; + + // Start recovery task which always runs on the timeline. + if !mgr.is_offloaded && mgr.conf.peer_recovery_enabled { + let tli = mgr.wal_resident_timeline(); + mgr.recovery_task = Some(tokio::spawn(recovery_main(tli, mgr.conf.clone()))); + } + + let last_state = 'outer: loop { + MANAGER_ITERATIONS_TOTAL.inc(); + + mgr.set_status(Status::StateSnapshot); + let state_snapshot = mgr.state_snapshot().await; + + let mut next_event: Option = None; + if !mgr.is_offloaded { + let num_computes = *mgr.num_computes_rx.borrow(); + + mgr.set_status(Status::UpdateBackup); + let is_wal_backup_required = mgr.update_backup(num_computes, &state_snapshot).await; + mgr.update_is_active(is_wal_backup_required, num_computes, &state_snapshot); + + mgr.set_status(Status::UpdateControlFile); + mgr.update_control_file_save(&state_snapshot, &mut next_event) + .await; + + mgr.set_status(Status::UpdateWalRemoval); + mgr.update_wal_removal(&state_snapshot).await; + + mgr.set_status(Status::UpdatePartialBackup); + mgr.update_partial_backup(&state_snapshot).await; + + let now = Instant::now(); + if mgr.evict_not_before > now { + // we should wait until evict_not_before + update_next_event(&mut next_event, mgr.evict_not_before); + } + + if mgr.conf.enable_offload + && mgr.evict_not_before <= now + && mgr.ready_for_eviction(&next_event, &state_snapshot) + { + // check rate limiter and evict timeline if possible + match mgr.global_rate_limiter.try_acquire_eviction() { + Some(_permit) => { + mgr.set_status(Status::EvictTimeline); + mgr.evict_timeline().await; + } + None => { + // we can't evict timeline now, will try again later + mgr.evict_not_before = + Instant::now() + rand_duration(&mgr.conf.eviction_min_resident); + update_next_event(&mut next_event, mgr.evict_not_before); + } + } + } + } + + mgr.set_status(Status::Wait); + // wait until something changes. tx channels are stored under Arc, so they will not be + // dropped until the manager task is finished. + tokio::select! { + _ = mgr.tli.cancel.cancelled() => { + // timeline was deleted + break 'outer state_snapshot; + } + _ = async { + // don't wake up on every state change, but at most every REFRESH_INTERVAL + tokio::time::sleep(REFRESH_INTERVAL).await; + let _ = mgr.state_version_rx.changed().await; + } => { + // state was updated + } + _ = mgr.num_computes_rx.changed() => { + // number of connected computes was updated + } + _ = sleep_until(&next_event) => { + // we were waiting for some event (e.g. cfile save) + } + res = await_task_finish(mgr.wal_removal_task.as_mut()) => { + // WAL removal task finished + mgr.wal_removal_task = None; + mgr.update_wal_removal_end(res); + } + res = await_task_finish(mgr.partial_backup_task.as_mut().map(|(handle, _)| handle)) => { + // partial backup task finished + mgr.partial_backup_task = None; + mgr.update_partial_backup_end(res); + } + + msg = manager_rx.recv() => { + mgr.set_status(Status::HandleMessage); + mgr.handle_message(msg).await; + } + } + }; + mgr.set_status(Status::Exiting); + + // remove timeline from the broker active set sooner, before waiting for background tasks + mgr.tli_broker_active.set(false); + + // shutdown background tasks + if mgr.conf.is_wal_backup_enabled() { + wal_backup::update_task(&mut mgr, false, &last_state).await; + } + + if let Some(recovery_task) = &mut mgr.recovery_task { + if let Err(e) = recovery_task.await { + warn!("recovery task failed: {:?}", e); + } + } + + if let Some((handle, cancel)) = &mut mgr.partial_backup_task { + cancel.cancel(); + if let Err(e) = handle.await { + warn!("partial backup task failed: {:?}", e); + } + } + + if let Some(wal_removal_task) = &mut mgr.wal_removal_task { + let res = wal_removal_task.await; + mgr.update_wal_removal_end(res); + } + + mgr.set_status(Status::Finished); +} + +impl Manager { + async fn new( + tli: ManagerTimeline, + conf: SafeKeeperConf, + broker_active_set: Arc, + manager_tx: tokio::sync::mpsc::UnboundedSender, + global_rate_limiter: RateLimiter, + ) -> Manager { + let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await; + Manager { + wal_seg_size: tli.get_wal_seg_size().await, + walsenders: tli.get_walsenders().clone(), + state_version_rx: tli.get_state_version_rx(), + num_computes_rx: tli.get_walreceivers().get_num_rx(), + tli_broker_active: broker_active_set.guard(tli.clone()), + last_removed_segno: 0, + is_offloaded, + backup_task: None, + recovery_task: None, + wal_removal_task: None, + partial_backup_task: None, + partial_backup_uploaded, + access_service: AccessService::new(manager_tx), + tli, + global_rate_limiter, + // to smooth out evictions spike after restart + evict_not_before: Instant::now() + rand_duration(&conf.eviction_min_resident), + conf, + } + } + + fn set_status(&self, status: Status) { + self.tli.set_status(status); + } + + /// Get a WalResidentTimeline. + /// Manager code must use this function instead of one from `Timeline` + /// directly, because it will deadlock. + pub(crate) fn wal_resident_timeline(&mut self) -> WalResidentTimeline { + assert!(!self.is_offloaded); + let guard = self.access_service.create_guard(); + WalResidentTimeline::new(self.tli.clone(), guard) + } + + /// Get a snapshot of the timeline state. + async fn state_snapshot(&self) -> StateSnapshot { + let _timer = MISC_OPERATION_SECONDS + .with_label_values(&["state_snapshot"]) + .start_timer(); + + StateSnapshot::new( + self.tli.read_shared_state().await, + self.conf.heartbeat_timeout, + ) + } + + /// Spawns/kills backup task and returns true if backup is required. + async fn update_backup(&mut self, num_computes: usize, state: &StateSnapshot) -> bool { + let is_wal_backup_required = + wal_backup::is_wal_backup_required(self.wal_seg_size, num_computes, state); + + if self.conf.is_wal_backup_enabled() { + wal_backup::update_task(self, is_wal_backup_required, state).await; + } + + // update the state in Arc + self.tli.wal_backup_active.store( + self.backup_task.is_some(), + std::sync::atomic::Ordering::Relaxed, + ); + is_wal_backup_required + } + + /// Update is_active flag and returns its value. + fn update_is_active( + &mut self, + is_wal_backup_required: bool, + num_computes: usize, + state: &StateSnapshot, + ) { + let is_active = is_wal_backup_required + || num_computes > 0 + || state.remote_consistent_lsn < state.commit_lsn; + + // update the broker timeline set + if self.tli_broker_active.set(is_active) { + // write log if state has changed + info!( + "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}", + is_active, state.remote_consistent_lsn, state.commit_lsn, + ); + + MANAGER_ACTIVE_CHANGES.inc(); + } + + // update the state in Arc + self.tli + .broker_active + .store(is_active, std::sync::atomic::Ordering::Relaxed); + } + + /// Save control file if needed. Returns Instant if we should persist the control file in the future. + async fn update_control_file_save( + &self, + state: &StateSnapshot, + next_event: &mut Option, + ) { + if !state.inmem_flush_pending { + return; + } + + if state.cfile_last_persist_at.elapsed() > self.conf.control_file_save_interval { + let mut write_guard = self.tli.write_shared_state().await; + // it should be done in the background because it blocks manager task, but flush() should + // be fast enough not to be a problem now + if let Err(e) = write_guard.sk.state_mut().flush().await { + warn!("failed to save control file: {:?}", e); + } + } else { + // we should wait until some time passed until the next save + update_next_event( + next_event, + (state.cfile_last_persist_at + self.conf.control_file_save_interval).into(), + ); + } + } + + /// Spawns WAL removal task if needed. + async fn update_wal_removal(&mut self, state: &StateSnapshot) { + if self.wal_removal_task.is_some() || state.wal_removal_on_hold { + // WAL removal is already in progress or hold off + return; + } + + // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon. + // This allows to get better read speed for pageservers that are lagging behind, + // at the cost of keeping more WAL on disk. + let replication_horizon_lsn = if self.conf.walsenders_keep_horizon { + self.walsenders.laggard_lsn() + } else { + None + }; + + let removal_horizon_lsn = calc_horizon_lsn(state, replication_horizon_lsn); + let removal_horizon_segno = removal_horizon_lsn + .segment_number(self.wal_seg_size) + .saturating_sub(1); + + if removal_horizon_segno > self.last_removed_segno { + // we need to remove WAL + let remover = match self.tli.read_shared_state().await.sk { + StateSK::Loaded(ref sk) => { + crate::wal_storage::Storage::remove_up_to(&sk.wal_store, removal_horizon_segno) + } + StateSK::Offloaded(_) => { + // we can't remove WAL if it's not loaded + warn!("unexpectedly trying to run WAL removal on offloaded timeline"); + return; + } + StateSK::Empty => unreachable!(), + }; + + self.wal_removal_task = Some(tokio::spawn( + async move { + remover.await?; + Ok(removal_horizon_segno) + } + .instrument(info_span!("WAL removal", ttid=%self.tli.ttid)), + )); + } + } + + /// Update the state after WAL removal task finished. + fn update_wal_removal_end(&mut self, res: Result, JoinError>) { + let new_last_removed_segno = match res { + Ok(Ok(segno)) => segno, + Err(e) => { + warn!("WAL removal task failed: {:?}", e); + return; + } + Ok(Err(e)) => { + warn!("WAL removal task failed: {:?}", e); + return; + } + }; + + self.last_removed_segno = new_last_removed_segno; + // update the state in Arc + self.tli + .last_removed_segno + .store(new_last_removed_segno, std::sync::atomic::Ordering::Relaxed); + } + + /// Spawns partial WAL backup task if needed. + async fn update_partial_backup(&mut self, state: &StateSnapshot) { + // check if WAL backup is enabled and should be started + if !self.conf.is_wal_backup_enabled() { + return; + } + + if self.partial_backup_task.is_some() { + // partial backup is already running + return; + } + + if !wal_backup_partial::needs_uploading(state, &self.partial_backup_uploaded) { + // nothing to upload + return; + } + + // Get WalResidentTimeline and start partial backup task. + let cancel = CancellationToken::new(); + let handle = tokio::spawn(wal_backup_partial::main_task( + self.wal_resident_timeline(), + self.conf.clone(), + self.global_rate_limiter.clone(), + cancel.clone(), + )); + self.partial_backup_task = Some((handle, cancel)); + } + + /// Update the state after partial WAL backup task finished. + fn update_partial_backup_end(&mut self, res: Result, JoinError>) { + match res { + Ok(new_upload_state) => { + self.partial_backup_uploaded = new_upload_state; + } + Err(e) => { + warn!("partial backup task panicked: {:?}", e); + } + } + } + + /// Reset partial backup state and remove its remote storage data. Since it + /// might concurrently uploading something, cancel the task first. + async fn backup_partial_reset(&mut self) -> anyhow::Result> { + info!("resetting partial backup state"); + // Force unevict timeline if it is evicted before erasing partial backup + // state. The intended use of this function is to drop corrupted remote + // state; we haven't enabled local files deletion yet anywhere, + // so direct switch is safe. + if self.is_offloaded { + self.tli.switch_to_present().await?; + // switch manager state as soon as possible + self.is_offloaded = false; + } + + if let Some((handle, cancel)) = &mut self.partial_backup_task { + cancel.cancel(); + info!("cancelled partial backup task, awaiting it"); + // we're going to reset .partial_backup_uploaded to None anyway, so ignore the result + handle.await.ok(); + self.partial_backup_task = None; + } + + let tli = self.wal_resident_timeline(); + let mut partial_backup = PartialBackup::new(tli, self.conf.clone()).await; + // Reset might fail e.g. when cfile is already reset but s3 removal + // failed, so set manager state to None beforehand. In any case caller + // is expected to retry until success. + self.partial_backup_uploaded = None; + let res = partial_backup.reset().await?; + info!("reset is done"); + Ok(res) + } + + /// Handle message arrived from ManagerCtl. + async fn handle_message(&mut self, msg: Option) { + debug!("received manager message: {:?}", msg); + match msg { + Some(ManagerCtlMessage::GuardRequest(tx)) => { + if self.is_offloaded { + // trying to unevict timeline, but without gurarantee that it will be successful + self.unevict_timeline().await; + } + + let guard = if self.is_offloaded { + Err(anyhow::anyhow!("timeline is offloaded, can't get a guard")) + } else { + Ok(self.access_service.create_guard()) + }; + + if tx.send(guard).is_err() { + warn!("failed to reply with a guard, receiver dropped"); + } + } + Some(ManagerCtlMessage::GuardDrop(guard_id)) => { + self.access_service.drop_guard(guard_id); + } + Some(ManagerCtlMessage::BackupPartialReset(tx)) => { + info!("resetting uploaded partial backup state"); + let res = self.backup_partial_reset().await; + if let Err(ref e) = res { + warn!("failed to reset partial backup state: {:?}", e); + } + if tx.send(res).is_err() { + warn!("failed to send partial backup reset result, receiver dropped"); + } + } + None => { + // can't happen, we're holding the sender + unreachable!(); + } + } + } +} + +// utility functions +async fn sleep_until(option: &Option) { + if let Some(timeout) = option { + tokio::time::sleep_until(*timeout).await; + } else { + futures::future::pending::<()>().await; + } +} + +/// Future that resolves when the task is finished or never if the task is None. +/// +/// Note: it accepts Option<&mut> instead of &mut Option<> because mapping the +/// option to get the latter is hard. +async fn await_task_finish(option: Option<&mut JoinHandle>) -> Result { + if let Some(task) = option { + task.await + } else { + futures::future::pending().await + } +} + +/// Update next_event if candidate is earlier. +fn update_next_event(next_event: &mut Option, candidate: Instant) { + if let Some(next) = next_event { + if candidate < *next { + *next = candidate; + } + } else { + *next_event = Some(candidate); + } +} + +#[repr(usize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum Status { + NotStarted, + Started, + StateSnapshot, + UpdateBackup, + UpdateControlFile, + UpdateWalRemoval, + UpdatePartialBackup, + EvictTimeline, + Wait, + HandleMessage, + Exiting, + Finished, +} + +/// AtomicStatus is a wrapper around AtomicUsize adapted for the Status enum. +pub struct AtomicStatus { + inner: AtomicUsize, +} + +impl Default for AtomicStatus { + fn default() -> Self { + Self::new() + } +} + +impl AtomicStatus { + pub fn new() -> Self { + AtomicStatus { + inner: AtomicUsize::new(Status::NotStarted as usize), + } + } + + pub fn load(&self, order: std::sync::atomic::Ordering) -> Status { + // Safety: This line of code uses `std::mem::transmute` to reinterpret the loaded value as `Status`. + // It is safe to use `transmute` in this context because `Status` is a repr(usize) enum, + // which means it has the same memory layout as usize. + // However, it is important to ensure that the loaded value is a valid variant of `Status`, + // otherwise, the behavior will be undefined. + unsafe { std::mem::transmute(self.inner.load(order)) } + } + + pub fn get(&self) -> Status { + self.load(std::sync::atomic::Ordering::Relaxed) + } + + pub fn store(&self, val: Status, order: std::sync::atomic::Ordering) { + self.inner.store(val as usize, order); + } +} diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 079e706ff8..6662e18817 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -2,8 +2,11 @@ //! All timelines should always be present in this map, this is done by loading them //! all from the disk on startup and keeping them in memory. +use crate::defaults::DEFAULT_EVICTION_CONCURRENCY; +use crate::rate_limit::RateLimiter; use crate::safekeeper::ServerInfo; -use crate::timeline::{Timeline, TimelineError}; +use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError}; +use crate::timelines_set::TimelinesSet; use crate::SafeKeeperConf; use anyhow::{bail, Context, Result}; use camino::Utf8PathBuf; @@ -11,17 +14,25 @@ use once_cell::sync::Lazy; use serde::Serialize; use std::collections::HashMap; use std::str::FromStr; +use std::sync::atomic::Ordering; use std::sync::{Arc, Mutex}; -use tokio::sync::mpsc::Sender; +use std::time::{Duration, Instant}; use tracing::*; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; struct GlobalTimelinesState { timelines: HashMap>, - wal_backup_launcher_tx: Option>, + + // A tombstone indicates this timeline used to exist has been deleted. These are used to prevent + // on-demand timeline creation from recreating deleted timelines. This is only soft-enforced, as + // this map is dropped on restart. + tombstones: HashMap, + conf: Option, + broker_active_set: Arc, load_lock: Arc>, + global_rate_limiter: RateLimiter, } // Used to prevent concurrent timeline loading. @@ -36,10 +47,11 @@ impl GlobalTimelinesState { } /// Get dependencies for a timeline constructor. - fn get_dependencies(&self) -> (SafeKeeperConf, Sender) { + fn get_dependencies(&self) -> (SafeKeeperConf, Arc, RateLimiter) { ( self.get_conf().clone(), - self.wal_backup_launcher_tx.as_ref().unwrap().clone(), + self.broker_active_set.clone(), + self.global_rate_limiter.clone(), ) } @@ -60,14 +72,21 @@ impl GlobalTimelinesState { .cloned() .ok_or(TimelineError::NotFound(*ttid)) } + + fn delete(&mut self, ttid: TenantTimelineId) { + self.timelines.remove(&ttid); + self.tombstones.insert(ttid, Instant::now()); + } } static TIMELINES_STATE: Lazy> = Lazy::new(|| { Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), - wal_backup_launcher_tx: None, + tombstones: HashMap::new(), conf: None, + broker_active_set: Arc::new(TimelinesSet::default()), load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)), + global_rate_limiter: RateLimiter::new(1, 1), }) }); @@ -76,16 +95,15 @@ pub struct GlobalTimelines; impl GlobalTimelines { /// Inject dependencies needed for the timeline constructors and load all timelines to memory. - pub async fn init( - conf: SafeKeeperConf, - wal_backup_launcher_tx: Sender, - ) -> Result<()> { + pub async fn init(conf: SafeKeeperConf) -> Result<()> { // clippy isn't smart enough to understand that drop(state) releases the // lock, so use explicit block let tenants_dir = { let mut state = TIMELINES_STATE.lock().unwrap(); - assert!(state.wal_backup_launcher_tx.is_none()); - state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); + state.global_rate_limiter = RateLimiter::new( + conf.partial_backup_concurrency, + DEFAULT_EVICTION_CONCURRENCY, + ); state.conf = Some(conf); // Iterate through all directories and load tenants for all directories @@ -129,15 +147,12 @@ impl GlobalTimelines { /// this function is called during init when nothing else is running, so /// this is fine. async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> { - let (conf, wal_backup_launcher_tx) = { + let (conf, broker_active_set, partial_backup_rate_limiter) = { let state = TIMELINES_STATE.lock().unwrap(); - ( - state.get_conf().clone(), - state.wal_backup_launcher_tx.as_ref().unwrap().clone(), - ) + state.get_dependencies() }; - let timelines_dir = conf.tenant_dir(&tenant_id); + let timelines_dir = get_tenant_dir(&conf, &tenant_id); for timelines_dir_entry in std::fs::read_dir(&timelines_dir) .with_context(|| format!("failed to list timelines dir {}", timelines_dir))? { @@ -147,7 +162,7 @@ impl GlobalTimelines { TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or("")) { let ttid = TenantTimelineId::new(tenant_id, timeline_id); - match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx.clone()) { + match Timeline::load_timeline(&conf, ttid) { Ok(timeline) => { let tli = Arc::new(timeline); TIMELINES_STATE @@ -155,8 +170,11 @@ impl GlobalTimelines { .unwrap() .timelines .insert(ttid, tli.clone()); - tli.bootstrap(&conf); - tli.update_status_notify().await.unwrap(); + tli.bootstrap( + &conf, + broker_active_set.clone(), + partial_backup_rate_limiter.clone(), + ); } // If we can't load a timeline, it's most likely because of a corrupted // directory. We will log an error and won't allow to delete/recreate @@ -189,20 +207,27 @@ impl GlobalTimelines { _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>, ttid: TenantTimelineId, ) -> Result> { - let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies(); + let (conf, broker_active_set, partial_backup_rate_limiter) = + TIMELINES_STATE.lock().unwrap().get_dependencies(); - match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) { + match Timeline::load_timeline(&conf, ttid) { Ok(timeline) => { let tli = Arc::new(timeline); // TODO: prevent concurrent timeline creation/loading - TIMELINES_STATE - .lock() - .unwrap() - .timelines - .insert(ttid, tli.clone()); + { + let mut state = TIMELINES_STATE.lock().unwrap(); - tli.bootstrap(&conf); + // We may be have been asked to load a timeline that was previously deleted (e.g. from `pull_timeline.rs`). We trust + // that the human doing this manual intervention knows what they are doing, and remove its tombstone. + if state.tombstones.remove(&ttid).is_some() { + warn!("Un-deleted timeline {ttid}"); + } + + state.timelines.insert(ttid, tli.clone()); + } + + tli.bootstrap(&conf, broker_active_set, partial_backup_rate_limiter); Ok(tli) } @@ -221,20 +246,29 @@ impl GlobalTimelines { TIMELINES_STATE.lock().unwrap().get_conf().clone() } + pub fn get_global_broker_active_set() -> Arc { + TIMELINES_STATE.lock().unwrap().broker_active_set.clone() + } + /// Create a new timeline with the given id. If the timeline already exists, returns /// an existing timeline. - pub async fn create( + pub(crate) async fn create( ttid: TenantTimelineId, server_info: ServerInfo, commit_lsn: Lsn, local_start_lsn: Lsn, ) -> Result> { - let (conf, wal_backup_launcher_tx) = { + let (conf, broker_active_set, partial_backup_rate_limiter) = { let state = TIMELINES_STATE.lock().unwrap(); if let Ok(timeline) = state.get(&ttid) { // Timeline already exists, return it. return Ok(timeline); } + + if state.tombstones.contains_key(&ttid) { + anyhow::bail!("Timeline {ttid} is deleted, refusing to recreate"); + } + state.get_dependencies() }; @@ -243,7 +277,6 @@ impl GlobalTimelines { let timeline = Arc::new(Timeline::create_empty( &conf, ttid, - wal_backup_launcher_tx, server_info, commit_lsn, local_start_lsn, @@ -264,7 +297,15 @@ impl GlobalTimelines { // Write the new timeline to the disk and start background workers. // Bootstrap is transactional, so if it fails, the timeline will be deleted, // and the state on disk should remain unchanged. - if let Err(e) = timeline.init_new(&mut shared_state, &conf).await { + if let Err(e) = timeline + .init_new( + &mut shared_state, + &conf, + broker_active_set, + partial_backup_rate_limiter, + ) + .await + { // Note: the most likely reason for init failure is that the timeline // directory already exists on disk. This happens when timeline is corrupted // and wasn't loaded from disk on startup because of that. We want to preserve @@ -281,25 +322,25 @@ impl GlobalTimelines { // We are done with bootstrap, release the lock, return the timeline. // {} block forces release before .await } - timeline.update_status_notify().await?; - timeline.wal_backup_launcher_tx.send(timeline.ttid).await?; Ok(timeline) } /// Get a timeline from the global map. If it's not present, it doesn't exist on disk, /// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid, /// i.e. loaded in memory and not cancelled. - pub fn get(ttid: TenantTimelineId) -> Result, TimelineError> { - let res = TIMELINES_STATE.lock().unwrap().get(&ttid); - - match res { + pub(crate) fn get(ttid: TenantTimelineId) -> Result, TimelineError> { + let tli_res = { + let state = TIMELINES_STATE.lock().unwrap(); + state.get(&ttid) + }; + match tli_res { Ok(tli) => { if tli.is_cancelled() { return Err(TimelineError::Cancelled(ttid)); } Ok(tli) } - _ => res, + _ => tli_res, } } @@ -328,37 +369,43 @@ impl GlobalTimelines { /// Cancels timeline, then deletes the corresponding data directory. /// If only_local, doesn't remove WAL segments in remote storage. - pub async fn delete( + pub(crate) async fn delete( ttid: &TenantTimelineId, only_local: bool, ) -> Result { - let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid); - match tli_res { + let tli_res = { + let state = TIMELINES_STATE.lock().unwrap(); + + if state.tombstones.contains_key(ttid) { + // Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do. + info!("Timeline {ttid} was already deleted"); + return Ok(TimelineDeleteForceResult { + dir_existed: false, + was_active: false, + }); + } + + state.get(ttid) + }; + + let result = match tli_res { Ok(timeline) => { + let was_active = timeline.broker_active.load(Ordering::Relaxed); + // Take a lock and finish the deletion holding this mutex. let mut shared_state = timeline.write_shared_state().await; info!("deleting timeline {}, only_local={}", ttid, only_local); - let (dir_existed, was_active) = - timeline.delete(&mut shared_state, only_local).await?; - - // Remove timeline from the map. - // FIXME: re-enable it once we fix the issue with recreation of deleted timelines - // https://github.com/neondatabase/neon/issues/3146 - // TIMELINES_STATE.lock().unwrap().timelines.remove(ttid); + let dir_existed = timeline.delete(&mut shared_state, only_local).await?; Ok(TimelineDeleteForceResult { dir_existed, - was_active, + was_active, // TODO: we probably should remove this field }) } Err(_) => { // Timeline is not memory, but it may still exist on disk in broken state. - let dir_path = TIMELINES_STATE - .lock() - .unwrap() - .get_conf() - .timeline_dir(ttid); + let dir_path = get_timeline_dir(TIMELINES_STATE.lock().unwrap().get_conf(), ttid); let dir_existed = delete_dir(dir_path)?; Ok(TimelineDeleteForceResult { @@ -366,7 +413,14 @@ impl GlobalTimelines { was_active: false, }) } - } + }; + + // Finalize deletion, by dropping Timeline objects and storing smaller tombstones. The tombstones + // are used to prevent still-running computes from re-creating the same timeline when they send data, + // and to speed up repeated deletion calls by avoiding re-listing objects. + TIMELINES_STATE.lock().unwrap().delete(*ttid); + + result } /// Deactivates and deletes all timelines for the tenant. Returns map of all timelines which @@ -407,27 +461,25 @@ impl GlobalTimelines { // Note that we could concurrently create new timelines while we were deleting them, // so the directory may be not empty. In this case timelines will have bad state // and timeline background jobs can panic. - delete_dir( - TIMELINES_STATE - .lock() - .unwrap() - .get_conf() - .tenant_dir(tenant_id), - )?; - - // FIXME: we temporarily disabled removing timelines from the map, see `delete_force` - // let tlis_after_delete = Self::get_all_for_tenant(*tenant_id); - // if !tlis_after_delete.is_empty() { - // // Some timelines were created while we were deleting them, returning error - // // to the caller, so it can retry later. - // bail!( - // "failed to delete all timelines for tenant {}: some timelines were created while we were deleting them", - // tenant_id - // ); - // } + delete_dir(get_tenant_dir( + TIMELINES_STATE.lock().unwrap().get_conf(), + tenant_id, + ))?; Ok(deleted) } + + pub fn housekeeping(tombstone_ttl: &Duration) { + let mut state = TIMELINES_STATE.lock().unwrap(); + + // We keep tombstones long enough to have a good chance of preventing rogue computes from re-creating deleted + // timelines. If a compute kept running for longer than this TTL (or across a safekeeper restart) then they + // may recreate a deleted timeline. + let now = Instant::now(); + state + .tombstones + .retain(|_, v| now.duration_since(*v) < *tombstone_ttl); + } } #[derive(Clone, Copy, Serialize)] diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs new file mode 100644 index 0000000000..096e348295 --- /dev/null +++ b/safekeeper/src/timelines_set.rs @@ -0,0 +1,95 @@ +use std::{collections::HashMap, sync::Arc}; + +use utils::id::TenantTimelineId; + +use crate::timeline::Timeline; + +/// Set of timelines, supports operations: +/// - add timeline +/// - remove timeline +/// - clone the set +/// +/// Usually used for keeping subset of timelines. For example active timelines that require broker push. +pub struct TimelinesSet { + timelines: std::sync::Mutex>>, +} + +impl Default for TimelinesSet { + fn default() -> Self { + Self { + timelines: std::sync::Mutex::new(HashMap::new()), + } + } +} + +impl TimelinesSet { + pub fn insert(&self, tli: Arc) { + self.timelines.lock().unwrap().insert(tli.ttid, tli); + } + + pub fn delete(&self, ttid: &TenantTimelineId) { + self.timelines.lock().unwrap().remove(ttid); + } + + /// If present is true, adds timeline to the set, otherwise removes it. + pub fn set_present(&self, tli: Arc, present: bool) { + if present { + self.insert(tli); + } else { + self.delete(&tli.ttid); + } + } + + pub fn is_present(&self, ttid: &TenantTimelineId) -> bool { + self.timelines.lock().unwrap().contains_key(ttid) + } + + /// Returns all timelines in the set. + pub fn get_all(&self) -> Vec> { + self.timelines.lock().unwrap().values().cloned().collect() + } + + /// Returns a timeline guard for easy presence control. + pub fn guard(self: &Arc, tli: Arc) -> TimelineSetGuard { + let is_present = self.is_present(&tli.ttid); + TimelineSetGuard { + timelines_set: self.clone(), + tli, + is_present, + } + } +} + +/// Guard is used to add or remove timelines from the set. +/// +/// If the timeline present in set, it will be removed from it on drop. +/// Note: do not use more than one guard for the same timeline, it caches the presence state. +/// It is designed to be used in the manager task only. +pub struct TimelineSetGuard { + timelines_set: Arc, + tli: Arc, + is_present: bool, +} + +impl TimelineSetGuard { + /// Returns true if the state was changed. + pub fn set(&mut self, present: bool) -> bool { + if present == self.is_present { + return false; + } + self.is_present = present; + self.timelines_set.set_present(self.tli.clone(), present); + true + } + + pub fn get(&self) -> bool { + self.is_present + } +} + +impl Drop for TimelineSetGuard { + fn drop(&mut self) { + // remove timeline from the map on drop + self.timelines_set.delete(&self.tli.ttid); + } +} diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index c47381351d..ef26ac99c5 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -9,30 +9,29 @@ use utils::backoff; use utils::id::NodeId; use std::cmp::min; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; +use std::num::NonZeroU32; use std::pin::Pin; -use std::sync::Arc; use std::time::Duration; use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; use postgres_ffi::XLogFileName; use postgres_ffi::{XLogSegNo, PG_TLI}; -use remote_storage::{GenericRemoteStorage, RemotePath}; +use remote_storage::{GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata}; use tokio::fs::File; use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; -use tokio::sync::watch; +use tokio::sync::{watch, OnceCell}; use tokio::time::sleep; use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; -use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS}; -use crate::timeline::{PeerInfo, Timeline}; -use crate::{GlobalTimelines, SafeKeeperConf}; - -use once_cell::sync::OnceCell; +use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS}; +use crate::timeline::{PeerInfo, WalResidentTimeline}; +use crate::timeline_manager::{Manager, StateSnapshot}; +use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME}; const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10; const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; @@ -40,35 +39,75 @@ const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; /// Default buffer size when interfacing with [`tokio::fs::File`]. const BUFFER_SIZE: usize = 32 * 1024; -/// Check whether wal backup is required for timeline. If yes, mark that launcher is -/// aware of current status and return the timeline. -async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option> { - match GlobalTimelines::get(ttid).ok() { - Some(tli) => { - tli.wal_backup_attend().await; - Some(tli) - } - None => None, - } -} - -struct WalBackupTaskHandle { +pub struct WalBackupTaskHandle { shutdown_tx: Sender<()>, handle: JoinHandle<()>, } -struct WalBackupTimelineEntry { - timeline: Arc, - handle: Option, +/// Do we have anything to upload to S3, i.e. should safekeepers run backup activity? +pub(crate) fn is_wal_backup_required( + wal_seg_size: usize, + num_computes: usize, + state: &StateSnapshot, +) -> bool { + num_computes > 0 || + // Currently only the whole segment is offloaded, so compare segment numbers. + (state.commit_lsn.segment_number(wal_seg_size) > state.backup_lsn.segment_number(wal_seg_size)) } -async fn shut_down_task(ttid: TenantTimelineId, entry: &mut WalBackupTimelineEntry) { - if let Some(wb_handle) = entry.handle.take() { +/// Based on peer information determine which safekeeper should offload; if it +/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task +/// is running, kill it. +pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &StateSnapshot) { + let (offloader, election_dbg_str) = + determine_offloader(&state.peers, state.backup_lsn, mgr.tli.ttid, &mgr.conf); + let elected_me = Some(mgr.conf.my_id) == offloader; + + let should_task_run = need_backup && elected_me; + + // start or stop the task + if should_task_run != (mgr.backup_task.is_some()) { + if should_task_run { + info!("elected for backup: {}", election_dbg_str); + + let (shutdown_tx, shutdown_rx) = mpsc::channel(1); + + let async_task = backup_task_main( + mgr.wal_resident_timeline(), + mgr.conf.backup_parallel_jobs, + shutdown_rx, + ); + + let handle = if mgr.conf.current_thread_runtime { + tokio::spawn(async_task) + } else { + WAL_BACKUP_RUNTIME.spawn(async_task) + }; + + mgr.backup_task = Some(WalBackupTaskHandle { + shutdown_tx, + handle, + }); + } else { + if !need_backup { + // don't need backup at all + info!("stepping down from backup, need_backup={}", need_backup); + } else { + // someone else has been elected + info!("stepping down from backup: {}", election_dbg_str); + } + shut_down_task(&mut mgr.backup_task).await; + } + } +} + +async fn shut_down_task(entry: &mut Option) { + if let Some(wb_handle) = entry.take() { // Tell the task to shutdown. Error means task exited earlier, that's ok. let _ = wb_handle.shutdown_tx.send(()).await; // Await the task itself. TODO: restart panicked tasks earlier. if let Err(e) = wb_handle.handle.await { - warn!("WAL backup task for {} panicked: {}", ttid, e); + warn!("WAL backup task panicked: {}", e); } } } @@ -78,6 +117,7 @@ async fn shut_down_task(ttid: TenantTimelineId, entry: &mut WalBackupTimelineEnt /// time we have several ones as they PUT the same files. Also, /// - frequently changing the offloader would be bad; /// - electing seriously lagging safekeeper is undesirable; +/// /// So we deterministically choose among the reasonably caught up candidates. /// TODO: take into account failed attempts to deal with hypothetical situation /// where s3 is unreachable only for some sks. @@ -125,50 +165,7 @@ fn determine_offloader( } } -/// Based on peer information determine which safekeeper should offload; if it -/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task -/// is running, kill it. -async fn update_task( - conf: &SafeKeeperConf, - ttid: TenantTimelineId, - entry: &mut WalBackupTimelineEntry, -) { - let alive_peers = entry.timeline.get_peers(conf).await; - let wal_backup_lsn = entry.timeline.get_wal_backup_lsn().await; - let (offloader, election_dbg_str) = - determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf); - let elected_me = Some(conf.my_id) == offloader; - - if elected_me != (entry.handle.is_some()) { - if elected_me { - info!("elected for backup: {}", election_dbg_str); - - let (shutdown_tx, shutdown_rx) = mpsc::channel(1); - let timeline_dir = conf.timeline_dir(&ttid); - - let handle = tokio::spawn( - backup_task_main( - ttid, - timeline_dir, - conf.workdir.clone(), - conf.backup_parallel_jobs, - shutdown_rx, - ) - .in_current_span(), - ); - - entry.handle = Some(WalBackupTaskHandle { - shutdown_tx, - handle, - }); - } else { - info!("stepping down from backup: {}", election_dbg_str); - shut_down_task(ttid, entry).await; - } - } -} - -static REMOTE_STORAGE: OnceCell> = OnceCell::new(); +static REMOTE_STORAGE: OnceCell> = OnceCell::const_new(); // Storage must be configured and initialized when this is called. fn get_configured_remote_storage() -> &'static GenericRemoteStorage { @@ -179,106 +176,47 @@ fn get_configured_remote_storage() -> &'static GenericRemoteStorage { .unwrap() } -const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000; - -/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup -/// tasks. Having this in separate task simplifies locking, allows to reap -/// panics and separate elections from offloading itself. -pub async fn wal_backup_launcher_task_main( - conf: SafeKeeperConf, - mut wal_backup_launcher_rx: Receiver, -) -> anyhow::Result<()> { - info!( - "WAL backup launcher started, remote config {:?}", - conf.remote_storage - ); - - let conf_ = conf.clone(); - REMOTE_STORAGE.get_or_init(|| { - conf_ - .remote_storage - .as_ref() - .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage")) - }); - - // Presence in this map means launcher is aware s3 offloading is needed for - // the timeline, but task is started only if it makes sense for to offload - // from this safekeeper. - let mut tasks: HashMap = HashMap::new(); - - let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC)); - loop { - tokio::select! { - ttid = wal_backup_launcher_rx.recv() => { - // channel is never expected to get closed - let ttid = ttid.unwrap(); - if !conf.is_wal_backup_enabled() { - continue; /* just drain the channel and do nothing */ - } - async { - let timeline = is_wal_backup_required(ttid).await; - // do we need to do anything at all? - if timeline.is_some() != tasks.contains_key(&ttid) { - if let Some(timeline) = timeline { - // need to start the task - let entry = tasks.entry(ttid).or_insert(WalBackupTimelineEntry { - timeline, - handle: None, - }); - update_task(&conf, ttid, entry).await; - } else { - // need to stop the task - info!("stopping WAL backup task"); - let mut entry = tasks.remove(&ttid).unwrap(); - shut_down_task(ttid, &mut entry).await; - } - } - }.instrument(info_span!("WAL backup", ttid = %ttid)).await; +pub async fn init_remote_storage(conf: &SafeKeeperConf) { + // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide + // dependencies to all tasks instead. + REMOTE_STORAGE + .get_or_init(|| async { + if let Some(conf) = conf.remote_storage.as_ref() { + Some( + GenericRemoteStorage::from_config(conf) + .await + .expect("failed to create remote storage"), + ) + } else { + None } - // For each timeline needing offloading, check if this safekeeper - // should do the job and start/stop the task accordingly. - _ = ticker.tick() => { - for (ttid, entry) in tasks.iter_mut() { - update_task(&conf, *ttid, entry) - .instrument(info_span!("WAL backup", ttid = %ttid)) - .await; - } - } - } - } + }) + .await; } struct WalBackupTask { - timeline: Arc, + timeline: WalResidentTimeline, timeline_dir: Utf8PathBuf, - workspace_dir: Utf8PathBuf, wal_seg_size: usize, parallel_jobs: usize, commit_lsn_watch_rx: watch::Receiver, } /// Offload single timeline. +#[instrument(name = "wal_backup", skip_all, fields(ttid = %tli.ttid))] async fn backup_task_main( - ttid: TenantTimelineId, - timeline_dir: Utf8PathBuf, - workspace_dir: Utf8PathBuf, + tli: WalResidentTimeline, parallel_jobs: usize, mut shutdown_rx: Receiver<()>, ) { + let _guard = WAL_BACKUP_TASKS.guard(); info!("started"); - let res = GlobalTimelines::get(ttid); - if let Err(e) = res { - error!("backup error: {}", e); - return; - } - let tli = res.unwrap(); let mut wb = WalBackupTask { wal_seg_size: tli.get_wal_seg_size().await, commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), + timeline_dir: tli.get_timeline_dir(), timeline: tli, - timeline_dir, - workspace_dir, parallel_jobs, }; @@ -345,7 +283,6 @@ impl WalBackupTask { commit_lsn, self.wal_seg_size, &self.timeline_dir, - &self.workspace_dir, self.parallel_jobs, ) .await @@ -367,18 +304,18 @@ impl WalBackupTask { } async fn backup_lsn_range( - timeline: &Arc, + timeline: &WalResidentTimeline, backup_lsn: &mut Lsn, end_lsn: Lsn, wal_seg_size: usize, timeline_dir: &Utf8Path, - workspace_dir: &Utf8Path, parallel_jobs: usize, ) -> Result<()> { if parallel_jobs < 1 { anyhow::bail!("parallel_jobs must be >= 1"); } + let remote_timeline_path = &timeline.remote_path; let start_lsn = *backup_lsn; let segments = get_segments(start_lsn, end_lsn, wal_seg_size); @@ -391,7 +328,7 @@ async fn backup_lsn_range( loop { let added_task = match iter.next() { Some(s) => { - uploads.push_back(backup_single_segment(s, timeline_dir, workspace_dir)); + uploads.push_back(backup_single_segment(s, timeline_dir, remote_timeline_path)); true } None => false, @@ -429,18 +366,10 @@ async fn backup_lsn_range( async fn backup_single_segment( seg: &Segment, timeline_dir: &Utf8Path, - workspace_dir: &Utf8Path, + remote_timeline_path: &RemotePath, ) -> Result { let segment_file_path = seg.file_path(timeline_dir)?; - let remote_segment_path = segment_file_path - .strip_prefix(workspace_dir) - .context("Failed to strip workspace dir prefix") - .and_then(RemotePath::new) - .with_context(|| { - format!( - "Failed to resolve remote part of path {segment_file_path:?} for base {workspace_dir:?}", - ) - })?; + let remote_segment_path = seg.remote_path(remote_timeline_path); let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await; if res.is_ok() { @@ -478,6 +407,10 @@ impl Segment { Ok(timeline_dir.join(self.object_name())) } + pub fn remote_path(self, remote_timeline_path: &RemotePath) -> RemotePath { + remote_timeline_path.join(self.object_name()) + } + pub fn size(self) -> usize { (u64::from(self.end_lsn) - u64::from(self.start_lsn)) as usize } @@ -510,7 +443,50 @@ async fn backup_object( let file = tokio_util::io::ReaderStream::with_capacity(file, BUFFER_SIZE); - storage.upload_storage_object(file, size, target_file).await + let cancel = CancellationToken::new(); + + storage + .upload_storage_object(file, size, target_file, &cancel) + .await +} + +pub(crate) async fn backup_partial_segment( + source_file: &Utf8Path, + target_file: &RemotePath, + size: usize, +) -> Result<()> { + let storage = get_configured_remote_storage(); + + let file = File::open(&source_file) + .await + .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?; + + // limiting the file to read only the first `size` bytes + let limited_file = tokio::io::AsyncReadExt::take(file, size as u64); + + let file = tokio_util::io::ReaderStream::with_capacity(limited_file, BUFFER_SIZE); + + let cancel = CancellationToken::new(); + + storage + .upload( + file, + size, + target_file, + Some(StorageMetadata::from([("sk_type", "partial_segment")])), + &cancel, + ) + .await +} + +pub(crate) async fn copy_partial_segment( + source: &RemotePath, + destination: &RemotePath, +) -> Result<()> { + let storage = get_configured_remote_storage(); + let cancel = CancellationToken::new(); + + storage.copy_object(source, destination, &cancel).await } pub async fn read_object( @@ -525,8 +501,10 @@ pub async fn read_object( info!("segment download about to start from remote path {file_path:?} at offset {offset}"); + let cancel = CancellationToken::new(); + let download = storage - .download_storage_object(Some((offset, None)), file_path) + .download_storage_object(Some((offset, None)), file_path, &cancel) .await .with_context(|| { format!("Failed to open WAL segment download stream for remote path {file_path:?}") @@ -543,8 +521,11 @@ pub async fn read_object( /// when called. pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { let storage = get_configured_remote_storage(); - let ttid_path = Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string()); - let remote_path = RemotePath::new(&ttid_path)?; + let remote_path = remote_timeline_path(ttid)?; + + // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE + // const Option unwrap is not stable, otherwise it would be const. + let batch_size: NonZeroU32 = NonZeroU32::new(1000).unwrap(); // A backoff::retry is used here for two reasons: // - To provide a backoff rather than busy-polling the API on errors @@ -554,24 +535,61 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { // Note: listing segments might take a long time if there are many of them. // We don't currently have http requests timeout cancellation, but if/once // we have listing should get streaming interface to make progress. - let token = CancellationToken::new(); // not really used + + let cancel = CancellationToken::new(); // not really used backoff::retry( || async { - let files = storage.list_files(Some(&remote_path)).await?; - storage.delete_objects(&files).await?; - Ok(()) + // Do list-delete in batch_size batches to make progress even if there a lot of files. + // Alternatively we could make remote storage list return iterator, but it is more complicated and + // I'm not sure deleting while iterating is expected in s3. + loop { + let files = storage + .list( + Some(&remote_path), + ListingMode::NoDelimiter, + Some(batch_size), + &cancel, + ) + .await? + .keys + .into_iter() + .map(|o| o.key) + .collect::>(); + if files.is_empty() { + return Ok(()); // done + } + // (at least) s3 results are sorted, so can log min/max: + // "List results are always returned in UTF-8 binary order." + info!( + "deleting batch of {} WAL segments [{}-{}]", + files.len(), + files.first().unwrap().object_name().unwrap_or(""), + files.last().unwrap().object_name().unwrap_or("") + ); + storage.delete_objects(&files, &cancel).await?; + } }, + // consider TimeoutOrCancel::caused_by_cancel when using cancellation |_| false, 3, 10, "executing WAL segments deletion batch", - backoff::Cancel::new(token, || anyhow::anyhow!("canceled")), + &cancel, ) - .await?; + .await + .ok_or_else(|| anyhow::anyhow!("canceled")) + .and_then(|x| x)?; Ok(()) } +/// Used by wal_backup_partial. +pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> { + let cancel = CancellationToken::new(); // not really used + let storage = get_configured_remote_storage(); + storage.delete_objects(paths, &cancel).await +} + /// Copy segments from one timeline to another. Used in copy_timeline. pub async fn copy_s3_segments( wal_seg_size: usize, @@ -588,15 +606,23 @@ pub async fn copy_s3_segments( .as_ref() .unwrap(); - let relative_dst_path = - Utf8Path::new(&dst_ttid.tenant_id.to_string()).join(dst_ttid.timeline_id.to_string()); + let remote_dst_path = remote_timeline_path(dst_ttid)?; - let remote_path = RemotePath::new(&relative_dst_path)?; + let cancel = CancellationToken::new(); + + let files = storage + .list( + Some(&remote_dst_path), + ListingMode::NoDelimiter, + None, + &cancel, + ) + .await? + .keys; - let files = storage.list_files(Some(&remote_path)).await?; let uploaded_segments = &files .iter() - .filter_map(|file| file.object_name().map(ToOwned::to_owned)) + .filter_map(|o| o.key.object_name().map(ToOwned::to_owned)) .collect::>(); debug!( @@ -604,9 +630,6 @@ pub async fn copy_s3_segments( uploaded_segments ); - let relative_src_path = - Utf8Path::new(&src_ttid.tenant_id.to_string()).join(src_ttid.timeline_id.to_string()); - for segno in from_segment..to_segment { if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 { info!("copied all segments from {} until {}", from_segment, segno); @@ -618,10 +641,10 @@ pub async fn copy_s3_segments( } debug!("copying segment {}", segment_name); - let from = RemotePath::new(&relative_src_path.join(&segment_name))?; - let to = RemotePath::new(&relative_dst_path.join(&segment_name))?; + let from = remote_timeline_path(src_ttid)?.join(&segment_name); + let to = remote_dst_path.join(&segment_name); - storage.copy_object(&from, &to).await?; + storage.copy_object(&from, &to, &cancel).await?; } info!( @@ -630,3 +653,8 @@ pub async fn copy_s3_segments( ); Ok(()) } + +/// Get S3 (remote_storage) prefix path used for timeline files. +pub fn remote_timeline_path(ttid: &TenantTimelineId) -> Result { + RemotePath::new(&Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string())) +} diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs new file mode 100644 index 0000000000..bddfca50e4 --- /dev/null +++ b/safekeeper/src/wal_backup_partial.rs @@ -0,0 +1,566 @@ +//! Safekeeper timeline has a background task which is subscribed to `commit_lsn` +//! and `flush_lsn` updates. +//! +//! After the partial segment was updated (`flush_lsn` was changed), the segment +//! will be uploaded to S3 within the configured `partial_backup_timeout`. +//! +//! The filename format for partial segments is +//! `Segment_Term_Flush_Commit_skNN.partial`, where: +//! - `Segment` – the segment name, like `000000010000000000000001` +//! - `Term` – current term +//! - `Flush` – flush_lsn in hex format `{:016X}`, e.g. `00000000346BC568` +//! - `Commit` – commit_lsn in the same hex format +//! - `NN` – safekeeper_id, like `1` +//! +//! The full object name example: +//! `000000010000000000000002_2_0000000002534868_0000000002534410_sk1.partial` +//! +//! Each safekeeper will keep info about remote partial segments in its control +//! file. Code updates state in the control file before doing any S3 operations. +//! This way control file stores information about all potentially existing +//! remote partial segments and can clean them up after uploading a newer version. +use camino::Utf8PathBuf; +use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; +use remote_storage::RemotePath; +use serde::{Deserialize, Serialize}; + +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, instrument, warn}; +use utils::{id::NodeId, lsn::Lsn}; + +use crate::{ + metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS}, + rate_limit::{rand_duration, RateLimiter}, + safekeeper::Term, + timeline::WalResidentTimeline, + timeline_manager::StateSnapshot, + wal_backup::{self}, + SafeKeeperConf, +}; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum UploadStatus { + /// Upload is in progress. This status should be used only for garbage collection, + /// don't read data from the remote storage with this status. + InProgress, + /// Upload is finished. There is always at most one segment with this status. + /// It means that the segment is actual and can be used. + Uploaded, + /// Deletion is in progress. This status should be used only for garbage collection, + /// don't read data from the remote storage with this status. + Deleting, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct PartialRemoteSegment { + pub status: UploadStatus, + pub name: String, + pub commit_lsn: Lsn, + pub flush_lsn: Lsn, + // We should use last_log_term here, otherwise it's possible to have inconsistent data in the + // remote storage. + // + // More info here: https://github.com/neondatabase/neon/pull/8022#discussion_r1654738405 + pub term: Term, +} + +impl PartialRemoteSegment { + fn eq_without_status(&self, other: &Self) -> bool { + self.name == other.name + && self.commit_lsn == other.commit_lsn + && self.flush_lsn == other.flush_lsn + && self.term == other.term + } + + pub(crate) fn remote_path(&self, remote_timeline_path: &RemotePath) -> RemotePath { + remote_timeline_path.join(&self.name) + } +} + +// NB: these structures are a part of a control_file, you can't change them without +// changing the control file format version. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)] +pub struct State { + pub segments: Vec, +} + +#[derive(Debug)] +pub(crate) struct ReplaceUploadedSegment { + pub(crate) previous: PartialRemoteSegment, + pub(crate) current: PartialRemoteSegment, +} + +impl State { + /// Find an Uploaded segment. There should be only one Uploaded segment at a time. + pub(crate) fn uploaded_segment(&self) -> Option { + self.segments + .iter() + .find(|seg| seg.status == UploadStatus::Uploaded) + .cloned() + } + + /// Replace the name of the Uploaded segment (if one exists) in order to match + /// it with `destination` safekeeper. Returns a description of the change or None + /// wrapped in anyhow::Result. + pub(crate) fn replace_uploaded_segment( + &mut self, + source: NodeId, + destination: NodeId, + ) -> anyhow::Result> { + let current = self + .segments + .iter_mut() + .find(|seg| seg.status == UploadStatus::Uploaded); + + let current = match current { + Some(some) => some, + None => { + return anyhow::Ok(None); + } + }; + + // Sanity check that the partial segment we are replacing is belongs + // to the `source` SK. + if !current + .name + .ends_with(format!("sk{}.partial", source.0).as_str()) + { + anyhow::bail!( + "Partial segment name ({}) doesn't match self node id ({})", + current.name, + source + ); + } + + let previous = current.clone(); + + let new_name = current.name.replace( + format!("_sk{}", source.0).as_str(), + format!("_sk{}", destination.0).as_str(), + ); + + current.name = new_name; + + anyhow::Ok(Some(ReplaceUploadedSegment { + previous, + current: current.clone(), + })) + } +} + +pub struct PartialBackup { + wal_seg_size: usize, + tli: WalResidentTimeline, + conf: SafeKeeperConf, + local_prefix: Utf8PathBuf, + remote_timeline_path: RemotePath, + + state: State, +} + +impl PartialBackup { + pub async fn new(tli: WalResidentTimeline, conf: SafeKeeperConf) -> PartialBackup { + let (_, persistent_state) = tli.get_state().await; + let wal_seg_size = tli.get_wal_seg_size().await; + + let local_prefix = tli.get_timeline_dir(); + let remote_timeline_path = tli.remote_path.clone(); + + PartialBackup { + wal_seg_size, + tli, + state: persistent_state.partial_backup, + conf, + local_prefix, + remote_timeline_path, + } + } + + // Read-only methods for getting segment names + fn segno(&self, lsn: Lsn) -> XLogSegNo { + lsn.segment_number(self.wal_seg_size) + } + + fn segment_name(&self, segno: u64) -> String { + XLogFileName(PG_TLI, segno, self.wal_seg_size) + } + + fn remote_segment_name( + &self, + segno: u64, + term: u64, + commit_lsn: Lsn, + flush_lsn: Lsn, + ) -> String { + format!( + "{}_{}_{:016X}_{:016X}_sk{}.partial", + self.segment_name(segno), + term, + flush_lsn.0, + commit_lsn.0, + self.conf.my_id.0, + ) + } + + fn local_segment_name(&self, segno: u64) -> String { + format!("{}.partial", self.segment_name(segno)) + } +} + +impl PartialBackup { + /// Takes a lock to read actual safekeeper state and returns a segment that should be uploaded. + async fn prepare_upload(&self) -> PartialRemoteSegment { + // this operation takes a lock to get the actual state + let sk_info = self.tli.get_safekeeper_info(&self.conf).await; + let flush_lsn = Lsn(sk_info.flush_lsn); + let commit_lsn = Lsn(sk_info.commit_lsn); + let last_log_term = sk_info.last_log_term; + let segno = self.segno(flush_lsn); + + let name = self.remote_segment_name(segno, last_log_term, commit_lsn, flush_lsn); + + PartialRemoteSegment { + status: UploadStatus::InProgress, + name, + commit_lsn, + flush_lsn, + term: last_log_term, + } + } + + /// Reads segment from disk and uploads it to the remote storage. + async fn upload_segment(&mut self, prepared: PartialRemoteSegment) -> anyhow::Result<()> { + let flush_lsn = prepared.flush_lsn; + let segno = self.segno(flush_lsn); + + // We're going to backup bytes from the start of the segment up to flush_lsn. + let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size); + + let local_path = self.local_prefix.join(self.local_segment_name(segno)); + let remote_path = prepared.remote_path(&self.remote_timeline_path); + + // Upload first `backup_bytes` bytes of the segment to the remote storage. + wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?; + PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64); + + // We uploaded the segment, now let's verify that the data is still actual. + // If the term changed, we cannot guarantee the validity of the uploaded data. + // If the term is the same, we know the data is not corrupted. + let sk_info = self.tli.get_safekeeper_info(&self.conf).await; + if sk_info.last_log_term != prepared.term { + anyhow::bail!("term changed during upload"); + } + assert!(prepared.commit_lsn <= Lsn(sk_info.commit_lsn)); + assert!(prepared.flush_lsn <= Lsn(sk_info.flush_lsn)); + + Ok(()) + } + + /// Write new state to disk. If in-memory and on-disk states diverged, returns an error. + async fn commit_state(&mut self, new_state: State) -> anyhow::Result<()> { + self.tli + .map_control_file(|cf| { + if cf.partial_backup != self.state { + let memory = self.state.clone(); + self.state = cf.partial_backup.clone(); + anyhow::bail!( + "partial backup state diverged, memory={:?}, disk={:?}", + memory, + cf.partial_backup + ); + } + + cf.partial_backup = new_state.clone(); + Ok(()) + }) + .await?; + // update in-memory state + self.state = new_state; + Ok(()) + } + + /// Upload the latest version of the partial segment and garbage collect older versions. + #[instrument(name = "upload", skip_all, fields(name = %prepared.name))] + async fn do_upload(&mut self, prepared: &PartialRemoteSegment) -> anyhow::Result<()> { + let _timer = MISC_OPERATION_SECONDS + .with_label_values(&["partial_do_upload"]) + .start_timer(); + info!("starting upload {:?}", prepared); + + let state_0 = self.state.clone(); + let state_1 = { + let mut state = state_0.clone(); + state.segments.push(prepared.clone()); + state + }; + + // we're going to upload a new segment, let's write it to disk to make GC later + self.commit_state(state_1).await?; + + self.upload_segment(prepared.clone()).await?; + + let state_2 = { + let mut state = state_0.clone(); + for seg in state.segments.iter_mut() { + seg.status = UploadStatus::Deleting; + } + let mut actual_remote_segment = prepared.clone(); + actual_remote_segment.status = UploadStatus::Uploaded; + state.segments.push(actual_remote_segment); + state + }; + + // we've uploaded new segment, it's actual, all other segments should be GCed + self.commit_state(state_2).await?; + self.gc().await?; + + Ok(()) + } + + // Prepend to the given segments remote prefix and delete them from the + // remote storage. + async fn delete_segments(&self, segments_to_delete: &Vec) -> anyhow::Result<()> { + info!("deleting objects: {:?}", segments_to_delete); + let mut objects_to_delete = vec![]; + for seg in segments_to_delete.iter() { + let remote_path = self.remote_timeline_path.join(seg); + objects_to_delete.push(remote_path); + } + wal_backup::delete_objects(&objects_to_delete).await + } + + /// Delete all non-Uploaded segments from the remote storage. There should be only one + /// Uploaded segment at a time. + #[instrument(name = "gc", skip_all)] + async fn gc(&mut self) -> anyhow::Result<()> { + let mut segments_to_delete = vec![]; + + let new_segments: Vec = self + .state + .segments + .iter() + .filter_map(|seg| { + if seg.status == UploadStatus::Uploaded { + Some(seg.clone()) + } else { + segments_to_delete.push(seg.name.clone()); + None + } + }) + .collect(); + + if new_segments.len() == 1 { + // we have an uploaded segment, it must not be deleted from remote storage + segments_to_delete.retain(|name| name != &new_segments[0].name); + } else { + // there should always be zero or one uploaded segment + assert!( + new_segments.is_empty(), + "too many uploaded segments: {:?}", + new_segments + ); + } + + // execute the deletion + self.delete_segments(&segments_to_delete).await?; + + // now we can update the state on disk + let new_state = { + let mut state = self.state.clone(); + state.segments = new_segments; + state + }; + self.commit_state(new_state).await?; + + Ok(()) + } + + /// Remove uploaded segment(s) from the state and remote storage. Aimed for + /// manual intervention, not normally needed. + /// Returns list of segments which potentially existed in the remote storage. + pub async fn reset(&mut self) -> anyhow::Result> { + let segments_to_delete = self + .state + .segments + .iter() + .map(|seg| seg.name.clone()) + .collect(); + + // First reset cfile state, and only then objects themselves. If the + // later fails we might leave some garbage behind; that's ok for this + // single time usage. + let new_state = State { segments: vec![] }; + self.commit_state(new_state).await?; + + self.delete_segments(&segments_to_delete).await?; + Ok(segments_to_delete) + } +} + +/// Check if everything is uploaded and partial backup task doesn't need to run. +pub(crate) fn needs_uploading( + state: &StateSnapshot, + uploaded: &Option, +) -> bool { + match uploaded { + Some(uploaded) => { + uploaded.status != UploadStatus::Uploaded + || uploaded.flush_lsn != state.flush_lsn + || uploaded.commit_lsn != state.commit_lsn + || uploaded.term != state.last_log_term + } + None => true, + } +} + +/// Main task for partial backup. It waits for the flush_lsn to change and then uploads the +/// partial segment to the remote storage. It also does garbage collection of old segments. +/// +/// When there is nothing more to do and the last segment was successfully uploaded, the task +/// returns PartialRemoteSegment, to signal readiness for offloading the timeline. +#[instrument(name = "partial_backup", skip_all, fields(ttid = %tli.ttid))] +pub async fn main_task( + tli: WalResidentTimeline, + conf: SafeKeeperConf, + limiter: RateLimiter, + cancel: CancellationToken, +) -> Option { + debug!("started"); + let await_duration = conf.partial_backup_timeout; + let mut first_iteration = true; + + let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx(); + let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx(); + + let mut backup = PartialBackup::new(tli, conf).await; + + debug!("state: {:?}", backup.state); + + // The general idea is that each safekeeper keeps only one partial segment + // both in remote storage and in local state. If this is not true, something + // went wrong. + const MAX_SIMULTANEOUS_SEGMENTS: usize = 10; + + 'outer: loop { + if backup.state.segments.len() > MAX_SIMULTANEOUS_SEGMENTS { + warn!( + "too many segments in control_file state, running gc: {}", + backup.state.segments.len() + ); + + backup.gc().await.unwrap_or_else(|e| { + error!("failed to run gc: {:#}", e); + }); + } + + // wait until we have something to upload + let uploaded_segment = backup.state.uploaded_segment(); + if let Some(seg) = &uploaded_segment { + // check if uploaded segment matches the current state + if flush_lsn_rx.borrow().lsn == seg.flush_lsn + && *commit_lsn_rx.borrow() == seg.commit_lsn + && flush_lsn_rx.borrow().term == seg.term + { + // we have nothing to do, the last segment is already uploaded + debug!( + "exiting, uploaded up to term={} flush_lsn={} commit_lsn={}", + seg.term, seg.flush_lsn, seg.commit_lsn + ); + return Some(seg.clone()); + } + } + + // if we don't have any data and zero LSNs, wait for something + while flush_lsn_rx.borrow().lsn == Lsn(0) { + tokio::select! { + _ = backup.tli.cancel.cancelled() => { + info!("timeline canceled"); + return None; + } + _ = cancel.cancelled() => { + info!("task canceled"); + return None; + } + _ = flush_lsn_rx.changed() => {} + } + } + + // smoothing the load after restart, by sleeping for a random time. + // if this is not the first iteration, we will wait for the full await_duration + let await_duration = if first_iteration { + first_iteration = false; + rand_duration(&await_duration) + } else { + await_duration + }; + + // fixing the segno and waiting some time to prevent reuploading the same segment too often + let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn); + let timeout = tokio::time::sleep(await_duration); + tokio::pin!(timeout); + let mut timeout_expired = false; + + // waiting until timeout expires OR segno changes + 'inner: loop { + tokio::select! { + _ = backup.tli.cancel.cancelled() => { + info!("timeline canceled"); + return None; + } + _ = cancel.cancelled() => { + info!("task canceled"); + return None; + } + _ = commit_lsn_rx.changed() => {} + _ = flush_lsn_rx.changed() => { + let segno = backup.segno(flush_lsn_rx.borrow().lsn); + if segno != pending_segno { + // previous segment is no longer partial, aborting the wait + break 'inner; + } + } + _ = &mut timeout => { + // timeout expired, now we are ready for upload + timeout_expired = true; + break 'inner; + } + } + } + + if !timeout_expired { + // likely segno has changed, let's try again in the next iteration + continue 'outer; + } + + // limit concurrent uploads + let _upload_permit = tokio::select! { + acq = limiter.acquire_partial_backup() => acq, + _ = cancel.cancelled() => { + info!("task canceled"); + return None; + } + }; + + let prepared = backup.prepare_upload().await; + if let Some(seg) = &uploaded_segment { + if seg.eq_without_status(&prepared) { + // we already uploaded this segment, nothing to do + continue 'outer; + } + } + + match backup.do_upload(&prepared).await { + Ok(()) => { + debug!( + "uploaded {} up to flush_lsn {}", + prepared.name, prepared.flush_lsn + ); + PARTIAL_BACKUP_UPLOADS.with_label_values(&["ok"]).inc(); + } + Err(e) => { + info!("failed to upload {}: {:#}", prepared.name, e); + PARTIAL_BACKUP_UPLOADS.with_label_values(&["error"]).inc(); + } + } + } +} diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index bceaad1e16..1ab54d4cce 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -4,9 +4,10 @@ //! use anyhow::{Context, Result}; use postgres_backend::QueryError; -use std::{future, time::Duration}; +use std::time::Duration; use tokio::net::TcpStream; use tokio_io_timeout::TimeoutReader; +use tokio_util::sync::CancellationToken; use tracing::*; use utils::{auth::Scope, measured_stream::MeasuredStream}; @@ -16,6 +17,7 @@ use crate::SafeKeeperConf; use postgres_backend::{AuthType, PostgresBackend}; /// Accept incoming TCP connections and spawn them into a background thread. +/// /// allowed_auth_scope is either SafekeeperData (wide JWT tokens giving access /// to any tenant are allowed) or Tenant (only tokens giving access to specific /// tenant are allowed). Doesn't matter if auth is disabled in conf. @@ -42,7 +44,7 @@ pub async fn task_main( error!("connection handler exited: {}", err); } } - .instrument(info_span!("", cid = %conn_id, ttid = field::Empty)), + .instrument(info_span!("", cid = %conn_id, ttid = field::Empty, application_name = field::Empty)), ); } } @@ -68,7 +70,7 @@ async fn handle_socket( // is not Unpin, and all pgbackend/framed/tokio dependencies require stream // to be Unpin. Which is reasonable, as indeed something like TimeoutReader // shouldn't be moved. - tokio::pin!(socket); + let socket = std::pin::pin!(socket); let traffic_metrics = TrafficMetrics::new(); if let Some(current_az) = conf.availability_zone.as_deref() { @@ -100,7 +102,7 @@ async fn handle_socket( // libpq protocol between safekeeper and walproposer / pageserver // We don't use shutdown. pgbackend - .run(&mut conn_handler, future::pending::<()>) + .run(&mut conn_handler, &CancellationToken::new()) .await } diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index ed6190042a..89c2e98a94 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -15,16 +15,20 @@ use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogF use postgres_ffi::{dispatch_pgversion, XLogSegNo, PG_TLI}; use remote_storage::RemotePath; use std::cmp::{max, min}; +use std::future::Future; use std::io::{self, SeekFrom}; use std::pin::Pin; use tokio::fs::{self, remove_file, File, OpenOptions}; use tokio::io::{AsyncRead, AsyncWriteExt}; use tokio::io::{AsyncReadExt, AsyncSeekExt}; use tracing::*; +use utils::crashsafe::durable_rename; -use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS}; +use crate::metrics::{ + time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, +}; use crate::state::TimelinePersistentState; -use crate::wal_backup::read_object; +use crate::wal_backup::{read_object, remote_timeline_path}; use crate::SafeKeeperConf; use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::XLogFileName; @@ -32,19 +36,29 @@ use postgres_ffi::XLOG_BLCKSZ; use pq_proto::SystemId; use utils::{id::TenantTimelineId, lsn::Lsn}; -#[async_trait::async_trait] pub trait Storage { + // Last written LSN. + fn write_lsn(&self) -> Lsn; /// LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn; + /// Initialize segment by creating proper long header at the beginning of + /// the segment and short header at the page of given LSN. This is only used + /// for timeline initialization because compute will stream data only since + /// init_lsn. Other segment headers are included in compute stream. + fn initialize_first_segment( + &mut self, + init_lsn: Lsn, + ) -> impl Future> + Send; + /// Write piece of WAL from buf to disk, but not necessarily sync it. - async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>; + fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> impl Future> + Send; /// Truncate WAL at specified LSN, which must be the end of WAL record. - async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>; + fn truncate_wal(&mut self, end_pos: Lsn) -> impl Future> + Send; /// Durably store WAL on disk, up to the last written WAL record. - async fn flush_wal(&mut self) -> Result<()>; + fn flush_wal(&mut self) -> impl Future> + Send; /// Remove all segments <= given segno. Returns function doing that as we /// want to perform it without timeline lock. @@ -77,6 +91,8 @@ pub struct PhysicalStorage { /// Size of WAL segment in bytes. wal_seg_size: usize, + pg_version: u32, + system_id: u64, /// Written to disk, but possibly still in the cache and not fully persisted. /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record. @@ -168,6 +184,8 @@ impl PhysicalStorage { timeline_dir, conf: conf.clone(), wal_seg_size, + pg_version: state.server.pg_version, + system_id: state.server.system_id, write_lsn, write_record_lsn: write_lsn, flush_record_lsn: flush_lsn, @@ -196,20 +214,11 @@ impl PhysicalStorage { Ok(()) } - /// Call fsync if config requires so. - async fn fsync_file(&mut self, file: &File) -> Result<()> { - if !self.conf.no_sync { - self.metrics - .observe_flush_seconds(time_io_closure(file.sync_all()).await?); - } - Ok(()) - } - /// Open or create WAL segment file. Caller must call seek to the wanted position. /// Returns `file` and `is_partial`. async fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> { let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size); // Try to open already completed segment if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path).await { @@ -223,15 +232,30 @@ impl PhysicalStorage { Ok((file, true)) } else { // Create and fill new partial file - let mut file = OpenOptions::new() - .create(true) - .write(true) - .open(&wal_file_partial_path) + // + // We're using fdatasync during WAL writing, so file size must not + // change; to this end it is filled with zeros here. To avoid using + // half initialized segment, first bake it under tmp filename and + // then rename. + let tmp_path = self.timeline_dir.join("waltmp"); + let mut file = File::create(&tmp_path) .await - .with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?; + .with_context(|| format!("Failed to open tmp wal file {:?}", &tmp_path))?; write_zeroes(&mut file, self.wal_seg_size).await?; - self.fsync_file(&file).await?; + + // Note: this doesn't get into observe_flush_seconds metric. But + // segment init should be separate metric, if any. + if let Err(e) = + durable_rename(&tmp_path, &wal_file_partial_path, !self.conf.no_sync).await + { + // Probably rename succeeded, but fsync of it failed. Remove + // the file then to avoid using it. + remove_file(wal_file_partial_path) + .await + .or_else(utils::fs_ext::ignore_not_found)?; + return Err(e.into()); + } Ok((file, true)) } } @@ -259,7 +283,7 @@ impl PhysicalStorage { // Rename partial file to completed file let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size); fs::rename(wal_file_partial_path, wal_file_path).await?; } else { // otherwise, file can be reused later @@ -306,13 +330,34 @@ impl PhysicalStorage { } } -#[async_trait::async_trait] impl Storage for PhysicalStorage { + // Last written LSN. + fn write_lsn(&self) -> Lsn { + self.write_lsn + } /// flush_lsn returns LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn { self.flush_record_lsn } + async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()> { + let _timer = WAL_STORAGE_OPERATION_SECONDS + .with_label_values(&["initialize_first_segment"]) + .start_timer(); + + let segno = init_lsn.segment_number(self.wal_seg_size); + let (mut file, _) = self.open_or_create(segno).await?; + let major_pg_version = self.pg_version / 10000; + let wal_seg = + postgres_ffi::generate_wal_segment(segno, self.system_id, major_pg_version, init_lsn)?; + file.seek(SeekFrom::Start(0)).await?; + file.write_all(&wal_seg).await?; + file.flush().await?; + info!("initialized segno {} at lsn {}", segno, init_lsn); + // note: file is *not* fsynced + Ok(()) + } + /// Write WAL to disk. async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { // Disallow any non-sequential writes, which can result in gaps or overwrites. @@ -391,6 +436,10 @@ impl Storage for PhysicalStorage { /// Truncate written WAL by removing all WAL segments after the given LSN. /// end_pos must point to the end of the WAL record. async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { + let _timer = WAL_STORAGE_OPERATION_SECONDS + .with_label_values(&["truncate_wal"]) + .start_timer(); + // Streaming must not create a hole, so truncate cannot be called on non-written lsn if self.write_lsn != Lsn(0) && end_pos > self.write_lsn { bail!( @@ -430,7 +479,7 @@ impl Storage for PhysicalStorage { if !is_partial { // Make segment partial once again let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size); fs::rename(wal_file_path, wal_file_partial_path).await?; } @@ -466,6 +515,10 @@ async fn remove_segments_from_disk( wal_seg_size: usize, remove_predicate: impl Fn(XLogSegNo) -> bool, ) -> Result<()> { + let _timer = WAL_STORAGE_OPERATION_SECONDS + .with_label_values(&["remove_segments_from_disk"]) + .start_timer(); + let mut n_removed = 0; let mut min_removed = u64::MAX; let mut max_removed = u64::MIN; @@ -501,7 +554,7 @@ async fn remove_segments_from_disk( } pub struct WalReader { - workdir: Utf8PathBuf, + remote_path: RemotePath, timeline_dir: Utf8PathBuf, wal_seg_size: usize, pos: Lsn, @@ -523,7 +576,7 @@ pub struct WalReader { impl WalReader { pub fn new( - workdir: Utf8PathBuf, + ttid: &TenantTimelineId, timeline_dir: Utf8PathBuf, state: &TimelinePersistentState, start_pos: Lsn, @@ -551,7 +604,7 @@ impl WalReader { } Ok(Self { - workdir, + remote_path: remote_timeline_path(ttid)?, timeline_dir, wal_seg_size: state.server.wal_seg_size as usize, pos: start_pos, @@ -649,13 +702,12 @@ impl WalReader { let xlogoff = self.pos.segment_offset(self.wal_seg_size); let segno = self.pos.segment_number(self.wal_seg_size); let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size); - let wal_file_path = self.timeline_dir.join(wal_file_name); // Try to open local file, if we may have WAL locally if self.pos >= self.local_start_lsn { - let res = Self::open_wal_file(&wal_file_path).await; + let res = open_wal_file(&self.timeline_dir, segno, self.wal_seg_size).await; match res { - Ok(mut file) => { + Ok((mut file, _)) => { file.seek(SeekFrom::Start(xlogoff as u64)).await?; return Ok(Box::pin(file)); } @@ -677,40 +729,12 @@ impl WalReader { // Try to open remote file, if remote reads are enabled if self.enable_remote_read { - let remote_wal_file_path = wal_file_path - .strip_prefix(&self.workdir) - .context("Failed to strip workdir prefix") - .and_then(RemotePath::new) - .with_context(|| { - format!( - "Failed to resolve remote part of path {:?} for base {:?}", - wal_file_path, self.workdir, - ) - })?; + let remote_wal_file_path = self.remote_path.join(&wal_file_name); return read_object(&remote_wal_file_path, xlogoff as u64).await; } bail!("WAL segment is not found") } - - /// Helper function for opening a wal file. - async fn open_wal_file(wal_file_path: &Utf8Path) -> Result { - // First try to open the .partial file. - let mut partial_path = wal_file_path.to_owned(); - partial_path.set_extension("partial"); - if let Ok(opened_file) = tokio::fs::File::open(&partial_path).await { - return Ok(opened_file); - } - - // If that failed, try it without the .partial extension. - tokio::fs::File::open(&wal_file_path) - .await - .with_context(|| format!("Failed to open WAL file {:?}", wal_file_path)) - .map_err(|e| { - warn!("{}", e); - e - }) - } } /// Zero block for filling created WAL segments. @@ -718,6 +742,11 @@ const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ]; /// Helper for filling file with zeroes. async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> { + fail::fail_point!("sk-write-zeroes", |_| { + info!("write_zeroes hit failpoint"); + Err(anyhow::anyhow!("failpoint: sk-write-zeroes")) + }); + while count >= XLOG_BLCKSZ { file.write_all(ZERO_BLOCK).await?; count -= XLOG_BLCKSZ; @@ -727,14 +756,42 @@ async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> { Ok(()) } +/// Helper function for opening WAL segment `segno` in `dir`. Returns file and +/// whether it is .partial. +pub(crate) async fn open_wal_file( + timeline_dir: &Utf8Path, + segno: XLogSegNo, + wal_seg_size: usize, +) -> Result<(tokio::fs::File, bool)> { + let (wal_file_path, wal_file_partial_path) = wal_file_paths(timeline_dir, segno, wal_seg_size); + + // First try to open the .partial file. + let mut partial_path = wal_file_path.to_owned(); + partial_path.set_extension("partial"); + if let Ok(opened_file) = tokio::fs::File::open(&wal_file_partial_path).await { + return Ok((opened_file, true)); + } + + // If that failed, try it without the .partial extension. + let pf = tokio::fs::File::open(&wal_file_path) + .await + .with_context(|| format!("failed to open WAL file {:#}", wal_file_path)) + .map_err(|e| { + warn!("{}", e); + e + })?; + + Ok((pf, false)) +} + /// Helper returning full path to WAL segment file and its .partial brother. pub fn wal_file_paths( timeline_dir: &Utf8Path, segno: XLogSegNo, wal_seg_size: usize, -) -> Result<(Utf8PathBuf, Utf8PathBuf)> { +) -> (Utf8PathBuf, Utf8PathBuf) { let wal_file_name = XLogFileName(PG_TLI, segno, wal_seg_size); let wal_file_path = timeline_dir.join(wal_file_name.clone()); let wal_file_partial_path = timeline_dir.join(wal_file_name + ".partial"); - Ok((wal_file_path, wal_file_partial_path)) + (wal_file_path, wal_file_partial_path) } diff --git a/safekeeper/tests/misc_test.rs b/safekeeper/tests/misc_test.rs new file mode 100644 index 0000000000..8e5b17a143 --- /dev/null +++ b/safekeeper/tests/misc_test.rs @@ -0,0 +1,155 @@ +use std::sync::Arc; + +use tracing::{info, warn}; +use utils::lsn::Lsn; + +use crate::walproposer_sim::{ + log::{init_logger, init_tracing_logger}, + simulation::{generate_network_opts, generate_schedule, Schedule, TestAction, TestConfig}, +}; + +pub mod walproposer_sim; + +// Test that simulation supports restarting (crashing) safekeepers. +#[test] +fn crash_safekeeper() { + let clock = init_logger(); + let config = TestConfig::new(Some(clock)); + let test = config.start(1337); + + let lsn = test.sync_safekeepers().unwrap(); + assert_eq!(lsn, Lsn(0)); + info!("Sucessfully synced empty safekeepers at 0/0"); + + let mut wp = test.launch_walproposer(lsn); + + // Write some WAL and crash safekeeper 0 without waiting for replication. + test.poll_for_duration(30); + wp.write_tx(3); + test.servers[0].restart(); + + // Wait some time, so that walproposer can reconnect. + test.poll_for_duration(2000); +} + +// Test that walproposer can be crashed (stopped). +#[test] +fn test_simple_restart() { + let clock = init_logger(); + let config = TestConfig::new(Some(clock)); + let test = config.start(1337); + + let lsn = test.sync_safekeepers().unwrap(); + assert_eq!(lsn, Lsn(0)); + info!("Sucessfully synced empty safekeepers at 0/0"); + + let mut wp = test.launch_walproposer(lsn); + + test.poll_for_duration(30); + wp.write_tx(3); + test.poll_for_duration(100); + + wp.stop(); + drop(wp); + + let lsn = test.sync_safekeepers().unwrap(); + info!("Sucessfully synced safekeepers at {}", lsn); +} + +// Test runnning a simple schedule, restarting everything a several times. +#[test] +fn test_simple_schedule() -> anyhow::Result<()> { + let clock = init_logger(); + let mut config = TestConfig::new(Some(clock)); + config.network.keepalive_timeout = Some(100); + let test = config.start(1337); + + let schedule: Schedule = vec![ + (0, TestAction::RestartWalProposer), + (50, TestAction::WriteTx(5)), + (100, TestAction::RestartSafekeeper(0)), + (100, TestAction::WriteTx(5)), + (110, TestAction::RestartSafekeeper(1)), + (110, TestAction::WriteTx(5)), + (120, TestAction::RestartSafekeeper(2)), + (120, TestAction::WriteTx(5)), + (201, TestAction::RestartWalProposer), + (251, TestAction::RestartSafekeeper(0)), + (251, TestAction::RestartSafekeeper(1)), + (251, TestAction::RestartSafekeeper(2)), + (251, TestAction::WriteTx(5)), + (255, TestAction::WriteTx(5)), + (1000, TestAction::WriteTx(5)), + ]; + + test.run_schedule(&schedule)?; + info!("Test finished, stopping all threads"); + test.world.deallocate(); + + Ok(()) +} + +// Test that simulation can process 10^4 transactions. +#[test] +fn test_many_tx() -> anyhow::Result<()> { + let clock = init_logger(); + let config = TestConfig::new(Some(clock)); + let test = config.start(1337); + + let mut schedule: Schedule = vec![]; + for i in 0..100 { + schedule.push((i * 10, TestAction::WriteTx(100))); + } + + test.run_schedule(&schedule)?; + info!("Test finished, stopping all threads"); + test.world.stop_all(); + + let events = test.world.take_events(); + info!("Events: {:?}", events); + let last_commit_lsn = events + .iter() + .filter_map(|event| { + if event.data.starts_with("commit_lsn;") { + let lsn: u64 = event.data.split(';').nth(1).unwrap().parse().unwrap(); + return Some(lsn); + } + None + }) + .last() + .unwrap(); + + let initdb_lsn = 21623024; + let diff = last_commit_lsn - initdb_lsn; + info!("Last commit lsn: {}, diff: {}", last_commit_lsn, diff); + // each tx is at least 8 bytes, it's written a 100 times for in a loop for 100 times + assert!(diff > 100 * 100 * 8); + Ok(()) +} + +// Checks that we don't have nasty circular dependencies, preventing Arc from deallocating. +// This test doesn't really assert anything, you need to run it manually to check if there +// is any issue. +#[test] +fn test_res_dealloc() -> anyhow::Result<()> { + let clock = init_tracing_logger(true); + let mut config = TestConfig::new(Some(clock)); + + let seed = 123456; + config.network = generate_network_opts(seed); + let test = config.start(seed); + warn!("Running test with seed {}", seed); + + let schedule = generate_schedule(seed); + info!("schedule: {:?}", schedule); + test.run_schedule(&schedule).unwrap(); + test.world.stop_all(); + + let world = test.world.clone(); + drop(test); + info!("world strong count: {}", Arc::strong_count(&world)); + world.deallocate(); + info!("world strong count: {}", Arc::strong_count(&world)); + + Ok(()) +} diff --git a/safekeeper/tests/random_test.rs b/safekeeper/tests/random_test.rs new file mode 100644 index 0000000000..7bdee35cd7 --- /dev/null +++ b/safekeeper/tests/random_test.rs @@ -0,0 +1,56 @@ +use rand::Rng; +use tracing::{info, warn}; + +use crate::walproposer_sim::{ + log::{init_logger, init_tracing_logger}, + simulation::{generate_network_opts, generate_schedule, TestConfig}, + simulation_logs::validate_events, +}; + +pub mod walproposer_sim; + +// Generates 2000 random seeds and runs a schedule for each of them. +// If you see this test fail, please report the last seed to the +// @safekeeper team. +#[test] +fn test_random_schedules() -> anyhow::Result<()> { + let clock = init_logger(); + let mut config = TestConfig::new(Some(clock)); + + for _ in 0..2000 { + let seed: u64 = rand::thread_rng().gen(); + config.network = generate_network_opts(seed); + + let test = config.start(seed); + warn!("Running test with seed {}", seed); + + let schedule = generate_schedule(seed); + test.run_schedule(&schedule).unwrap(); + validate_events(test.world.take_events()); + test.world.deallocate(); + } + + Ok(()) +} + +// After you found a seed that fails, you can insert this seed here +// and run the test to see the full debug output. +#[test] +fn test_one_schedule() -> anyhow::Result<()> { + let clock = init_tracing_logger(true); + let mut config = TestConfig::new(Some(clock)); + + let seed = 11047466935058776390; + config.network = generate_network_opts(seed); + info!("network: {:?}", config.network); + let test = config.start(seed); + warn!("Running test with seed {}", seed); + + let schedule = generate_schedule(seed); + info!("schedule: {:?}", schedule); + test.run_schedule(&schedule).unwrap(); + validate_events(test.world.take_events()); + test.world.deallocate(); + + Ok(()) +} diff --git a/safekeeper/tests/simple_test.rs b/safekeeper/tests/simple_test.rs new file mode 100644 index 0000000000..0be9d0deef --- /dev/null +++ b/safekeeper/tests/simple_test.rs @@ -0,0 +1,45 @@ +use tracing::info; +use utils::lsn::Lsn; + +use crate::walproposer_sim::{log::init_logger, simulation::TestConfig}; + +pub mod walproposer_sim; + +// Check that first start of sync_safekeepers() returns 0/0 on empty safekeepers. +#[test] +fn sync_empty_safekeepers() { + let clock = init_logger(); + let config = TestConfig::new(Some(clock)); + let test = config.start(1337); + + let lsn = test.sync_safekeepers().unwrap(); + assert_eq!(lsn, Lsn(0)); + info!("Sucessfully synced empty safekeepers at 0/0"); + + let lsn = test.sync_safekeepers().unwrap(); + assert_eq!(lsn, Lsn(0)); + info!("Sucessfully synced (again) empty safekeepers at 0/0"); +} + +// Check that there are no panics when we are writing and streaming WAL to safekeepers. +#[test] +fn run_walproposer_generate_wal() { + let clock = init_logger(); + let config = TestConfig::new(Some(clock)); + let test = config.start(1337); + + let lsn = test.sync_safekeepers().unwrap(); + assert_eq!(lsn, Lsn(0)); + info!("Sucessfully synced empty safekeepers at 0/0"); + + let mut wp = test.launch_walproposer(lsn); + + // wait for walproposer to start + test.poll_for_duration(30); + + // just write some WAL + for _ in 0..100 { + wp.write_tx(1); + test.poll_for_duration(5); + } +} diff --git a/safekeeper/tests/walproposer_sim/block_storage.rs b/safekeeper/tests/walproposer_sim/block_storage.rs new file mode 100644 index 0000000000..468c02ad2f --- /dev/null +++ b/safekeeper/tests/walproposer_sim/block_storage.rs @@ -0,0 +1,57 @@ +use std::collections::HashMap; + +const BLOCK_SIZE: usize = 8192; + +/// A simple in-memory implementation of a block storage. Can be used to implement external +/// storage in tests. +pub struct BlockStorage { + blocks: HashMap, +} + +impl Default for BlockStorage { + fn default() -> Self { + Self::new() + } +} + +impl BlockStorage { + pub fn new() -> Self { + BlockStorage { + blocks: HashMap::new(), + } + } + + pub fn read(&self, pos: u64, buf: &mut [u8]) { + let mut buf_offset = 0; + let mut storage_pos = pos; + while buf_offset < buf.len() { + let block_id = storage_pos / BLOCK_SIZE as u64; + let block = self.blocks.get(&block_id).unwrap_or(&[0; BLOCK_SIZE]); + let block_offset = storage_pos % BLOCK_SIZE as u64; + let block_len = BLOCK_SIZE as u64 - block_offset; + let buf_len = buf.len() - buf_offset; + let copy_len = std::cmp::min(block_len as usize, buf_len); + buf[buf_offset..buf_offset + copy_len] + .copy_from_slice(&block[block_offset as usize..block_offset as usize + copy_len]); + buf_offset += copy_len; + storage_pos += copy_len as u64; + } + } + + pub fn write(&mut self, pos: u64, buf: &[u8]) { + let mut buf_offset = 0; + let mut storage_pos = pos; + while buf_offset < buf.len() { + let block_id = storage_pos / BLOCK_SIZE as u64; + let block = self.blocks.entry(block_id).or_insert([0; BLOCK_SIZE]); + let block_offset = storage_pos % BLOCK_SIZE as u64; + let block_len = BLOCK_SIZE as u64 - block_offset; + let buf_len = buf.len() - buf_offset; + let copy_len = std::cmp::min(block_len as usize, buf_len); + block[block_offset as usize..block_offset as usize + copy_len] + .copy_from_slice(&buf[buf_offset..buf_offset + copy_len]); + buf_offset += copy_len; + storage_pos += copy_len as u64 + } + } +} diff --git a/safekeeper/tests/walproposer_sim/log.rs b/safekeeper/tests/walproposer_sim/log.rs new file mode 100644 index 0000000000..870f30de4f --- /dev/null +++ b/safekeeper/tests/walproposer_sim/log.rs @@ -0,0 +1,77 @@ +use std::{fmt, sync::Arc}; + +use desim::time::Timing; +use once_cell::sync::OnceCell; +use parking_lot::Mutex; +use tracing_subscriber::fmt::{format::Writer, time::FormatTime}; + +/// SimClock can be plugged into tracing logger to print simulation time. +#[derive(Clone)] +pub struct SimClock { + clock_ptr: Arc>>>, +} + +impl Default for SimClock { + fn default() -> Self { + SimClock { + clock_ptr: Arc::new(Mutex::new(None)), + } + } +} + +impl SimClock { + pub fn set_clock(&self, clock: Arc) { + *self.clock_ptr.lock() = Some(clock); + } +} + +impl FormatTime for SimClock { + fn format_time(&self, w: &mut Writer<'_>) -> fmt::Result { + let clock = self.clock_ptr.lock(); + + if let Some(clock) = clock.as_ref() { + let now = clock.now(); + write!(w, "[{}]", now) + } else { + write!(w, "[?]") + } + } +} + +static LOGGING_DONE: OnceCell = OnceCell::new(); + +/// Returns ptr to clocks attached to tracing logger to update them when the +/// world is (re)created. +pub fn init_tracing_logger(debug_enabled: bool) -> SimClock { + LOGGING_DONE + .get_or_init(|| { + let clock = SimClock::default(); + let base_logger = tracing_subscriber::fmt() + .with_target(false) + // prefix log lines with simulated time timestamp + .with_timer(clock.clone()) + // .with_ansi(true) TODO + .with_max_level(match debug_enabled { + true => tracing::Level::DEBUG, + false => tracing::Level::WARN, + }) + .with_writer(std::io::stdout); + base_logger.init(); + + // logging::replace_panic_hook_with_tracing_panic_hook().forget(); + + if !debug_enabled { + std::panic::set_hook(Box::new(|_| {})); + } + + clock + }) + .clone() +} + +pub fn init_logger() -> SimClock { + // RUST_TRACEBACK envvar controls whether we print all logs or only warnings. + let debug_enabled = std::env::var("RUST_TRACEBACK").is_ok(); + + init_tracing_logger(debug_enabled) +} diff --git a/safekeeper/tests/walproposer_sim/mod.rs b/safekeeper/tests/walproposer_sim/mod.rs new file mode 100644 index 0000000000..ec560dcb3b --- /dev/null +++ b/safekeeper/tests/walproposer_sim/mod.rs @@ -0,0 +1,8 @@ +pub mod block_storage; +pub mod log; +pub mod safekeeper; +pub mod safekeeper_disk; +pub mod simulation; +pub mod simulation_logs; +pub mod walproposer_api; +pub mod walproposer_disk; diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs new file mode 100644 index 0000000000..771d905c90 --- /dev/null +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -0,0 +1,428 @@ +//! Safekeeper communication endpoint to WAL proposer (compute node). +//! Gets messages from the network, passes them down to consensus module and +//! sends replies back. + +use std::{collections::HashMap, sync::Arc, time::Duration}; + +use anyhow::{bail, Result}; +use bytes::{Bytes, BytesMut}; +use camino::Utf8PathBuf; +use desim::{ + executor::{self, PollSome}, + network::TCP, + node_os::NodeOs, + proto::{AnyMessage, NetEvent, NodeEvent}, +}; +use hyper::Uri; +use safekeeper::{ + safekeeper::{ProposerAcceptorMessage, SafeKeeper, ServerInfo, UNKNOWN_SERVER_VERSION}, + state::{TimelinePersistentState, TimelineState}, + timeline::TimelineError, + wal_storage::Storage, + SafeKeeperConf, +}; +use tracing::{debug, info_span, warn}; +use utils::{ + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, +}; + +use super::safekeeper_disk::{DiskStateStorage, DiskWALStorage, SafekeeperDisk, TimelineDisk}; + +struct SharedState { + sk: SafeKeeper, + disk: Arc, +} + +struct GlobalMap { + timelines: HashMap, + conf: SafeKeeperConf, + disk: Arc, +} + +impl GlobalMap { + /// Restores global state from disk. + fn new(disk: Arc, conf: SafeKeeperConf) -> Result { + let mut timelines = HashMap::new(); + + for (&ttid, disk) in disk.timelines.lock().iter() { + debug!("loading timeline {}", ttid); + let state = disk.state.lock().clone(); + + if state.server.wal_seg_size == 0 { + bail!(TimelineError::UninitializedWalSegSize(ttid)); + } + + if state.server.pg_version == UNKNOWN_SERVER_VERSION { + bail!(TimelineError::UninitialinzedPgVersion(ttid)); + } + + if state.commit_lsn < state.local_start_lsn { + bail!( + "commit_lsn {} is higher than local_start_lsn {}", + state.commit_lsn, + state.local_start_lsn + ); + } + + let control_store = DiskStateStorage::new(disk.clone()); + let wal_store = DiskWALStorage::new(disk.clone(), &control_store)?; + + let sk = SafeKeeper::new(TimelineState::new(control_store), wal_store, conf.my_id)?; + timelines.insert( + ttid, + SharedState { + sk, + disk: disk.clone(), + }, + ); + } + + Ok(Self { + timelines, + conf, + disk, + }) + } + + fn create(&mut self, ttid: TenantTimelineId, server_info: ServerInfo) -> Result<()> { + if self.timelines.contains_key(&ttid) { + bail!("timeline {} already exists", ttid); + } + + debug!("creating new timeline {}", ttid); + + let commit_lsn = Lsn::INVALID; + let local_start_lsn = Lsn::INVALID; + + let state = + TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn); + + if state.server.wal_seg_size == 0 { + bail!(TimelineError::UninitializedWalSegSize(ttid)); + } + + if state.server.pg_version == UNKNOWN_SERVER_VERSION { + bail!(TimelineError::UninitialinzedPgVersion(ttid)); + } + + if state.commit_lsn < state.local_start_lsn { + bail!( + "commit_lsn {} is higher than local_start_lsn {}", + state.commit_lsn, + state.local_start_lsn + ); + } + + let disk_timeline = self.disk.put_state(&ttid, state); + let control_store = DiskStateStorage::new(disk_timeline.clone()); + let wal_store = DiskWALStorage::new(disk_timeline.clone(), &control_store)?; + + let sk = SafeKeeper::new( + TimelineState::new(control_store), + wal_store, + self.conf.my_id, + )?; + + self.timelines.insert( + ttid, + SharedState { + sk, + disk: disk_timeline, + }, + ); + Ok(()) + } + + fn get(&mut self, ttid: &TenantTimelineId) -> &mut SharedState { + self.timelines.get_mut(ttid).expect("timeline must exist") + } + + fn has_tli(&self, ttid: &TenantTimelineId) -> bool { + self.timelines.contains_key(ttid) + } +} + +/// State of a single connection to walproposer. +struct ConnState { + tcp: TCP, + + greeting: bool, + ttid: TenantTimelineId, + flush_pending: bool, + + runtime: tokio::runtime::Runtime, +} + +pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { + let _enter = info_span!("safekeeper", id = os.id()).entered(); + debug!("started server"); + os.log_event("started;safekeeper".to_owned()); + let conf = SafeKeeperConf { + workdir: Utf8PathBuf::from("."), + my_id: NodeId(os.id() as u64), + listen_pg_addr: String::new(), + listen_http_addr: String::new(), + no_sync: false, + broker_endpoint: "/".parse::().unwrap(), + broker_keepalive_interval: Duration::from_secs(0), + heartbeat_timeout: Duration::from_secs(0), + remote_storage: None, + max_offloader_lag_bytes: 0, + wal_backup_enabled: false, + listen_pg_addr_tenant_only: None, + advertise_pg_addr: None, + availability_zone: None, + peer_recovery_enabled: false, + backup_parallel_jobs: 0, + pg_auth: None, + pg_tenant_only_auth: None, + http_auth: None, + sk_auth_token: None, + current_thread_runtime: false, + walsenders_keep_horizon: false, + partial_backup_timeout: Duration::from_secs(0), + disable_periodic_broker_push: false, + enable_offload: false, + delete_offloaded_wal: false, + control_file_save_interval: Duration::from_secs(1), + partial_backup_concurrency: 1, + eviction_min_resident: Duration::ZERO, + }; + + let mut global = GlobalMap::new(disk, conf.clone())?; + let mut conns: HashMap = HashMap::new(); + + for (&_ttid, shared_state) in global.timelines.iter_mut() { + let flush_lsn = shared_state.sk.wal_store.flush_lsn(); + let commit_lsn = shared_state.sk.state.commit_lsn; + os.log_event(format!("tli_loaded;{};{}", flush_lsn.0, commit_lsn.0)); + } + + let node_events = os.node_events(); + let mut epoll_vec: Vec> = vec![]; + let mut epoll_idx: Vec = vec![]; + + // TODO: batch events processing (multiple events per tick) + loop { + epoll_vec.clear(); + epoll_idx.clear(); + + // node events channel + epoll_vec.push(Box::new(node_events.clone())); + epoll_idx.push(0); + + // tcp connections + for conn in conns.values() { + epoll_vec.push(Box::new(conn.tcp.recv_chan())); + epoll_idx.push(conn.tcp.connection_id()); + } + + // waiting for the next message + let index = executor::epoll_chans(&epoll_vec, -1).unwrap(); + + if index == 0 { + // got a new connection + match node_events.must_recv() { + NodeEvent::Accept(tcp) => { + conns.insert( + tcp.connection_id(), + ConnState { + tcp, + greeting: false, + ttid: TenantTimelineId::empty(), + flush_pending: false, + runtime: tokio::runtime::Builder::new_current_thread().build()?, + }, + ); + } + NodeEvent::Internal(_) => unreachable!(), + } + continue; + } + + let connection_id = epoll_idx[index]; + let conn = conns.get_mut(&connection_id).unwrap(); + let mut next_event = Some(conn.tcp.recv_chan().must_recv()); + + loop { + let event = match next_event { + Some(event) => event, + None => break, + }; + + match event { + NetEvent::Message(msg) => { + let res = conn.process_any(msg, &mut global); + if res.is_err() { + let e = res.unwrap_err(); + let estr = e.to_string(); + if !estr.contains("finished processing START_REPLICATION") { + warn!("conn {:?} error: {:?}", connection_id, e); + panic!("unexpected error at safekeeper: {:#}", e); + } + conns.remove(&connection_id); + break; + } + } + NetEvent::Closed => { + // TODO: remove from conns? + } + } + + next_event = conn.tcp.recv_chan().try_recv(); + } + + conns.retain(|_, conn| { + let res = conn.flush(&mut global); + if res.is_err() { + debug!("conn {:?} error: {:?}", conn.tcp, res); + } + res.is_ok() + }); + } +} + +impl ConnState { + /// Process a message from the network. It can be START_REPLICATION request or a valid ProposerAcceptorMessage message. + fn process_any(&mut self, any: AnyMessage, global: &mut GlobalMap) -> Result<()> { + if let AnyMessage::Bytes(copy_data) = any { + let repl_prefix = b"START_REPLICATION "; + if !self.greeting && copy_data.starts_with(repl_prefix) { + self.process_start_replication(copy_data.slice(repl_prefix.len()..), global)?; + bail!("finished processing START_REPLICATION") + } + + let msg = ProposerAcceptorMessage::parse(copy_data)?; + debug!("got msg: {:?}", msg); + self.process(msg, global) + } else { + bail!("unexpected message, expected AnyMessage::Bytes"); + } + } + + /// Process START_REPLICATION request. + fn process_start_replication( + &mut self, + copy_data: Bytes, + global: &mut GlobalMap, + ) -> Result<()> { + // format is " " + let str = String::from_utf8(copy_data.to_vec())?; + + let mut parts = str.split(' '); + let tenant_id = parts.next().unwrap().parse::()?; + let timeline_id = parts.next().unwrap().parse::()?; + let start_lsn = parts.next().unwrap().parse::()?; + let end_lsn = parts.next().unwrap().parse::()?; + + let ttid = TenantTimelineId::new(tenant_id, timeline_id); + let shared_state = global.get(&ttid); + + // read bytes from start_lsn to end_lsn + let mut buf = vec![0; (end_lsn - start_lsn) as usize]; + shared_state.disk.wal.lock().read(start_lsn, &mut buf); + + // send bytes to the client + self.tcp.send(AnyMessage::Bytes(Bytes::from(buf))); + Ok(()) + } + + /// Get or create a timeline. + fn init_timeline( + &mut self, + ttid: TenantTimelineId, + server_info: ServerInfo, + global: &mut GlobalMap, + ) -> Result<()> { + self.ttid = ttid; + if global.has_tli(&ttid) { + return Ok(()); + } + + global.create(ttid, server_info) + } + + /// Process a ProposerAcceptorMessage. + fn process(&mut self, msg: ProposerAcceptorMessage, global: &mut GlobalMap) -> Result<()> { + if !self.greeting { + self.greeting = true; + + match msg { + ProposerAcceptorMessage::Greeting(ref greeting) => { + tracing::info!( + "start handshake with walproposer {:?} {:?}", + self.tcp, + greeting + ); + let server_info = ServerInfo { + pg_version: greeting.pg_version, + system_id: greeting.system_id, + wal_seg_size: greeting.wal_seg_size, + }; + let ttid = TenantTimelineId::new(greeting.tenant_id, greeting.timeline_id); + self.init_timeline(ttid, server_info, global)? + } + _ => { + bail!("unexpected message {msg:?} instead of greeting"); + } + } + } + + let tli = global.get(&self.ttid); + + match msg { + ProposerAcceptorMessage::AppendRequest(append_request) => { + self.flush_pending = true; + self.process_sk_msg( + tli, + &ProposerAcceptorMessage::NoFlushAppendRequest(append_request), + )?; + } + other => { + self.process_sk_msg(tli, &other)?; + } + } + + Ok(()) + } + + /// Process FlushWAL if needed. + fn flush(&mut self, global: &mut GlobalMap) -> Result<()> { + // TODO: try to add extra flushes in simulation, to verify that extra flushes don't break anything + if !self.flush_pending { + return Ok(()); + } + self.flush_pending = false; + let shared_state = global.get(&self.ttid); + self.process_sk_msg(shared_state, &ProposerAcceptorMessage::FlushWAL) + } + + /// Make safekeeper process a message and send a reply to the TCP + fn process_sk_msg( + &mut self, + shared_state: &mut SharedState, + msg: &ProposerAcceptorMessage, + ) -> Result<()> { + let mut reply = self.runtime.block_on(shared_state.sk.process_msg(msg))?; + if let Some(reply) = &mut reply { + // TODO: if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn + + let mut buf = BytesMut::with_capacity(128); + reply.serialize(&mut buf)?; + + self.tcp.send(AnyMessage::Bytes(buf.into())); + } + Ok(()) + } +} + +impl Drop for ConnState { + fn drop(&mut self) { + debug!("dropping conn: {:?}", self.tcp); + if !std::thread::panicking() { + self.tcp.close(); + } + // TODO: clean up non-fsynced WAL + } +} diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs new file mode 100644 index 0000000000..b854754ecf --- /dev/null +++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs @@ -0,0 +1,284 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use parking_lot::Mutex; +use safekeeper::state::TimelinePersistentState; +use utils::id::TenantTimelineId; + +use super::block_storage::BlockStorage; + +use std::{ops::Deref, time::Instant}; + +use anyhow::Result; +use bytes::{Buf, BytesMut}; +use futures::future::BoxFuture; +use postgres_ffi::{waldecoder::WalStreamDecoder, XLogSegNo}; +use safekeeper::{control_file, metrics::WalStorageMetrics, wal_storage}; +use tracing::{debug, info}; +use utils::lsn::Lsn; + +/// All safekeeper state that is usually saved to disk. +pub struct SafekeeperDisk { + pub timelines: Mutex>>, +} + +impl Default for SafekeeperDisk { + fn default() -> Self { + Self::new() + } +} + +impl SafekeeperDisk { + pub fn new() -> Self { + SafekeeperDisk { + timelines: Mutex::new(HashMap::new()), + } + } + + pub fn put_state( + &self, + ttid: &TenantTimelineId, + state: TimelinePersistentState, + ) -> Arc { + self.timelines + .lock() + .entry(*ttid) + .and_modify(|e| { + let mut mu = e.state.lock(); + *mu = state.clone(); + }) + .or_insert_with(|| { + Arc::new(TimelineDisk { + state: Mutex::new(state), + wal: Mutex::new(BlockStorage::new()), + }) + }) + .clone() + } +} + +/// Control file state and WAL storage. +pub struct TimelineDisk { + pub state: Mutex, + pub wal: Mutex, +} + +/// Implementation of `control_file::Storage` trait. +pub struct DiskStateStorage { + persisted_state: TimelinePersistentState, + disk: Arc, + last_persist_at: Instant, +} + +impl DiskStateStorage { + pub fn new(disk: Arc) -> Self { + let guard = disk.state.lock(); + let state = guard.clone(); + drop(guard); + DiskStateStorage { + persisted_state: state, + disk, + last_persist_at: Instant::now(), + } + } +} + +impl control_file::Storage for DiskStateStorage { + /// Persist safekeeper state on disk and update internal state. + async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> { + self.persisted_state = s.clone(); + *self.disk.state.lock() = s.clone(); + Ok(()) + } + + /// Timestamp of last persist. + fn last_persist_at(&self) -> Instant { + // TODO: don't rely on it in tests + self.last_persist_at + } +} + +impl Deref for DiskStateStorage { + type Target = TimelinePersistentState; + + fn deref(&self) -> &Self::Target { + &self.persisted_state + } +} + +/// Implementation of `wal_storage::Storage` trait. +pub struct DiskWALStorage { + /// Written to disk, but possibly still in the cache and not fully persisted. + /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record. + write_lsn: Lsn, + + /// The LSN of the last WAL record written to disk. Still can be not fully flushed. + write_record_lsn: Lsn, + + /// The LSN of the last WAL record flushed to disk. + flush_record_lsn: Lsn, + + /// Decoder is required for detecting boundaries of WAL records. + decoder: WalStreamDecoder, + + /// Bytes of WAL records that are not yet written to disk. + unflushed_bytes: BytesMut, + + /// Contains BlockStorage for WAL. + disk: Arc, +} + +impl DiskWALStorage { + pub fn new(disk: Arc, state: &TimelinePersistentState) -> Result { + let write_lsn = if state.commit_lsn == Lsn(0) { + Lsn(0) + } else { + Self::find_end_of_wal(disk.clone(), state.commit_lsn)? + }; + + let flush_lsn = write_lsn; + Ok(DiskWALStorage { + write_lsn, + write_record_lsn: flush_lsn, + flush_record_lsn: flush_lsn, + decoder: WalStreamDecoder::new(flush_lsn, 16), + unflushed_bytes: BytesMut::new(), + disk, + }) + } + + fn find_end_of_wal(disk: Arc, start_lsn: Lsn) -> Result { + let mut buf = [0; 8192]; + let mut pos = start_lsn.0; + let mut decoder = WalStreamDecoder::new(start_lsn, 16); + let mut result = start_lsn; + loop { + disk.wal.lock().read(pos, &mut buf); + pos += buf.len() as u64; + decoder.feed_bytes(&buf); + + loop { + match decoder.poll_decode() { + Ok(Some(record)) => result = record.0, + Err(e) => { + debug!( + "find_end_of_wal reached end at {:?}, decode error: {:?}", + result, e + ); + return Ok(result); + } + Ok(None) => break, // need more data + } + } + } + } +} + +impl wal_storage::Storage for DiskWALStorage { + // Last written LSN. + fn write_lsn(&self) -> Lsn { + self.write_lsn + } + /// LSN of last durably stored WAL record. + fn flush_lsn(&self) -> Lsn { + self.flush_record_lsn + } + + async fn initialize_first_segment(&mut self, _init_lsn: Lsn) -> Result<()> { + Ok(()) + } + + /// Write piece of WAL from buf to disk, but not necessarily sync it. + async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { + if self.write_lsn != startpos { + panic!("write_wal called with wrong startpos"); + } + + self.unflushed_bytes.extend_from_slice(buf); + self.write_lsn += buf.len() as u64; + + if self.decoder.available() != startpos { + info!( + "restart decoder from {} to {}", + self.decoder.available(), + startpos, + ); + self.decoder = WalStreamDecoder::new(startpos, 16); + } + self.decoder.feed_bytes(buf); + loop { + match self.decoder.poll_decode()? { + None => break, // no full record yet + Some((lsn, _rec)) => { + self.write_record_lsn = lsn; + } + } + } + + Ok(()) + } + + /// Truncate WAL at specified LSN, which must be the end of WAL record. + async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { + if self.write_lsn != Lsn(0) && end_pos > self.write_lsn { + panic!( + "truncate_wal called on non-written WAL, write_lsn={}, end_pos={}", + self.write_lsn, end_pos + ); + } + + self.flush_wal().await?; + + // write zeroes to disk from end_pos until self.write_lsn + let buf = [0; 8192]; + let mut pos = end_pos.0; + while pos < self.write_lsn.0 { + self.disk.wal.lock().write(pos, &buf); + pos += buf.len() as u64; + } + + self.write_lsn = end_pos; + self.write_record_lsn = end_pos; + self.flush_record_lsn = end_pos; + self.unflushed_bytes.clear(); + self.decoder = WalStreamDecoder::new(end_pos, 16); + + Ok(()) + } + + /// Durably store WAL on disk, up to the last written WAL record. + async fn flush_wal(&mut self) -> Result<()> { + if self.flush_record_lsn == self.write_record_lsn { + // no need to do extra flush + return Ok(()); + } + + let num_bytes = self.write_record_lsn.0 - self.flush_record_lsn.0; + + self.disk.wal.lock().write( + self.flush_record_lsn.0, + &self.unflushed_bytes[..num_bytes as usize], + ); + self.unflushed_bytes.advance(num_bytes as usize); + self.flush_record_lsn = self.write_record_lsn; + + Ok(()) + } + + /// Remove all segments <= given segno. Returns function doing that as we + /// want to perform it without timeline lock. + fn remove_up_to(&self, _segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>> { + Box::pin(async move { Ok(()) }) + } + + /// Release resources associated with the storage -- technically, close FDs. + /// Currently we don't remove timelines until restart (#3146), so need to + /// spare descriptors. This would be useful for temporary tli detach as + /// well. + fn close(&mut self) {} + + /// Get metrics for this timeline. + fn get_metrics(&self) -> WalStorageMetrics { + WalStorageMetrics::default() + } +} diff --git a/safekeeper/tests/walproposer_sim/simulation.rs b/safekeeper/tests/walproposer_sim/simulation.rs new file mode 100644 index 0000000000..0d7aaf517b --- /dev/null +++ b/safekeeper/tests/walproposer_sim/simulation.rs @@ -0,0 +1,436 @@ +use std::{cell::Cell, str::FromStr, sync::Arc}; + +use crate::walproposer_sim::{safekeeper::run_server, walproposer_api::SimulationApi}; +use desim::{ + executor::{self, ExternalHandle}, + node_os::NodeOs, + options::{Delay, NetworkOptions}, + proto::{AnyMessage, NodeEvent}, + world::Node, + world::World, +}; +use rand::{Rng, SeedableRng}; +use tracing::{debug, info_span, warn}; +use utils::{id::TenantTimelineId, lsn::Lsn}; +use walproposer::walproposer::{Config, Wrapper}; + +use super::{ + log::SimClock, safekeeper_disk::SafekeeperDisk, walproposer_api, + walproposer_disk::DiskWalProposer, +}; + +/// Simulated safekeeper node. +pub struct SafekeeperNode { + pub node: Arc, + pub id: u32, + pub disk: Arc, + pub thread: Cell, +} + +impl SafekeeperNode { + /// Create and start a safekeeper at the specified Node. + pub fn new(node: Arc) -> Self { + let disk = Arc::new(SafekeeperDisk::new()); + let thread = Cell::new(SafekeeperNode::launch(disk.clone(), node.clone())); + + Self { + id: node.id, + node, + disk, + thread, + } + } + + fn launch(disk: Arc, node: Arc) -> ExternalHandle { + // start the server thread + node.launch(move |os| { + run_server(os, disk).expect("server should finish without errors"); + }) + } + + /// Restart the safekeeper. + pub fn restart(&self) { + let new_thread = SafekeeperNode::launch(self.disk.clone(), self.node.clone()); + let old_thread = self.thread.replace(new_thread); + old_thread.crash_stop(); + } +} + +/// Simulated walproposer node. +pub struct WalProposer { + thread: ExternalHandle, + node: Arc, + disk: Arc, + sync_safekeepers: bool, +} + +impl WalProposer { + /// Generic start function for both modes. + fn start( + os: NodeOs, + disk: Arc, + ttid: TenantTimelineId, + addrs: Vec, + lsn: Option, + ) { + let sync_safekeepers = lsn.is_none(); + + let _enter = if sync_safekeepers { + info_span!("sync", started = executor::now()).entered() + } else { + info_span!("walproposer", started = executor::now()).entered() + }; + + os.log_event(format!("started;walproposer;{}", sync_safekeepers as i32)); + + let config = Config { + ttid, + safekeepers_list: addrs, + safekeeper_reconnect_timeout: 1000, + safekeeper_connection_timeout: 5000, + sync_safekeepers, + }; + let args = walproposer_api::Args { + os, + config: config.clone(), + disk, + redo_start_lsn: lsn, + }; + let api = SimulationApi::new(args); + let wp = Wrapper::new(Box::new(api), config); + wp.start(); + } + + /// Start walproposer in a sync_safekeepers mode. + pub fn launch_sync(ttid: TenantTimelineId, addrs: Vec, node: Arc) -> Self { + debug!("sync_safekeepers started at node {}", node.id); + let disk = DiskWalProposer::new(); + let disk_wp = disk.clone(); + + // start the client thread + let handle = node.launch(move |os| { + WalProposer::start(os, disk_wp, ttid, addrs, None); + }); + + Self { + thread: handle, + node, + disk, + sync_safekeepers: true, + } + } + + /// Start walproposer in a normal mode. + pub fn launch_walproposer( + ttid: TenantTimelineId, + addrs: Vec, + node: Arc, + lsn: Lsn, + ) -> Self { + debug!("walproposer started at node {}", node.id); + let disk = DiskWalProposer::new(); + disk.lock().reset_to(lsn); + let disk_wp = disk.clone(); + + // start the client thread + let handle = node.launch(move |os| { + WalProposer::start(os, disk_wp, ttid, addrs, Some(lsn)); + }); + + Self { + thread: handle, + node, + disk, + sync_safekeepers: false, + } + } + + pub fn write_tx(&mut self, cnt: usize) { + let start_lsn = self.disk.lock().flush_rec_ptr(); + + for _ in 0..cnt { + self.disk + .lock() + .insert_logical_message("prefix", b"message") + .expect("failed to generate logical message"); + } + + let end_lsn = self.disk.lock().flush_rec_ptr(); + + // log event + self.node + .log_event(format!("write_wal;{};{};{}", start_lsn.0, end_lsn.0, cnt)); + + // now we need to set "Latch" in walproposer + self.node + .node_events() + .send(NodeEvent::Internal(AnyMessage::Just32(0))); + } + + pub fn stop(&self) { + self.thread.crash_stop(); + } +} + +/// Holds basic simulation settings, such as network options. +pub struct TestConfig { + pub network: NetworkOptions, + pub timeout: u64, + pub clock: Option, +} + +impl TestConfig { + /// Create a new TestConfig with default settings. + pub fn new(clock: Option) -> Self { + Self { + network: NetworkOptions { + keepalive_timeout: Some(2000), + connect_delay: Delay { + min: 1, + max: 5, + fail_prob: 0.0, + }, + send_delay: Delay { + min: 1, + max: 5, + fail_prob: 0.0, + }, + }, + timeout: 1_000 * 10, + clock, + } + } + + /// Start a new simulation with the specified seed. + pub fn start(&self, seed: u64) -> Test { + let world = Arc::new(World::new(seed, Arc::new(self.network.clone()))); + + if let Some(clock) = &self.clock { + clock.set_clock(world.clock()); + } + + let servers = [ + SafekeeperNode::new(world.new_node()), + SafekeeperNode::new(world.new_node()), + SafekeeperNode::new(world.new_node()), + ]; + + let server_ids = [servers[0].id, servers[1].id, servers[2].id]; + let safekeepers_addrs = server_ids.map(|id| format!("node:{}", id)).to_vec(); + + let ttid = TenantTimelineId::generate(); + + Test { + world, + servers, + sk_list: safekeepers_addrs, + ttid, + timeout: self.timeout, + } + } +} + +/// Holds simulation state. +pub struct Test { + pub world: Arc, + pub servers: [SafekeeperNode; 3], + pub sk_list: Vec, + pub ttid: TenantTimelineId, + pub timeout: u64, +} + +impl Test { + /// Start a sync_safekeepers thread and wait for it to finish. + pub fn sync_safekeepers(&self) -> anyhow::Result { + let wp = self.launch_sync_safekeepers(); + + // poll until exit or timeout + let time_limit = self.timeout; + while self.world.step() && self.world.now() < time_limit && !wp.thread.is_finished() {} + + if !wp.thread.is_finished() { + anyhow::bail!("timeout or idle stuck"); + } + + let res = wp.thread.result(); + if res.0 != 0 { + anyhow::bail!("non-zero exitcode: {:?}", res); + } + let lsn = Lsn::from_str(&res.1)?; + Ok(lsn) + } + + /// Spawn a new sync_safekeepers thread. + pub fn launch_sync_safekeepers(&self) -> WalProposer { + WalProposer::launch_sync(self.ttid, self.sk_list.clone(), self.world.new_node()) + } + + /// Spawn a new walproposer thread. + pub fn launch_walproposer(&self, lsn: Lsn) -> WalProposer { + let lsn = if lsn.0 == 0 { + // usual LSN after basebackup + Lsn(21623024) + } else { + lsn + }; + + WalProposer::launch_walproposer(self.ttid, self.sk_list.clone(), self.world.new_node(), lsn) + } + + /// Execute the simulation for the specified duration. + pub fn poll_for_duration(&self, duration: u64) { + let time_limit = std::cmp::min(self.world.now() + duration, self.timeout); + while self.world.step() && self.world.now() < time_limit {} + } + + /// Execute the simulation together with events defined in some schedule. + pub fn run_schedule(&self, schedule: &Schedule) -> anyhow::Result<()> { + // scheduling empty events so that world will stop in those points + { + let clock = self.world.clock(); + + let now = self.world.now(); + for (time, _) in schedule { + if *time < now { + continue; + } + clock.schedule_fake(*time - now); + } + } + + let mut wp = self.launch_sync_safekeepers(); + + let mut skipped_tx = 0; + let mut started_tx = 0; + + let mut schedule_ptr = 0; + + loop { + if wp.sync_safekeepers && wp.thread.is_finished() { + let res = wp.thread.result(); + if res.0 != 0 { + warn!("sync non-zero exitcode: {:?}", res); + debug!("restarting sync_safekeepers"); + // restart the sync_safekeepers + wp = self.launch_sync_safekeepers(); + continue; + } + let lsn = Lsn::from_str(&res.1)?; + debug!("sync_safekeepers finished at LSN {}", lsn); + wp = self.launch_walproposer(lsn); + debug!("walproposer started at thread {}", wp.thread.id()); + } + + let now = self.world.now(); + while schedule_ptr < schedule.len() && schedule[schedule_ptr].0 <= now { + if now != schedule[schedule_ptr].0 { + warn!("skipped event {:?} at {}", schedule[schedule_ptr], now); + } + + let action = &schedule[schedule_ptr].1; + match action { + TestAction::WriteTx(size) => { + if !wp.sync_safekeepers && !wp.thread.is_finished() { + started_tx += *size; + wp.write_tx(*size); + debug!("written {} transactions", size); + } else { + skipped_tx += size; + debug!("skipped {} transactions", size); + } + } + TestAction::RestartSafekeeper(id) => { + debug!("restarting safekeeper {}", id); + self.servers[*id].restart(); + } + TestAction::RestartWalProposer => { + debug!("restarting sync_safekeepers"); + wp.stop(); + wp = self.launch_sync_safekeepers(); + } + } + schedule_ptr += 1; + } + + if schedule_ptr == schedule.len() { + break; + } + let next_event_time = schedule[schedule_ptr].0; + + // poll until the next event + if wp.thread.is_finished() { + while self.world.step() && self.world.now() < next_event_time {} + } else { + while self.world.step() + && self.world.now() < next_event_time + && !wp.thread.is_finished() + {} + } + } + + debug!( + "finished schedule, total steps: {}", + self.world.get_thread_step_count() + ); + debug!("skipped_tx: {}", skipped_tx); + debug!("started_tx: {}", started_tx); + + Ok(()) + } +} + +#[derive(Debug, Clone)] +pub enum TestAction { + WriteTx(usize), + RestartSafekeeper(usize), + RestartWalProposer, +} + +pub type Schedule = Vec<(u64, TestAction)>; + +pub fn generate_schedule(seed: u64) -> Schedule { + let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + let mut schedule = Vec::new(); + let mut time = 0; + + let cnt = rng.gen_range(1..100); + + for _ in 0..cnt { + time += rng.gen_range(0..500); + let action = match rng.gen_range(0..3) { + 0 => TestAction::WriteTx(rng.gen_range(1..10)), + 1 => TestAction::RestartSafekeeper(rng.gen_range(0..3)), + 2 => TestAction::RestartWalProposer, + _ => unreachable!(), + }; + schedule.push((time, action)); + } + + schedule +} + +pub fn generate_network_opts(seed: u64) -> NetworkOptions { + let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + + let timeout = rng.gen_range(100..2000); + let max_delay = rng.gen_range(1..2 * timeout); + let min_delay = rng.gen_range(1..=max_delay); + + let max_fail_prob = rng.gen_range(0.0..0.9); + let connect_fail_prob = rng.gen_range(0.0..max_fail_prob); + let send_fail_prob = rng.gen_range(0.0..connect_fail_prob); + + NetworkOptions { + keepalive_timeout: Some(timeout), + connect_delay: Delay { + min: min_delay, + max: max_delay, + fail_prob: connect_fail_prob, + }, + send_delay: Delay { + min: min_delay, + max: max_delay, + fail_prob: send_fail_prob, + }, + } +} diff --git a/safekeeper/tests/walproposer_sim/simulation_logs.rs b/safekeeper/tests/walproposer_sim/simulation_logs.rs new file mode 100644 index 0000000000..38885e5dd0 --- /dev/null +++ b/safekeeper/tests/walproposer_sim/simulation_logs.rs @@ -0,0 +1,187 @@ +use desim::proto::SimEvent; +use tracing::debug; + +#[derive(Debug, Clone, PartialEq, Eq)] +enum NodeKind { + Unknown, + Safekeeper, + WalProposer, +} + +impl Default for NodeKind { + fn default() -> Self { + Self::Unknown + } +} + +/// Simulation state of walproposer/safekeeper, derived from the simulation logs. +#[derive(Clone, Debug, Default)] +struct NodeInfo { + kind: NodeKind, + + // walproposer + is_sync: bool, + term: u64, + epoch_lsn: u64, + + // safekeeper + commit_lsn: u64, + flush_lsn: u64, +} + +impl NodeInfo { + fn init_kind(&mut self, kind: NodeKind) { + if self.kind == NodeKind::Unknown { + self.kind = kind; + } else { + assert!(self.kind == kind); + } + } + + fn started(&mut self, data: &str) { + let mut parts = data.split(';'); + assert!(parts.next().unwrap() == "started"); + match parts.next().unwrap() { + "safekeeper" => { + self.init_kind(NodeKind::Safekeeper); + } + "walproposer" => { + self.init_kind(NodeKind::WalProposer); + let is_sync: u8 = parts.next().unwrap().parse().unwrap(); + self.is_sync = is_sync != 0; + } + _ => unreachable!(), + } + } +} + +/// Global state of the simulation, derived from the simulation logs. +#[derive(Debug, Default)] +struct GlobalState { + nodes: Vec, + commit_lsn: u64, + write_lsn: u64, + max_write_lsn: u64, + + written_wal: u64, + written_records: u64, +} + +impl GlobalState { + fn new() -> Self { + Default::default() + } + + fn get(&mut self, id: u32) -> &mut NodeInfo { + let id = id as usize; + if id >= self.nodes.len() { + self.nodes.resize(id + 1, NodeInfo::default()); + } + &mut self.nodes[id] + } +} + +/// Try to find inconsistencies in the simulation log. +pub fn validate_events(events: Vec) { + const INITDB_LSN: u64 = 21623024; + + let hook = std::panic::take_hook(); + scopeguard::defer_on_success! { + std::panic::set_hook(hook); + }; + + let mut state = GlobalState::new(); + state.max_write_lsn = INITDB_LSN; + + for event in events { + debug!("{:?}", event); + + let node = state.get(event.node); + if event.data.starts_with("started;") { + node.started(&event.data); + continue; + } + assert!(node.kind != NodeKind::Unknown); + + // drop reference to unlock state + let mut node = node.clone(); + + let mut parts = event.data.split(';'); + match node.kind { + NodeKind::Safekeeper => match parts.next().unwrap() { + "tli_loaded" => { + let flush_lsn: u64 = parts.next().unwrap().parse().unwrap(); + let commit_lsn: u64 = parts.next().unwrap().parse().unwrap(); + node.flush_lsn = flush_lsn; + node.commit_lsn = commit_lsn; + } + _ => unreachable!(), + }, + NodeKind::WalProposer => { + match parts.next().unwrap() { + "prop_elected" => { + let prop_lsn: u64 = parts.next().unwrap().parse().unwrap(); + let prop_term: u64 = parts.next().unwrap().parse().unwrap(); + let prev_lsn: u64 = parts.next().unwrap().parse().unwrap(); + let prev_term: u64 = parts.next().unwrap().parse().unwrap(); + + assert!(prop_lsn >= prev_lsn); + assert!(prop_term >= prev_term); + + assert!(prop_lsn >= state.commit_lsn); + + if prop_lsn > state.write_lsn { + assert!(prop_lsn <= state.max_write_lsn); + debug!( + "moving write_lsn up from {} to {}", + state.write_lsn, prop_lsn + ); + state.write_lsn = prop_lsn; + } + if prop_lsn < state.write_lsn { + debug!( + "moving write_lsn down from {} to {}", + state.write_lsn, prop_lsn + ); + state.write_lsn = prop_lsn; + } + + node.epoch_lsn = prop_lsn; + node.term = prop_term; + } + "write_wal" => { + assert!(!node.is_sync); + let start_lsn: u64 = parts.next().unwrap().parse().unwrap(); + let end_lsn: u64 = parts.next().unwrap().parse().unwrap(); + let cnt: u64 = parts.next().unwrap().parse().unwrap(); + + let size = end_lsn - start_lsn; + state.written_wal += size; + state.written_records += cnt; + + // TODO: If we allow writing WAL before winning the election + + assert!(start_lsn >= state.commit_lsn); + assert!(end_lsn >= start_lsn); + // assert!(start_lsn == state.write_lsn); + state.write_lsn = end_lsn; + + if end_lsn > state.max_write_lsn { + state.max_write_lsn = end_lsn; + } + } + "commit_lsn" => { + let lsn: u64 = parts.next().unwrap().parse().unwrap(); + assert!(lsn >= state.commit_lsn); + state.commit_lsn = lsn; + } + _ => unreachable!(), + } + } + _ => unreachable!(), + } + + // update the node in the state struct + *state.get(event.node) = node; + } +} diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs new file mode 100644 index 0000000000..5578c94cf6 --- /dev/null +++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs @@ -0,0 +1,673 @@ +use std::{ + cell::{RefCell, RefMut, UnsafeCell}, + ffi::CStr, + sync::Arc, +}; + +use bytes::Bytes; +use desim::{ + executor::{self, PollSome}, + network::TCP, + node_os::NodeOs, + proto::{AnyMessage, NetEvent, NodeEvent}, + world::NodeId, +}; +use tracing::debug; +use utils::lsn::Lsn; +use walproposer::{ + api_bindings::Level, + bindings::{ + NeonWALReadResult, SafekeeperStateDesiredEvents, WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE, + }, + walproposer::{ApiImpl, Config}, +}; + +use super::walproposer_disk::DiskWalProposer; + +/// Special state for each wp->sk connection. +struct SafekeeperConn { + host: String, + port: String, + node_id: NodeId, + // socket is Some(..) equals to connection is established + socket: Option, + // connection is in progress + is_connecting: bool, + // START_WAL_PUSH is in progress + is_start_wal_push: bool, + // pointer to Safekeeper in walproposer for callbacks + raw_ptr: *mut walproposer::bindings::Safekeeper, +} + +impl SafekeeperConn { + pub fn new(host: String, port: String) -> Self { + // port number is the same as NodeId + let port_num = port.parse::().unwrap(); + Self { + host, + port, + node_id: port_num, + socket: None, + is_connecting: false, + is_start_wal_push: false, + raw_ptr: std::ptr::null_mut(), + } + } +} + +/// Simulation version of a postgres WaitEventSet. At pos 0 there is always +/// a special NodeEvents channel, which is used as a latch. +struct EventSet { + os: NodeOs, + // all pollable channels, 0 is always NodeEvent channel + chans: Vec>, + // 0 is always nullptr + sk_ptrs: Vec<*mut walproposer::bindings::Safekeeper>, + // event mask for each channel + masks: Vec, +} + +impl EventSet { + pub fn new(os: NodeOs) -> Self { + let node_events = os.node_events(); + Self { + os, + chans: vec![Box::new(node_events)], + sk_ptrs: vec![std::ptr::null_mut()], + masks: vec![WL_SOCKET_READABLE], + } + } + + /// Leaves all readable channels at the beginning of the array. + fn sort_readable(&mut self) -> usize { + let mut cnt = 1; + for i in 1..self.chans.len() { + if self.masks[i] & WL_SOCKET_READABLE != 0 { + self.chans.swap(i, cnt); + self.sk_ptrs.swap(i, cnt); + self.masks.swap(i, cnt); + cnt += 1; + } + } + cnt + } + + fn update_event_set(&mut self, conn: &SafekeeperConn, event_mask: u32) { + let index = self + .sk_ptrs + .iter() + .position(|&ptr| ptr == conn.raw_ptr) + .expect("safekeeper should exist in event set"); + self.masks[index] = event_mask; + } + + fn add_safekeeper(&mut self, sk: &SafekeeperConn, event_mask: u32) { + for ptr in self.sk_ptrs.iter() { + assert!(*ptr != sk.raw_ptr); + } + + self.chans.push(Box::new( + sk.socket + .as_ref() + .expect("socket should not be closed") + .recv_chan(), + )); + self.sk_ptrs.push(sk.raw_ptr); + self.masks.push(event_mask); + } + + fn remove_safekeeper(&mut self, sk: &SafekeeperConn) { + let index = self.sk_ptrs.iter().position(|&ptr| ptr == sk.raw_ptr); + if index.is_none() { + debug!("remove_safekeeper: sk={:?} not found", sk.raw_ptr); + return; + } + let index = index.unwrap(); + + self.chans.remove(index); + self.sk_ptrs.remove(index); + self.masks.remove(index); + + // to simulate the actual behaviour + self.refresh_event_set(); + } + + /// Updates all masks to match the result of a SafekeeperStateDesiredEvents. + fn refresh_event_set(&mut self) { + for (i, mask) in self.masks.iter_mut().enumerate() { + if i == 0 { + continue; + } + + let mut mask_sk: u32 = 0; + let mut mask_nwr: u32 = 0; + unsafe { SafekeeperStateDesiredEvents(self.sk_ptrs[i], &mut mask_sk, &mut mask_nwr) }; + + if mask_sk != *mask { + debug!( + "refresh_event_set: sk={:?}, old_mask={:#b}, new_mask={:#b}", + self.sk_ptrs[i], *mask, mask_sk + ); + *mask = mask_sk; + } + } + } + + /// Wait for events on all channels. + fn wait(&mut self, timeout_millis: i64) -> walproposer::walproposer::WaitResult { + // all channels are always writeable + for (i, mask) in self.masks.iter().enumerate() { + if *mask & WL_SOCKET_WRITEABLE != 0 { + return walproposer::walproposer::WaitResult::Network( + self.sk_ptrs[i], + WL_SOCKET_WRITEABLE, + ); + } + } + + let cnt = self.sort_readable(); + + let slice = &self.chans[0..cnt]; + match executor::epoll_chans(slice, timeout_millis) { + None => walproposer::walproposer::WaitResult::Timeout, + Some(0) => { + let msg = self.os.node_events().must_recv(); + match msg { + NodeEvent::Internal(AnyMessage::Just32(0)) => { + // got a notification about new WAL available + } + NodeEvent::Internal(_) => unreachable!(), + NodeEvent::Accept(_) => unreachable!(), + } + walproposer::walproposer::WaitResult::Latch + } + Some(index) => walproposer::walproposer::WaitResult::Network( + self.sk_ptrs[index], + WL_SOCKET_READABLE, + ), + } + } +} + +/// This struct handles all calls from walproposer into walproposer_api. +pub struct SimulationApi { + os: NodeOs, + safekeepers: RefCell>, + disk: Arc, + redo_start_lsn: Option, + last_logged_commit_lsn: u64, + shmem: UnsafeCell, + config: Config, + event_set: RefCell>, +} + +pub struct Args { + pub os: NodeOs, + pub config: Config, + pub disk: Arc, + pub redo_start_lsn: Option, +} + +impl SimulationApi { + pub fn new(args: Args) -> Self { + // initialize connection state for each safekeeper + let sk_conns = args + .config + .safekeepers_list + .iter() + .map(|s| { + SafekeeperConn::new( + s.split(':').next().unwrap().to_string(), + s.split(':').nth(1).unwrap().to_string(), + ) + }) + .collect::>(); + + Self { + os: args.os, + safekeepers: RefCell::new(sk_conns), + disk: args.disk, + redo_start_lsn: args.redo_start_lsn, + last_logged_commit_lsn: 0, + shmem: UnsafeCell::new(walproposer::api_bindings::empty_shmem()), + config: args.config, + event_set: RefCell::new(None), + } + } + + /// Get SafekeeperConn for the given Safekeeper. + fn get_conn(&self, sk: &mut walproposer::bindings::Safekeeper) -> RefMut<'_, SafekeeperConn> { + let sk_port = unsafe { CStr::from_ptr(sk.port).to_str().unwrap() }; + let state = self.safekeepers.borrow_mut(); + RefMut::map(state, |v| { + v.iter_mut() + .find(|conn| conn.port == sk_port) + .expect("safekeeper conn not found by port") + }) + } +} + +impl ApiImpl for SimulationApi { + fn get_current_timestamp(&self) -> i64 { + debug!("get_current_timestamp"); + // PG TimestampTZ is microseconds, but simulation unit is assumed to be + // milliseconds, so add 10^3 + self.os.now() as i64 * 1000 + } + + fn update_donor(&self, donor: &mut walproposer::bindings::Safekeeper, donor_lsn: u64) { + let mut shmem = unsafe { *self.get_shmem_state() }; + shmem.propEpochStartLsn.value = donor_lsn; + shmem.donor_conninfo = donor.conninfo; + } + + fn conn_status( + &self, + _: &mut walproposer::bindings::Safekeeper, + ) -> walproposer::bindings::WalProposerConnStatusType { + debug!("conn_status"); + // break the connection with a 10% chance + if self.os.random(100) < 10 { + walproposer::bindings::WalProposerConnStatusType_WP_CONNECTION_BAD + } else { + walproposer::bindings::WalProposerConnStatusType_WP_CONNECTION_OK + } + } + + fn conn_connect_start(&self, sk: &mut walproposer::bindings::Safekeeper) { + debug!("conn_connect_start"); + let mut conn = self.get_conn(sk); + + assert!(conn.socket.is_none()); + let socket = self.os.open_tcp(conn.node_id); + conn.socket = Some(socket); + conn.raw_ptr = sk; + conn.is_connecting = true; + } + + fn conn_connect_poll( + &self, + _: &mut walproposer::bindings::Safekeeper, + ) -> walproposer::bindings::WalProposerConnectPollStatusType { + debug!("conn_connect_poll"); + // TODO: break the connection here + walproposer::bindings::WalProposerConnectPollStatusType_WP_CONN_POLLING_OK + } + + fn conn_send_query(&self, sk: &mut walproposer::bindings::Safekeeper, query: &str) -> bool { + debug!("conn_send_query: {}", query); + self.get_conn(sk).is_start_wal_push = true; + true + } + + fn conn_get_query_result( + &self, + _: &mut walproposer::bindings::Safekeeper, + ) -> walproposer::bindings::WalProposerExecStatusType { + debug!("conn_get_query_result"); + // TODO: break the connection here + walproposer::bindings::WalProposerExecStatusType_WP_EXEC_SUCCESS_COPYBOTH + } + + fn conn_async_read( + &self, + sk: &mut walproposer::bindings::Safekeeper, + vec: &mut Vec, + ) -> walproposer::bindings::PGAsyncReadResult { + debug!("conn_async_read"); + let mut conn = self.get_conn(sk); + + let socket = if let Some(socket) = conn.socket.as_mut() { + socket + } else { + // socket is already closed + return walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_FAIL; + }; + + let msg = socket.recv_chan().try_recv(); + + match msg { + None => { + // no message is ready + walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_TRY_AGAIN + } + Some(NetEvent::Closed) => { + // connection is closed + debug!("conn_async_read: connection is closed"); + conn.socket = None; + walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_FAIL + } + Some(NetEvent::Message(msg)) => { + // got a message + let b = match msg { + desim::proto::AnyMessage::Bytes(b) => b, + _ => unreachable!(), + }; + vec.extend_from_slice(&b); + walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS + } + } + } + + fn conn_blocking_write(&self, sk: &mut walproposer::bindings::Safekeeper, buf: &[u8]) -> bool { + let mut conn = self.get_conn(sk); + debug!("conn_blocking_write to {}: {:?}", conn.node_id, buf); + let socket = conn.socket.as_mut().unwrap(); + socket.send(desim::proto::AnyMessage::Bytes(Bytes::copy_from_slice(buf))); + true + } + + fn conn_async_write( + &self, + sk: &mut walproposer::bindings::Safekeeper, + buf: &[u8], + ) -> walproposer::bindings::PGAsyncWriteResult { + let mut conn = self.get_conn(sk); + debug!("conn_async_write to {}: {:?}", conn.node_id, buf); + if let Some(socket) = conn.socket.as_mut() { + socket.send(desim::proto::AnyMessage::Bytes(Bytes::copy_from_slice(buf))); + } else { + // connection is already closed + debug!("conn_async_write: writing to a closed socket!"); + // TODO: maybe we should return error here? + } + walproposer::bindings::PGAsyncWriteResult_PG_ASYNC_WRITE_SUCCESS + } + + fn wal_reader_allocate(&self, _: &mut walproposer::bindings::Safekeeper) -> NeonWALReadResult { + debug!("wal_reader_allocate"); + walproposer::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS + } + + fn wal_read( + &self, + _sk: &mut walproposer::bindings::Safekeeper, + buf: &mut [u8], + startpos: u64, + ) -> NeonWALReadResult { + self.disk.lock().read(startpos, buf); + walproposer::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS + } + + fn init_event_set(&self, _: &mut walproposer::bindings::WalProposer) { + debug!("init_event_set"); + let new_event_set = EventSet::new(self.os.clone()); + let old_event_set = self.event_set.replace(Some(new_event_set)); + assert!(old_event_set.is_none()); + } + + fn update_event_set(&self, sk: &mut walproposer::bindings::Safekeeper, event_mask: u32) { + debug!( + "update_event_set, sk={:?}, events_mask={:#b}", + sk as *mut walproposer::bindings::Safekeeper, event_mask + ); + let conn = self.get_conn(sk); + + self.event_set + .borrow_mut() + .as_mut() + .unwrap() + .update_event_set(&conn, event_mask); + } + + fn add_safekeeper_event_set( + &self, + sk: &mut walproposer::bindings::Safekeeper, + event_mask: u32, + ) { + debug!( + "add_safekeeper_event_set, sk={:?}, events_mask={:#b}", + sk as *mut walproposer::bindings::Safekeeper, event_mask + ); + + self.event_set + .borrow_mut() + .as_mut() + .unwrap() + .add_safekeeper(&self.get_conn(sk), event_mask); + } + + fn rm_safekeeper_event_set(&self, sk: &mut walproposer::bindings::Safekeeper) { + debug!( + "rm_safekeeper_event_set, sk={:?}", + sk as *mut walproposer::bindings::Safekeeper, + ); + + self.event_set + .borrow_mut() + .as_mut() + .unwrap() + .remove_safekeeper(&self.get_conn(sk)); + } + + fn active_state_update_event_set(&self, sk: &mut walproposer::bindings::Safekeeper) { + debug!("active_state_update_event_set"); + + assert!(sk.state == walproposer::bindings::SafekeeperState_SS_ACTIVE); + self.event_set + .borrow_mut() + .as_mut() + .unwrap() + .refresh_event_set(); + } + + fn wal_reader_events(&self, _sk: &mut walproposer::bindings::Safekeeper) -> u32 { + 0 + } + + fn wait_event_set( + &self, + _: &mut walproposer::bindings::WalProposer, + timeout_millis: i64, + ) -> walproposer::walproposer::WaitResult { + // TODO: handle multiple stages as part of the simulation (e.g. connect, start_wal_push, etc) + let mut conns = self.safekeepers.borrow_mut(); + for conn in conns.iter_mut() { + if conn.socket.is_some() && conn.is_connecting { + conn.is_connecting = false; + debug!("wait_event_set, connecting to {}:{}", conn.host, conn.port); + return walproposer::walproposer::WaitResult::Network( + conn.raw_ptr, + WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE, + ); + } + if conn.socket.is_some() && conn.is_start_wal_push { + conn.is_start_wal_push = false; + debug!( + "wait_event_set, start wal push to {}:{}", + conn.host, conn.port + ); + return walproposer::walproposer::WaitResult::Network( + conn.raw_ptr, + WL_SOCKET_READABLE, + ); + } + } + drop(conns); + + let res = self + .event_set + .borrow_mut() + .as_mut() + .unwrap() + .wait(timeout_millis); + + debug!( + "wait_event_set, timeout_millis={}, res={:?}", + timeout_millis, res, + ); + res + } + + fn strong_random(&self, buf: &mut [u8]) -> bool { + debug!("strong_random"); + buf.fill(0); + true + } + + fn finish_sync_safekeepers(&self, lsn: u64) { + debug!("finish_sync_safekeepers, lsn={}", lsn); + executor::exit(0, Lsn(lsn).to_string()); + } + + fn log_internal(&self, _wp: &mut walproposer::bindings::WalProposer, level: Level, msg: &str) { + debug!("wp_log[{}] {}", level, msg); + if level == Level::Fatal || level == Level::Panic { + if msg.contains("rejects our connection request with term") { + // collected quorum with lower term, then got rejected by next connected safekeeper + executor::exit(1, msg.to_owned()); + } + if msg.contains("collected propEpochStartLsn") && msg.contains(", but basebackup LSN ") + { + // sync-safekeepers collected wrong quorum, walproposer collected another quorum + executor::exit(1, msg.to_owned()); + } + if msg.contains("failed to download WAL for logical replicaiton") { + // Recovery connection broken and recovery was failed + executor::exit(1, msg.to_owned()); + } + if msg.contains("missing majority of votes, collected") { + // Voting bug when safekeeper disconnects after voting + executor::exit(1, msg.to_owned()); + } + panic!("unknown FATAL error from walproposer: {}", msg); + } + } + + fn after_election(&self, wp: &mut walproposer::bindings::WalProposer) { + let prop_lsn = wp.propEpochStartLsn; + let prop_term = wp.propTerm; + + let mut prev_lsn: u64 = 0; + let mut prev_term: u64 = 0; + + unsafe { + let history = wp.propTermHistory.entries; + let len = wp.propTermHistory.n_entries as usize; + if len > 1 { + let entry = *history.wrapping_add(len - 2); + prev_lsn = entry.lsn; + prev_term = entry.term; + } + } + + let msg = format!( + "prop_elected;{};{};{};{}", + prop_lsn, prop_term, prev_lsn, prev_term + ); + + debug!(msg); + self.os.log_event(msg); + } + + fn get_redo_start_lsn(&self) -> u64 { + debug!("get_redo_start_lsn -> {:?}", self.redo_start_lsn); + self.redo_start_lsn.expect("redo_start_lsn is not set").0 + } + + fn get_shmem_state(&self) -> *mut walproposer::bindings::WalproposerShmemState { + self.shmem.get() + } + + fn start_streaming( + &self, + startpos: u64, + callback: &walproposer::walproposer::StreamingCallback, + ) { + let disk = &self.disk; + let disk_lsn = disk.lock().flush_rec_ptr().0; + debug!("start_streaming at {} (disk_lsn={})", startpos, disk_lsn); + if startpos < disk_lsn { + debug!("startpos < disk_lsn, it means we wrote some transaction even before streaming started"); + } + assert!(startpos <= disk_lsn); + let mut broadcasted = Lsn(startpos); + + loop { + let available = disk.lock().flush_rec_ptr(); + assert!(available >= broadcasted); + callback.broadcast(broadcasted, available); + broadcasted = available; + callback.poll(); + } + } + + fn process_safekeeper_feedback( + &mut self, + wp: &mut walproposer::bindings::WalProposer, + _sk: &mut walproposer::bindings::Safekeeper, + ) { + debug!("process_safekeeper_feedback, commit_lsn={}", wp.commitLsn); + if wp.commitLsn > self.last_logged_commit_lsn { + self.os.log_event(format!("commit_lsn;{}", wp.commitLsn)); + self.last_logged_commit_lsn = wp.commitLsn; + } + } + + fn get_flush_rec_ptr(&self) -> u64 { + let lsn = self.disk.lock().flush_rec_ptr(); + debug!("get_flush_rec_ptr: {}", lsn); + lsn.0 + } + + fn recovery_download( + &self, + wp: &mut walproposer::bindings::WalProposer, + sk: &mut walproposer::bindings::Safekeeper, + ) -> bool { + let mut startpos = wp.truncateLsn; + let endpos = wp.propEpochStartLsn; + + if startpos == endpos { + debug!("recovery_download: nothing to download"); + return true; + } + + debug!("recovery_download from {} to {}", startpos, endpos,); + + let replication_prompt = format!( + "START_REPLICATION {} {} {} {}", + self.config.ttid.tenant_id, self.config.ttid.timeline_id, startpos, endpos, + ); + let async_conn = self.get_conn(sk); + + let conn = self.os.open_tcp(async_conn.node_id); + conn.send(desim::proto::AnyMessage::Bytes(replication_prompt.into())); + + let chan = conn.recv_chan(); + while startpos < endpos { + let event = chan.recv(); + match event { + NetEvent::Closed => { + debug!("connection closed in recovery"); + break; + } + NetEvent::Message(AnyMessage::Bytes(b)) => { + debug!("got recovery bytes from safekeeper"); + self.disk.lock().write(startpos, &b); + startpos += b.len() as u64; + } + NetEvent::Message(_) => unreachable!(), + } + } + + debug!("recovery finished at {}", startpos); + + startpos == endpos + } + + fn conn_finish(&self, sk: &mut walproposer::bindings::Safekeeper) { + let mut conn = self.get_conn(sk); + debug!("conn_finish to {}", conn.node_id); + if let Some(socket) = conn.socket.as_mut() { + socket.close(); + } else { + // connection is already closed + } + conn.socket = None; + } + + fn conn_error_message(&self, _sk: &mut walproposer::bindings::Safekeeper) -> String { + "connection is closed, probably".into() + } +} diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs new file mode 100644 index 0000000000..123cd6bad6 --- /dev/null +++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs @@ -0,0 +1,314 @@ +use std::{ffi::CString, sync::Arc}; + +use byteorder::{LittleEndian, WriteBytesExt}; +use crc32c::crc32c_append; +use parking_lot::{Mutex, MutexGuard}; +use postgres_ffi::{ + pg_constants::{ + RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE, XLP_LONG_HEADER, XLR_BLOCK_ID_DATA_LONG, + XLR_BLOCK_ID_DATA_SHORT, + }, + v16::{ + wal_craft_test_export::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC}, + xlog_utils::{ + XLogSegNoOffsetToRecPtr, XlLogicalMessage, XLOG_RECORD_CRC_OFFS, + XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, + XLP_FIRST_IS_CONTRECORD, + }, + XLogRecord, + }, + WAL_SEGMENT_SIZE, XLOG_BLCKSZ, +}; +use utils::lsn::Lsn; + +use super::block_storage::BlockStorage; + +/// Simulation implementation of walproposer WAL storage. +pub struct DiskWalProposer { + state: Mutex, +} + +impl DiskWalProposer { + pub fn new() -> Arc { + Arc::new(DiskWalProposer { + state: Mutex::new(State { + internal_available_lsn: Lsn(0), + prev_lsn: Lsn(0), + disk: BlockStorage::new(), + }), + }) + } + + pub fn lock(&self) -> MutexGuard { + self.state.lock() + } +} + +pub struct State { + // flush_lsn + internal_available_lsn: Lsn, + // needed for WAL generation + prev_lsn: Lsn, + // actual WAL storage + disk: BlockStorage, +} + +impl State { + pub fn read(&self, pos: u64, buf: &mut [u8]) { + self.disk.read(pos, buf); + // TODO: fail on reading uninitialized data + } + + pub fn write(&mut self, pos: u64, buf: &[u8]) { + self.disk.write(pos, buf); + } + + /// Update the internal available LSN to the given value. + pub fn reset_to(&mut self, lsn: Lsn) { + self.internal_available_lsn = lsn; + } + + /// Get current LSN. + pub fn flush_rec_ptr(&self) -> Lsn { + self.internal_available_lsn + } + + /// Generate a new WAL record at the current LSN. + pub fn insert_logical_message(&mut self, prefix: &str, msg: &[u8]) -> anyhow::Result<()> { + let prefix_cstr = CString::new(prefix)?; + let prefix_bytes = prefix_cstr.as_bytes_with_nul(); + + let lm = XlLogicalMessage { + db_id: 0, + transactional: 0, + prefix_size: prefix_bytes.len() as ::std::os::raw::c_ulong, + message_size: msg.len() as ::std::os::raw::c_ulong, + }; + + let record_bytes = lm.encode(); + let rdatas: Vec<&[u8]> = vec![&record_bytes, prefix_bytes, msg]; + insert_wal_record(self, rdatas, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE) + } +} + +fn insert_wal_record( + state: &mut State, + rdatas: Vec<&[u8]>, + rmid: u8, + info: u8, +) -> anyhow::Result<()> { + // bytes right after the header, in the same rdata block + let mut scratch = Vec::new(); + let mainrdata_len: usize = rdatas.iter().map(|rdata| rdata.len()).sum(); + + if mainrdata_len > 0 { + if mainrdata_len > 255 { + scratch.push(XLR_BLOCK_ID_DATA_LONG); + // TODO: verify endiness + let _ = scratch.write_u32::(mainrdata_len as u32); + } else { + scratch.push(XLR_BLOCK_ID_DATA_SHORT); + scratch.push(mainrdata_len as u8); + } + } + + let total_len: u32 = (XLOG_SIZE_OF_XLOG_RECORD + scratch.len() + mainrdata_len) as u32; + let size = maxalign(total_len); + assert!(size as usize > XLOG_SIZE_OF_XLOG_RECORD); + + let start_bytepos = recptr_to_bytepos(state.internal_available_lsn); + let end_bytepos = start_bytepos + size as u64; + + let start_recptr = bytepos_to_recptr(start_bytepos); + let end_recptr = bytepos_to_recptr(end_bytepos); + + assert!(recptr_to_bytepos(start_recptr) == start_bytepos); + assert!(recptr_to_bytepos(end_recptr) == end_bytepos); + + let mut crc = crc32c_append(0, &scratch); + for rdata in &rdatas { + crc = crc32c_append(crc, rdata); + } + + let mut header = XLogRecord { + xl_tot_len: total_len, + xl_xid: 0, + xl_prev: state.prev_lsn.0, + xl_info: info, + xl_rmid: rmid, + __bindgen_padding_0: [0u8; 2usize], + xl_crc: crc, + }; + + // now we have the header and can finish the crc + let header_bytes = header.encode()?; + let crc = crc32c_append(crc, &header_bytes[0..XLOG_RECORD_CRC_OFFS]); + header.xl_crc = crc; + + let mut header_bytes = header.encode()?.to_vec(); + assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_RECORD); + + header_bytes.extend_from_slice(&scratch); + + // finish rdatas + let mut rdatas = rdatas; + rdatas.insert(0, &header_bytes); + + write_walrecord_to_disk(state, total_len as u64, rdatas, start_recptr, end_recptr)?; + + state.internal_available_lsn = end_recptr; + state.prev_lsn = start_recptr; + Ok(()) +} + +fn write_walrecord_to_disk( + state: &mut State, + total_len: u64, + rdatas: Vec<&[u8]>, + start: Lsn, + end: Lsn, +) -> anyhow::Result<()> { + let mut curr_ptr = start; + let mut freespace = insert_freespace(curr_ptr); + let mut written: usize = 0; + + assert!(freespace >= size_of::()); + + for mut rdata in rdatas { + while rdata.len() >= freespace { + assert!( + curr_ptr.segment_offset(WAL_SEGMENT_SIZE) >= XLOG_SIZE_OF_XLOG_SHORT_PHD + || freespace == 0 + ); + + state.write(curr_ptr.0, &rdata[..freespace]); + rdata = &rdata[freespace..]; + written += freespace; + curr_ptr = Lsn(curr_ptr.0 + freespace as u64); + + let mut new_page = XLogPageHeaderData { + xlp_magic: XLOG_PAGE_MAGIC as u16, + xlp_info: XLP_BKP_REMOVABLE, + xlp_tli: 1, + xlp_pageaddr: curr_ptr.0, + xlp_rem_len: (total_len - written as u64) as u32, + ..Default::default() // Put 0 in padding fields. + }; + if new_page.xlp_rem_len > 0 { + new_page.xlp_info |= XLP_FIRST_IS_CONTRECORD; + } + + if curr_ptr.segment_offset(WAL_SEGMENT_SIZE) == 0 { + new_page.xlp_info |= XLP_LONG_HEADER; + let long_page = XLogLongPageHeaderData { + std: new_page, + xlp_sysid: 0, + xlp_seg_size: WAL_SEGMENT_SIZE as u32, + xlp_xlog_blcksz: XLOG_BLCKSZ as u32, + }; + let header_bytes = long_page.encode()?; + assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_LONG_PHD); + state.write(curr_ptr.0, &header_bytes); + curr_ptr = Lsn(curr_ptr.0 + header_bytes.len() as u64); + } else { + let header_bytes = new_page.encode()?; + assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_SHORT_PHD); + state.write(curr_ptr.0, &header_bytes); + curr_ptr = Lsn(curr_ptr.0 + header_bytes.len() as u64); + } + freespace = insert_freespace(curr_ptr); + } + + assert!( + curr_ptr.segment_offset(WAL_SEGMENT_SIZE) >= XLOG_SIZE_OF_XLOG_SHORT_PHD + || rdata.is_empty() + ); + state.write(curr_ptr.0, rdata); + curr_ptr = Lsn(curr_ptr.0 + rdata.len() as u64); + written += rdata.len(); + freespace -= rdata.len(); + } + + assert!(written == total_len as usize); + curr_ptr.0 = maxalign(curr_ptr.0); + assert!(curr_ptr == end); + Ok(()) +} + +fn maxalign(size: T) -> T +where + T: std::ops::BitAnd + + std::ops::Add + + std::ops::Not + + From, +{ + (size + T::from(7)) & !T::from(7) +} + +fn insert_freespace(ptr: Lsn) -> usize { + if ptr.block_offset() == 0 { + 0 + } else { + (XLOG_BLCKSZ as u64 - ptr.block_offset()) as usize + } +} + +const XLP_BKP_REMOVABLE: u16 = 0x0004; +const USABLE_BYTES_IN_PAGE: u64 = (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64; +const USABLE_BYTES_IN_SEGMENT: u64 = ((WAL_SEGMENT_SIZE / XLOG_BLCKSZ) as u64 + * USABLE_BYTES_IN_PAGE) + - (XLOG_SIZE_OF_XLOG_RECORD - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64; + +fn bytepos_to_recptr(bytepos: u64) -> Lsn { + let fullsegs = bytepos / USABLE_BYTES_IN_SEGMENT; + let mut bytesleft = bytepos % USABLE_BYTES_IN_SEGMENT; + + let seg_offset = if bytesleft < (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64 { + // fits on first page of segment + bytesleft + XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 + } else { + // account for the first page on segment with long header + bytesleft -= (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64; + let fullpages = bytesleft / USABLE_BYTES_IN_PAGE; + bytesleft %= USABLE_BYTES_IN_PAGE; + + XLOG_BLCKSZ as u64 + + fullpages * XLOG_BLCKSZ as u64 + + bytesleft + + XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 + }; + + Lsn(XLogSegNoOffsetToRecPtr( + fullsegs, + seg_offset as u32, + WAL_SEGMENT_SIZE, + )) +} + +fn recptr_to_bytepos(ptr: Lsn) -> u64 { + let fullsegs = ptr.segment_number(WAL_SEGMENT_SIZE); + let offset = ptr.segment_offset(WAL_SEGMENT_SIZE) as u64; + + let fullpages = offset / XLOG_BLCKSZ as u64; + let offset = offset % XLOG_BLCKSZ as u64; + + if fullpages == 0 { + fullsegs * USABLE_BYTES_IN_SEGMENT + + if offset > 0 { + assert!(offset >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64); + offset - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 + } else { + 0 + } + } else { + fullsegs * USABLE_BYTES_IN_SEGMENT + + (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64 + + (fullpages - 1) * USABLE_BYTES_IN_PAGE + + if offset > 0 { + assert!(offset >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64); + offset - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 + } else { + 0 + } + } +} diff --git a/scripts/benchmark_durations.py b/scripts/benchmark_durations.py index 7f05d72a03..4ca433679a 100755 --- a/scripts/benchmark_durations.py +++ b/scripts/benchmark_durations.py @@ -20,7 +20,7 @@ BENCHMARKS_DURATION_QUERY = """ FROM results WHERE started_at > CURRENT_DATE - INTERVAL '%s' day - AND parent_suite = 'test_runner.performance' + AND starts_with(parent_suite, 'test_runner.performance') AND status = 'passed' GROUP BY parent_suite, suite, name @@ -31,68 +31,76 @@ BENCHMARKS_DURATION_QUERY = """ # the total duration varies from 8 to 40 minutes. # We use some pre-collected durations as a fallback to have a better distribution. FALLBACK_DURATION = { - "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 62.144, - "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 90.941, - "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 26.053, - "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.67, - "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 14.497, - "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 18.852, - "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 26.572, - "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 6.259, - "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 21.206, - "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.474, - "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 11.262, - "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 94.225, - "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 68.159, - "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 76.719, - "test_runner/performance/test_compaction.py::test_compaction": 110.222, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 10.743, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.541, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.109, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.121, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.3, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.086, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 12.024, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.14, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.375, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.075, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.147, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.321, - "test_runner/performance/test_copy.py::test_copy[neon]": 16.579, - "test_runner/performance/test_copy.py::test_copy[vanilla]": 10.094, - "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 590.157, - "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.102, - "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 8.677, - "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 31.079, - "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 38.119, - "test_runner/performance/test_layer_map.py::test_layer_map": 24.784, - "test_runner/performance/test_logical_replication.py::test_logical_replication": 117.707, - "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 21.194, - "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 59.068, - "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 73.235, - "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 82.586, - "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 106.536, - "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 98.753, - "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 6.975, - "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 3.69, - "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.529, - "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 64.522, - "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 40.964, - "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55, - "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189, - "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899, - "test_runner/performance/test_startup.py::test_startup_simple": 2.51, - "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245, - "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46, - "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 113.653, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 233.728, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 419.093, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 982.461, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 116.522, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 115.583, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 155.282, - "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 26.704, - "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.088, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-13-30]": 400.15, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-6-30]": 372.521, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-13-30]": 420.017, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-6-30]": 373.769, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-13-30]": 678.742, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-6-30]": 512.135, + "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 58.036, + "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 22.104, + "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 126.073, + "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.759, + "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 6.885, + "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 8.758, + "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 18.275, + "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 9.533, + "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 12.09, + "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 35.145, + "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 22.28, + "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 66.353, + "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 75.487, + "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 54.142, + "test_runner/performance/test_compaction.py::test_compaction": 110.715, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 11.68, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.384, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.315, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.783, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.647, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 17.04, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.01, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 11.902, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.077, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.4, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.33, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.434, + "test_runner/performance/test_copy.py::test_copy[neon]": 13.817, + "test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736, + "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735, + "test_runner/performance/test_gc_feedback.py::test_gc_feedback_with_snapshots": 575.735, + "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868, + "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393, + "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588, + "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 30.849, + "test_runner/performance/test_layer_map.py::test_layer_map": 39.378, + "test_runner/performance/test_lazy_startup.py::test_lazy_startup": 2848.938, + "test_runner/performance/test_logical_replication.py::test_logical_replication": 120.952, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 35.552, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 66.762, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 85.177, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 92.12, + "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 107.009, + "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 99.582, + "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 4.737, + "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 2.686, + "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.271, + "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 50.719, + "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 15.992, + "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.566, + "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 13.542, + "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.35, + "test_runner/performance/test_startup.py::test_startup_simple": 13.043, + "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 194.841, + "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 286.667, + "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 85.577, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 297.626, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 646.187, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 989.776, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 125.638, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 123.554, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 190.083, + "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 21.016, + "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 23.028, } diff --git a/scripts/check_allowed_errors.sh b/scripts/check_allowed_errors.sh new file mode 100755 index 0000000000..87e52c1e64 --- /dev/null +++ b/scripts/check_allowed_errors.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -eu + +HELPER_DIR="$(dirname "${BASH_SOURCE[0]}")" +SCRIPT="test_runner/fixtures/pageserver/allowed_errors.py" + +# first run to understand all of the errors: +# +# example: ./scripts/check_allowed_errors.sh -i - < pageserver.log +# example: ./scripts/check_allowed_errors.sh -i pageserver.log +# +# then edit the test local allowed_errors to the +# test_runner/fixtures/pageserver/allowed_errors.py, then re-run to make sure +# they are handled. +# +# finally revert any local changes to allowed_errors.py. +poetry run python3 "$HELPER_DIR/../$SCRIPT" $* diff --git a/scripts/comment-test-report.js b/scripts/comment-test-report.js index 89befda71f..e8e0b3c23a 100755 --- a/scripts/comment-test-report.js +++ b/scripts/comment-test-report.js @@ -68,16 +68,29 @@ const parseReportJson = async ({ reportJsonUrl, fetch }) => { console.info(`Cannot get BUILD_TYPE and Postgres Version from test name: "${test.name}", defaulting to "release" and "14"`) buildType = "release" - pgVersion = "14" + pgVersion = "16" } pgVersions.add(pgVersion) + // We use `arch` as it is returned by GitHub Actions + // (RUNNER_ARCH env var): X86, X64, ARM, or ARM64 + // Ref https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables + let arch = "" + if (test.parameters.includes("'X64'")) { + arch = "x86-64" + } else if (test.parameters.includes("'ARM64'")) { + arch = "arm64" + } else { + arch = "unknown" + } + // Removing build type and PostgreSQL version from the test name to make it shorter const testName = test.name.replace(new RegExp(`${buildType}-pg${pgVersion}-?`), "").replace("[]", "") test.pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${testName}` test.pgVersion = pgVersion test.buildType = buildType + test.arch = arch if (test.status === "passed") { passedTests[pgVersion][testName].push(test) @@ -144,7 +157,7 @@ const reportSummary = async (params) => { const links = [] for (const test of tests) { const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}` - links.push(`[${test.buildType}](${allureLink})`) + links.push(`[${test.buildType}-${test.arch}](${allureLink})`) } summary += `- \`${testName}\`: ${links.join(", ")}\n` } @@ -175,7 +188,7 @@ const reportSummary = async (params) => { const links = [] for (const test of tests) { const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}/retries` - links.push(`[${test.buildType}](${allureLink})`) + links.push(`[${test.buildType}-${test.arch}](${allureLink})`) } summary += `- \`${testName}\`: ${links.join(", ")}\n` } @@ -188,7 +201,7 @@ const reportSummary = async (params) => { } const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => { - let summary = `\n### Code coverage ([full report](${coverageUrl}))\n` + let summary = `\n### Code coverage* ([full report](${coverageUrl}))\n` const coverage = await (await fetch(summaryJsonUrl)).json() for (const covType of Object.keys(coverage).sort()) { @@ -198,7 +211,7 @@ const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => { summary += `- \`${covType}s\`: \`${coverage[covType]["_summary"]}\`\n` } - + summary += "\n\\* collected from Rust tests only\n" summary += `\n___\n` return summary diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py deleted file mode 100755 index 980f343047..0000000000 --- a/scripts/export_import_between_pageservers.py +++ /dev/null @@ -1,736 +0,0 @@ -# -# Script to export tenants from one pageserver and import them into another page server. -# -# Outline of steps: -# 1. Get `(last_lsn, prev_lsn)` from old pageserver -# 2. Get `fullbackup` from old pageserver, which creates a basebackup tar file -# 3. This tar file might be missing relation files for empty relations, if the pageserver -# is old enough (we didn't always store those). So to recreate them, we start a local -# vanilla postgres on this basebackup and ask it what relations should exist, then touch -# any missing files and re-pack the tar. -# TODO This functionality is no longer needed, so we can delete it later if we don't -# end up using the same utils for the pg 15 upgrade. Not sure. -# 4. We import the patched basebackup into a new pageserver -# 5. We export again via fullbackup, now from the new pageserver and compare the returned -# tar file with the one we imported. This confirms that we imported everything that was -# exported, but doesn't guarantee correctness (what if we didn't **export** everything -# initially?) -# 6. We wait for the new pageserver's remote_consistent_lsn to catch up -# -# For more context on how to use this, see: -# https://www.notion.so/neondatabase/Storage-format-migration-9a8eba33ccf8417ea8cf50e6a0c542cf - -import argparse -import os -import shutil -import subprocess -import tempfile -import time -import uuid -from contextlib import closing -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, cast - -import psycopg2 -import requests -from psycopg2.extensions import connection as PgConnection -from psycopg2.extensions import parse_dsn - -############################################### -### client-side utils copied from test fixtures -############################################### - -Env = Dict[str, str] - -_global_counter = 0 - - -def global_counter() -> int: - """A really dumb global counter. - This is useful for giving output files a unique number, so if we run the - same command multiple times we can keep their output separate. - """ - global _global_counter - _global_counter += 1 - return _global_counter - - -def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: - """Run a process and capture its output - Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr" - where "cmd" is the name of the program and NNN is an incrementing - counter. - If those files already exist, we will overwrite them. - Returns basepath for files with captured output. - """ - assert isinstance(cmd, list) - base = os.path.basename(cmd[0]) + "_{}".format(global_counter()) - basepath = os.path.join(capture_dir, base) - stdout_filename = basepath + ".stdout" - stderr_filename = basepath + ".stderr" - - with open(stdout_filename, "w") as stdout_f: - with open(stderr_filename, "w") as stderr_f: - print('(capturing output to "{}.stdout")'.format(base)) - subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) - - return basepath - - -class PgBin: - """A helper class for executing postgres binaries""" - - def __init__(self, log_dir: Path, pg_distrib_dir, pg_version): - self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin") - self.env = os.environ.copy() - self.env["LD_LIBRARY_PATH"] = os.path.join( - str(pg_distrib_dir), "v{}".format(pg_version), "lib" - ) - - def _fixpath(self, command: List[str]): - if "/" not in command[0]: - command[0] = os.path.join(self.pg_bin_path, command[0]) - - def _build_env(self, env_add: Optional[Env]) -> Env: - if env_add is None: - return self.env - env = self.env.copy() - env.update(env_add) - return env - - def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None): - """ - Run one of the postgres binaries. - The command should be in list form, e.g. ['pgbench', '-p', '55432'] - All the necessary environment variables will be set. - If the first argument (the command name) doesn't include a path (no '/' - characters present), then it will be edited to include the correct path. - If you want stdout/stderr captured to files, use `run_capture` instead. - """ - - self._fixpath(command) - print('Running command "{}"'.format(" ".join(command))) - env = self._build_env(env) - subprocess.run(command, env=env, cwd=cwd, check=True) - - def run_capture( - self, - command: List[str], - env: Optional[Env] = None, - cwd: Optional[str] = None, - **kwargs: Any, - ) -> str: - """ - Run one of the postgres binaries, with stderr and stdout redirected to a file. - This is just like `run`, but for chatty programs. Returns basepath for files - with captured output. - """ - - self._fixpath(command) - print('Running command "{}"'.format(" ".join(command))) - env = self._build_env(env) - return subprocess_capture( - str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs - ) - - -class PgProtocol: - """Reusable connection logic""" - - def __init__(self, **kwargs): - self.default_options = kwargs - - def conn_options(self, **kwargs): - conn_options = self.default_options.copy() - if "dsn" in kwargs: - conn_options.update(parse_dsn(kwargs["dsn"])) - conn_options.update(kwargs) - - # Individual statement timeout in seconds. 2 minutes should be - # enough for our tests, but if you need a longer, you can - # change it by calling "SET statement_timeout" after - # connecting. - conn_options["options"] = f"-cstatement_timeout=120s {conn_options.get('options', '')}" - - return conn_options - - # autocommit=True here by default because that's what we need most of the time - def connect(self, autocommit=True, **kwargs) -> PgConnection: - """ - Connect to the node. - Returns psycopg2's connection object. - This method passes all extra params to connstr. - """ - conn: PgConnection = psycopg2.connect(**self.conn_options(**kwargs)) - - # WARNING: this setting affects *all* tests! - conn.autocommit = autocommit - return conn - - def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]: - """ - Execute query against the node and return all rows. - This method passes all extra params to connstr. - """ - return self.safe_psql_many([query], **kwargs)[0] - - def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]: - """ - Execute queries against the node and return all rows. - This method passes all extra params to connstr. - """ - result: List[List[Any]] = [] - with closing(self.connect(**kwargs)) as conn: - with conn.cursor() as cur: - for query in queries: - print(f"Executing query: {query}") - cur.execute(query) - - if cur.description is None: - result.append([]) # query didn't return data - else: - result.append(cast(List[Any], cur.fetchall())) - return result - - -class VanillaPostgres(PgProtocol): - def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True): - super().__init__(host="localhost", port=port, dbname="postgres") - self.pgdatadir = pgdatadir - self.pg_bin = pg_bin - self.running = False - if init: - self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)]) - self.configure([f"port = {port}\n"]) - - def configure(self, options: List[str]): - """Append lines into postgresql.conf file.""" - assert not self.running - with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file: - conf_file.write("\n".join(options)) - - def start(self, log_path: Optional[str] = None): - assert not self.running - self.running = True - - log_path = log_path or os.path.join(self.pgdatadir, "pg.log") - - self.pg_bin.run_capture( - ["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"] - ) - - def stop(self): - assert self.running - self.running = False - self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"]) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc, tb): - if self.running: - self.stop() - - -class NeonPageserverApiException(Exception): - pass - - -class NeonPageserverHttpClient(requests.Session): - def __init__(self, host, port): - super().__init__() - self.host = host - self.port = port - - def verbose_error(self, res: requests.Response): - try: - res.raise_for_status() - except requests.RequestException as e: - try: - msg = res.json()["msg"] - except: # noqa: E722 - msg = "" - raise NeonPageserverApiException(msg) from e - - def check_status(self): - self.get(f"http://{self.host}:{self.port}/v1/status").raise_for_status() - - def tenant_list(self): - res = self.get(f"http://{self.host}:{self.port}/v1/tenant") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists): - res = self.post( - f"http://{self.host}:{self.port}/v1/tenant", - json={"new_tenant_id": new_tenant_id.hex, "generation": 1}, - ) - - if res.status_code == 409: - if ok_if_exists: - print(f"could not create tenant: already exists for id {new_tenant_id}") - else: - res.raise_for_status() - elif res.status_code == 201: - print(f"created tenant {new_tenant_id}") - else: - self.verbose_error(res) - - return new_tenant_id - - def timeline_list(self, tenant_id: uuid.UUID): - res = self.get(f"http://{self.host}:{self.port}/v1/tenant/{tenant_id.hex}/timeline") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=true" - ) - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - -def lsn_to_hex(num: int) -> str: - """Convert lsn from int to standard hex notation.""" - return "{:X}/{:X}".format(num >> 32, num & 0xFFFFFFFF) - - -def lsn_from_hex(lsn_hex: str) -> int: - """Convert lsn from hex notation to int.""" - left, right = lsn_hex.split("/") - return (int(left, 16) << 32) + int(right, 16) - - -def remote_consistent_lsn( - pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID -) -> int: - detail = pageserver_http_client.timeline_detail(tenant, timeline) - - lsn_str = detail["remote_consistent_lsn"] - assert isinstance(lsn_str, str) - return lsn_from_hex(lsn_str) - - -def wait_for_upload( - pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID, - lsn: int, -): - """waits for local timeline upload up to specified lsn""" - for i in range(10): - current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) - if current_lsn >= lsn: - return - print( - "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1 - ) - ) - time.sleep(1) - - raise Exception( - "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn) - ) - ) - - -############## -# End of utils -############## - - -def pack_base(log_dir, restored_dir, output_tar): - """Create tar file from basebackup, being careful to produce relative filenames.""" - tmp_tar_name = "tmp.tar" - tmp_tar_path = os.path.join(restored_dir, tmp_tar_name) - cmd = ["tar", "-cf", tmp_tar_name] + os.listdir(restored_dir) - # We actually cd into the dir and call tar from there. If we call tar from - # outside we won't encode filenames as relative, and they won't parse well - # on import. - subprocess_capture(log_dir, cmd, cwd=restored_dir) - shutil.move(tmp_tar_path, output_tar) - - -def reconstruct_paths(log_dir, pg_bin, base_tar, port: int): - """Reconstruct what relation files should exist in the datadir by querying postgres.""" - with tempfile.TemporaryDirectory() as restored_dir: - # Unpack the base tar - subprocess_capture(log_dir, ["tar", "-xf", base_tar, "-C", restored_dir]) - - # Start a vanilla postgres from the given datadir and query it to find - # what relfiles should exist, but possibly don't. - with VanillaPostgres(Path(restored_dir), pg_bin, port, init=False) as vanilla_pg: - vanilla_pg.configure([f"port={port}"]) - vanilla_pg.start(log_path=os.path.join(log_dir, "tmp_pg.log")) - - # Create database based on template0 because we can't connect to template0 - query = "create database template0copy template template0" - vanilla_pg.safe_psql(query, user="cloud_admin") - vanilla_pg.safe_psql("CHECKPOINT", user="cloud_admin") - - # Get all databases - query = "select oid, datname from pg_database" - oid_dbname_pairs = vanilla_pg.safe_psql(query, user="cloud_admin") - template0_oid = [ - oid for (oid, database) in oid_dbname_pairs if database == "template0" - ][0] - - # Get rel paths for each database - for oid, database in oid_dbname_pairs: - if database == "template0": - # We can't connect to template0 - continue - - query = "select relname, pg_relation_filepath(oid) from pg_class" - result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database) - for _relname, filepath in result: - if filepath is not None: - if database == "template0copy": - # Add all template0copy paths to template0 - prefix = f"base/{oid}/" - if filepath.startswith(prefix): - suffix = filepath[len(prefix) :] - yield f"base/{template0_oid}/{suffix}" - elif filepath.startswith("global"): - print(f"skipping {database} global file {filepath}") - else: - raise AssertionError - else: - yield filepath - - -def touch_missing_rels(log_dir, corrupt_tar, output_tar, paths): - """Add the appropriate empty files to a basebadkup tar.""" - with tempfile.TemporaryDirectory() as restored_dir: - # Unpack the base tar - subprocess_capture(log_dir, ["tar", "-xf", corrupt_tar, "-C", restored_dir]) - - # Touch files that don't exist - for path in paths: - absolute_path = os.path.join(restored_dir, path) - exists = os.path.exists(absolute_path) - if not exists: - print(f"File {absolute_path} didn't exist. Creating..") - Path(absolute_path).touch() - - # Repackage - pack_base(log_dir, restored_dir, output_tar) - - -# HACK This is a workaround for exporting from old pageservers that -# can't export empty relations. In this case we need to start -# a vanilla postgres from the exported datadir, and query it -# to see what empty relations are missing, and then create -# those empty files before importing. -def add_missing_rels(base_tar, output_tar, log_dir, pg_bin, tmp_pg_port: int): - reconstructed_paths = set(reconstruct_paths(log_dir, pg_bin, base_tar, tmp_pg_port)) - touch_missing_rels(log_dir, base_tar, output_tar, reconstructed_paths) - - -def get_rlsn(pageserver_connstr, tenant_id, timeline_id): - with closing(psycopg2.connect(pageserver_connstr)) as conn: - conn.autocommit = True - with conn.cursor() as cur: - cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}" - cur.execute(cmd) - res = cur.fetchone() - assert res is not None - prev_lsn = res[0] - last_lsn = res[1] - - return last_lsn, prev_lsn - - -def import_timeline( - args, - psql_path, - pageserver_connstr, - pageserver_http, - tenant_id, - timeline_id, - last_lsn, - prev_lsn, - tar_filename, - pg_version, -): - # Import timelines to new pageserver - import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn} {pg_version}" - full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """ - - stderr_filename2 = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr") - stdout_filename = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout") - - print(f"Running: {full_cmd}") - - with open(stdout_filename, "w") as stdout_f: - with open(stderr_filename2, "w") as stderr_f: - print(f"(capturing output to {stdout_filename})") - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) - subprocess.run( - full_cmd, - stdout=stdout_f, - stderr=stderr_f, - env=pg_bin._build_env(None), - shell=True, - check=True, - ) - - print("Done import") - - # Wait until pageserver persists the files - wait_for_upload( - pageserver_http, uuid.UUID(tenant_id), uuid.UUID(timeline_id), lsn_from_hex(last_lsn) - ) - - -def export_timeline( - args, - psql_path, - pageserver_connstr, - tenant_id, - timeline_id, - last_lsn, - prev_lsn, - tar_filename, - pg_version, -): - # Choose filenames - incomplete_filename = tar_filename + ".incomplete" - stderr_filename = os.path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr") - - # Construct export command - query = f"fullbackup {tenant_id} {timeline_id} {last_lsn} {prev_lsn}" - cmd = [psql_path, "--no-psqlrc", pageserver_connstr, "-c", query] - - # Run export command - print(f"Running: {cmd}") - with open(incomplete_filename, "w") as stdout_f: - with open(stderr_filename, "w") as stderr_f: - print(f"(capturing output to {incomplete_filename})") - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) - subprocess.run( - cmd, stdout=stdout_f, stderr=stderr_f, env=pg_bin._build_env(None), check=True - ) - - # Add missing rels - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) - add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin, args.tmp_pg_port) - - # Log more info - file_size = os.path.getsize(tar_filename) - print(f"Done export: {tar_filename}, size {file_size}") - - -def main(args: argparse.Namespace): - # any psql version will do here. use current DEFAULT_PG_VERSION = 15 - psql_path = str(Path(args.pg_distrib_dir) / "v15" / "bin" / "psql") - - old_pageserver_host = args.old_pageserver_host - new_pageserver_host = args.new_pageserver_host - - old_http_client = NeonPageserverHttpClient(old_pageserver_host, args.old_pageserver_http_port) - old_http_client.check_status() - old_pageserver_connstr = f"postgresql://{old_pageserver_host}:{args.old_pageserver_pg_port}" - - new_http_client = NeonPageserverHttpClient(new_pageserver_host, args.new_pageserver_http_port) - new_http_client.check_status() - new_pageserver_connstr = f"postgresql://{new_pageserver_host}:{args.new_pageserver_pg_port}" - - for tenant_id in args.tenants: - print(f"Tenant: {tenant_id}") - timelines = old_http_client.timeline_list(uuid.UUID(tenant_id)) - print(f"Timelines: {timelines}") - - # Create tenant in new pageserver - if args.only_import is False and not args.timelines: - new_http_client.tenant_create(uuid.UUID(tenant_id), args.ok_if_exists) - - for timeline in timelines: - # Skip timelines we don't need to export - if args.timelines and timeline["timeline_id"] not in args.timelines: - print(f"Skipping timeline {timeline['timeline_id']}") - continue - - # Choose filenames - tar_filename = os.path.join( - args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar" - ) - - pg_version = timeline["pg_version"] - - # Export timeline from old pageserver - if args.only_import is False: - last_lsn, prev_lsn = get_rlsn( - old_pageserver_connstr, - timeline["tenant_id"], - timeline["timeline_id"], - ) - export_timeline( - args, - psql_path, - old_pageserver_connstr, - timeline["tenant_id"], - timeline["timeline_id"], - last_lsn, - prev_lsn, - tar_filename, - pg_version, - ) - - # Import into new pageserver - import_timeline( - args, - psql_path, - new_pageserver_connstr, - new_http_client, - timeline["tenant_id"], - timeline["timeline_id"], - last_lsn, - prev_lsn, - tar_filename, - pg_version, - ) - - # Re-export and compare - re_export_filename = tar_filename + ".reexport" - export_timeline( - args, - psql_path, - new_pageserver_connstr, - timeline["tenant_id"], - timeline["timeline_id"], - last_lsn, - prev_lsn, - re_export_filename, - pg_version, - ) - - # Check the size is the same - old_size = (os.path.getsize(tar_filename),) - new_size = (os.path.getsize(re_export_filename),) - if old_size != new_size: - raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}") - - -def non_zero_tcp_port(arg: Any): - port = int(arg) - if port < 1 or port > 65535: - raise argparse.ArgumentTypeError(f"invalid tcp port: {arg}") - return port - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--tenant-id", - dest="tenants", - required=True, - nargs="+", - help="Id of the tenant to migrate. You can pass multiple arguments", - ) - parser.add_argument( - "--timeline-id", - dest="timelines", - required=False, - nargs="+", - help="Id of the timeline to migrate. You can pass multiple arguments", - ) - parser.add_argument( - "--from-host", - dest="old_pageserver_host", - required=True, - help="Host of the pageserver to migrate data from", - ) - parser.add_argument( - "--from-http-port", - dest="old_pageserver_http_port", - required=False, - type=int, - default=9898, - help="HTTP port of the pageserver to migrate data from. Default: 9898", - ) - parser.add_argument( - "--from-pg-port", - dest="old_pageserver_pg_port", - required=False, - type=int, - default=6400, - help="pg port of the pageserver to migrate data from. Default: 6400", - ) - parser.add_argument( - "--to-host", - dest="new_pageserver_host", - required=True, - help="Host of the pageserver to migrate data to", - ) - parser.add_argument( - "--to-http-port", - dest="new_pageserver_http_port", - required=False, - default=9898, - type=int, - help="HTTP port of the pageserver to migrate data to. Default: 9898", - ) - parser.add_argument( - "--to-pg-port", - dest="new_pageserver_pg_port", - required=False, - default=6400, - type=int, - help="pg port of the pageserver to migrate data to. Default: 6400", - ) - parser.add_argument( - "--ignore-tenant-exists", - dest="ok_if_exists", - required=False, - help="Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.", - ) - parser.add_argument( - "--pg-distrib-dir", - dest="pg_distrib_dir", - required=False, - default="/usr/local/", - help="Path where postgres binaries are installed. Default: /usr/local/", - ) - parser.add_argument( - "--psql-path", - dest="psql_path", - required=False, - default="/usr/local/v14/bin/psql", - help="Path to the psql binary. Default: /usr/local/v14/bin/psql", - ) - parser.add_argument( - "--only-import", - dest="only_import", - required=False, - default=False, - action="store_true", - help="Skip export and tenant creation part", - ) - parser.add_argument( - "--work-dir", - dest="work_dir", - required=True, - default=False, - help="directory where temporary tar files are stored", - ) - parser.add_argument( - "--tmp-pg-port", - dest="tmp_pg_port", - required=False, - default=55439, - type=non_zero_tcp_port, - help="localhost port to use for temporary postgres instance", - ) - args = parser.parse_args() - main(args) diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py index b07e4bea9b..919a9278a9 100755 --- a/scripts/flaky_tests.py +++ b/scripts/flaky_tests.py @@ -3,11 +3,13 @@ import argparse import json import logging +import os from collections import defaultdict -from typing import DefaultDict, Dict +from typing import Any, DefaultDict, Dict, Optional import psycopg2 import psycopg2.extras +import toml FLAKY_TESTS_QUERY = """ SELECT @@ -45,6 +47,36 @@ def main(args: argparse.Namespace): logging.error("cannot fetch flaky tests from the DB due to an error", exc) rows = [] + # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not tokio-epoll-uring), + # use it to parametrize test name along with build_type and pg_version + # + # See test_runner/fixtures/parametrize.py for details + if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ( + "", + "tokio-epoll-uring", + ): + pageserver_virtual_file_io_engine_parameter = f"-{io_engine}" + else: + pageserver_virtual_file_io_engine_parameter = "" + + # re-use existing records of flaky tests from before parametrization by compaction_algorithm + def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]: + """Duplicated from parametrize.py""" + toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") + if toml_table is None: + return None + v = toml.loads(toml_table) + assert isinstance(v, dict) + return v + + pageserver_default_tenant_config_compaction_algorithm_parameter = "" + if ( + explicit_default := get_pageserver_default_tenant_config_compaction_algorithm() + ) is not None: + pageserver_default_tenant_config_compaction_algorithm_parameter = ( + f"-{explicit_default['kind']}" + ) + for row in rows: # We don't want to automatically rerun tests in a performance suite if row["parent_suite"] != "test_runner.regress": @@ -53,10 +85,10 @@ def main(args: argparse.Namespace): if row["name"].endswith("]"): parametrized_test = row["name"].replace( "[", - f"[{build_type}-pg{pg_version}-", + f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}-", ) else: - parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}]" + parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}]" res[row["parent_suite"]][row["suite"]][parametrized_test] = True diff --git a/scripts/generate_and_push_perf_report.sh b/scripts/generate_and_push_perf_report.sh index 9e03302b0f..178c570b13 100755 --- a/scripts/generate_and_push_perf_report.sh +++ b/scripts/generate_and_push_perf_report.sh @@ -8,17 +8,3 @@ SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) echo "Uploading perf report to neon pg" # ingest per test results data into neon backed postgres running in staging to build grafana reports on that data DATABASE_URL="$PERF_TEST_RESULT_CONNSTR" poetry run python "$SCRIPT_DIR"/ingest_perf_test_result.py --ingest "$REPORT_FROM" - -# Activate poetry's venv. Needed because git upload does not run in a project dir (it uses tmp to store the repository) -# so the problem occurs because poetry cannot find pyproject.toml in temp dir created by git upload -# shellcheck source=/dev/null -. "$(poetry env info --path)"/bin/activate - -echo "Uploading perf result to zenith-perf-data" -scripts/git-upload \ - --repo=https://"$VIP_VAP_ACCESS_TOKEN"@github.com/neondatabase/zenith-perf-data.git \ - --message="add performance test result for $GITHUB_SHA neon revision" \ - --branch=master \ - copy "$REPORT_FROM" "data/$REPORT_TO" `# COPY FROM TO_RELATIVE`\ - --merge \ - --run-cmd "python $SCRIPT_DIR/generate_perf_report_page.py --input-dir data/$REPORT_TO --out reports/$REPORT_TO.html" diff --git a/scripts/generate_perf_report_page.py b/scripts/generate_perf_report_page.py deleted file mode 100755 index b5b49bb600..0000000000 --- a/scripts/generate_perf_report_page.py +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import json -from dataclasses import dataclass -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, cast - -from jinja2 import Template - -# skip 'input' columns. They are included in the header and just blow the table -EXCLUDE_COLUMNS = frozenset( - { - "scale", - "duration", - "number_of_clients", - "number_of_threads", - "init_start_timestamp", - "init_end_timestamp", - "run_start_timestamp", - "run_end_timestamp", - } -) - -KEY_EXCLUDE_FIELDS = frozenset( - { - "init_start_timestamp", - "init_end_timestamp", - "run_start_timestamp", - "run_end_timestamp", - } -) -NEGATIVE_COLOR = "negative" -POSITIVE_COLOR = "positive" -EPS = 1e-6 - - -@dataclass -class SuitRun: - revision: str - values: Dict[str, Any] - - -@dataclass -class SuitRuns: - platform: str - suit: str - common_columns: List[Tuple[str, str]] - value_columns: List[str] - runs: List[SuitRun] - - -@dataclass -class RowValue: - value: str - color: str - ratio: str - - -def get_columns(values: List[Dict[Any, Any]]) -> Tuple[List[Tuple[str, str]], List[str]]: - value_columns = [] - common_columns = [] - for item in values: - if item["name"] in KEY_EXCLUDE_FIELDS: - continue - if item["report"] != "test_param": - value_columns.append(cast(str, item["name"])) - else: - common_columns.append((cast(str, item["name"]), cast(str, item["value"]))) - value_columns.sort() - common_columns.sort(key=lambda x: x[0]) # sort by name - return common_columns, value_columns - - -def format_ratio(ratio: float, report: str) -> Tuple[str, str]: - color = "" - sign = "+" if ratio > 0 else "" - if abs(ratio) < 0.05: - return f" ({sign}{ratio:.2f})", color - - if report not in {"test_param", "higher_is_better", "lower_is_better"}: - raise ValueError(f"Unknown report type: {report}") - - if report == "test_param": - return f"{ratio:.2f}", color - - if ratio > 0: - if report == "higher_is_better": - color = POSITIVE_COLOR - elif report == "lower_is_better": - color = NEGATIVE_COLOR - elif ratio < 0: - if report == "higher_is_better": - color = NEGATIVE_COLOR - elif report == "lower_is_better": - color = POSITIVE_COLOR - - return f" ({sign}{ratio:.2f})", color - - -def extract_value(name: str, suit_run: SuitRun) -> Optional[Dict[str, Any]]: - for item in suit_run.values["data"]: - if item["name"] == name: - return cast(Dict[str, Any], item) - return None - - -def get_row_values( - columns: List[str], run_result: SuitRun, prev_result: Optional[SuitRun] -) -> List[RowValue]: - row_values = [] - for column in columns: - current_value = extract_value(column, run_result) - if current_value is None: - # should never happen - raise ValueError(f"{column} not found in {run_result.values}") - - value = current_value["value"] - if isinstance(value, float): - value = f"{value:.2f}" - - if prev_result is None: - row_values.append(RowValue(value, "", "")) - continue - - prev_value = extract_value(column, prev_result) - if prev_value is None: - # this might happen when new metric is added and there is no value for it in previous run - # let this be here, TODO add proper handling when this actually happens - raise ValueError(f"{column} not found in previous result") - # adding `EPS` to each term to avoid ZeroDivisionError when the denominator is zero - ratio = (float(value) + EPS) / (float(prev_value["value"]) + EPS) - 1 - ratio_display, color = format_ratio(ratio, current_value["report"]) - row_values.append(RowValue(value, color, ratio_display)) - return row_values - - -@dataclass -class SuiteRunTableRow: - revision: str - values: List[RowValue] - - -def prepare_rows_from_runs(value_columns: List[str], runs: List[SuitRun]) -> List[SuiteRunTableRow]: - rows = [] - prev_run = None - for run in runs: - rows.append( - SuiteRunTableRow( - revision=run.revision, values=get_row_values(value_columns, run, prev_run) - ) - ) - prev_run = run - - return rows - - -def main(args: argparse.Namespace) -> None: - input_dir = Path(args.input_dir) - grouped_runs: Dict[str, SuitRuns] = {} - # we have files in form: _.json - # fill them in the hashmap so we have grouped items for the - # same run configuration (scale, duration etc.) ordered by counter. - for item in sorted(input_dir.iterdir(), key=lambda x: int(x.name.split("_")[0])): - run_data = json.loads(item.read_text()) - revision = run_data["revision"] - - for suit_result in run_data["result"]: - key = "{}{}".format(run_data["platform"], suit_result["suit"]) - # pack total duration as a synthetic value - total_duration = suit_result["total_duration"] - suit_result["data"].append( - { - "name": "total_duration", - "value": total_duration, - "unit": "s", - "report": "lower_is_better", - } - ) - common_columns, value_columns = get_columns(suit_result["data"]) - - grouped_runs.setdefault( - key, - SuitRuns( - platform=run_data["platform"], - suit=suit_result["suit"], - common_columns=common_columns, - value_columns=value_columns, - runs=[], - ), - ) - - grouped_runs[key].runs.append(SuitRun(revision=revision, values=suit_result)) - context = {} - for result in grouped_runs.values(): - suit = result.suit - context[suit] = { - "common_columns": result.common_columns, - "value_columns": result.value_columns, - "platform": result.platform, - # reverse the order so newest results are on top of the table - "rows": reversed(prepare_rows_from_runs(result.value_columns, result.runs)), - } - - template = Template((Path(__file__).parent / "perf_report_template.html").read_text()) - - Path(args.out).write_text(template.render(context=context)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--input-dir", - dest="input_dir", - required=True, - help="Directory with jsons generated by the test suite", - ) - parser.add_argument("--out", required=True, help="Output html file path") - args = parser.parse_args() - main(args) diff --git a/scripts/git-upload b/scripts/git-upload deleted file mode 100755 index d56c0f8e94..0000000000 --- a/scripts/git-upload +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import os -import shlex -import shutil -import subprocess -import sys -import textwrap -from contextlib import contextmanager -from distutils.dir_util import copy_tree -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Optional - - -def absolute_path(path): - return Path(path).resolve() - - -def relative_path(path): - path = Path(path) - if path.is_absolute(): - raise Exception(f'path `{path}` must be relative!') - return path - - -@contextmanager -def chdir(cwd: Path): - old = os.getcwd() - os.chdir(cwd) - try: - yield cwd - finally: - os.chdir(old) - - -def run(cmd, *args, **kwargs): - print('$', ' '.join(cmd)) - subprocess.check_call(cmd, *args, **kwargs) - - -class GitRepo: - def __init__(self, url, branch: Optional[str] = None): - self.url = url - self.cwd = TemporaryDirectory() - self.branch = branch - - args = [ - 'git', - 'clone', - '--single-branch', - ] - if self.branch: - args.extend(['--branch', self.branch]) - - subprocess.check_call([ - *args, - str(url), - self.cwd.name, - ]) - - def is_dirty(self): - res = subprocess.check_output(['git', 'status', '--porcelain'], text=True).strip() - return bool(res) - - def update(self, message, action, branch=None): - with chdir(self.cwd.name): - if not branch: - cmd = ['git', 'branch', '--show-current'] - branch = subprocess.check_output(cmd, text=True).strip() - - # Run action in repo's directory - action() - - run(['git', 'add', '.']) - - if not self.is_dirty(): - print('No changes detected, quitting') - return - - git_with_user = [ - 'git', - '-c', - 'user.name=vipvap', - '-c', - 'user.email=vipvap@zenith.tech', - ] - run(git_with_user + [ - 'commit', - '--author="vipvap "', - f'--message={message}', - ]) - - for _ in range(5): - try: - run(['git', 'fetch', 'origin', branch]) - run(git_with_user + ['rebase', f'origin/{branch}']) - run(['git', 'push', 'origin', branch]) - return - - except subprocess.CalledProcessError as e: - print(f'failed to update branch `{branch}`: {e}', file=sys.stderr) - - raise Exception(f'failed to update branch `{branch}`') - - -def do_copy(args): - src = args.src - dst = args.dst - - if args.forbid_overwrite and dst.exists(): - raise FileExistsError(f"File exists: '{dst}'") - - if src.is_dir(): - if not args.merge: - shutil.rmtree(dst, ignore_errors=True) - # distutils is deprecated, but this is a temporary workaround before python version bump - # here we need dir_exists_ok=True from shutil.copytree which is available in python 3.8+ - copy_tree(str(src), str(dst)) - else: - shutil.copy(src, dst) - - if args.run_cmd: - run(shlex.split(args.run_cmd)) - - -def main(): - parser = argparse.ArgumentParser(description='Git upload tool') - parser.add_argument('--repo', type=str, metavar='URL', required=True, help='git repo url') - parser.add_argument('--message', type=str, metavar='TEXT', help='commit message') - parser.add_argument('--branch', type=str, metavar='TEXT', help='target git repo branch') - - commands = parser.add_subparsers(title='commands', dest='subparser_name') - - p_copy = commands.add_parser( - 'copy', - help='copy file into the repo', - formatter_class=argparse.RawTextHelpFormatter, - ) - p_copy.add_argument('src', type=absolute_path, help='source path') - p_copy.add_argument('dst', type=relative_path, help='relative dest path') - p_copy.add_argument('--forbid-overwrite', action='store_true', help='do not allow overwrites') - p_copy.add_argument( - '--merge', - action='store_true', - help='when copying a directory do not delete existing data, but add new files') - p_copy.add_argument('--run-cmd', - help=textwrap.dedent('''\ - run arbitrary cmd on top of copied files, - example usage is static content generation - based on current repository state\ - ''')) - - args = parser.parse_args() - - commands = { - 'copy': do_copy, - } - - action = commands.get(args.subparser_name) - if action: - message = args.message or 'update' - GitRepo(args.repo, args.branch).update(message, lambda: action(args)) - else: - parser.print_usage() - - -if __name__ == '__main__': - main() diff --git a/scripts/ingest_regress_test_result-new-format.py b/scripts/ingest_regress_test_result-new-format.py index cff1d9875f..40d7254e00 100644 --- a/scripts/ingest_regress_test_result-new-format.py +++ b/scripts/ingest_regress_test_result-new-format.py @@ -18,6 +18,7 @@ import psycopg2 from psycopg2.extras import execute_values CREATE_TABLE = """ +CREATE TYPE arch AS ENUM ('ARM64', 'X64', 'UNKNOWN'); CREATE TABLE IF NOT EXISTS results ( id BIGSERIAL PRIMARY KEY, parent_suite TEXT NOT NULL, @@ -28,6 +29,7 @@ CREATE TABLE IF NOT EXISTS results ( stopped_at TIMESTAMPTZ NOT NULL, duration INT NOT NULL, flaky BOOLEAN NOT NULL, + arch arch DEFAULT 'X64', build_type TEXT NOT NULL, pg_version INT NOT NULL, run_id BIGINT NOT NULL, @@ -35,7 +37,7 @@ CREATE TABLE IF NOT EXISTS results ( reference TEXT NOT NULL, revision CHAR(40) NOT NULL, raw JSONB COMPRESSION lz4 NOT NULL, - UNIQUE (parent_suite, suite, name, build_type, pg_version, started_at, stopped_at, run_id) + UNIQUE (parent_suite, suite, name, arch, build_type, pg_version, started_at, stopped_at, run_id) ); """ @@ -50,6 +52,7 @@ class Row: stopped_at: datetime duration: int flaky: bool + arch: str build_type: str pg_version: int run_id: int @@ -121,6 +124,14 @@ def ingest_test_result( raw.pop("labels") raw.pop("extra") + # All allure parameters are prefixed with "__", see test_runner/fixtures/parametrize.py + parameters = { + p["name"].removeprefix("__"): p["value"] + for p in test["parameters"] + if p["name"].startswith("__") + } + arch = parameters.get("arch", "UNKNOWN").strip("'") + build_type, pg_version, unparametrized_name = parse_test_name(test["name"]) labels = {label["name"]: label["value"] for label in test["labels"]} row = Row( @@ -132,6 +143,7 @@ def ingest_test_result( stopped_at=datetime.fromtimestamp(test["time"]["stop"] / 1000, tz=timezone.utc), duration=test["time"]["duration"], flaky=test["flaky"] or test["retriesStatusChange"], + arch=arch, build_type=build_type, pg_version=pg_version, run_id=run_id, diff --git a/scripts/ingest_regress_test_result.py b/scripts/ingest_regress_test_result.py deleted file mode 100644 index 39c1c02941..0000000000 --- a/scripts/ingest_regress_test_result.py +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import logging -import os -import re -import sys -from contextlib import contextmanager -from pathlib import Path - -import backoff -import psycopg2 - -CREATE_TABLE = """ -CREATE TABLE IF NOT EXISTS regress_test_results ( - id SERIAL PRIMARY KEY, - reference CHAR(255), - revision CHAR(40), - build_type CHAR(16), - data JSONB -) -""" - - -def err(msg): - print(f"error: {msg}") - sys.exit(1) - - -@contextmanager -def get_connection_cursor(): - connstr = os.getenv("DATABASE_URL") - if not connstr: - err("DATABASE_URL environment variable is not set") - - @backoff.on_exception(backoff.expo, psycopg2.OperationalError, max_time=150) - def connect(connstr): - conn = psycopg2.connect(connstr, connect_timeout=30) - conn.autocommit = True - return conn - - conn = connect(connstr) - try: - with conn.cursor() as cur: - yield cur - finally: - if conn is not None: - conn.close() - - -def create_table(cur): - cur.execute(CREATE_TABLE) - - -def ingest_regress_test_result( - cursor, reference: str, revision: str, build_type: str, data_file: Path -): - data = data_file.read_text() - # In the JSON report we can have lines related to LazyFixture with escaped double-quote - # It's hard to insert them into jsonb field as is, so replace \" with ' to make it easier for us - # - # "" -> "" - data = re.sub(r'("")', r"\g<1>'\g<2>'\g<3>", data) - values = ( - reference, - revision, - build_type, - data, - ) - cursor.execute( - """ - INSERT INTO regress_test_results ( - reference, - revision, - build_type, - data - ) VALUES (%s, %s, %s, %s) - """, - values, - ) - - -def main(): - parser = argparse.ArgumentParser( - description="Regress test result uploader. \ - Database connection string should be provided via DATABASE_URL environment variable", - ) - parser.add_argument("--initdb", action="store_true", help="Initialuze database") - parser.add_argument( - "--reference", type=str, required=True, help="git reference, for example refs/heads/main" - ) - parser.add_argument("--revision", type=str, required=True, help="git revision") - parser.add_argument( - "--build-type", type=str, required=True, help="build type: release, debug or remote" - ) - parser.add_argument( - "--ingest", type=Path, required=True, help="Path to regress test result file" - ) - - args = parser.parse_args() - with get_connection_cursor() as cur: - if args.initdb: - create_table(cur) - - if not args.ingest.exists(): - err(f"ingest path {args.ingest} does not exist") - - ingest_regress_test_result( - cur, - reference=args.reference, - revision=args.revision, - build_type=args.build_type, - data_file=args.ingest, - ) - - -if __name__ == "__main__": - logging.getLogger("backoff").addHandler(logging.StreamHandler()) - main() diff --git a/scripts/ps_ec2_setup_instance_store b/scripts/ps_ec2_setup_instance_store index 4cca3a9857..7c383e322f 100755 --- a/scripts/ps_ec2_setup_instance_store +++ b/scripts/ps_ec2_setup_instance_store @@ -40,11 +40,11 @@ To run your local neon.git build on the instance store volume, run the following commands from the top of the neon.git checkout # raise file descriptor limit of your shell and its child processes - sudo prlimit -p $$ --nofile=800000:800000 + sudo prlimit -p \$\$ --nofile=800000:800000 # test suite run export TEST_OUTPUT="$TEST_OUTPUT" - DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py + DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py # for interactive use export NEON_REPO_DIR="$NEON_REPO_DIR" diff --git a/scripts/sk_cleanup_tenants/script.py b/scripts/sk_cleanup_tenants/script.py index fa22433614..c20a4bb830 100644 --- a/scripts/sk_cleanup_tenants/script.py +++ b/scripts/sk_cleanup_tenants/script.py @@ -22,7 +22,7 @@ parser.add_argument("--safekeeper-host", required=True, type=str) args = parser.parse_args() access_key = os.getenv("CONSOLE_API_TOKEN") -endpoint: str = "https://console.stage.neon.tech/api" +endpoint: str = "https://console-stage.neon.build/api" trash_dir: Path = args.trash_dir dry_run: bool = args.dry_run diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md index 7494a6cb78..5ae55e058b 100644 --- a/scripts/sk_collect_dumps/readme.md +++ b/scripts/sk_collect_dumps/readme.md @@ -3,7 +3,7 @@ 3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key): ``` # staging: -AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') +AUTH_TOKEN=$(curl https://console-stage.neon.build/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') # prod: AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') # check diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs index d66cbefa45..1a6fb7fedf 100644 --- a/storage_broker/benches/rps.rs +++ b/storage_broker/benches/rps.rs @@ -147,6 +147,7 @@ async fn publish(client: Option, n_keys: u64) { http_connstr: "zenith-1-sk-1.local:7677".to_owned(), local_start_lsn: 0, availability_zone: None, + standby_horizon: 0, }; counter += 1; yield info; diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto index 7d1b63d23f..a420fd9c66 100644 --- a/storage_broker/proto/broker.proto +++ b/storage_broker/proto/broker.proto @@ -42,6 +42,7 @@ message SafekeeperTimelineInfo { uint64 remote_consistent_lsn = 7; uint64 peer_horizon_lsn = 8; uint64 local_start_lsn = 9; + uint64 standby_horizon = 14; // A connection string to use for WAL receiving. string safekeeper_connstr = 10; // HTTP endpoint connection string @@ -105,4 +106,6 @@ message SafekeeperDiscoveryResponse { string safekeeper_connstr = 4; // Availability zone of a safekeeper. optional string availability_zone = 5; + // Replica apply LSN + uint64 standby_horizon = 6; } diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 4e5f8ed724..15acd0e49c 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -196,8 +196,13 @@ impl SubscriptionKey { /// Parse from FilterTenantTimelineId pub fn from_proto_filter_tenant_timeline_id( - f: &FilterTenantTimelineId, + opt: Option<&FilterTenantTimelineId>, ) -> Result { + if opt.is_none() { + return Ok(SubscriptionKey::All); + } + + let f = opt.unwrap(); if !f.enabled { return Ok(SubscriptionKey::All); } @@ -534,10 +539,7 @@ impl BrokerService for Broker { .remote_addr() .expect("TCPConnectInfo inserted by handler"); let proto_filter = request.into_inner(); - let ttid_filter = proto_filter - .tenant_timeline_id - .as_ref() - .ok_or_else(|| Status::new(Code::InvalidArgument, "missing tenant_timeline_id"))?; + let ttid_filter = proto_filter.tenant_timeline_id.as_ref(); let sub_key = SubscriptionKey::from_proto_filter_tenant_timeline_id(ttid_filter)?; let types_set = proto_filter @@ -640,8 +642,7 @@ async fn main() -> Result<(), Box> { logging::replace_panic_hook_with_tracing_panic_hook().forget(); // initialize sentry if SENTRY_DSN is provided let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); - info!("version: {GIT_VERSION}"); - info!("build_tag: {BUILD_TAG}"); + info!("version: {GIT_VERSION} build_tag: {BUILD_TAG}"); metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG); // On any shutdown signal, log receival and exit. @@ -734,6 +735,7 @@ mod tests { http_connstr: "neon-1-sk-1.local:7677".to_owned(), local_start_lsn: 0, availability_zone: None, + standby_horizon: 0, }) } diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml new file mode 100644 index 0000000000..ecaac04915 --- /dev/null +++ b/storage_controller/Cargo.toml @@ -0,0 +1,61 @@ +[package] +name = "storage_controller" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[[bin]] +name = "storage_controller" +path = "src/main.rs" + +[features] +default = [] +# Enables test-only APIs and behaviors +testing = [] + +[dependencies] +anyhow.workspace = true +aws-config.workspace = true +bytes.workspace = true +camino.workspace = true +chrono.workspace = true +clap.workspace = true +fail.workspace = true +futures.workspace = true +git-version.workspace = true +hex.workspace = true +hyper.workspace = true +humantime.workspace = true +itertools.workspace = true +lasso.workspace = true +once_cell.workspace = true +pageserver_api.workspace = true +pageserver_client.workspace = true +postgres_connection.workspace = true +rand.workspace = true +reqwest = { workspace = true, features = ["stream"] } +routerify.workspace = true +serde.workspace = true +serde_json.workspace = true +thiserror.workspace = true +tokio.workspace = true +tokio-util.workspace = true +tracing.workspace = true +measured.workspace = true +scopeguard.workspace = true +strum.workspace = true +strum_macros.workspace = true + +diesel = { version = "2.1.4", features = [ + "serde_json", + "postgres", + "r2d2", + "chrono", +] } +diesel_migrations = { version = "2.1.0" } +r2d2 = { version = "0.8.10" } + +utils = { path = "../libs/utils/" } +metrics = { path = "../libs/metrics/" } +control_plane = { path = "../control_plane" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/storage_controller/client/Cargo.toml b/storage_controller/client/Cargo.toml new file mode 100644 index 0000000000..e7a4264fd0 --- /dev/null +++ b/storage_controller/client/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "storage_controller_client" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +pageserver_api.workspace = true +pageserver_client.workspace = true +thiserror.workspace = true +reqwest.workspace = true +utils.workspace = true +serde.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } +tokio-postgres.workspace = true +tokio-stream.workspace = true +tokio.workspace = true +futures.workspace = true +tokio-util.workspace = true +anyhow.workspace = true +postgres.workspace = true +bytes.workspace = true diff --git a/storage_controller/client/src/control_api.rs b/storage_controller/client/src/control_api.rs new file mode 100644 index 0000000000..a981b5020e --- /dev/null +++ b/storage_controller/client/src/control_api.rs @@ -0,0 +1,62 @@ +use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt}; +use reqwest::{Method, Url}; +use serde::{de::DeserializeOwned, Serialize}; +use std::str::FromStr; + +pub struct Client { + base_url: Url, + jwt_token: Option, + client: reqwest::Client, +} + +impl Client { + pub fn new(base_url: Url, jwt_token: Option) -> Self { + Self { + base_url, + jwt_token, + client: reqwest::ClientBuilder::new() + .build() + .expect("Failed to construct http client"), + } + } + + /// Simple HTTP request wrapper for calling into storage controller + pub async fn dispatch( + &self, + method: Method, + path: String, + body: Option, + ) -> mgmt_api::Result + where + RQ: Serialize + Sized, + RS: DeserializeOwned + Sized, + { + // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out + // for general purpose API access. + let url = Url::from_str(&format!( + "http://{}:{}/{path}", + self.base_url.host_str().unwrap(), + self.base_url.port().unwrap() + )) + .unwrap(); + + let mut builder = self.client.request(method, url); + if let Some(body) = body { + builder = builder.json(&body) + } + if let Some(jwt_token) = &self.jwt_token { + builder = builder.header( + reqwest::header::AUTHORIZATION, + format!("Bearer {jwt_token}"), + ); + } + + let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?; + let response = response.error_from_body().await?; + + response + .json() + .await + .map_err(pageserver_client::mgmt_api::Error::ReceiveBody) + } +} diff --git a/storage_controller/client/src/lib.rs b/storage_controller/client/src/lib.rs new file mode 100644 index 0000000000..6d5e202942 --- /dev/null +++ b/storage_controller/client/src/lib.rs @@ -0,0 +1 @@ +pub mod control_api; diff --git a/storage_controller/migrations/.keep b/storage_controller/migrations/.keep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/storage_controller/migrations/00000000000000_diesel_initial_setup/down.sql b/storage_controller/migrations/00000000000000_diesel_initial_setup/down.sql new file mode 100644 index 0000000000..a9f5260911 --- /dev/null +++ b/storage_controller/migrations/00000000000000_diesel_initial_setup/down.sql @@ -0,0 +1,6 @@ +-- This file was automatically created by Diesel to setup helper functions +-- and other internal bookkeeping. This file is safe to edit, any future +-- changes will be added to existing projects as new migrations. + +DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass); +DROP FUNCTION IF EXISTS diesel_set_updated_at(); diff --git a/storage_controller/migrations/00000000000000_diesel_initial_setup/up.sql b/storage_controller/migrations/00000000000000_diesel_initial_setup/up.sql new file mode 100644 index 0000000000..d68895b1a7 --- /dev/null +++ b/storage_controller/migrations/00000000000000_diesel_initial_setup/up.sql @@ -0,0 +1,36 @@ +-- This file was automatically created by Diesel to setup helper functions +-- and other internal bookkeeping. This file is safe to edit, any future +-- changes will be added to existing projects as new migrations. + + + + +-- Sets up a trigger for the given table to automatically set a column called +-- `updated_at` whenever the row is modified (unless `updated_at` was included +-- in the modified columns) +-- +-- # Example +-- +-- ```sql +-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW()); +-- +-- SELECT diesel_manage_updated_at('users'); +-- ``` +CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$ +BEGIN + EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s + FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl); +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$ +BEGIN + IF ( + NEW IS DISTINCT FROM OLD AND + NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at + ) THEN + NEW.updated_at := current_timestamp; + END IF; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; diff --git a/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/down.sql b/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/down.sql new file mode 100644 index 0000000000..b875b91c00 --- /dev/null +++ b/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/down.sql @@ -0,0 +1 @@ +DROP TABLE tenant_shards; diff --git a/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/up.sql b/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/up.sql new file mode 100644 index 0000000000..2ffdae6287 --- /dev/null +++ b/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/up.sql @@ -0,0 +1,13 @@ +CREATE TABLE tenant_shards ( + tenant_id VARCHAR NOT NULL, + shard_number INTEGER NOT NULL, + shard_count INTEGER NOT NULL, + PRIMARY KEY(tenant_id, shard_number, shard_count), + shard_stripe_size INTEGER NOT NULL, + generation INTEGER NOT NULL, + generation_pageserver BIGINT NOT NULL, + placement_policy VARCHAR NOT NULL, + splitting SMALLINT NOT NULL, + -- config is JSON encoded, opaque to the database. + config TEXT NOT NULL +); \ No newline at end of file diff --git a/storage_controller/migrations/2024-01-07-212945_create_nodes/down.sql b/storage_controller/migrations/2024-01-07-212945_create_nodes/down.sql new file mode 100644 index 0000000000..ec303bc8cf --- /dev/null +++ b/storage_controller/migrations/2024-01-07-212945_create_nodes/down.sql @@ -0,0 +1 @@ +DROP TABLE nodes; diff --git a/storage_controller/migrations/2024-01-07-212945_create_nodes/up.sql b/storage_controller/migrations/2024-01-07-212945_create_nodes/up.sql new file mode 100644 index 0000000000..9be0880fa4 --- /dev/null +++ b/storage_controller/migrations/2024-01-07-212945_create_nodes/up.sql @@ -0,0 +1,10 @@ +CREATE TABLE nodes ( + node_id BIGINT PRIMARY KEY NOT NULL, + + scheduling_policy VARCHAR NOT NULL, + + listen_http_addr VARCHAR NOT NULL, + listen_http_port INTEGER NOT NULL, + listen_pg_addr VARCHAR NOT NULL, + listen_pg_port INTEGER NOT NULL +); \ No newline at end of file diff --git a/storage_controller/migrations/2024-02-29-094122_generations_null/down.sql b/storage_controller/migrations/2024-02-29-094122_generations_null/down.sql new file mode 100644 index 0000000000..503231f69d --- /dev/null +++ b/storage_controller/migrations/2024-02-29-094122_generations_null/down.sql @@ -0,0 +1,2 @@ +ALTER TABLE tenant_shards ALTER generation SET NOT NULL; +ALTER TABLE tenant_shards ALTER generation_pageserver SET NOT NULL; diff --git a/storage_controller/migrations/2024-02-29-094122_generations_null/up.sql b/storage_controller/migrations/2024-02-29-094122_generations_null/up.sql new file mode 100644 index 0000000000..7e1e3cfe90 --- /dev/null +++ b/storage_controller/migrations/2024-02-29-094122_generations_null/up.sql @@ -0,0 +1,4 @@ + + +ALTER TABLE tenant_shards ALTER generation DROP NOT NULL; +ALTER TABLE tenant_shards ALTER generation_pageserver DROP NOT NULL; \ No newline at end of file diff --git a/storage_controller/migrations/2024-03-18-184429_rename_policy/down.sql b/storage_controller/migrations/2024-03-18-184429_rename_policy/down.sql new file mode 100644 index 0000000000..897c7e0d01 --- /dev/null +++ b/storage_controller/migrations/2024-03-18-184429_rename_policy/down.sql @@ -0,0 +1,3 @@ + +UPDATE tenant_shards set placement_policy='{"Double": 1}' where placement_policy='{"Attached": 1}'; +UPDATE tenant_shards set placement_policy='"Single"' where placement_policy='{"Attached": 0}'; \ No newline at end of file diff --git a/storage_controller/migrations/2024-03-18-184429_rename_policy/up.sql b/storage_controller/migrations/2024-03-18-184429_rename_policy/up.sql new file mode 100644 index 0000000000..c898ac9aee --- /dev/null +++ b/storage_controller/migrations/2024-03-18-184429_rename_policy/up.sql @@ -0,0 +1,3 @@ + +UPDATE tenant_shards set placement_policy='{"Attached": 1}' where placement_policy='{"Double": 1}'; +UPDATE tenant_shards set placement_policy='{"Attached": 0}' where placement_policy='"Single"'; \ No newline at end of file diff --git a/storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql b/storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql new file mode 100644 index 0000000000..33c06dc03d --- /dev/null +++ b/storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql @@ -0,0 +1,3 @@ +-- This file should undo anything in `up.sql` + +ALTER TABLE tenant_shards drop scheduling_policy; \ No newline at end of file diff --git a/storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql b/storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql new file mode 100644 index 0000000000..aa00f0d2ca --- /dev/null +++ b/storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql @@ -0,0 +1,2 @@ + +ALTER TABLE tenant_shards add scheduling_policy VARCHAR NOT NULL DEFAULT '"Active"'; diff --git a/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql new file mode 100644 index 0000000000..1ecfc8786f --- /dev/null +++ b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/down.sql @@ -0,0 +1 @@ +DROP TABLE metadata_health; \ No newline at end of file diff --git a/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql new file mode 100644 index 0000000000..fa87eda119 --- /dev/null +++ b/storage_controller/migrations/2024-07-23-191537_create_metadata_health/up.sql @@ -0,0 +1,14 @@ +CREATE TABLE metadata_health ( + tenant_id VARCHAR NOT NULL, + shard_number INTEGER NOT NULL, + shard_count INTEGER NOT NULL, + PRIMARY KEY(tenant_id, shard_number, shard_count), + -- Rely on cascade behavior for delete + FOREIGN KEY(tenant_id, shard_number, shard_count) REFERENCES tenant_shards ON DELETE CASCADE, + healthy BOOLEAN NOT NULL DEFAULT TRUE, + last_scrubbed_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + + +INSERT INTO metadata_health(tenant_id, shard_number, shard_count) +SELECT tenant_id, shard_number, shard_count FROM tenant_shards; diff --git a/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql b/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql new file mode 100644 index 0000000000..53222c614e --- /dev/null +++ b/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql @@ -0,0 +1 @@ +DROP TABLE controllers; diff --git a/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql b/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql new file mode 100644 index 0000000000..90546948cb --- /dev/null +++ b/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql @@ -0,0 +1,5 @@ +CREATE TABLE controllers ( + address VARCHAR NOT NULL, + started_at TIMESTAMPTZ NOT NULL, + PRIMARY KEY(address, started_at) +); diff --git a/storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql b/storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql new file mode 100644 index 0000000000..9dfc750586 --- /dev/null +++ b/storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql @@ -0,0 +1,2 @@ +-- This file should undo anything in `up.sql` +DROP TABLE safekeepers; diff --git a/storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql b/storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql new file mode 100644 index 0000000000..c78716660f --- /dev/null +++ b/storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql @@ -0,0 +1,15 @@ +-- started out as a copy of cplane schema, removed the unnecessary columns. +CREATE TABLE safekeepers ( + -- the surrogate identifier defined by control plane database sequence + id BIGINT PRIMARY KEY, + region_id TEXT NOT NULL, + version BIGINT NOT NULL, + -- the natural id on whatever cloud platform, not needed in storage controller + -- instance_id TEXT UNIQUE NOT NULL, + host TEXT NOT NULL, + port INTEGER NOT NULL, + active BOOLEAN NOT NULL DEFAULT false, + -- projects_count INTEGER NOT NULL DEFAULT 0, + http_port INTEGER NOT NULL, + availability_zone_id TEXT NOT NULL +); diff --git a/storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql new file mode 100644 index 0000000000..518c747100 --- /dev/null +++ b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql @@ -0,0 +1,2 @@ +-- This file should undo anything in `up.sql` +DROP INDEX tenant_shards_tenant_id; \ No newline at end of file diff --git a/storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql new file mode 100644 index 0000000000..dd6b37781a --- /dev/null +++ b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql @@ -0,0 +1,2 @@ +-- Your SQL goes here +CREATE INDEX tenant_shards_tenant_id ON tenant_shards (tenant_id); \ No newline at end of file diff --git a/storage_controller/migrations/2024-08-27-184400_pageserver_az/down.sql b/storage_controller/migrations/2024-08-27-184400_pageserver_az/down.sql new file mode 100644 index 0000000000..22df81c83c --- /dev/null +++ b/storage_controller/migrations/2024-08-27-184400_pageserver_az/down.sql @@ -0,0 +1 @@ +ALTER TABLE nodes DROP availability_zone_id; diff --git a/storage_controller/migrations/2024-08-27-184400_pageserver_az/up.sql b/storage_controller/migrations/2024-08-27-184400_pageserver_az/up.sql new file mode 100644 index 0000000000..7112f92bf2 --- /dev/null +++ b/storage_controller/migrations/2024-08-27-184400_pageserver_az/up.sql @@ -0,0 +1 @@ +ALTER TABLE nodes ADD availability_zone_id VARCHAR; diff --git a/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql new file mode 100644 index 0000000000..4fcb928533 --- /dev/null +++ b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql @@ -0,0 +1 @@ +ALTER TABLE nodes ALTER availability_zone_id DROP NOT NULL; diff --git a/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql new file mode 100644 index 0000000000..c5b4534087 --- /dev/null +++ b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql @@ -0,0 +1 @@ +ALTER TABLE nodes ALTER availability_zone_id SET NOT NULL; diff --git a/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql new file mode 100644 index 0000000000..127972a2e4 --- /dev/null +++ b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql @@ -0,0 +1 @@ +ALTER TABLE tenant_shards DROP preferred_az_id; diff --git a/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql new file mode 100644 index 0000000000..641a54feb2 --- /dev/null +++ b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql @@ -0,0 +1 @@ +ALTER TABLE tenant_shards ADD preferred_az_id VARCHAR; diff --git a/storage_controller/src/auth.rs b/storage_controller/src/auth.rs new file mode 100644 index 0000000000..ef47abf8c7 --- /dev/null +++ b/storage_controller/src/auth.rs @@ -0,0 +1,9 @@ +use utils::auth::{AuthError, Claims, Scope}; + +pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), AuthError> { + if claims.scope != required_scope { + return Err(AuthError("Scope mismatch. Permission denied".into())); + } + + Ok(()) +} diff --git a/storage_controller/src/background_node_operations.rs b/storage_controller/src/background_node_operations.rs new file mode 100644 index 0000000000..6f1355eb68 --- /dev/null +++ b/storage_controller/src/background_node_operations.rs @@ -0,0 +1,59 @@ +use std::{borrow::Cow, fmt::Debug, fmt::Display}; + +use tokio_util::sync::CancellationToken; +use utils::id::NodeId; + +pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 32; + +#[derive(Copy, Clone)] +pub(crate) struct Drain { + pub(crate) node_id: NodeId, +} + +#[derive(Copy, Clone)] +pub(crate) struct Fill { + pub(crate) node_id: NodeId, +} + +#[derive(Copy, Clone)] +pub(crate) enum Operation { + Drain(Drain), + Fill(Fill), +} + +#[derive(Debug, thiserror::Error)] +pub(crate) enum OperationError { + #[error("Node state changed during operation: {0}")] + NodeStateChanged(Cow<'static, str>), + #[error("Operation finalize error: {0}")] + FinalizeError(Cow<'static, str>), + #[error("Operation cancelled")] + Cancelled, +} + +pub(crate) struct OperationHandler { + pub(crate) operation: Operation, + #[allow(unused)] + pub(crate) cancel: CancellationToken, +} + +impl Display for Drain { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "drain {}", self.node_id) + } +} + +impl Display for Fill { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "fill {}", self.node_id) + } +} + +impl Display for Operation { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + Operation::Drain(op) => write!(f, "{op}"), + Operation::Fill(op) => write!(f, "{op}"), + } + } +} diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs new file mode 100644 index 0000000000..c46539485c --- /dev/null +++ b/storage_controller/src/compute_hook.rs @@ -0,0 +1,697 @@ +use std::sync::Arc; +use std::{collections::HashMap, time::Duration}; + +use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; +use control_plane::local_env::LocalEnv; +use futures::StreamExt; +use hyper::StatusCode; +use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId}; +use postgres_connection::parse_host_port; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; +use tracing::{info_span, Instrument}; +use utils::{ + backoff::{self}, + id::{NodeId, TenantId}, +}; + +use crate::service::Config; + +const SLOWDOWN_DELAY: Duration = Duration::from_secs(5); + +const NOTIFY_REQUEST_TIMEOUT: Duration = Duration::from_secs(10); + +pub(crate) const API_CONCURRENCY: usize = 32; + +struct UnshardedComputeHookTenant { + // Which node is this tenant attached to + node_id: NodeId, + + // Must hold this lock to send a notification. + send_lock: Arc>>, +} +struct ShardedComputeHookTenant { + stripe_size: ShardStripeSize, + shard_count: ShardCount, + shards: Vec<(ShardNumber, NodeId)>, + + // Must hold this lock to send a notification. The contents represent + // the last successfully sent notification, and are used to coalesce multiple + // updates by only sending when there is a chance since our last successful send. + send_lock: Arc>>, +} + +enum ComputeHookTenant { + Unsharded(UnshardedComputeHookTenant), + Sharded(ShardedComputeHookTenant), +} + +impl ComputeHookTenant { + /// Construct with at least one shard's information + fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self { + if tenant_shard_id.shard_count.count() > 1 { + Self::Sharded(ShardedComputeHookTenant { + shards: vec![(tenant_shard_id.shard_number, node_id)], + stripe_size, + shard_count: tenant_shard_id.shard_count, + send_lock: Arc::default(), + }) + } else { + Self::Unsharded(UnshardedComputeHookTenant { + node_id, + send_lock: Arc::default(), + }) + } + } + + fn get_send_lock(&self) -> &Arc>> { + match self { + Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock, + Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock, + } + } + + /// Set one shard's location. If stripe size or shard count have changed, Self is reset + /// and drops existing content. + fn update( + &mut self, + tenant_shard_id: TenantShardId, + stripe_size: ShardStripeSize, + node_id: NodeId, + ) { + match self { + Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => { + unsharded_tenant.node_id = node_id + } + Self::Sharded(sharded_tenant) + if sharded_tenant.stripe_size == stripe_size + && sharded_tenant.shard_count == tenant_shard_id.shard_count => + { + if let Some(existing) = sharded_tenant + .shards + .iter() + .position(|s| s.0 == tenant_shard_id.shard_number) + { + sharded_tenant.shards.get_mut(existing).unwrap().1 = node_id; + } else { + sharded_tenant + .shards + .push((tenant_shard_id.shard_number, node_id)); + sharded_tenant.shards.sort_by_key(|s| s.0) + } + } + _ => { + // Shard count changed: reset struct. + *self = Self::new(tenant_shard_id, stripe_size, node_id); + } + } + } +} + +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] +struct ComputeHookNotifyRequestShard { + node_id: NodeId, + shard_number: ShardNumber, +} + +/// Request body that we send to the control plane to notify it of where a tenant is attached +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] +struct ComputeHookNotifyRequest { + tenant_id: TenantId, + stripe_size: Option, + shards: Vec, +} + +/// Error type for attempts to call into the control plane compute notification hook +#[derive(thiserror::Error, Debug)] +pub(crate) enum NotifyError { + // Request was not send successfully, e.g. transport error + #[error("Sending request: {0}")] + Request(#[from] reqwest::Error), + // Request could not be serviced right now due to ongoing Operation in control plane, but should be possible soon. + #[error("Control plane tenant busy")] + Busy, + // Explicit 429 response asking us to retry less frequently + #[error("Control plane overloaded")] + SlowDown, + // A 503 response indicates the control plane can't handle the request right now + #[error("Control plane unavailable (status {0})")] + Unavailable(StatusCode), + // API returned unexpected non-success status. We will retry, but log a warning. + #[error("Control plane returned unexpected status {0}")] + Unexpected(StatusCode), + // We shutdown while sending + #[error("Shutting down")] + ShuttingDown, + // A response indicates we will never succeed, such as 400 or 404 + #[error("Non-retryable error {0}")] + Fatal(StatusCode), + + #[error("neon_local error: {0}")] + NeonLocal(anyhow::Error), +} + +enum MaybeSendResult { + // Please send this request while holding the lock, and if you succeed then write + // the request into the lock. + Transmit( + ( + ComputeHookNotifyRequest, + tokio::sync::OwnedMutexGuard>, + ), + ), + // Something requires sending, but you must wait for a current sender then call again + AwaitLock(Arc>>), + // Nothing requires sending + Noop, +} + +impl ComputeHookTenant { + fn maybe_send( + &self, + tenant_id: TenantId, + lock: Option>>, + ) -> MaybeSendResult { + let locked = match lock { + Some(already_locked) => already_locked, + None => { + // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock. + let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else { + return MaybeSendResult::AwaitLock(self.get_send_lock().clone()); + }; + locked + } + }; + + let request = match self { + Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest { + tenant_id, + shards: vec![ComputeHookNotifyRequestShard { + shard_number: ShardNumber(0), + node_id: unsharded_tenant.node_id, + }], + stripe_size: None, + }), + Self::Sharded(sharded_tenant) + if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize => + { + Some(ComputeHookNotifyRequest { + tenant_id, + shards: sharded_tenant + .shards + .iter() + .map(|(shard_number, node_id)| ComputeHookNotifyRequestShard { + shard_number: *shard_number, + node_id: *node_id, + }) + .collect(), + stripe_size: Some(sharded_tenant.stripe_size), + }) + } + Self::Sharded(sharded_tenant) => { + // Sharded tenant doesn't yet have information for all its shards + + tracing::info!( + "ComputeHookTenant::maybe_send: not enough shards ({}/{})", + sharded_tenant.shards.len(), + sharded_tenant.shard_count.count() + ); + None + } + }; + + match request { + None => { + // Not yet ready to emit a notification + tracing::info!("Tenant isn't yet ready to emit a notification"); + MaybeSendResult::Noop + } + Some(request) if Some(&request) == locked.as_ref() => { + // No change from the last value successfully sent + MaybeSendResult::Noop + } + Some(request) => MaybeSendResult::Transmit((request, locked)), + } + } +} + +/// The compute hook is a destination for notifications about changes to tenant:pageserver +/// mapping. It aggregates updates for the shards in a tenant, and when appropriate reconfigures +/// the compute connection string. +pub(super) struct ComputeHook { + config: Config, + state: std::sync::Mutex>, + authorization_header: Option, + + // Concurrency limiter, so that we do not overload the cloud control plane when updating + // large numbers of tenants (e.g. when failing over after a node failure) + api_concurrency: tokio::sync::Semaphore, + + // This lock is only used in testing enviroments, to serialize calls into neon_lock + neon_local_lock: tokio::sync::Mutex<()>, + + // We share a client across all notifications to enable connection re-use etc when + // sending large numbers of notifications + client: reqwest::Client, +} + +impl ComputeHook { + pub(super) fn new(config: Config) -> Self { + let authorization_header = config + .control_plane_jwt_token + .clone() + .map(|jwt| format!("Bearer {}", jwt)); + + let client = reqwest::ClientBuilder::new() + .timeout(NOTIFY_REQUEST_TIMEOUT) + .build() + .expect("Failed to construct HTTP client"); + + Self { + state: Default::default(), + config, + authorization_header, + neon_local_lock: Default::default(), + api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY), + client, + } + } + + /// For test environments: use neon_local's LocalEnv to update compute + async fn do_notify_local( + &self, + reconfigure_request: &ComputeHookNotifyRequest, + ) -> Result<(), NotifyError> { + // neon_local updates are not safe to call concurrently, use a lock to serialize + // all calls to this function + let _locked = self.neon_local_lock.lock().await; + + let Some(repo_dir) = self.config.neon_local_repo_dir.as_deref() else { + tracing::warn!( + "neon_local_repo_dir not set, likely a bug in neon_local; skipping compute update" + ); + return Ok(()); + }; + let env = match LocalEnv::load_config(repo_dir) { + Ok(e) => e, + Err(e) => { + tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})"); + return Ok(()); + } + }; + let cplane = + ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane"); + let ComputeHookNotifyRequest { + tenant_id, + shards, + stripe_size, + } = reconfigure_request; + + let compute_pageservers = shards + .iter() + .map(|shard| { + let ps_conf = env + .get_pageserver_conf(shard.node_id) + .expect("Unknown pageserver"); + let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr) + .expect("Unable to parse listen_pg_addr"); + (pg_host, pg_port.unwrap_or(5432)) + }) + .collect::>(); + + for (endpoint_name, endpoint) in &cplane.endpoints { + if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running { + tracing::info!("Reconfiguring endpoint {}", endpoint_name,); + endpoint + .reconfigure(compute_pageservers.clone(), *stripe_size, None) + .await + .map_err(NotifyError::NeonLocal)?; + } + } + + Ok(()) + } + + async fn do_notify_iteration( + &self, + url: &String, + reconfigure_request: &ComputeHookNotifyRequest, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { + let req = self.client.request(reqwest::Method::PUT, url); + let req = if let Some(value) = &self.authorization_header { + req.header(reqwest::header::AUTHORIZATION, value) + } else { + req + }; + + tracing::info!( + "Sending notify request to {} ({:?})", + url, + reconfigure_request + ); + let send_result = req.json(&reconfigure_request).send().await; + let response = match send_result { + Ok(r) => r, + Err(e) => return Err(e.into()), + }; + + // Treat all 2xx responses as success + if response.status() >= reqwest::StatusCode::OK + && response.status() < reqwest::StatusCode::MULTIPLE_CHOICES + { + if response.status() != reqwest::StatusCode::OK { + // Non-200 2xx response: it doesn't make sense to retry, but this is unexpected, so + // log a warning. + tracing::warn!( + "Unexpected 2xx response code {} from control plane", + response.status() + ); + } + + return Ok(()); + } + + // Error response codes + match response.status() { + reqwest::StatusCode::TOO_MANY_REQUESTS => { + // TODO: 429 handling should be global: set some state visible to other requests + // so that they will delay before starting, rather than all notifications trying + // once before backing off. + tokio::time::timeout(SLOWDOWN_DELAY, cancel.cancelled()) + .await + .ok(); + Err(NotifyError::SlowDown) + } + reqwest::StatusCode::LOCKED => { + // We consider this fatal, because it's possible that the operation blocking the control one is + // also the one that is waiting for this reconcile. We should let the reconciler calling + // this hook fail, to give control plane a chance to un-lock. + tracing::info!("Control plane reports tenant is locked, dropping out of notify"); + Err(NotifyError::Busy) + } + reqwest::StatusCode::SERVICE_UNAVAILABLE => { + Err(NotifyError::Unavailable(StatusCode::SERVICE_UNAVAILABLE)) + } + reqwest::StatusCode::GATEWAY_TIMEOUT => { + Err(NotifyError::Unavailable(StatusCode::GATEWAY_TIMEOUT)) + } + reqwest::StatusCode::BAD_GATEWAY => { + Err(NotifyError::Unavailable(StatusCode::BAD_GATEWAY)) + } + + reqwest::StatusCode::BAD_REQUEST => Err(NotifyError::Fatal(StatusCode::BAD_REQUEST)), + reqwest::StatusCode::UNAUTHORIZED => Err(NotifyError::Fatal(StatusCode::UNAUTHORIZED)), + reqwest::StatusCode::FORBIDDEN => Err(NotifyError::Fatal(StatusCode::FORBIDDEN)), + status => Err(NotifyError::Unexpected( + hyper::StatusCode::from_u16(status.as_u16()) + .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR), + )), + } + } + + async fn do_notify( + &self, + url: &String, + reconfigure_request: &ComputeHookNotifyRequest, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { + // We hold these semaphore units across all retries, rather than only across each + // HTTP request: this is to preserve fairness and avoid a situation where a retry might + // time out waiting for a semaphore. + let _units = self + .api_concurrency + .acquire() + .await + // Interpret closed semaphore as shutdown + .map_err(|_| NotifyError::ShuttingDown)?; + + backoff::retry( + || self.do_notify_iteration(url, reconfigure_request, cancel), + |e| { + matches!( + e, + NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy + ) + }, + 3, + 10, + "Send compute notification", + cancel, + ) + .await + .ok_or_else(|| NotifyError::ShuttingDown) + .and_then(|x| x) + } + + /// Synchronous phase: update the per-tenant state for the next intended notification + fn notify_prepare( + &self, + tenant_shard_id: TenantShardId, + node_id: NodeId, + stripe_size: ShardStripeSize, + ) -> MaybeSendResult { + let mut state_locked = self.state.lock().unwrap(); + + use std::collections::hash_map::Entry; + let tenant = match state_locked.entry(tenant_shard_id.tenant_id) { + Entry::Vacant(e) => e.insert(ComputeHookTenant::new( + tenant_shard_id, + stripe_size, + node_id, + )), + Entry::Occupied(e) => { + let tenant = e.into_mut(); + tenant.update(tenant_shard_id, stripe_size, node_id); + tenant + } + }; + tenant.maybe_send(tenant_shard_id.tenant_id, None) + } + + async fn notify_execute( + &self, + maybe_send_result: MaybeSendResult, + tenant_shard_id: TenantShardId, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { + // Process result: we may get an update to send, or we may have to wait for a lock + // before trying again. + let (request, mut send_lock_guard) = match maybe_send_result { + MaybeSendResult::Noop => { + return Ok(()); + } + MaybeSendResult::AwaitLock(send_lock) => { + let send_locked = tokio::select! { + guard = send_lock.lock_owned() => {guard}, + _ = cancel.cancelled() => { + return Err(NotifyError::ShuttingDown) + } + }; + + // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here + // we have acquired the send lock and take `[Self::state]` lock. This is safe because maybe_send only uses + // try_lock. + let state_locked = self.state.lock().unwrap(); + let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else { + return Ok(()); + }; + match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) { + MaybeSendResult::AwaitLock(_) => { + unreachable!("We supplied lock guard") + } + MaybeSendResult::Noop => { + return Ok(()); + } + MaybeSendResult::Transmit((request, lock)) => (request, lock), + } + } + MaybeSendResult::Transmit((request, lock)) => (request, lock), + }; + + let result = if let Some(notify_url) = &self.config.compute_hook_url { + self.do_notify(notify_url, &request, cancel).await + } else { + self.do_notify_local(&request).await.map_err(|e| { + // This path is for testing only, so munge the error into our prod-style error type. + tracing::error!("neon_local notification hook failed: {e}"); + NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR) + }) + }; + + if result.is_ok() { + // Before dropping the send lock, stash the request we just sent so that + // subsequent callers can avoid redundantly re-sending the same thing. + *send_lock_guard = Some(request); + } + result + } + + /// Infallible synchronous fire-and-forget version of notify(), that sends its results to + /// a channel. Something should consume the channel and arrange to try notifying again + /// if something failed. + pub(super) fn notify_background( + self: &Arc, + notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>, + result_tx: tokio::sync::mpsc::Sender>, + cancel: &CancellationToken, + ) { + let mut maybe_sends = Vec::new(); + for (tenant_shard_id, node_id, stripe_size) in notifications { + let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size); + maybe_sends.push((tenant_shard_id, maybe_send_result)) + } + + let this = self.clone(); + let cancel = cancel.clone(); + + tokio::task::spawn(async move { + // Construct an async stream of futures to invoke the compute notify function: we do this + // in order to subsequently use .buffered() on the stream to execute with bounded parallelism. The + // ComputeHook semaphore already limits concurrency, but this way we avoid constructing+polling lots of futures which + // would mostly just be waiting on that semaphore. + let mut stream = futures::stream::iter(maybe_sends) + .map(|(tenant_shard_id, maybe_send_result)| { + let this = this.clone(); + let cancel = cancel.clone(); + + async move { + this + .notify_execute(maybe_send_result, tenant_shard_id, &cancel) + .await.map_err(|e| (tenant_shard_id, e)) + }.instrument(info_span!( + "notify_background", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug() + )) + }) + .buffered(API_CONCURRENCY); + + loop { + tokio::select! { + next = stream.next() => { + match next { + Some(r) => { + result_tx.send(r).await.ok(); + }, + None => { + tracing::info!("Finished sending background compute notifications"); + break; + } + } + }, + _ = cancel.cancelled() => { + tracing::info!("Shutdown while running background compute notifications"); + break; + } + }; + } + }); + } + + /// Call this to notify the compute (postgres) tier of new pageservers to use + /// for a tenant. notify() is called by each shard individually, and this function + /// will decide whether an update to the tenant is sent. An update is sent on the + /// condition that: + /// - We know a pageserver for every shard. + /// - All the shards have the same shard_count (i.e. we are not mid-split) + /// + /// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler + /// that is cancelled. + /// + /// This function is fallible, including in the case that the control plane is transiently + /// unavailable. A limited number of retries are done internally to efficiently hide short unavailability + /// periods, but we don't retry forever. The **caller** is responsible for handling failures and + /// ensuring that they eventually call again to ensure that the compute is eventually notified of + /// the proper pageserver nodes for a tenant. + #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))] + pub(super) async fn notify( + &self, + tenant_shard_id: TenantShardId, + node_id: NodeId, + stripe_size: ShardStripeSize, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { + let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size); + self.notify_execute(maybe_send_result, tenant_shard_id, cancel) + .await + } +} + +#[cfg(test)] +pub(crate) mod tests { + use pageserver_api::shard::{ShardCount, ShardNumber}; + use utils::id::TenantId; + + use super::*; + + #[test] + fn tenant_updates() -> anyhow::Result<()> { + let tenant_id = TenantId::generate(); + let mut tenant_state = ComputeHookTenant::new( + TenantShardId { + tenant_id, + shard_count: ShardCount::new(0), + shard_number: ShardNumber(0), + }, + ShardStripeSize(12345), + NodeId(1), + ); + + // An unsharded tenant is always ready to emit a notification, but won't + // send the same one twice + let send_result = tenant_state.maybe_send(tenant_id, None); + let MaybeSendResult::Transmit((request, mut guard)) = send_result else { + anyhow::bail!("Wrong send result"); + }; + assert_eq!(request.shards.len(), 1); + assert!(request.stripe_size.is_none()); + + // Simulate successful send + *guard = Some(request); + drop(guard); + + // Try asking again: this should be a no-op + let send_result = tenant_state.maybe_send(tenant_id, None); + assert!(matches!(send_result, MaybeSendResult::Noop)); + + // Writing the first shard of a multi-sharded situation (i.e. in a split) + // resets the tenant state and puts it in an non-notifying state (need to + // see all shards) + tenant_state.update( + TenantShardId { + tenant_id, + shard_count: ShardCount::new(2), + shard_number: ShardNumber(1), + }, + ShardStripeSize(32768), + NodeId(1), + ); + assert!(matches!( + tenant_state.maybe_send(tenant_id, None), + MaybeSendResult::Noop + )); + + // Writing the second shard makes it ready to notify + tenant_state.update( + TenantShardId { + tenant_id, + shard_count: ShardCount::new(2), + shard_number: ShardNumber(0), + }, + ShardStripeSize(32768), + NodeId(1), + ); + + let send_result = tenant_state.maybe_send(tenant_id, None); + let MaybeSendResult::Transmit((request, mut guard)) = send_result else { + anyhow::bail!("Wrong send result"); + }; + assert_eq!(request.shards.len(), 2); + assert_eq!(request.stripe_size, Some(ShardStripeSize(32768))); + + // Simulate successful send + *guard = Some(request); + drop(guard); + + Ok(()) + } +} diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs new file mode 100644 index 0000000000..dea1f04649 --- /dev/null +++ b/storage_controller/src/drain_utils.rs @@ -0,0 +1,225 @@ +use std::{ + collections::{BTreeMap, HashMap}, + sync::Arc, +}; + +use pageserver_api::controller_api::NodeSchedulingPolicy; +use utils::{id::NodeId, shard::TenantShardId}; + +use crate::{ + background_node_operations::OperationError, node::Node, scheduler::Scheduler, + tenant_shard::TenantShard, +}; + +pub(crate) struct TenantShardIterator { + tenants_accessor: F, + inspected_all_shards: bool, + last_inspected_shard: Option, +} + +/// A simple iterator which can be used in tandem with [`crate::service::Service`] +/// to iterate over all known tenant shard ids without holding the lock on the +/// service state at all times. +impl TenantShardIterator +where + F: Fn(Option) -> Option, +{ + pub(crate) fn new(tenants_accessor: F) -> Self { + Self { + tenants_accessor, + inspected_all_shards: false, + last_inspected_shard: None, + } + } + + /// Returns the next tenant shard id if one exists + pub(crate) fn next(&mut self) -> Option { + if self.inspected_all_shards { + return None; + } + + match (self.tenants_accessor)(self.last_inspected_shard) { + Some(tid) => { + self.last_inspected_shard = Some(tid); + Some(tid) + } + None => { + self.inspected_all_shards = true; + None + } + } + } + + /// Returns true when the end of the iterator is reached and false otherwise + pub(crate) fn finished(&self) -> bool { + self.inspected_all_shards + } +} + +/// Check that the state of the node being drained is as expected: +/// node is present in memory and scheduling policy is set to [`NodeSchedulingPolicy::Draining`] +pub(crate) fn validate_node_state( + node_id: &NodeId, + nodes: Arc>, +) -> Result<(), OperationError> { + let node = nodes.get(node_id).ok_or(OperationError::NodeStateChanged( + format!("node {} was removed", node_id).into(), + ))?; + + let current_policy = node.get_scheduling(); + if !matches!(current_policy, NodeSchedulingPolicy::Draining) { + // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think + // about it + return Err(OperationError::NodeStateChanged( + format!("node {} changed state to {:?}", node_id, current_policy).into(), + )); + } + + Ok(()) +} + +/// Struct that houses a few utility methods for draining pageserver nodes +pub(crate) struct TenantShardDrain { + pub(crate) drained_node: NodeId, + pub(crate) tenant_shard_id: TenantShardId, +} + +impl TenantShardDrain { + /// Check if the tenant shard under question is eligible for drainining: + /// it's primary attachment is on the node being drained + pub(crate) fn tenant_shard_eligible_for_drain( + &self, + tenants: &BTreeMap, + scheduler: &Scheduler, + ) -> Option { + let tenant_shard = tenants.get(&self.tenant_shard_id)?; + + if *tenant_shard.intent.get_attached() != Some(self.drained_node) { + return None; + } + + match scheduler.node_preferred(tenant_shard.intent.get_secondary()) { + Some(node) => Some(node), + None => { + tracing::warn!( + tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "No eligible secondary while draining {}", self.drained_node + ); + + None + } + } + } + + /// Attempt to reschedule the tenant shard under question to one of its secondary locations + /// Returns an Err when the operation should be aborted and Ok(None) when the tenant shard + /// should be skipped. + pub(crate) fn reschedule_to_secondary<'a>( + &self, + destination: NodeId, + tenants: &'a mut BTreeMap, + scheduler: &mut Scheduler, + nodes: &Arc>, + ) -> Result, OperationError> { + let tenant_shard = match tenants.get_mut(&self.tenant_shard_id) { + Some(some) => some, + None => { + // Tenant shard was removed in the meantime. + // Skip to the next one, but don't fail the overall operation + return Ok(None); + } + }; + + if !nodes.contains_key(&destination) { + return Err(OperationError::NodeStateChanged( + format!("node {} was removed", destination).into(), + )); + } + + if !tenant_shard.intent.get_secondary().contains(&destination) { + tracing::info!( + tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "Secondary moved away from {destination} during drain" + ); + + return Ok(None); + } + + match tenant_shard.reschedule_to_secondary(Some(destination), scheduler) { + Err(e) => { + tracing::warn!( + tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "Scheduling error when draining pageserver {} : {}", self.drained_node, e + ); + + Ok(None) + } + Ok(()) => { + let scheduled_to = tenant_shard.intent.get_attached(); + tracing::info!( + tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "Rescheduled shard while draining node {}: {} -> {:?}", + self.drained_node, + self.drained_node, + scheduled_to + ); + + Ok(Some(tenant_shard)) + } + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use utils::{ + id::TenantId, + shard::{ShardCount, ShardNumber, TenantShardId}, + }; + + use super::TenantShardIterator; + + #[test] + fn test_tenant_shard_iterator() { + let tenant_id = TenantId::generate(); + let shard_count = ShardCount(8); + + let mut tenant_shards = Vec::default(); + for i in 0..shard_count.0 { + tenant_shards.push(( + TenantShardId { + tenant_id, + shard_number: ShardNumber(i), + shard_count, + }, + (), + )) + } + + let tenant_shards = Arc::new(tenant_shards); + + let mut tid_iter = TenantShardIterator::new({ + let tenants = tenant_shards.clone(); + move |last_inspected_shard: Option| { + let entry = match last_inspected_shard { + Some(skip_past) => { + let mut cursor = tenants.iter().skip_while(|(tid, _)| *tid != skip_past); + cursor.nth(1) + } + None => tenants.first(), + }; + + entry.map(|(tid, _)| tid).copied() + } + }); + + let mut iterated_over = Vec::default(); + while let Some(tid) = tid_iter.next() { + iterated_over.push((tid, ())); + } + + assert_eq!(iterated_over, *tenant_shards); + } +} diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs new file mode 100644 index 0000000000..b7e66d33eb --- /dev/null +++ b/storage_controller/src/heartbeater.rs @@ -0,0 +1,274 @@ +use futures::{stream::FuturesUnordered, StreamExt}; +use std::{ + collections::HashMap, + sync::Arc, + time::{Duration, Instant}, +}; +use tokio_util::sync::CancellationToken; + +use pageserver_api::{controller_api::NodeAvailability, models::PageserverUtilization}; + +use thiserror::Error; +use utils::id::NodeId; + +use crate::node::Node; + +struct HeartbeaterTask { + receiver: tokio::sync::mpsc::UnboundedReceiver, + cancel: CancellationToken, + + state: HashMap, + + max_offline_interval: Duration, + max_warming_up_interval: Duration, + jwt_token: Option, +} + +#[derive(Debug, Clone)] +pub(crate) enum PageserverState { + Available { + last_seen_at: Instant, + utilization: PageserverUtilization, + }, + WarmingUp { + started_at: Instant, + }, + Offline, +} + +#[derive(Debug)] +pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, PageserverState)>); + +#[derive(Debug, Error)] +pub(crate) enum HeartbeaterError { + #[error("Cancelled")] + Cancel, +} + +struct HeartbeatRequest { + pageservers: Arc>, + reply: tokio::sync::oneshot::Sender>, +} + +pub(crate) struct Heartbeater { + sender: tokio::sync::mpsc::UnboundedSender, +} + +impl Heartbeater { + pub(crate) fn new( + jwt_token: Option, + max_offline_interval: Duration, + max_warming_up_interval: Duration, + cancel: CancellationToken, + ) -> Self { + let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::(); + let mut heartbeater = HeartbeaterTask::new( + receiver, + jwt_token, + max_offline_interval, + max_warming_up_interval, + cancel, + ); + tokio::task::spawn(async move { heartbeater.run().await }); + + Self { sender } + } + + pub(crate) async fn heartbeat( + &self, + pageservers: Arc>, + ) -> Result { + let (sender, receiver) = tokio::sync::oneshot::channel(); + self.sender + .send(HeartbeatRequest { + pageservers, + reply: sender, + }) + .map_err(|_| HeartbeaterError::Cancel)?; + + receiver + .await + .map_err(|_| HeartbeaterError::Cancel) + .and_then(|x| x) + } +} + +impl HeartbeaterTask { + fn new( + receiver: tokio::sync::mpsc::UnboundedReceiver, + jwt_token: Option, + max_offline_interval: Duration, + max_warming_up_interval: Duration, + cancel: CancellationToken, + ) -> Self { + Self { + receiver, + cancel, + state: HashMap::new(), + max_offline_interval, + max_warming_up_interval, + jwt_token, + } + } + + async fn run(&mut self) { + loop { + tokio::select! { + request = self.receiver.recv() => { + match request { + Some(req) => { + let res = self.heartbeat(req.pageservers).await; + req.reply.send(res).unwrap(); + }, + None => { return; } + } + }, + _ = self.cancel.cancelled() => return + } + } + } + + async fn heartbeat( + &mut self, + pageservers: Arc>, + ) -> Result { + let mut new_state = HashMap::new(); + + let mut heartbeat_futs = FuturesUnordered::new(); + for (node_id, node) in &*pageservers { + heartbeat_futs.push({ + let jwt_token = self.jwt_token.clone(); + let cancel = self.cancel.clone(); + + // Clone the node and mark it as available such that the request + // goes through to the pageserver even when the node is marked offline. + // This doesn't impact the availability observed by [`crate::service::Service`]. + let mut node_clone = node.clone(); + node_clone + .set_availability(NodeAvailability::Active(PageserverUtilization::full())); + + async move { + let response = node_clone + .with_client_retries( + |client| async move { client.get_utilization().await }, + &jwt_token, + 3, + 3, + Duration::from_secs(1), + &cancel, + ) + .await; + + let response = match response { + Some(r) => r, + None => { + // This indicates cancellation of the request. + // We ignore the node in this case. + return None; + } + }; + + let status = if let Ok(utilization) = response { + PageserverState::Available { + last_seen_at: Instant::now(), + utilization, + } + } else if let NodeAvailability::WarmingUp(last_seen_at) = + node.get_availability() + { + PageserverState::WarmingUp { + started_at: *last_seen_at, + } + } else { + PageserverState::Offline + }; + + Some((*node_id, status)) + } + }); + + loop { + let maybe_status = tokio::select! { + next = heartbeat_futs.next() => { + match next { + Some(result) => result, + None => { break; } + } + }, + _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } + }; + + if let Some((node_id, status)) = maybe_status { + new_state.insert(node_id, status); + } + } + } + + let mut warming_up = 0; + let mut offline = 0; + for state in new_state.values() { + match state { + PageserverState::WarmingUp { .. } => { + warming_up += 1; + } + PageserverState::Offline { .. } => offline += 1, + PageserverState::Available { .. } => {} + } + } + + tracing::info!( + "Heartbeat round complete for {} nodes, {} warming-up, {} offline", + new_state.len(), + warming_up, + offline + ); + + let mut deltas = Vec::new(); + let now = Instant::now(); + for (node_id, ps_state) in new_state.iter_mut() { + use std::collections::hash_map::Entry::*; + let entry = self.state.entry(*node_id); + + let mut needs_update = false; + match entry { + Occupied(ref occ) => match (occ.get(), &ps_state) { + (PageserverState::Offline, PageserverState::Offline) => {} + (PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => { + if now - *last_seen_at >= self.max_offline_interval { + deltas.push((*node_id, ps_state.clone())); + needs_update = true; + } + } + (_, PageserverState::WarmingUp { started_at }) => { + if now - *started_at >= self.max_warming_up_interval { + *ps_state = PageserverState::Offline; + } + + deltas.push((*node_id, ps_state.clone())); + needs_update = true; + } + _ => { + deltas.push((*node_id, ps_state.clone())); + needs_update = true; + } + }, + Vacant(_) => { + // This is a new node. Don't generate a delta for it. + deltas.push((*node_id, ps_state.clone())); + } + } + + match entry { + Occupied(mut occ) if needs_update => { + (*occ.get_mut()) = ps_state.clone(); + } + Vacant(vac) => { + vac.insert(ps_state.clone()); + } + _ => {} + } + } + + Ok(AvailablityDeltas(deltas)) + } +} diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs new file mode 100644 index 0000000000..5d4d0460be --- /dev/null +++ b/storage_controller/src/http.rs @@ -0,0 +1,1295 @@ +use crate::metrics::{ + HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup, + METRICS_REGISTRY, +}; +use crate::persistence::SafekeeperPersistence; +use crate::reconciler::ReconcileError; +use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT}; +use anyhow::Context; +use futures::Future; +use hyper::header::CONTENT_TYPE; +use hyper::{Body, Request, Response}; +use hyper::{StatusCode, Uri}; +use metrics::{BuildInfo, NeonMetrics}; +use pageserver_api::controller_api::{ + MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse, + MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse, + ShardsPreferredAzsRequest, TenantCreateRequest, +}; +use pageserver_api::models::{ + TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest, + TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, +}; +use pageserver_api::shard::TenantShardId; +use pageserver_client::mgmt_api; +use std::sync::Arc; +use std::time::{Duration, Instant}; +use tokio_util::sync::CancellationToken; +use utils::auth::{Scope, SwappableJwtAuth}; +use utils::failpoint_support::failpoints_handler; +use utils::http::endpoint::{auth_middleware, check_permission_with, request_span}; +use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param}; +use utils::id::{TenantId, TimelineId}; + +use utils::{ + http::{ + endpoint::{self}, + error::ApiError, + json::{json_request, json_response}, + RequestExt, RouterBuilder, + }, + id::NodeId, +}; + +use pageserver_api::controller_api::{ + NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest, + TenantShardMigrateRequest, +}; +use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; + +use control_plane::storage_controller::{AttachHookRequest, InspectRequest}; + +use routerify::Middleware; + +/// State available to HTTP request handlers +pub struct HttpState { + service: Arc, + auth: Option>, + neon_metrics: NeonMetrics, + allowlist_routes: Vec, +} + +impl HttpState { + pub fn new( + service: Arc, + auth: Option>, + build_info: BuildInfo, + ) -> Self { + let allowlist_routes = ["/status", "/ready", "/metrics"] + .iter() + .map(|v| v.parse().unwrap()) + .collect::>(); + Self { + service, + auth, + neon_metrics: NeonMetrics::new(build_info), + allowlist_routes, + } + } +} + +#[inline(always)] +fn get_state(request: &Request) -> &HttpState { + request + .data::>() + .expect("unknown state type") + .as_ref() +} + +/// Pageserver calls into this on startup, to learn which tenants it should attach +async fn handle_re_attach(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::GenerationsApi)?; + + let reattach_req = json_request::(&mut req).await?; + let state = get_state(&req); + json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?) +} + +/// Pageserver calls into this before doing deletions, to confirm that it still +/// holds the latest generation for the tenants with deletions enqueued +async fn handle_validate(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::GenerationsApi)?; + + let validate_req = json_request::(&mut req).await?; + let state = get_state(&req); + json_response(StatusCode::OK, state.service.validate(validate_req).await?) +} + +/// Call into this before attaching a tenant to a pageserver, to acquire a generation number +/// (in the real control plane this is unnecessary, because the same program is managing +/// generation numbers and doing attachments). +async fn handle_attach_hook(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let attach_req = json_request::(&mut req).await?; + let state = get_state(&req); + + json_response( + StatusCode::OK, + state + .service + .attach_hook(attach_req) + .await + .map_err(ApiError::InternalServerError)?, + ) +} + +async fn handle_inspect(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let inspect_req = json_request::(&mut req).await?; + + let state = get_state(&req); + + json_response(StatusCode::OK, state.service.inspect(inspect_req)) +} + +async fn handle_tenant_create( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::PageServerApi)?; + + let create_req = json_request::(&mut req).await?; + + json_response( + StatusCode::CREATED, + service.tenant_create(create_req).await?, + ) +} + +async fn handle_tenant_location_config( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let config_req = json_request::(&mut req).await?; + json_response( + StatusCode::OK, + service + .tenant_location_config(tenant_shard_id, config_req) + .await?, + ) +} + +async fn handle_tenant_config_set( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::PageServerApi)?; + + let config_req = json_request::(&mut req).await?; + + json_response(StatusCode::OK, service.tenant_config_set(config_req).await?) +} + +async fn handle_tenant_config_get( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?) +} + +async fn handle_tenant_time_travel_remote_storage( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let time_travel_req = json_request::(&mut req).await?; + + let timestamp_raw = must_get_query_param(&req, "travel_to")?; + let _timestamp = humantime::parse_rfc3339(×tamp_raw).map_err(|_e| { + ApiError::BadRequest(anyhow::anyhow!( + "Invalid time for travel_to: {timestamp_raw:?}" + )) + })?; + + let done_if_after_raw = must_get_query_param(&req, "done_if_after")?; + let _done_if_after = humantime::parse_rfc3339(&done_if_after_raw).map_err(|_e| { + ApiError::BadRequest(anyhow::anyhow!( + "Invalid time for done_if_after: {done_if_after_raw:?}" + )) + })?; + + service + .tenant_time_travel_remote_storage( + &time_travel_req, + tenant_id, + timestamp_raw, + done_if_after_raw, + ) + .await?; + json_response(StatusCode::OK, ()) +} + +fn map_reqwest_hyper_status(status: reqwest::StatusCode) -> Result { + hyper::StatusCode::from_u16(status.as_u16()) + .context("invalid status code") + .map_err(ApiError::InternalServerError) +} + +async fn handle_tenant_secondary_download( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis); + + let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?; + json_response(map_reqwest_hyper_status(status)?, progress) +} + +async fn handle_tenant_delete( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let status_code = service + .tenant_delete(tenant_id) + .await + .and_then(map_reqwest_hyper_status)?; + + if status_code == StatusCode::NOT_FOUND { + // The pageserver uses 404 for successful deletion, but we use 200 + json_response(StatusCode::OK, ()) + } else { + json_response(status_code, ()) + } +} + +async fn handle_tenant_timeline_create( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let create_req = json_request::(&mut req).await?; + json_response( + StatusCode::CREATED, + service + .tenant_timeline_create(tenant_id, create_req) + .await?, + ) +} + +async fn handle_tenant_timeline_delete( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + // For timeline deletions, which both implement an "initially return 202, then 404 once + // we're done" semantic, we wrap with a retry loop to expose a simpler API upstream. + async fn deletion_wrapper(service: Arc, f: F) -> Result, ApiError> + where + R: std::future::Future> + Send + 'static, + F: Fn(Arc) -> R + Send + Sync + 'static, + { + let started_at = Instant::now(); + // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion + // completed. + let mut retry_period = Duration::from_secs(1); + // On subsequent retries, wait longer. + let max_retry_period = Duration::from_secs(5); + // Enable callers with a 30 second request timeout to reliably get a response + let max_wait = Duration::from_secs(25); + + loop { + let status = f(service.clone()).await?; + match status { + StatusCode::ACCEPTED => { + tracing::info!("Deletion accepted, waiting to try again..."); + tokio::time::sleep(retry_period).await; + retry_period = max_retry_period; + } + StatusCode::NOT_FOUND => { + tracing::info!("Deletion complete"); + return json_response(StatusCode::OK, ()); + } + _ => { + tracing::warn!("Unexpected status {status}"); + return json_response(status, ()); + } + } + + let now = Instant::now(); + if now + retry_period > started_at + max_wait { + tracing::info!("Deletion timed out waiting for 404"); + // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of + // the pageserver's swagger definition for this endpoint, and has the same desired + // effect of causing the control plane to retry later. + return json_response(StatusCode::CONFLICT, ()); + } + } + } + + deletion_wrapper(service, move |service| async move { + service + .tenant_timeline_delete(tenant_id, timeline_id) + .await + .and_then(map_reqwest_hyper_status) + }) + .await +} + +async fn handle_tenant_timeline_archival_config( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + let create_req = json_request::(&mut req).await?; + + service + .tenant_timeline_archival_config(tenant_id, timeline_id, create_req) + .await?; + + json_response(StatusCode::OK, ()) +} + +async fn handle_tenant_timeline_detach_ancestor( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + let res = service + .tenant_timeline_detach_ancestor(tenant_id, timeline_id) + .await?; + + json_response(StatusCode::OK, res) +} + +async fn handle_tenant_timeline_passthrough( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let Some(path) = req.uri().path_and_query() else { + // This should never happen, our request router only calls us if there is a path + return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path"))); + }; + + tracing::info!("Proxying request for tenant {} ({})", tenant_id, path); + + // Find the node that holds shard zero + let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?; + + // Callers will always pass an unsharded tenant ID. Before proxying, we must + // rewrite this to a shard-aware shard zero ID. + let path = format!("{}", path); + let tenant_str = tenant_id.to_string(); + let tenant_shard_str = format!("{}", tenant_shard_id); + let path = path.replace(&tenant_str, &tenant_shard_str); + + let latency = &METRICS_REGISTRY + .metrics_group + .storage_controller_passthrough_request_latency; + + // This is a bit awkward. We remove the param from the request + // and join the words by '_' to get a label for the request. + let just_path = path.replace(&tenant_shard_str, ""); + let path_label = just_path + .split('/') + .filter(|token| !token.is_empty()) + .collect::>() + .join("_"); + let labels = PageserverRequestLabelGroup { + pageserver_id: &node.get_id().to_string(), + path: &path_label, + method: crate::metrics::Method::Get, + }; + + let _timer = latency.start_timer(labels.clone()); + + let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref()); + let resp = client.get_raw(path).await.map_err(|_e| + // FIXME: give APiError a proper Unavailable variant. We return 503 here because + // if we can't successfully send a request to the pageserver, we aren't available. + ApiError::ShuttingDown)?; + + if !resp.status().is_success() { + let error_counter = &METRICS_REGISTRY + .metrics_group + .storage_controller_passthrough_request_error; + error_counter.inc(labels); + } + + // We have a reqest::Response, would like a http::Response + let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp.status())?); + for (k, v) in resp.headers() { + builder = builder.header(k.as_str(), v.as_bytes()); + } + + let response = builder + .body(Body::wrap_stream(resp.bytes_stream())) + .map_err(|e| ApiError::InternalServerError(e.into()))?; + + Ok(response) +} + +async fn handle_tenant_locate( + service: Arc, + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + json_response(StatusCode::OK, service.tenant_locate(tenant_id)?) +} + +async fn handle_tenant_describe( + service: Arc, + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Scrubber)?; + + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + json_response(StatusCode::OK, service.tenant_describe(tenant_id)?) +} + +async fn handle_tenant_list( + service: Arc, + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + json_response(StatusCode::OK, service.tenant_list()) +} + +async fn handle_node_register(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let register_req = json_request::(&mut req).await?; + let state = get_state(&req); + state.service.node_register(register_req).await?; + json_response(StatusCode::OK, ()) +} + +async fn handle_node_list(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let nodes = state.service.node_list().await?; + let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::>(); + + json_response(StatusCode::OK, api_nodes) +} + +async fn handle_node_drop(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + json_response(StatusCode::OK, state.service.node_drop(node_id).await?) +} + +async fn handle_node_delete(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + json_response(StatusCode::OK, state.service.node_delete(node_id).await?) +} + +async fn handle_node_configure(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let node_id: NodeId = parse_request_param(&req, "node_id")?; + let config_req = json_request::(&mut req).await?; + if node_id != config_req.node_id { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Path and body node_id differ" + ))); + } + let state = get_state(&req); + + json_response( + StatusCode::OK, + state + .service + .external_node_configure( + config_req.node_id, + config_req.availability.map(NodeAvailability::from), + config_req.scheduling, + ) + .await?, + ) +} + +async fn handle_node_status(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + + let node_status = state.service.get_node(node_id).await?; + + json_response(StatusCode::OK, node_status) +} + +async fn handle_get_leader(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let leader = state.service.get_leader().await.map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!( + "Failed to read leader from database: {err}" + )) + })?; + + json_response(StatusCode::OK, leader) +} + +async fn handle_node_drain(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + + state.service.start_node_drain(node_id).await?; + + json_response(StatusCode::ACCEPTED, ()) +} + +async fn handle_cancel_node_drain(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + + state.service.cancel_node_drain(node_id).await?; + + json_response(StatusCode::ACCEPTED, ()) +} + +async fn handle_node_fill(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + + state.service.start_node_fill(node_id).await?; + + json_response(StatusCode::ACCEPTED, ()) +} + +async fn handle_cancel_node_fill(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + + state.service.cancel_node_fill(node_id).await?; + + json_response(StatusCode::ACCEPTED, ()) +} + +async fn handle_metadata_health_update(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Scrubber)?; + + let update_req = json_request::(&mut req).await?; + let state = get_state(&req); + + state.service.metadata_health_update(update_req).await?; + + json_response(StatusCode::OK, MetadataHealthUpdateResponse {}) +} + +async fn handle_metadata_health_list_unhealthy( + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?; + + json_response( + StatusCode::OK, + MetadataHealthListUnhealthyResponse { + unhealthy_tenant_shards, + }, + ) +} + +async fn handle_metadata_health_list_outdated( + mut req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let list_outdated_req = json_request::(&mut req).await?; + let state = get_state(&req); + let health_records = state + .service + .metadata_health_list_outdated(list_outdated_req.not_scrubbed_for) + .await?; + + json_response( + StatusCode::OK, + MetadataHealthListOutdatedResponse { health_records }, + ) +} + +async fn handle_tenant_shard_split( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let split_req = json_request::(&mut req).await?; + + json_response( + StatusCode::OK, + service.tenant_shard_split(tenant_id, split_req).await?, + ) +} + +async fn handle_tenant_shard_migrate( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; + let migrate_req = json_request::(&mut req).await?; + json_response( + StatusCode::OK, + service + .tenant_shard_migrate(tenant_shard_id, migrate_req) + .await?, + ) +} + +async fn handle_tenant_update_policy(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let update_req = json_request::(&mut req).await?; + let state = get_state(&req); + + json_response( + StatusCode::OK, + state + .service + .tenant_update_policy(tenant_id, update_req) + .await?, + ) +} + +async fn handle_update_preferred_azs(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let azs_req = json_request::(&mut req).await?; + let state = get_state(&req); + + json_response( + StatusCode::OK, + state.service.update_shards_preferred_azs(azs_req).await?, + ) +} + +async fn handle_step_down(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + json_response(StatusCode::OK, state.service.step_down().await) +} + +async fn handle_tenant_drop(req: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let state = get_state(&req); + + json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?) +} + +async fn handle_tenant_import(req: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let state = get_state(&req); + + json_response( + StatusCode::OK, + state.service.tenant_import(tenant_id).await?, + ) +} + +async fn handle_tenants_dump(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + state.service.tenants_dump() +} + +async fn handle_scheduler_dump(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + state.service.scheduler_dump() +} + +async fn handle_consistency_check(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + + json_response(StatusCode::OK, state.service.consistency_check().await?) +} + +async fn handle_reconcile_all(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + + json_response(StatusCode::OK, state.service.reconcile_all_now().await?) +} + +/// Status endpoint is just used for checking that our HTTP listener is up +async fn handle_status(_req: Request) -> Result, ApiError> { + json_response(StatusCode::OK, ()) +} + +/// Readiness endpoint indicates when we're done doing startup I/O (e.g. reconciling +/// with remote pageserver nodes). This is intended for use as a kubernetes readiness probe. +async fn handle_ready(req: Request) -> Result, ApiError> { + let state = get_state(&req); + if state.service.startup_complete.is_ready() { + json_response(StatusCode::OK, ()) + } else { + json_response(StatusCode::SERVICE_UNAVAILABLE, ()) + } +} + +impl From for ApiError { + fn from(value: ReconcileError) -> Self { + ApiError::Conflict(format!("Reconciliation error: {}", value)) + } +} + +/// Return the safekeeper record by instance id, or 404. +/// +/// Not used by anything except manual testing. +async fn handle_get_safekeeper(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let id = parse_request_param::(&req, "id")?; + + let state = get_state(&req); + + let res = state.service.get_safekeeper(id).await; + + match res { + Ok(b) => json_response(StatusCode::OK, b), + Err(crate::persistence::DatabaseError::Query(diesel::result::Error::NotFound)) => { + Err(ApiError::NotFound("unknown instance_id".into())) + } + Err(other) => Err(other.into()), + } +} + +/// Used as part of deployment scripts. +/// +/// Assumes information is only relayed to storage controller after first selecting an unique id on +/// control plane database, which means we have an id field in the request and payload. +async fn handle_upsert_safekeeper(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let body = json_request::(&mut req).await?; + let id = parse_request_param::(&req, "id")?; + + if id != body.id { + // it should be repeated + return Err(ApiError::BadRequest(anyhow::anyhow!( + "id mismatch: url={id:?}, body={:?}", + body.id + ))); + } + + let state = get_state(&req); + + state.service.upsert_safekeeper(body).await?; + + Ok(Response::builder() + .status(StatusCode::NO_CONTENT) + .body(Body::empty()) + .unwrap()) +} + +/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only +/// be allowed to run if Service has finished its initial reconciliation. +async fn tenant_service_handler( + request: Request, + handler: H, + request_name: RequestName, +) -> R::Output +where + R: std::future::Future, ApiError>> + Send + 'static, + H: FnOnce(Arc, Request) -> R + Send + Sync + 'static, +{ + let state = get_state(&request); + let service = state.service.clone(); + + let startup_complete = service.startup_complete.clone(); + if tokio::time::timeout(STARTUP_RECONCILE_TIMEOUT, startup_complete.wait()) + .await + .is_err() + { + // This shouldn't happen: it is the responsibilty of [`Service::startup_reconcile`] to use appropriate + // timeouts around its remote calls, to bound its runtime. + return Err(ApiError::Timeout( + "Timed out waiting for service readiness".into(), + )); + } + + named_request_span( + request, + |request| async move { handler(service, request).await }, + request_name, + ) + .await +} + +/// Check if the required scope is held in the request's token, or if the request has +/// a token with 'admin' scope then always permit it. +fn check_permissions(request: &Request, required_scope: Scope) -> Result<(), ApiError> { + check_permission_with(request, |claims| { + match crate::auth::check_permission(claims, required_scope) { + Err(e) => match crate::auth::check_permission(claims, Scope::Admin) { + Ok(()) => Ok(()), + Err(_) => Err(e), + }, + Ok(()) => Ok(()), + } + }) +} + +#[derive(Clone, Debug)] +struct RequestMeta { + method: hyper::http::Method, + at: Instant, +} + +pub fn prologue_leadership_status_check_middleware< + B: hyper::body::HttpBody + Send + Sync + 'static, +>() -> Middleware { + Middleware::pre(move |req| async move { + let state = get_state(&req); + let leadership_status = state.service.get_leadership_status(); + + enum AllowedRoutes<'a> { + All, + Some(Vec<&'a str>), + } + + let allowed_routes = match leadership_status { + LeadershipStatus::Leader => AllowedRoutes::All, + LeadershipStatus::SteppedDown => { + // TODO: does it make sense to allow /status here? + AllowedRoutes::Some(["/control/v1/step_down", "/status", "/metrics"].to_vec()) + } + LeadershipStatus::Candidate => { + AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec()) + } + }; + + let uri = req.uri().to_string(); + match allowed_routes { + AllowedRoutes::All => Ok(req), + AllowedRoutes::Some(allowed) if allowed.contains(&uri.as_str()) => Ok(req), + _ => { + tracing::info!( + "Request {} not allowed due to current leadership state", + req.uri() + ); + + Err(ApiError::ResourceUnavailable( + format!("Current leadership status is {leadership_status}").into(), + )) + } + } + }) +} + +fn prologue_metrics_middleware( +) -> Middleware { + Middleware::pre(move |req| async move { + let meta = RequestMeta { + method: req.method().clone(), + at: Instant::now(), + }; + + req.set_context(meta); + + Ok(req) + }) +} + +fn epilogue_metrics_middleware( +) -> Middleware { + Middleware::post_with_info(move |resp, req_info| async move { + let request_name = match req_info.context::() { + Some(name) => name, + None => { + return Ok(resp); + } + }; + + if let Some(meta) = req_info.context::() { + let status = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_http_request_status; + let latency = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_http_request_latency; + + status.inc(HttpRequestStatusLabelGroup { + path: request_name.0, + method: meta.method.clone().into(), + status: crate::metrics::StatusCode(resp.status()), + }); + + latency.observe( + HttpRequestLatencyLabelGroup { + path: request_name.0, + method: meta.method.into(), + }, + meta.at.elapsed().as_secs_f64(), + ); + } + Ok(resp) + }) +} + +pub async fn measured_metrics_handler(req: Request) -> Result, ApiError> { + pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4"; + + let state = get_state(&req); + let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics); + let response = Response::builder() + .status(200) + .header(CONTENT_TYPE, TEXT_FORMAT) + .body(payload.into()) + .unwrap(); + + Ok(response) +} + +#[derive(Clone)] +struct RequestName(&'static str); + +async fn named_request_span( + request: Request, + handler: H, + name: RequestName, +) -> R::Output +where + R: Future, ApiError>> + Send + 'static, + H: FnOnce(Request) -> R + Send + Sync + 'static, +{ + request.set_context(name); + request_span(request, handler).await +} + +pub fn make_router( + service: Arc, + auth: Option>, + build_info: BuildInfo, +) -> RouterBuilder { + let mut router = endpoint::make_router() + .middleware(prologue_leadership_status_check_middleware()) + .middleware(prologue_metrics_middleware()) + .middleware(epilogue_metrics_middleware()); + if auth.is_some() { + router = router.middleware(auth_middleware(|request| { + let state = get_state(request); + if state.allowlist_routes.contains(request.uri()) { + None + } else { + state.auth.as_deref() + } + })); + } + + router + .data(Arc::new(HttpState::new(service, auth, build_info))) + .get("/metrics", |r| { + named_request_span(r, measured_metrics_handler, RequestName("metrics")) + }) + // Non-prefixed generic endpoints (status, metrics) + .get("/status", |r| { + named_request_span(r, handle_status, RequestName("status")) + }) + .get("/ready", |r| { + named_request_span(r, handle_ready, RequestName("ready")) + }) + // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix + .post("/upcall/v1/re-attach", |r| { + named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach")) + }) + .post("/upcall/v1/validate", |r| { + named_request_span(r, handle_validate, RequestName("upcall_v1_validate")) + }) + // Test/dev/debug endpoints + .post("/debug/v1/attach-hook", |r| { + named_request_span(r, handle_attach_hook, RequestName("debug_v1_attach_hook")) + }) + .post("/debug/v1/inspect", |r| { + named_request_span(r, handle_inspect, RequestName("debug_v1_inspect")) + }) + .post("/debug/v1/tenant/:tenant_id/drop", |r| { + named_request_span(r, handle_tenant_drop, RequestName("debug_v1_tenant_drop")) + }) + .post("/debug/v1/node/:node_id/drop", |r| { + named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop")) + }) + .post("/debug/v1/tenant/:tenant_id/import", |r| { + named_request_span( + r, + handle_tenant_import, + RequestName("debug_v1_tenant_import"), + ) + }) + .get("/debug/v1/tenant", |r| { + named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant")) + }) + .get("/debug/v1/tenant/:tenant_id/locate", |r| { + tenant_service_handler( + r, + handle_tenant_locate, + RequestName("debug_v1_tenant_locate"), + ) + }) + .get("/debug/v1/scheduler", |r| { + named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler")) + }) + .post("/debug/v1/consistency_check", |r| { + named_request_span( + r, + handle_consistency_check, + RequestName("debug_v1_consistency_check"), + ) + }) + .post("/debug/v1/reconcile_all", |r| { + request_span(r, handle_reconcile_all) + }) + .put("/debug/v1/failpoints", |r| { + request_span(r, |r| failpoints_handler(r, CancellationToken::new())) + }) + // Node operations + .post("/control/v1/node", |r| { + named_request_span(r, handle_node_register, RequestName("control_v1_node")) + }) + .delete("/control/v1/node/:node_id", |r| { + named_request_span(r, handle_node_delete, RequestName("control_v1_node_delete")) + }) + .get("/control/v1/node", |r| { + named_request_span(r, handle_node_list, RequestName("control_v1_node")) + }) + .put("/control/v1/node/:node_id/config", |r| { + named_request_span( + r, + handle_node_configure, + RequestName("control_v1_node_config"), + ) + }) + .get("/control/v1/node/:node_id", |r| { + named_request_span(r, handle_node_status, RequestName("control_v1_node_status")) + }) + .get("/control/v1/leader", |r| { + named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader")) + }) + .put("/control/v1/node/:node_id/drain", |r| { + named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain")) + }) + .delete("/control/v1/node/:node_id/drain", |r| { + named_request_span( + r, + handle_cancel_node_drain, + RequestName("control_v1_cancel_node_drain"), + ) + }) + .put("/control/v1/node/:node_id/fill", |r| { + named_request_span(r, handle_node_fill, RequestName("control_v1_node_fill")) + }) + .delete("/control/v1/node/:node_id/fill", |r| { + named_request_span( + r, + handle_cancel_node_fill, + RequestName("control_v1_cancel_node_fill"), + ) + }) + // Metadata health operations + .post("/control/v1/metadata_health/update", |r| { + named_request_span( + r, + handle_metadata_health_update, + RequestName("control_v1_metadata_health_update"), + ) + }) + .get("/control/v1/metadata_health/unhealthy", |r| { + named_request_span( + r, + handle_metadata_health_list_unhealthy, + RequestName("control_v1_metadata_health_list_unhealthy"), + ) + }) + .post("/control/v1/metadata_health/outdated", |r| { + named_request_span( + r, + handle_metadata_health_list_outdated, + RequestName("control_v1_metadata_health_list_outdated"), + ) + }) + // Tenant Shard operations + .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| { + tenant_service_handler( + r, + handle_tenant_shard_migrate, + RequestName("control_v1_tenant_migrate"), + ) + }) + .put("/control/v1/tenant/:tenant_id/shard_split", |r| { + tenant_service_handler( + r, + handle_tenant_shard_split, + RequestName("control_v1_tenant_shard_split"), + ) + }) + .get("/control/v1/tenant/:tenant_id", |r| { + tenant_service_handler( + r, + handle_tenant_describe, + RequestName("control_v1_tenant_describe"), + ) + }) + .get("/control/v1/tenant", |r| { + tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list")) + }) + .put("/control/v1/tenant/:tenant_id/policy", |r| { + named_request_span( + r, + handle_tenant_update_policy, + RequestName("control_v1_tenant_policy"), + ) + }) + .put("/control/v1/preferred_azs", |r| { + named_request_span( + r, + handle_update_preferred_azs, + RequestName("control_v1_preferred_azs"), + ) + }) + .put("/control/v1/step_down", |r| { + named_request_span(r, handle_step_down, RequestName("control_v1_step_down")) + }) + .get("/control/v1/safekeeper/:id", |r| { + named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper")) + }) + .post("/control/v1/safekeeper/:id", |r| { + // id is in the body + named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper")) + }) + // Tenant operations + // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into + // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity. + .post("/v1/tenant", |r| { + tenant_service_handler(r, handle_tenant_create, RequestName("v1_tenant")) + }) + .delete("/v1/tenant/:tenant_id", |r| { + tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant")) + }) + .put("/v1/tenant/config", |r| { + tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config")) + }) + .get("/v1/tenant/:tenant_id/config", |r| { + tenant_service_handler(r, handle_tenant_config_get, RequestName("v1_tenant_config")) + }) + .put("/v1/tenant/:tenant_shard_id/location_config", |r| { + tenant_service_handler( + r, + handle_tenant_location_config, + RequestName("v1_tenant_location_config"), + ) + }) + .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| { + tenant_service_handler( + r, + handle_tenant_time_travel_remote_storage, + RequestName("v1_tenant_time_travel_remote_storage"), + ) + }) + .post("/v1/tenant/:tenant_id/secondary/download", |r| { + tenant_service_handler( + r, + handle_tenant_secondary_download, + RequestName("v1_tenant_secondary_download"), + ) + }) + // Timeline operations + .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { + tenant_service_handler( + r, + handle_tenant_timeline_delete, + RequestName("v1_tenant_timeline"), + ) + }) + .post("/v1/tenant/:tenant_id/timeline", |r| { + tenant_service_handler( + r, + handle_tenant_timeline_create, + RequestName("v1_tenant_timeline"), + ) + }) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/archival_config", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_archival_config, + RequestName("v1_tenant_timeline_archival_config"), + ) + }, + ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_detach_ancestor, + RequestName("v1_tenant_timeline_detach_ancestor"), + ) + }, + ) + // Tenant detail GET passthrough to shard zero: + .get("/v1/tenant/:tenant_id", |r| { + tenant_service_handler( + r, + handle_tenant_timeline_passthrough, + RequestName("v1_tenant_passthrough"), + ) + }) + // The `*` in the URL is a wildcard: any tenant/timeline GET APIs on the pageserver + // are implicitly exposed here. This must be last in the list to avoid + // taking precedence over other GET methods we might implement by hand. + .get("/v1/tenant/:tenant_id/*", |r| { + tenant_service_handler( + r, + handle_tenant_timeline_passthrough, + RequestName("v1_tenant_passthrough"), + ) + }) +} diff --git a/storage_controller/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs new file mode 100644 index 0000000000..fcd3eb57e2 --- /dev/null +++ b/storage_controller/src/id_lock_map.rs @@ -0,0 +1,223 @@ +use std::fmt::Display; +use std::time::Instant; +use std::{collections::HashMap, sync::Arc}; + +use std::time::Duration; + +use crate::service::RECONCILE_TIMEOUT; + +const LOCK_TIMEOUT_ALERT_THRESHOLD: Duration = RECONCILE_TIMEOUT; + +/// A wrapper around `OwnedRwLockWriteGuard` used for tracking the +/// operation that holds the lock, and print a warning if it exceeds +/// the LOCK_TIMEOUT_ALERT_THRESHOLD time +pub struct TracingExclusiveGuard { + guard: tokio::sync::OwnedRwLockWriteGuard>, + start: Instant, +} + +impl TracingExclusiveGuard { + pub fn new(guard: tokio::sync::OwnedRwLockWriteGuard>) -> Self { + Self { + guard, + start: Instant::now(), + } + } +} + +impl Drop for TracingExclusiveGuard { + fn drop(&mut self) { + let duration = self.start.elapsed(); + if duration > LOCK_TIMEOUT_ALERT_THRESHOLD { + tracing::warn!( + "Exclusive lock by {} was held for {:?}", + self.guard.as_ref().unwrap(), + duration + ); + } + *self.guard = None; + } +} + +// A wrapper around `OwnedRwLockReadGuard` used for tracking the +/// operation that holds the lock, and print a warning if it exceeds +/// the LOCK_TIMEOUT_ALERT_THRESHOLD time +pub struct TracingSharedGuard { + _guard: tokio::sync::OwnedRwLockReadGuard>, + operation: T, + start: Instant, +} + +impl TracingSharedGuard { + pub fn new(guard: tokio::sync::OwnedRwLockReadGuard>, operation: T) -> Self { + Self { + _guard: guard, + operation, + start: Instant::now(), + } + } +} + +impl Drop for TracingSharedGuard { + fn drop(&mut self) { + let duration = self.start.elapsed(); + if duration > LOCK_TIMEOUT_ALERT_THRESHOLD { + tracing::warn!( + "Shared lock by {} was held for {:?}", + self.operation, + duration + ); + } + } +} + +/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't +/// want to embed a lock in each one, or if your locking granularity is different to your object granularity. +/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking +/// is needed at a tenant-wide granularity. +pub(crate) struct IdLockMap +where + T: Eq + PartialEq + std::hash::Hash, +{ + /// A synchronous lock for getting/setting the async locks that our callers will wait on. + entities: std::sync::Mutex>>>>, +} + +impl IdLockMap +where + T: Eq + PartialEq + std::hash::Hash, + I: Display, +{ + pub(crate) fn shared( + &self, + key: T, + operation: I, + ) -> impl std::future::Future> { + let mut locked = self.entities.lock().unwrap(); + let entry = locked.entry(key).or_default().clone(); + async move { TracingSharedGuard::new(entry.read_owned().await, operation) } + } + + pub(crate) fn exclusive( + &self, + key: T, + operation: I, + ) -> impl std::future::Future> { + let mut locked = self.entities.lock().unwrap(); + let entry = locked.entry(key).or_default().clone(); + async move { + let mut guard = TracingExclusiveGuard::new(entry.write_owned().await); + *guard.guard = Some(operation); + guard + } + } + + /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do + /// periodic housekeeping to avoid the map growing indefinitely + pub(crate) fn housekeeping(&self) { + let mut locked = self.entities.lock().unwrap(); + locked.retain(|_k, entry| entry.try_write().is_err()) + } +} + +impl Default for IdLockMap +where + T: Eq + PartialEq + std::hash::Hash, +{ + fn default() -> Self { + Self { + entities: std::sync::Mutex::new(HashMap::new()), + } + } +} + +pub async fn trace_exclusive_lock< + T: Clone + Display + Eq + PartialEq + std::hash::Hash, + I: Clone + Display, +>( + op_locks: &IdLockMap, + key: T, + operation: I, +) -> TracingExclusiveGuard { + let start = Instant::now(); + let guard = op_locks.exclusive(key.clone(), operation.clone()).await; + + let duration = start.elapsed(); + if duration > LOCK_TIMEOUT_ALERT_THRESHOLD { + tracing::warn!( + "Operation {} on key {} has waited {:?} for exclusive lock", + operation, + key, + duration + ); + } + + guard +} + +pub async fn trace_shared_lock< + T: Clone + Display + Eq + PartialEq + std::hash::Hash, + I: Clone + Display, +>( + op_locks: &IdLockMap, + key: T, + operation: I, +) -> TracingSharedGuard { + let start = Instant::now(); + let guard = op_locks.shared(key.clone(), operation.clone()).await; + + let duration = start.elapsed(); + if duration > LOCK_TIMEOUT_ALERT_THRESHOLD { + tracing::warn!( + "Operation {} on key {} has waited {:?} for shared lock", + operation, + key, + duration + ); + } + + guard +} + +#[cfg(test)] +mod tests { + use super::IdLockMap; + + #[derive(Clone, Debug, strum_macros::Display, PartialEq)] + enum Operations { + Op1, + Op2, + } + + #[tokio::test] + async fn multiple_shared_locks() { + let id_lock_map: IdLockMap = IdLockMap::default(); + + let shared_lock_1 = id_lock_map.shared(1, Operations::Op1).await; + let shared_lock_2 = id_lock_map.shared(1, Operations::Op2).await; + + assert_eq!(shared_lock_1.operation, Operations::Op1); + assert_eq!(shared_lock_2.operation, Operations::Op2); + } + + #[tokio::test] + async fn exclusive_locks() { + let id_lock_map = IdLockMap::default(); + let resource_id = 1; + + { + let _ex_lock = id_lock_map.exclusive(resource_id, Operations::Op1).await; + assert_eq!(_ex_lock.guard.clone().unwrap(), Operations::Op1); + + let _ex_lock_2 = tokio::time::timeout( + tokio::time::Duration::from_millis(1), + id_lock_map.exclusive(resource_id, Operations::Op2), + ) + .await; + assert!(_ex_lock_2.is_err()); + } + + let shared_lock_1 = id_lock_map.shared(resource_id, Operations::Op1).await; + assert_eq!(shared_lock_1.operation, Operations::Op1); + } +} diff --git a/storage_controller/src/leadership.rs b/storage_controller/src/leadership.rs new file mode 100644 index 0000000000..5fae8991ec --- /dev/null +++ b/storage_controller/src/leadership.rs @@ -0,0 +1,135 @@ +use std::sync::Arc; + +use hyper::Uri; +use tokio_util::sync::CancellationToken; + +use crate::{ + peer_client::{GlobalObservedState, PeerClient}, + persistence::{ControllerPersistence, DatabaseError, DatabaseResult, Persistence}, + service::Config, +}; + +/// Helper for storage controller leadership acquisition +pub(crate) struct Leadership { + persistence: Arc, + config: Config, + cancel: CancellationToken, +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum Error { + #[error(transparent)] + Database(#[from] DatabaseError), +} + +pub(crate) type Result = std::result::Result; + +impl Leadership { + pub(crate) fn new( + persistence: Arc, + config: Config, + cancel: CancellationToken, + ) -> Self { + Self { + persistence, + config, + cancel, + } + } + + /// Find the current leader in the database and request it to step down if required. + /// Should be called early on in within the start-up sequence. + /// + /// Returns a tuple of two optionals: the current leader and its observed state + pub(crate) async fn step_down_current_leader( + &self, + ) -> Result<(Option, Option)> { + let leader = self.current_leader().await?; + let leader_step_down_state = if let Some(ref leader) = leader { + if self.config.start_as_candidate { + self.request_step_down(leader).await + } else { + None + } + } else { + tracing::info!("No leader found to request step down from. Will build observed state."); + None + }; + + Ok((leader, leader_step_down_state)) + } + + /// Mark the current storage controller instance as the leader in the database + pub(crate) async fn become_leader( + &self, + current_leader: Option, + ) -> Result<()> { + if let Some(address_for_peers) = &self.config.address_for_peers { + // TODO: `address-for-peers` can become a mandatory cli arg + // after we update the k8s setup + let proposed_leader = ControllerPersistence { + address: address_for_peers.to_string(), + started_at: chrono::Utc::now(), + }; + + self.persistence + .update_leader(current_leader, proposed_leader) + .await + .map_err(Error::Database) + } else { + tracing::info!("No address-for-peers provided. Skipping leader persistence."); + Ok(()) + } + } + + async fn current_leader(&self) -> DatabaseResult> { + let res = self.persistence.get_leader().await; + if let Err(DatabaseError::Query(diesel::result::Error::DatabaseError(_kind, ref err))) = res + { + const REL_NOT_FOUND_MSG: &str = "relation \"controllers\" does not exist"; + if err.message().trim() == REL_NOT_FOUND_MSG { + // Special case: if this is a brand new storage controller, migrations will not + // have run at this point yet, and, hence, the controllers table does not exist. + // Detect this case via the error string (diesel doesn't type it) and allow it. + tracing::info!("Detected first storage controller start-up. Allowing missing controllers table ..."); + return Ok(None); + } + } + + res + } + + /// Request step down from the currently registered leader in the database + /// + /// If such an entry is persisted, the success path returns the observed + /// state and details of the leader. Otherwise, None is returned indicating + /// there is no leader currently. + async fn request_step_down( + &self, + leader: &ControllerPersistence, + ) -> Option { + tracing::info!("Sending step down request to {leader:?}"); + + let client = PeerClient::new( + Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"), + self.config.peer_jwt_token.clone(), + ); + let state = client.step_down(&self.cancel).await; + match state { + Ok(state) => Some(state), + Err(err) => { + // TODO: Make leaders periodically update a timestamp field in the + // database and, if the leader is not reachable from the current instance, + // but inferred as alive from the timestamp, abort start-up. This avoids + // a potential scenario in which we have two controllers acting as leaders. + tracing::error!( + "Leader ({}) did not respond to step-down request: {}", + leader.address, + err + ); + + None + } + } + } +} diff --git a/control_plane/attachment_service/src/lib.rs b/storage_controller/src/lib.rs similarity index 55% rename from control_plane/attachment_service/src/lib.rs rename to storage_controller/src/lib.rs index e4ca9aa304..60e613bb5c 100644 --- a/control_plane/attachment_service/src/lib.rs +++ b/storage_controller/src/lib.rs @@ -1,27 +1,26 @@ -use serde::{Deserialize, Serialize}; +use serde::Serialize; use utils::seqwait::MonotonicCounter; +mod auth; +mod background_node_operations; mod compute_hook; +mod drain_utils; +mod heartbeater; pub mod http; +mod id_lock_map; +mod leadership; +pub mod metrics; mod node; +mod pageserver_client; +mod peer_client; pub mod persistence; mod reconciler; mod scheduler; +mod schema; pub mod service; -mod tenant_state; +mod tenant_shard; -#[derive(Clone, Serialize, Deserialize)] -enum PlacementPolicy { - /// Cheapest way to attach a tenant: just one pageserver, no secondary - Single, - /// Production-ready way to attach a tenant: one attached pageserver and - /// some number of secondaries. - Double(usize), - /// Do not attach to any pageservers - Detached, -} - -#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)] +#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)] struct Sequence(u64); impl Sequence { @@ -36,6 +35,12 @@ impl std::fmt::Display for Sequence { } } +impl std::fmt::Debug for Sequence { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + impl MonotonicCounter for Sequence { fn cnt_advance(&mut self, v: Sequence) { assert!(*self <= v); @@ -51,9 +56,3 @@ impl Sequence { Sequence(self.0 + 1) } } - -impl Default for PlacementPolicy { - fn default() -> Self { - PlacementPolicy::Double(1) - } -} diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs new file mode 100644 index 0000000000..00e90f4467 --- /dev/null +++ b/storage_controller/src/main.rs @@ -0,0 +1,385 @@ +use anyhow::{anyhow, Context}; +use clap::Parser; +use hyper::Uri; +use metrics::launch_timestamp::LaunchTimestamp; +use metrics::BuildInfo; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Duration; +use storage_controller::http::make_router; +use storage_controller::metrics::preinitialize_metrics; +use storage_controller::persistence::Persistence; +use storage_controller::service::chaos_injector::ChaosInjector; +use storage_controller::service::{ + Config, Service, HEARTBEAT_INTERVAL_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT, + MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, +}; +use tokio::signal::unix::SignalKind; +use tokio_util::sync::CancellationToken; +use tracing::Instrument; +use utils::auth::{JwtAuth, SwappableJwtAuth}; +use utils::logging::{self, LogFormat}; + +use utils::sentry_init::init_sentry; +use utils::{project_build_tag, project_git_version, tcp_listener}; + +project_git_version!(GIT_VERSION); +project_build_tag!(BUILD_TAG); + +#[derive(Parser)] +#[command(author, version, about, long_about = None)] +#[command(arg_required_else_help(true))] +struct Cli { + /// Host and port to listen on, like `127.0.0.1:1234` + #[arg(short, long)] + listen: std::net::SocketAddr, + + /// Public key for JWT authentication of clients + #[arg(long)] + public_key: Option, + + /// Token for authenticating this service with the pageservers it controls + #[arg(long)] + jwt_token: Option, + + /// Token for authenticating this service with the control plane, when calling + /// the compute notification endpoint + #[arg(long)] + control_plane_jwt_token: Option, + + #[arg(long)] + peer_jwt_token: Option, + + /// URL to control plane compute notification endpoint + #[arg(long)] + compute_hook_url: Option, + + /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller + #[arg(long)] + database_url: Option, + + /// Flag to enable dev mode, which permits running without auth + #[arg(long, default_value = "false")] + dev: bool, + + /// Grace period before marking unresponsive pageserver offline + #[arg(long)] + max_offline_interval: Option, + + /// More tolerant grace period before marking unresponsive pagserver offline used + /// around pageserver restarts + #[arg(long)] + max_warming_up_interval: Option, + + /// Size threshold for automatically splitting shards (disabled by default) + #[arg(long)] + split_threshold: Option, + + /// Maximum number of reconcilers that may run in parallel + #[arg(long)] + reconciler_concurrency: Option, + + /// How long to wait for the initial database connection to be available. + #[arg(long, default_value = "5s")] + db_connect_timeout: humantime::Duration, + + #[arg(long, default_value = "false")] + start_as_candidate: bool, + + // TODO: make this mandatory once the helm chart gets updated + #[arg(long)] + address_for_peers: Option, + + /// `neon_local` sets this to the path of the neon_local repo dir. + /// Only relevant for testing. + // TODO: make `cfg(feature = "testing")` + #[arg(long)] + neon_local_repo_dir: Option, + + /// Chaos testing + #[arg(long)] + chaos_interval: Option, + + // Maximum acceptable lag for the secondary location while draining + // a pageserver + #[arg(long)] + max_secondary_lag_bytes: Option, + + // Period with which to send heartbeats to registered nodes + #[arg(long)] + heartbeat_interval: Option, +} + +enum StrictMode { + /// In strict mode, we will require that all secrets are loaded, i.e. security features + /// may not be implicitly turned off by omitting secrets in the environment. + Strict, + /// In dev mode, secrets are optional, and omitting a particular secret will implicitly + /// disable the auth related to it (e.g. no pageserver jwt key -> send unauthenticated + /// requests, no public key -> don't authenticate incoming requests). + Dev, +} + +impl Default for StrictMode { + fn default() -> Self { + Self::Strict + } +} + +/// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this +/// type encapsulates the logic to decide which and do the loading. +struct Secrets { + database_url: String, + public_key: Option, + jwt_token: Option, + control_plane_jwt_token: Option, + peer_jwt_token: Option, +} + +impl Secrets { + const DATABASE_URL_ENV: &'static str = "DATABASE_URL"; + const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN"; + const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN"; + const PEER_JWT_TOKEN_ENV: &'static str = "PEER_JWT_TOKEN"; + const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY"; + + /// Load secrets from, in order of preference: + /// - CLI args if database URL is provided on the CLI + /// - Environment variables if DATABASE_URL is set. + async fn load(args: &Cli) -> anyhow::Result { + let Some(database_url) = Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV) + else { + anyhow::bail!( + "Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)" + ) + }; + + let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV) { + Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?), + None => None, + }; + + let this = Self { + database_url, + public_key, + jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV), + control_plane_jwt_token: Self::load_secret( + &args.control_plane_jwt_token, + Self::CONTROL_PLANE_JWT_TOKEN_ENV, + ), + peer_jwt_token: Self::load_secret(&args.peer_jwt_token, Self::PEER_JWT_TOKEN_ENV), + }; + + Ok(this) + } + + fn load_secret(cli: &Option, env_name: &str) -> Option { + if let Some(v) = cli { + Some(v.clone()) + } else if let Ok(v) = std::env::var(env_name) { + Some(v) + } else { + None + } + } +} + +fn main() -> anyhow::Result<()> { + logging::init( + LogFormat::Plain, + logging::TracingErrorLayerEnablement::Disabled, + logging::Output::Stdout, + )?; + + // log using tracing so we don't get confused output by default hook writing to stderr + utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); + + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + + let hook = std::panic::take_hook(); + std::panic::set_hook(Box::new(move |info| { + // let sentry send a message (and flush) + // and trace the error + hook(info); + + std::process::exit(1); + })); + + tokio::runtime::Builder::new_current_thread() + // We use spawn_blocking for database operations, so require approximately + // as many blocking threads as we will open database connections. + .max_blocking_threads(Persistence::MAX_CONNECTIONS as usize) + .enable_all() + .build() + .unwrap() + .block_on(async_main()) +} + +async fn async_main() -> anyhow::Result<()> { + let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate())); + + preinitialize_metrics(); + + let args = Cli::parse(); + tracing::info!( + "version: {}, launch_timestamp: {}, build_tag {}, listening on {}", + GIT_VERSION, + launch_ts.to_string(), + BUILD_TAG, + args.listen + ); + + let build_info = BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }; + + let strict_mode = if args.dev { + StrictMode::Dev + } else { + StrictMode::Strict + }; + + let secrets = Secrets::load(&args).await?; + + // Validate required secrets and arguments are provided in strict mode + match strict_mode { + StrictMode::Strict + if (secrets.public_key.is_none() + || secrets.jwt_token.is_none() + || secrets.control_plane_jwt_token.is_none()) => + { + // Production systems should always have secrets configured: if public_key was not set + // then we would implicitly disable auth. + anyhow::bail!( + "Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode" + ); + } + StrictMode::Strict if args.compute_hook_url.is_none() => { + // Production systems should always have a compute hook set, to prevent falling + // back to trying to use neon_local. + anyhow::bail!( + "`--compute-hook-url` is not set: this is only permitted in `--dev` mode" + ); + } + StrictMode::Strict => { + tracing::info!("Starting in strict mode: configuration is OK.") + } + StrictMode::Dev => { + tracing::warn!("Starting in dev mode: this may be an insecure configuration.") + } + } + + let config = Config { + jwt_token: secrets.jwt_token, + control_plane_jwt_token: secrets.control_plane_jwt_token, + peer_jwt_token: secrets.peer_jwt_token, + compute_hook_url: args.compute_hook_url, + max_offline_interval: args + .max_offline_interval + .map(humantime::Duration::into) + .unwrap_or(MAX_OFFLINE_INTERVAL_DEFAULT), + max_warming_up_interval: args + .max_warming_up_interval + .map(humantime::Duration::into) + .unwrap_or(MAX_WARMING_UP_INTERVAL_DEFAULT), + reconciler_concurrency: args + .reconciler_concurrency + .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT), + split_threshold: args.split_threshold, + neon_local_repo_dir: args.neon_local_repo_dir, + max_secondary_lag_bytes: args.max_secondary_lag_bytes, + heartbeat_interval: args + .heartbeat_interval + .map(humantime::Duration::into) + .unwrap_or(HEARTBEAT_INTERVAL_DEFAULT), + address_for_peers: args.address_for_peers, + start_as_candidate: args.start_as_candidate, + http_service_port: args.listen.port() as i32, + }; + + // Validate that we can connect to the database + Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?; + + let persistence = Arc::new(Persistence::new(secrets.database_url)); + + let service = Service::spawn(config, persistence.clone()).await?; + + let http_listener = tcp_listener::bind(args.listen)?; + + let auth = secrets + .public_key + .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth))); + let router = make_router(service.clone(), auth, build_info) + .build() + .map_err(|err| anyhow!(err))?; + let router_service = utils::http::RouterService::new(router).unwrap(); + + // Start HTTP server + let server_shutdown = CancellationToken::new(); + let server = hyper::Server::from_tcp(http_listener)? + .serve(router_service) + .with_graceful_shutdown({ + let server_shutdown = server_shutdown.clone(); + async move { + server_shutdown.cancelled().await; + } + }); + tracing::info!("Serving on {0}", args.listen); + let server_task = tokio::task::spawn(server); + + let chaos_task = args.chaos_interval.map(|interval| { + let service = service.clone(); + let cancel = CancellationToken::new(); + let cancel_bg = cancel.clone(); + ( + tokio::task::spawn( + async move { + let mut chaos_injector = ChaosInjector::new(service, interval.into()); + chaos_injector.run(cancel_bg).await + } + .instrument(tracing::info_span!("chaos_injector")), + ), + cancel, + ) + }); + + // Wait until we receive a signal + let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?; + let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?; + let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate())?; + tokio::select! { + _ = sigint.recv() => {}, + _ = sigterm.recv() => {}, + _ = sigquit.recv() => {}, + } + tracing::info!("Terminating on signal"); + + // Stop HTTP server first, so that we don't have to service requests + // while shutting down Service. + server_shutdown.cancel(); + match tokio::time::timeout(Duration::from_secs(5), server_task).await { + Ok(Ok(_)) => { + tracing::info!("Joined HTTP server task"); + } + Ok(Err(e)) => { + tracing::error!("Error joining HTTP server task: {e}") + } + Err(_) => { + tracing::warn!("Timed out joining HTTP server task"); + // We will fall through and shut down the service anyway, any request handlers + // in flight will experience cancellation & their clients will see a torn connection. + } + } + + // If we were injecting chaos, stop that so that we're not calling into Service while it shuts down + if let Some((chaos_jh, chaos_cancel)) = chaos_task { + chaos_cancel.cancel(); + chaos_jh.await.ok(); + } + + service.shutdown().await; + tracing::info!("Service shutdown complete"); + + std::process::exit(0); +} diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs new file mode 100644 index 0000000000..5cfcfb4b1f --- /dev/null +++ b/storage_controller/src/metrics.rs @@ -0,0 +1,261 @@ +//! +//! This module provides metric definitions for the storage controller. +//! +//! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds +//! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`] +//! constant. +//! +//! The rest of the code defines label group types and deals with converting outer types to labels. +//! +use bytes::Bytes; +use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup}; +use metrics::NeonMetrics; +use once_cell::sync::Lazy; +use std::sync::Mutex; +use strum::IntoEnumIterator; + +use crate::{ + persistence::{DatabaseError, DatabaseOperation}, + service::LeadershipStatus, +}; + +pub(crate) static METRICS_REGISTRY: Lazy = + Lazy::new(StorageControllerMetrics::default); + +pub fn preinitialize_metrics() { + Lazy::force(&METRICS_REGISTRY); +} + +pub(crate) struct StorageControllerMetrics { + pub(crate) metrics_group: StorageControllerMetricGroup, + encoder: Mutex, +} + +#[derive(measured::MetricGroup)] +#[metric(new())] +pub(crate) struct StorageControllerMetricGroup { + /// Count of how many times we spawn a reconcile task + pub(crate) storage_controller_reconcile_spawn: measured::Counter, + + /// Reconciler tasks completed, broken down by success/failure/cancelled + pub(crate) storage_controller_reconcile_complete: + measured::CounterVec, + + /// Count of how many times we make an optimization change to a tenant's scheduling + pub(crate) storage_controller_schedule_optimization: measured::Counter, + + /// HTTP request status counters for handled requests + pub(crate) storage_controller_http_request_status: + measured::CounterVec, + + /// HTTP request handler latency across all status codes + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] + pub(crate) storage_controller_http_request_latency: + measured::HistogramVec, + + /// Count of HTTP requests to the pageserver that resulted in an error, + /// broken down by the pageserver node id, request name and method + pub(crate) storage_controller_pageserver_request_error: + measured::CounterVec, + + /// Latency of HTTP requests to the pageserver, broken down by pageserver + /// node id, request name and method. This include both successful and unsuccessful + /// requests. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] + pub(crate) storage_controller_pageserver_request_latency: + measured::HistogramVec, + + /// Count of pass-through HTTP requests to the pageserver that resulted in an error, + /// broken down by the pageserver node id, request name and method + pub(crate) storage_controller_passthrough_request_error: + measured::CounterVec, + + /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver + /// node id, request name and method. This include both successful and unsuccessful + /// requests. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] + pub(crate) storage_controller_passthrough_request_latency: + measured::HistogramVec, + + /// Count of errors in database queries, broken down by error type and operation. + pub(crate) storage_controller_database_query_error: + measured::CounterVec, + + /// Latency of database queries, broken down by operation. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] + pub(crate) storage_controller_database_query_latency: + measured::HistogramVec, + + pub(crate) storage_controller_leadership_status: measured::GaugeVec, +} + +impl StorageControllerMetrics { + pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes { + let mut encoder = self.encoder.lock().unwrap(); + neon_metrics + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); + self.metrics_group + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); + encoder.finish() + } +} + +impl Default for StorageControllerMetrics { + fn default() -> Self { + let mut metrics_group = StorageControllerMetricGroup::new(); + metrics_group + .storage_controller_reconcile_complete + .init_all_dense(); + + Self { + metrics_group, + encoder: Mutex::new(measured::text::BufferedTextEncoder::new()), + } + } +} + +#[derive(measured::LabelGroup)] +#[label(set = ReconcileCompleteLabelGroupSet)] +pub(crate) struct ReconcileCompleteLabelGroup { + pub(crate) status: ReconcileOutcome, +} + +#[derive(measured::LabelGroup)] +#[label(set = HttpRequestStatusLabelGroupSet)] +pub(crate) struct HttpRequestStatusLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) path: &'a str, + pub(crate) method: Method, + pub(crate) status: StatusCode, +} + +#[derive(measured::LabelGroup)] +#[label(set = HttpRequestLatencyLabelGroupSet)] +pub(crate) struct HttpRequestLatencyLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) path: &'a str, + pub(crate) method: Method, +} + +#[derive(measured::LabelGroup, Clone)] +#[label(set = PageserverRequestLabelGroupSet)] +pub(crate) struct PageserverRequestLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) pageserver_id: &'a str, + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) path: &'a str, + pub(crate) method: Method, +} + +#[derive(measured::LabelGroup)] +#[label(set = DatabaseQueryErrorLabelGroupSet)] +pub(crate) struct DatabaseQueryErrorLabelGroup { + pub(crate) error_type: DatabaseErrorLabel, + pub(crate) operation: DatabaseOperation, +} + +#[derive(measured::LabelGroup)] +#[label(set = DatabaseQueryLatencyLabelGroupSet)] +pub(crate) struct DatabaseQueryLatencyLabelGroup { + pub(crate) operation: DatabaseOperation, +} + +#[derive(measured::LabelGroup)] +#[label(set = LeadershipStatusGroupSet)] +pub(crate) struct LeadershipStatusGroup { + pub(crate) status: LeadershipStatus, +} + +#[derive(FixedCardinalityLabel, Clone, Copy)] +pub(crate) enum ReconcileOutcome { + #[label(rename = "ok")] + Success, + Error, + Cancel, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub(crate) enum Method { + Get, + Put, + Post, + Delete, + Other, +} + +impl From for Method { + fn from(value: hyper::Method) -> Self { + if value == hyper::Method::GET { + Method::Get + } else if value == hyper::Method::PUT { + Method::Put + } else if value == hyper::Method::POST { + Method::Post + } else if value == hyper::Method::DELETE { + Method::Delete + } else { + Method::Other + } + } +} + +#[derive(Clone, Copy)] +pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode); + +impl LabelValue for StatusCode { + fn visit(&self, v: V) -> V::Output { + v.write_int(self.0.as_u16() as i64) + } +} + +impl FixedCardinalityLabel for StatusCode { + fn cardinality() -> usize { + (100..1000).len() + } + + fn encode(&self) -> usize { + self.0.as_u16() as usize + } + + fn decode(value: usize) -> Self { + Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap()) + } +} + +#[derive(FixedCardinalityLabel, Clone, Copy)] +pub(crate) enum DatabaseErrorLabel { + Query, + Connection, + ConnectionPool, + Logical, + Migration, +} + +impl DatabaseError { + pub(crate) fn error_label(&self) -> DatabaseErrorLabel { + match self { + Self::Query(_) => DatabaseErrorLabel::Query, + Self::Connection(_) => DatabaseErrorLabel::Connection, + Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool, + Self::Logical(_) => DatabaseErrorLabel::Logical, + Self::Migration(_) => DatabaseErrorLabel::Migration, + } + } +} + +/// Update the leadership status metric gauges to reflect the requested status +pub(crate) fn update_leadership_status(status: LeadershipStatus) { + let status_metric = &METRICS_REGISTRY + .metrics_group + .storage_controller_leadership_status; + + for s in LeadershipStatus::iter() { + if s == status { + status_metric.set(LeadershipStatusGroup { status: s }, 1); + } else { + status_metric.set(LeadershipStatusGroup { status: s }, 0); + } + } +} diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs new file mode 100644 index 0000000000..cb9ce10d23 --- /dev/null +++ b/storage_controller/src/node.rs @@ -0,0 +1,320 @@ +use std::{str::FromStr, time::Duration}; + +use pageserver_api::{ + controller_api::{ + NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, + TenantLocateResponseShard, + }, + shard::TenantShardId, +}; +use pageserver_client::mgmt_api; +use reqwest::StatusCode; +use serde::Serialize; +use tokio_util::sync::CancellationToken; +use utils::{backoff, id::NodeId}; + +use crate::{ + pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule, +}; + +/// Represents the in-memory description of a Node. +/// +/// Scheduling statistics are maintened separately in [`crate::scheduler`]. +/// +/// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the +/// implementation of serialization on this type is only for debug dumps. +#[derive(Clone, Serialize)] +pub(crate) struct Node { + id: NodeId, + + availability: NodeAvailability, + scheduling: NodeSchedulingPolicy, + + listen_http_addr: String, + listen_http_port: u16, + + listen_pg_addr: String, + listen_pg_port: u16, + + availability_zone_id: String, + + // This cancellation token means "stop any RPCs in flight to this node, and don't start + // any more". It is not related to process shutdown. + #[serde(skip)] + cancel: CancellationToken, +} + +/// When updating [`Node::availability`] we use this type to indicate to the caller +/// whether/how they changed it. +pub(crate) enum AvailabilityTransition { + ToActive, + ToWarmingUpFromActive, + ToWarmingUpFromOffline, + ToOffline, + Unchanged, +} + +impl Node { + pub(crate) fn base_url(&self) -> String { + format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) + } + + pub(crate) fn get_id(&self) -> NodeId { + self.id + } + + #[allow(unused)] + pub(crate) fn get_availability_zone_id(&self) -> &str { + self.availability_zone_id.as_str() + } + + pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy { + self.scheduling + } + + pub(crate) fn set_scheduling(&mut self, scheduling: NodeSchedulingPolicy) { + self.scheduling = scheduling + } + + /// Does this registration request match `self`? This is used when deciding whether a registration + /// request should be allowed to update an existing record with the same node ID. + pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool { + self.id == register_req.node_id + && self.listen_http_addr == register_req.listen_http_addr + && self.listen_http_port == register_req.listen_http_port + && self.listen_pg_addr == register_req.listen_pg_addr + && self.listen_pg_port == register_req.listen_pg_port + && self.availability_zone_id == register_req.availability_zone_id + } + + /// For a shard located on this node, populate a response object + /// with this node's address information. + pub(crate) fn shard_location(&self, shard_id: TenantShardId) -> TenantLocateResponseShard { + TenantLocateResponseShard { + shard_id, + node_id: self.id, + listen_http_addr: self.listen_http_addr.clone(), + listen_http_port: self.listen_http_port, + listen_pg_addr: self.listen_pg_addr.clone(), + listen_pg_port: self.listen_pg_port, + } + } + + pub(crate) fn get_availability(&self) -> &NodeAvailability { + &self.availability + } + + pub(crate) fn set_availability(&mut self, availability: NodeAvailability) { + use AvailabilityTransition::*; + use NodeAvailability::WarmingUp; + + match self.get_availability_transition(&availability) { + ToActive => { + // Give the node a new cancellation token, effectively resetting it to un-cancelled. Any + // users of previously-cloned copies of the node will still see the old cancellation + // state. For example, Reconcilers in flight will have to complete and be spawned + // again to realize that the node has become available. + self.cancel = CancellationToken::new(); + } + ToOffline | ToWarmingUpFromActive => { + // Fire the node's cancellation token to cancel any in-flight API requests to it + self.cancel.cancel(); + } + Unchanged | ToWarmingUpFromOffline => {} + } + + if let (WarmingUp(crnt), WarmingUp(proposed)) = (&self.availability, &availability) { + self.availability = WarmingUp(std::cmp::max(*crnt, *proposed)); + } else { + self.availability = availability; + } + } + + /// Without modifying the availability of the node, convert the intended availability + /// into a description of the transition. + pub(crate) fn get_availability_transition( + &self, + availability: &NodeAvailability, + ) -> AvailabilityTransition { + use AvailabilityTransition::*; + use NodeAvailability::*; + + match (&self.availability, availability) { + (Offline, Active(_)) => ToActive, + (Active(_), Offline) => ToOffline, + (Active(_), WarmingUp(_)) => ToWarmingUpFromActive, + (WarmingUp(_), Offline) => ToOffline, + (WarmingUp(_), Active(_)) => ToActive, + (Offline, WarmingUp(_)) => ToWarmingUpFromOffline, + _ => Unchanged, + } + } + + /// Whether we may send API requests to this node. + pub(crate) fn is_available(&self) -> bool { + // When we clone a node, [`Self::availability`] is a snapshot, but [`Self::cancel`] holds + // a reference to the original Node's cancellation status. Checking both of these results + // in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable + // when we cloned it, or if the original Node instance's cancellation token was fired. + matches!(self.availability, NodeAvailability::Active(_)) && !self.cancel.is_cancelled() + } + + /// Is this node elegible to have work scheduled onto it? + pub(crate) fn may_schedule(&self) -> MaySchedule { + let utilization = match &self.availability { + NodeAvailability::Active(u) => u.clone(), + NodeAvailability::Offline | NodeAvailability::WarmingUp(_) => return MaySchedule::No, + }; + + match self.scheduling { + NodeSchedulingPolicy::Active => MaySchedule::Yes(utilization), + NodeSchedulingPolicy::Draining => MaySchedule::No, + NodeSchedulingPolicy::Filling => MaySchedule::Yes(utilization), + NodeSchedulingPolicy::Pause => MaySchedule::No, + NodeSchedulingPolicy::PauseForRestart => MaySchedule::No, + } + } + + pub(crate) fn new( + id: NodeId, + listen_http_addr: String, + listen_http_port: u16, + listen_pg_addr: String, + listen_pg_port: u16, + availability_zone_id: String, + ) -> Self { + Self { + id, + listen_http_addr, + listen_http_port, + listen_pg_addr, + listen_pg_port, + scheduling: NodeSchedulingPolicy::Active, + availability: NodeAvailability::Offline, + availability_zone_id, + cancel: CancellationToken::new(), + } + } + + pub(crate) fn to_persistent(&self) -> NodePersistence { + NodePersistence { + node_id: self.id.0 as i64, + scheduling_policy: self.scheduling.into(), + listen_http_addr: self.listen_http_addr.clone(), + listen_http_port: self.listen_http_port as i32, + listen_pg_addr: self.listen_pg_addr.clone(), + listen_pg_port: self.listen_pg_port as i32, + availability_zone_id: self.availability_zone_id.clone(), + } + } + + pub(crate) fn from_persistent(np: NodePersistence) -> Self { + Self { + id: NodeId(np.node_id as u64), + // At startup we consider a node offline until proven otherwise. + availability: NodeAvailability::Offline, + scheduling: NodeSchedulingPolicy::from_str(&np.scheduling_policy) + .expect("Bad scheduling policy in DB"), + listen_http_addr: np.listen_http_addr, + listen_http_port: np.listen_http_port as u16, + listen_pg_addr: np.listen_pg_addr, + listen_pg_port: np.listen_pg_port as u16, + availability_zone_id: np.availability_zone_id, + cancel: CancellationToken::new(), + } + } + + /// Wrapper for issuing requests to pageserver management API: takes care of generic + /// retry/backoff for retryable HTTP status codes. + /// + /// This will return None to indicate cancellation. Cancellation may happen from + /// the cancellation token passed in, or from Self's cancellation token (i.e. node + /// going offline). + pub(crate) async fn with_client_retries( + &self, + mut op: O, + jwt: &Option, + warn_threshold: u32, + max_retries: u32, + timeout: Duration, + cancel: &CancellationToken, + ) -> Option> + where + O: FnMut(PageserverClient) -> F, + F: std::future::Future>, + { + fn is_fatal(e: &mgmt_api::Error) -> bool { + use mgmt_api::Error::*; + match e { + SendRequest(_) | ReceiveBody(_) | ReceiveErrorBody(_) => false, + ApiError(StatusCode::SERVICE_UNAVAILABLE, _) + | ApiError(StatusCode::GATEWAY_TIMEOUT, _) + | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false, + ApiError(_, _) => true, + Cancelled => true, + } + } + + backoff::retry( + || { + let http_client = reqwest::ClientBuilder::new() + .timeout(timeout) + .build() + .expect("Failed to construct HTTP client"); + + let client = PageserverClient::from_client( + self.get_id(), + http_client, + self.base_url(), + jwt.as_deref(), + ); + + let node_cancel_fut = self.cancel.cancelled(); + + let op_fut = op(client); + + async { + tokio::select! { + r = op_fut=> {r}, + _ = node_cancel_fut => { + Err(mgmt_api::Error::Cancelled) + }} + } + }, + is_fatal, + warn_threshold, + max_retries, + &format!( + "Call to node {} ({}:{}) management API", + self.id, self.listen_http_addr, self.listen_http_port + ), + cancel, + ) + .await + } + + /// Generate the simplified API-friendly description of a node's state + pub(crate) fn describe(&self) -> NodeDescribeResponse { + NodeDescribeResponse { + id: self.id, + availability: self.availability.clone().into(), + scheduling: self.scheduling, + listen_http_addr: self.listen_http_addr.clone(), + listen_http_port: self.listen_http_port, + listen_pg_addr: self.listen_pg_addr.clone(), + listen_pg_port: self.listen_pg_port, + } + } +} + +impl std::fmt::Display for Node { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{} ({})", self.id, self.listen_http_addr) + } +} + +impl std::fmt::Debug for Node { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{} ({})", self.id, self.listen_http_addr) + } +} diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs new file mode 100644 index 0000000000..20770ed703 --- /dev/null +++ b/storage_controller/src/pageserver_client.rs @@ -0,0 +1,281 @@ +use pageserver_api::{ + models::{ + detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse, + PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse, + TenantShardSplitRequest, TenantShardSplitResponse, TimelineArchivalConfigRequest, + TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse, + }, + shard::TenantShardId, +}; +use pageserver_client::mgmt_api::{Client, Result}; +use reqwest::StatusCode; +use utils::id::{NodeId, TenantId, TimelineId}; + +/// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage +/// controller to collect metrics in a non-intrusive manner. +#[derive(Debug, Clone)] +pub(crate) struct PageserverClient { + inner: Client, + node_id_label: String, +} + +macro_rules! measured_request { + ($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{ + let labels = crate::metrics::PageserverRequestLabelGroup { + pageserver_id: $node_id, + path: $name, + method: $method, + }; + + let latency = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_request_latency; + let _timer_guard = latency.start_timer(labels.clone()); + + let res = $invoke; + + if res.is_err() { + let error_counters = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_request_error; + error_counters.inc(labels) + } + + res + }}; +} + +impl PageserverClient { + pub(crate) fn new(node_id: NodeId, mgmt_api_endpoint: String, jwt: Option<&str>) -> Self { + Self { + inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt), + node_id_label: node_id.0.to_string(), + } + } + + pub(crate) fn from_client( + node_id: NodeId, + raw_client: reqwest::Client, + mgmt_api_endpoint: String, + jwt: Option<&str>, + ) -> Self { + Self { + inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt), + node_id_label: node_id.0.to_string(), + } + } + + pub(crate) async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result { + measured_request!( + "tenant", + crate::metrics::Method::Delete, + &self.node_id_label, + self.inner.tenant_delete(tenant_shard_id).await + ) + } + + pub(crate) async fn tenant_time_travel_remote_storage( + &self, + tenant_shard_id: TenantShardId, + timestamp: &str, + done_if_after: &str, + ) -> Result<()> { + measured_request!( + "tenant_time_travel_remote_storage", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .tenant_time_travel_remote_storage(tenant_shard_id, timestamp, done_if_after) + .await + ) + } + + pub(crate) async fn tenant_scan_remote_storage( + &self, + tenant_id: TenantId, + ) -> Result { + measured_request!( + "tenant_scan_remote_storage", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.tenant_scan_remote_storage(tenant_id).await + ) + } + + pub(crate) async fn tenant_secondary_download( + &self, + tenant_id: TenantShardId, + wait: Option, + ) -> Result<(StatusCode, SecondaryProgress)> { + measured_request!( + "tenant_secondary_download", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.tenant_secondary_download(tenant_id, wait).await + ) + } + + pub(crate) async fn tenant_secondary_status( + &self, + tenant_shard_id: TenantShardId, + ) -> Result { + measured_request!( + "tenant_secondary_status", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.tenant_secondary_status(tenant_shard_id).await + ) + } + + pub(crate) async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> { + measured_request!( + "tenant_heatmap_upload", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.tenant_heatmap_upload(tenant_id).await + ) + } + + pub(crate) async fn location_config( + &self, + tenant_shard_id: TenantShardId, + config: LocationConfig, + flush_ms: Option, + lazy: bool, + ) -> Result<()> { + measured_request!( + "location_config", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .location_config(tenant_shard_id, config, flush_ms, lazy) + .await + ) + } + + pub(crate) async fn list_location_config(&self) -> Result { + measured_request!( + "location_configs", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.list_location_config().await + ) + } + + pub(crate) async fn get_location_config( + &self, + tenant_shard_id: TenantShardId, + ) -> Result> { + measured_request!( + "location_config", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.get_location_config(tenant_shard_id).await + ) + } + + pub(crate) async fn timeline_create( + &self, + tenant_shard_id: TenantShardId, + req: &TimelineCreateRequest, + ) -> Result { + measured_request!( + "timeline", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.timeline_create(tenant_shard_id, req).await + ) + } + + pub(crate) async fn timeline_delete( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + measured_request!( + "timeline", + crate::metrics::Method::Delete, + &self.node_id_label, + self.inner + .timeline_delete(tenant_shard_id, timeline_id) + .await + ) + } + + pub(crate) async fn tenant_shard_split( + &self, + tenant_shard_id: TenantShardId, + req: TenantShardSplitRequest, + ) -> Result { + measured_request!( + "tenant_shard_split", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner.tenant_shard_split(tenant_shard_id, req).await + ) + } + + pub(crate) async fn timeline_list( + &self, + tenant_shard_id: &TenantShardId, + ) -> Result> { + measured_request!( + "timelines", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.timeline_list(tenant_shard_id).await + ) + } + + pub(crate) async fn timeline_archival_config( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + req: &TimelineArchivalConfigRequest, + ) -> Result<()> { + measured_request!( + "timeline_archival_config", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner + .timeline_archival_config(tenant_shard_id, timeline_id, req) + .await + ) + } + + pub(crate) async fn timeline_detach_ancestor( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + measured_request!( + "timeline_detach_ancestor", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .timeline_detach_ancestor(tenant_shard_id, timeline_id) + .await + ) + } + + pub(crate) async fn get_utilization(&self) -> Result { + measured_request!( + "utilization", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.get_utilization().await + ) + } + + pub(crate) async fn top_tenant_shards( + &self, + request: TopTenantShardsRequest, + ) -> Result { + measured_request!( + "top_tenants", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.top_tenant_shards(request).await + ) + } +} diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs new file mode 100644 index 0000000000..3f8520fe55 --- /dev/null +++ b/storage_controller/src/peer_client.rs @@ -0,0 +1,108 @@ +use crate::tenant_shard::ObservedState; +use pageserver_api::shard::TenantShardId; +use serde::{Deserialize, Serialize}; +use std::{collections::HashMap, time::Duration}; +use tokio_util::sync::CancellationToken; + +use hyper::Uri; +use reqwest::{StatusCode, Url}; +use utils::{backoff, http::error::HttpErrorBody}; + +#[derive(Debug, Clone)] +pub(crate) struct PeerClient { + uri: Uri, + jwt: Option, + client: reqwest::Client, +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum StorageControllerPeerError { + #[error("failed to deserialize error response with status code {0} at {1}: {2}")] + DeserializationError(StatusCode, Url, reqwest::Error), + #[error("storage controller peer API error ({0}): {1}")] + ApiError(StatusCode, String), + #[error("failed to send HTTP request: {0}")] + SendError(reqwest::Error), + #[error("Cancelled")] + Cancelled, +} + +pub(crate) type Result = std::result::Result; + +pub(crate) trait ResponseErrorMessageExt: Sized { + fn error_from_body(self) -> impl std::future::Future> + Send; +} + +impl ResponseErrorMessageExt for reqwest::Response { + async fn error_from_body(self) -> Result { + let status = self.status(); + if !(status.is_client_error() || status.is_server_error()) { + return Ok(self); + } + + let url = self.url().to_owned(); + Err(match self.json::().await { + Ok(HttpErrorBody { msg }) => StorageControllerPeerError::ApiError(status, msg), + Err(err) => StorageControllerPeerError::DeserializationError(status, url, err), + }) + } +} + +#[derive(Serialize, Deserialize, Debug, Default)] +pub(crate) struct GlobalObservedState(pub(crate) HashMap); + +impl PeerClient { + pub(crate) fn new(uri: Uri, jwt: Option) -> Self { + Self { + uri, + jwt, + client: reqwest::Client::new(), + } + } + + async fn request_step_down(&self) -> Result { + let step_down_path = format!("{}control/v1/step_down", self.uri); + let req = self.client.put(step_down_path); + let req = if let Some(jwt) = &self.jwt { + req.header(reqwest::header::AUTHORIZATION, format!("Bearer {jwt}")) + } else { + req + }; + + let req = req.timeout(Duration::from_secs(2)); + + let res = req + .send() + .await + .map_err(StorageControllerPeerError::SendError)?; + let response = res.error_from_body().await?; + + let status = response.status(); + let url = response.url().to_owned(); + + response + .json() + .await + .map_err(|err| StorageControllerPeerError::DeserializationError(status, url, err)) + } + + /// Request the peer to step down and return its current observed state + /// All errors are retried with exponential backoff for a maximum of 4 attempts. + /// Assuming all retries are performed, the function times out after roughly 4 seconds. + pub(crate) async fn step_down( + &self, + cancel: &CancellationToken, + ) -> Result { + backoff::retry( + || self.request_step_down(), + |_e| false, + 2, + 4, + "Send step down request", + cancel, + ) + .await + .ok_or_else(|| StorageControllerPeerError::Cancelled) + .and_then(|x| x) + } +} diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs new file mode 100644 index 0000000000..1dc1040d96 --- /dev/null +++ b/storage_controller/src/persistence.rs @@ -0,0 +1,1230 @@ +pub(crate) mod split_state; +use std::collections::HashMap; +use std::str::FromStr; +use std::time::Duration; +use std::time::Instant; + +use self::split_state::SplitState; +use diesel::pg::PgConnection; +use diesel::prelude::*; +use diesel::Connection; +use itertools::Itertools; +use pageserver_api::controller_api::MetadataHealthRecord; +use pageserver_api::controller_api::ShardSchedulingPolicy; +use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; +use pageserver_api::models::TenantConfig; +use pageserver_api::shard::ShardConfigError; +use pageserver_api::shard::ShardIdentity; +use pageserver_api::shard::ShardStripeSize; +use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId}; +use serde::{Deserialize, Serialize}; +use utils::generation::Generation; +use utils::id::{NodeId, TenantId}; + +use crate::metrics::{ + DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY, +}; +use crate::node::Node; + +use diesel_migrations::{embed_migrations, EmbeddedMigrations}; +const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations"); + +/// ## What do we store? +/// +/// The storage controller service does not store most of its state durably. +/// +/// The essential things to store durably are: +/// - generation numbers, as these must always advance monotonically to ensure data safety. +/// - Tenant's PlacementPolicy and TenantConfig, as the source of truth for these is something external. +/// - Node's scheduling policies, as the source of truth for these is something external. +/// +/// Other things we store durably as an implementation detail: +/// - Node's host/port: this could be avoided it we made nodes emit a self-registering heartbeat, +/// but it is operationally simpler to make this service the authority for which nodes +/// it talks to. +/// +/// ## Performance/efficiency +/// +/// The storage controller service does not go via the database for most things: there are +/// a couple of places where we must, and where efficiency matters: +/// - Incrementing generation numbers: the Reconciler has to wait for this to complete +/// before it can attach a tenant, so this acts as a bound on how fast things like +/// failover can happen. +/// - Pageserver re-attach: we will increment many shards' generations when this happens, +/// so it is important to avoid e.g. issuing O(N) queries. +/// +/// Database calls relating to nodes have low performance requirements, as they are very rarely +/// updated, and reads of nodes are always from memory, not the database. We only require that +/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline. +pub struct Persistence { + connection_pool: diesel::r2d2::Pool>, +} + +/// Legacy format, for use in JSON compat objects in test environment +#[derive(Serialize, Deserialize)] +struct JsonPersistence { + tenants: HashMap, +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum DatabaseError { + #[error(transparent)] + Query(#[from] diesel::result::Error), + #[error(transparent)] + Connection(#[from] diesel::result::ConnectionError), + #[error(transparent)] + ConnectionPool(#[from] r2d2::Error), + #[error("Logical error: {0}")] + Logical(String), + #[error("Migration error: {0}")] + Migration(String), +} + +#[derive(measured::FixedCardinalityLabel, Copy, Clone)] +pub(crate) enum DatabaseOperation { + InsertNode, + UpdateNode, + DeleteNode, + ListNodes, + BeginShardSplit, + CompleteShardSplit, + AbortShardSplit, + Detach, + ReAttach, + IncrementGeneration, + TenantGenerations, + ShardGenerations, + ListTenantShards, + InsertTenantShards, + UpdateTenantShard, + DeleteTenant, + UpdateTenantConfig, + UpdateMetadataHealth, + ListMetadataHealth, + ListMetadataHealthUnhealthy, + ListMetadataHealthOutdated, + GetLeader, + UpdateLeader, + SetPreferredAzs, +} + +#[must_use] +pub(crate) enum AbortShardSplitStatus { + /// We aborted the split in the database by reverting to the parent shards + Aborted, + /// The split had already been persisted. + Complete, +} + +pub(crate) type DatabaseResult = Result; + +/// Some methods can operate on either a whole tenant or a single shard +pub(crate) enum TenantFilter { + Tenant(TenantId), + Shard(TenantShardId), +} + +/// Represents the results of looking up generation+pageserver for the shards of a tenant +pub(crate) struct ShardGenerationState { + pub(crate) tenant_shard_id: TenantShardId, + pub(crate) generation: Option, + pub(crate) generation_pageserver: Option, +} + +impl Persistence { + // The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under + // normal circumstances. This assumes we have exclusive use of the database cluster to which we connect. + pub const MAX_CONNECTIONS: u32 = 99; + + // We don't want to keep a lot of connections alive: close them down promptly if they aren't being used. + const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10); + const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60); + + pub fn new(database_url: String) -> Self { + let manager = diesel::r2d2::ConnectionManager::::new(database_url); + + // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time + // to execute queries (database queries are not generally on latency-sensitive paths). + let connection_pool = diesel::r2d2::Pool::builder() + .max_size(Self::MAX_CONNECTIONS) + .max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME)) + .idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT)) + // Always keep at least one connection ready to go + .min_idle(Some(1)) + .test_on_check_out(true) + .build(manager) + .expect("Could not build connection pool"); + + Self { connection_pool } + } + + /// A helper for use during startup, where we would like to tolerate concurrent restarts of the + /// database and the storage controller, therefore the database might not be available right away + pub async fn await_connection( + database_url: &str, + timeout: Duration, + ) -> Result<(), diesel::ConnectionError> { + let started_at = Instant::now(); + loop { + match PgConnection::establish(database_url) { + Ok(_) => { + tracing::info!("Connected to database."); + return Ok(()); + } + Err(e) => { + if started_at.elapsed() > timeout { + return Err(e); + } else { + tracing::info!("Database not yet available, waiting... ({e})"); + tokio::time::sleep(Duration::from_millis(100)).await; + } + } + } + } + } + + /// Execute the diesel migrations that are built into this binary + pub(crate) async fn migration_run(&self) -> DatabaseResult<()> { + use diesel_migrations::{HarnessWithOutput, MigrationHarness}; + + self.with_conn(move |conn| -> DatabaseResult<()> { + HarnessWithOutput::write_to_stdout(conn) + .run_pending_migrations(MIGRATIONS) + .map(|_| ()) + .map_err(|e| DatabaseError::Migration(e.to_string())) + }) + .await + } + + /// Wraps `with_conn` in order to collect latency and error metrics + async fn with_measured_conn(&self, op: DatabaseOperation, func: F) -> DatabaseResult + where + F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, + R: Send + 'static, + { + let latency = &METRICS_REGISTRY + .metrics_group + .storage_controller_database_query_latency; + let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op }); + + let res = self.with_conn(func).await; + + if let Err(err) = &res { + let error_counter = &METRICS_REGISTRY + .metrics_group + .storage_controller_database_query_error; + error_counter.inc(DatabaseQueryErrorLabelGroup { + error_type: err.error_label(), + operation: op, + }) + } + + res + } + + /// Call the provided function in a tokio blocking thread, with a Diesel database connection. + async fn with_conn(&self, func: F) -> DatabaseResult + where + F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, + R: Send + 'static, + { + // A generous allowance for how many times we may retry serializable transactions + // before giving up. This is not expected to be hit: it is a defensive measure in case we + // somehow engineer a situation where duelling transactions might otherwise live-lock. + const MAX_RETRIES: usize = 128; + + let mut conn = self.connection_pool.get()?; + tokio::task::spawn_blocking(move || -> DatabaseResult { + let mut retry_count = 0; + loop { + match conn.build_transaction().serializable().run(|c| func(c)) { + Ok(r) => break Ok(r), + Err( + err @ DatabaseError::Query(diesel::result::Error::DatabaseError( + diesel::result::DatabaseErrorKind::SerializationFailure, + _, + )), + ) => { + retry_count += 1; + if retry_count > MAX_RETRIES { + tracing::error!( + "Exceeded max retries on SerializationFailure errors: {err:?}" + ); + break Err(err); + } else { + // Retry on serialization errors: these are expected, because even though our + // transactions don't fight for the same rows, they will occasionally collide + // on index pages (e.g. increment_generation for unrelated shards can collide) + tracing::debug!( + "Retrying transaction on serialization failure {err:?}" + ); + continue; + } + } + Err(e) => break Err(e), + } + } + }) + .await + .expect("Task panic") + } + + /// When a node is first registered, persist it before using it for anything + pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> { + let np = node.to_persistent(); + self.with_measured_conn( + DatabaseOperation::InsertNode, + move |conn| -> DatabaseResult<()> { + diesel::insert_into(crate::schema::nodes::table) + .values(&np) + .execute(conn)?; + Ok(()) + }, + ) + .await + } + + /// At startup, populate the list of nodes which our shards may be placed on + pub(crate) async fn list_nodes(&self) -> DatabaseResult> { + let nodes: Vec = self + .with_measured_conn( + DatabaseOperation::ListNodes, + move |conn| -> DatabaseResult<_> { + Ok(crate::schema::nodes::table.load::(conn)?) + }, + ) + .await?; + + tracing::info!("list_nodes: loaded {} nodes", nodes.len()); + + Ok(nodes) + } + + pub(crate) async fn update_node( + &self, + input_node_id: NodeId, + input_scheduling: NodeSchedulingPolicy, + ) -> DatabaseResult<()> { + use crate::schema::nodes::dsl::*; + let updated = self + .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| { + let updated = diesel::update(nodes) + .filter(node_id.eq(input_node_id.0 as i64)) + .set((scheduling_policy.eq(String::from(input_scheduling)),)) + .execute(conn)?; + Ok(updated) + }) + .await?; + + if updated != 1 { + Err(DatabaseError::Logical(format!( + "Node {node_id:?} not found for update", + ))) + } else { + Ok(()) + } + } + + /// At startup, load the high level state for shards, such as their config + policy. This will + /// be enriched at runtime with state discovered on pageservers. + pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult> { + self.with_measured_conn( + DatabaseOperation::ListTenantShards, + move |conn| -> DatabaseResult<_> { + Ok(crate::schema::tenant_shards::table.load::(conn)?) + }, + ) + .await + } + + /// Tenants must be persisted before we schedule them for the first time. This enables us + /// to correctly retain generation monotonicity, and the externally provided placement policy & config. + pub(crate) async fn insert_tenant_shards( + &self, + shards: Vec, + ) -> DatabaseResult<()> { + use crate::schema::metadata_health; + use crate::schema::tenant_shards; + + let now = chrono::Utc::now(); + + let metadata_health_records = shards + .iter() + .map(|t| MetadataHealthPersistence { + tenant_id: t.tenant_id.clone(), + shard_number: t.shard_number, + shard_count: t.shard_count, + healthy: true, + last_scrubbed_at: now, + }) + .collect::>(); + + self.with_measured_conn( + DatabaseOperation::InsertTenantShards, + move |conn| -> DatabaseResult<()> { + diesel::insert_into(tenant_shards::table) + .values(&shards) + .execute(conn)?; + + diesel::insert_into(metadata_health::table) + .values(&metadata_health_records) + .execute(conn)?; + Ok(()) + }, + ) + .await + } + + /// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for + /// the tenant from memory on this server. + pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + self.with_measured_conn( + DatabaseOperation::DeleteTenant, + move |conn| -> DatabaseResult<()> { + // `metadata_health` status (if exists) is also deleted based on the cascade behavior. + diesel::delete(tenant_shards) + .filter(tenant_id.eq(del_tenant_id.to_string())) + .execute(conn)?; + Ok(()) + }, + ) + .await + } + + pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> { + use crate::schema::nodes::dsl::*; + self.with_measured_conn( + DatabaseOperation::DeleteNode, + move |conn| -> DatabaseResult<()> { + diesel::delete(nodes) + .filter(node_id.eq(del_node_id.0 as i64)) + .execute(conn)?; + + Ok(()) + }, + ) + .await + } + + /// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient + /// batched increment of the generations of all tenants whose generation_pageserver is equal to + /// the node that called /re-attach. + #[tracing::instrument(skip_all, fields(node_id))] + pub(crate) async fn re_attach( + &self, + input_node_id: NodeId, + ) -> DatabaseResult> { + use crate::schema::nodes::dsl::scheduling_policy; + use crate::schema::nodes::dsl::*; + use crate::schema::tenant_shards::dsl::*; + let updated = self + .with_measured_conn(DatabaseOperation::ReAttach, move |conn| { + let rows_updated = diesel::update(tenant_shards) + .filter(generation_pageserver.eq(input_node_id.0 as i64)) + .set(generation.eq(generation + 1)) + .execute(conn)?; + + tracing::info!("Incremented {} tenants' generations", rows_updated); + + // TODO: UPDATE+SELECT in one query + + let updated = tenant_shards + .filter(generation_pageserver.eq(input_node_id.0 as i64)) + .select(TenantShardPersistence::as_select()) + .load(conn)?; + + // If the node went through a drain and restart phase before re-attaching, + // then reset it's node scheduling policy to active. + diesel::update(nodes) + .filter(node_id.eq(input_node_id.0 as i64)) + .filter( + scheduling_policy + .eq(String::from(NodeSchedulingPolicy::PauseForRestart)) + .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Draining))) + .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Filling))), + ) + .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active))) + .execute(conn)?; + + Ok(updated) + }) + .await?; + + let mut result = HashMap::new(); + for tsp in updated { + let tenant_shard_id = TenantShardId { + tenant_id: TenantId::from_str(tsp.tenant_id.as_str()) + .map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?, + shard_number: ShardNumber(tsp.shard_number as u8), + shard_count: ShardCount::new(tsp.shard_count as u8), + }; + + let Some(g) = tsp.generation else { + // If the generation_pageserver column was non-NULL, then the generation column should also be non-NULL: + // we only set generation_pageserver when setting generation. + return Err(DatabaseError::Logical( + "Generation should always be set after incrementing".to_string(), + )); + }; + result.insert(tenant_shard_id, Generation::new(g as u32)); + } + + Ok(result) + } + + /// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically + /// advancing generation number. We also store the NodeId for which the generation was issued, so that in + /// [`Self::re_attach`] we can do a bulk UPDATE on the generations for that node. + pub(crate) async fn increment_generation( + &self, + tenant_shard_id: TenantShardId, + node_id: NodeId, + ) -> anyhow::Result { + use crate::schema::tenant_shards::dsl::*; + let updated = self + .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| { + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .set(( + generation.eq(generation + 1), + generation_pageserver.eq(node_id.0 as i64), + )) + // TODO: only returning() the generation column + .returning(TenantShardPersistence::as_returning()) + .get_result(conn)?; + + Ok(updated) + }) + .await?; + + // Generation is always non-null in the rseult: if the generation column had been NULL, then we + // should have experienced an SQL Confilict error while executing a query that tries to increment it. + debug_assert!(updated.generation.is_some()); + let Some(g) = updated.generation else { + return Err(DatabaseError::Logical( + "Generation should always be set after incrementing".to_string(), + ) + .into()); + }; + + Ok(Generation::new(g as u32)) + } + + /// When we want to call out to the running shards for a tenant, e.g. during timeline CRUD operations, + /// we need to know where the shard is attached, _and_ the generation, so that we can re-check the generation + /// afterwards to confirm that our timeline CRUD operation is truly persistent (it must have happened in the + /// latest generation) + /// + /// If the tenant doesn't exist, an empty vector is returned. + /// + /// Output is sorted by shard number + pub(crate) async fn tenant_generations( + &self, + filter_tenant_id: TenantId, + ) -> Result, DatabaseError> { + use crate::schema::tenant_shards::dsl::*; + let rows = self + .with_measured_conn(DatabaseOperation::TenantGenerations, move |conn| { + let result = tenant_shards + .filter(tenant_id.eq(filter_tenant_id.to_string())) + .select(TenantShardPersistence::as_select()) + .order(shard_number) + .load(conn)?; + Ok(result) + }) + .await?; + + Ok(rows + .into_iter() + .map(|p| ShardGenerationState { + tenant_shard_id: p + .get_tenant_shard_id() + .expect("Corrupt tenant shard id in database"), + generation: p.generation.map(|g| Generation::new(g as u32)), + generation_pageserver: p.generation_pageserver.map(|n| NodeId(n as u64)), + }) + .collect()) + } + + /// Read the generation number of specific tenant shards + /// + /// Output is unsorted. Output may not include values for all inputs, if they are missing in the database. + pub(crate) async fn shard_generations( + &self, + mut tenant_shard_ids: impl Iterator, + ) -> Result)>, DatabaseError> { + let mut rows = Vec::with_capacity(tenant_shard_ids.size_hint().0); + + // We will chunk our input to avoid composing arbitrarily long `IN` clauses. Typically we are + // called with a single digit number of IDs, but in principle we could be called with tens + // of thousands (all the shards on one pageserver) from the generation validation API. + loop { + // A modest hardcoded chunk size to handle typical cases in a single query but never generate particularly + // large query strings. + let chunk_ids = tenant_shard_ids.by_ref().take(32); + + // Compose a comma separated list of tuples for matching on (tenant_id, shard_number, shard_count) + let in_clause = chunk_ids + .map(|tsid| { + format!( + "('{}', {}, {})", + tsid.tenant_id, tsid.shard_number.0, tsid.shard_count.0 + ) + }) + .join(","); + + // We are done when our iterator gives us nothing to filter on + if in_clause.is_empty() { + break; + } + + let chunk_rows = self + .with_measured_conn(DatabaseOperation::ShardGenerations, move |conn| { + // diesel doesn't support multi-column IN queries, so we compose raw SQL. No escaping is required because + // the inputs are strongly typed and cannot carry any user-supplied raw string content. + let result : Vec = diesel::sql_query( + format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str() + ).load(conn)?; + + Ok(result) + }) + .await?; + rows.extend(chunk_rows.into_iter()) + } + + Ok(rows + .into_iter() + .map(|tsp| { + ( + tsp.get_tenant_shard_id() + .expect("Bad tenant ID in database"), + tsp.generation.map(|g| Generation::new(g as u32)), + ) + }) + .collect()) + } + + #[allow(non_local_definitions)] + /// For use when updating a persistent property of a tenant, such as its config or placement_policy. + /// + /// Do not use this for settting generation, unless in the special onboarding code path (/location_config) + /// API: use [`Self::increment_generation`] instead. Setting the generation via this route is a one-time thing + /// that we only do the first time a tenant is set to an attached policy via /location_config. + pub(crate) async fn update_tenant_shard( + &self, + tenant: TenantFilter, + input_placement_policy: Option, + input_config: Option, + input_generation: Option, + input_scheduling_policy: Option, + ) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + + self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| { + let query = match tenant { + TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .into_boxed(), + TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards) + .filter(tenant_id.eq(input_tenant_id.to_string())) + .into_boxed(), + }; + + #[derive(AsChangeset)] + #[diesel(table_name = crate::schema::tenant_shards)] + struct ShardUpdate { + generation: Option, + placement_policy: Option, + config: Option, + scheduling_policy: Option, + } + + let update = ShardUpdate { + generation: input_generation.map(|g| g.into().unwrap() as i32), + placement_policy: input_placement_policy + .as_ref() + .map(|p| serde_json::to_string(&p).unwrap()), + config: input_config + .as_ref() + .map(|c| serde_json::to_string(&c).unwrap()), + scheduling_policy: input_scheduling_policy + .map(|p| serde_json::to_string(&p).unwrap()), + }; + + query.set(update).execute(conn)?; + + Ok(()) + }) + .await?; + + Ok(()) + } + + pub(crate) async fn set_tenant_shard_preferred_azs( + &self, + preferred_azs: Vec<(TenantShardId, String)>, + ) -> DatabaseResult> { + use crate::schema::tenant_shards::dsl::*; + + self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| { + let mut shards_updated = Vec::default(); + + for (tenant_shard_id, preferred_az) in preferred_azs.iter() { + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .set(preferred_az_id.eq(preferred_az)) + .execute(conn)?; + + if updated == 1 { + shards_updated.push((*tenant_shard_id, preferred_az.clone())); + } + } + + Ok(shards_updated) + }) + .await + } + + pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { + use crate::schema::tenant_shards::dsl::*; + self.with_measured_conn(DatabaseOperation::Detach, move |conn| { + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .set(( + generation_pageserver.eq(Option::::None), + placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()), + )) + .execute(conn)?; + + Ok(updated) + }) + .await?; + + Ok(()) + } + + // When we start shard splitting, we must durably mark the tenant so that + // on restart, we know that we must go through recovery. + // + // We create the child shards here, so that they will be available for increment_generation calls + // if some pageserver holding a child shard needs to restart before the overall tenant split is complete. + pub(crate) async fn begin_shard_split( + &self, + old_shard_count: ShardCount, + split_tenant_id: TenantId, + parent_to_children: Vec<(TenantShardId, Vec)>, + ) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> { + // Mark parent shards as splitting + + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(old_shard_count.literal() as i32)) + .set((splitting.eq(1),)) + .execute(conn)?; + if u8::try_from(updated) + .map_err(|_| DatabaseError::Logical( + format!("Overflow existing shard count {} while splitting", updated)) + )? != old_shard_count.count() { + // Perhaps a deletion or another split raced with this attempt to split, mutating + // the parent shards that we intend to split. In this case the split request should fail. + return Err(DatabaseError::Logical( + format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count()) + )); + } + + // FIXME: spurious clone to sidestep closure move rules + let parent_to_children = parent_to_children.clone(); + + // Insert child shards + for (parent_shard_id, children) in parent_to_children { + let mut parent = crate::schema::tenant_shards::table + .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32)) + .load::(conn)?; + let parent = if parent.len() != 1 { + return Err(DatabaseError::Logical(format!( + "Parent shard {parent_shard_id} not found" + ))); + } else { + parent.pop().unwrap() + }; + for mut shard in children { + // Carry the parent's generation into the child + shard.generation = parent.generation; + + debug_assert!(shard.splitting == SplitState::Splitting); + diesel::insert_into(tenant_shards) + .values(shard) + .execute(conn)?; + } + } + + Ok(()) + }) + .await + } + + // When we finish shard splitting, we must atomically clean up the old shards + // and insert the new shards, and clear the splitting marker. + pub(crate) async fn complete_shard_split( + &self, + split_tenant_id: TenantId, + old_shard_count: ShardCount, + ) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + self.with_measured_conn( + DatabaseOperation::CompleteShardSplit, + move |conn| -> DatabaseResult<()> { + // Drop parent shards + diesel::delete(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(old_shard_count.literal() as i32)) + .execute(conn)?; + + // Clear sharding flag + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .set((splitting.eq(0),)) + .execute(conn)?; + debug_assert!(updated > 0); + + Ok(()) + }, + ) + .await + } + + /// Used when the remote part of a shard split failed: we will revert the database state to have only + /// the parent shards, with SplitState::Idle. + pub(crate) async fn abort_shard_split( + &self, + split_tenant_id: TenantId, + new_shard_count: ShardCount, + ) -> DatabaseResult { + use crate::schema::tenant_shards::dsl::*; + self.with_measured_conn( + DatabaseOperation::AbortShardSplit, + move |conn| -> DatabaseResult { + // Clear the splitting state on parent shards + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.ne(new_shard_count.literal() as i32)) + .set((splitting.eq(0),)) + .execute(conn)?; + + // Parent shards are already gone: we cannot abort. + if updated == 0 { + return Ok(AbortShardSplitStatus::Complete); + } + + // Sanity check: if parent shards were present, their cardinality should + // be less than the number of child shards. + if updated >= new_shard_count.count() as usize { + return Err(DatabaseError::Logical(format!( + "Unexpected parent shard count {updated} while aborting split to \ + count {new_shard_count:?} on tenant {split_tenant_id}" + ))); + } + + // Erase child shards + diesel::delete(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(new_shard_count.literal() as i32)) + .execute(conn)?; + + Ok(AbortShardSplitStatus::Aborted) + }, + ) + .await + } + + /// Stores all the latest metadata health updates durably. Updates existing entry on conflict. + /// + /// **Correctness:** `metadata_health_updates` should all belong the tenant shards managed by the storage controller. + #[allow(dead_code)] + pub(crate) async fn update_metadata_health_records( + &self, + healthy_records: Vec, + unhealthy_records: Vec, + now: chrono::DateTime, + ) -> DatabaseResult<()> { + use crate::schema::metadata_health::dsl::*; + + self.with_measured_conn( + DatabaseOperation::UpdateMetadataHealth, + move |conn| -> DatabaseResult<_> { + diesel::insert_into(metadata_health) + .values(&healthy_records) + .on_conflict((tenant_id, shard_number, shard_count)) + .do_update() + .set((healthy.eq(true), last_scrubbed_at.eq(now))) + .execute(conn)?; + + diesel::insert_into(metadata_health) + .values(&unhealthy_records) + .on_conflict((tenant_id, shard_number, shard_count)) + .do_update() + .set((healthy.eq(false), last_scrubbed_at.eq(now))) + .execute(conn)?; + Ok(()) + }, + ) + .await + } + + /// Lists all the metadata health records. + #[allow(dead_code)] + pub(crate) async fn list_metadata_health_records( + &self, + ) -> DatabaseResult> { + self.with_measured_conn( + DatabaseOperation::ListMetadataHealth, + move |conn| -> DatabaseResult<_> { + Ok( + crate::schema::metadata_health::table + .load::(conn)?, + ) + }, + ) + .await + } + + /// Lists all the metadata health records that is unhealthy. + #[allow(dead_code)] + pub(crate) async fn list_unhealthy_metadata_health_records( + &self, + ) -> DatabaseResult> { + use crate::schema::metadata_health::dsl::*; + self.with_measured_conn( + DatabaseOperation::ListMetadataHealthUnhealthy, + move |conn| -> DatabaseResult<_> { + Ok(crate::schema::metadata_health::table + .filter(healthy.eq(false)) + .load::(conn)?) + }, + ) + .await + } + + /// Lists all the metadata health records that have not been updated since an `earlier` time. + #[allow(dead_code)] + pub(crate) async fn list_outdated_metadata_health_records( + &self, + earlier: chrono::DateTime, + ) -> DatabaseResult> { + use crate::schema::metadata_health::dsl::*; + + self.with_measured_conn( + DatabaseOperation::ListMetadataHealthOutdated, + move |conn| -> DatabaseResult<_> { + let query = metadata_health.filter(last_scrubbed_at.lt(earlier)); + let res = query.load::(conn)?; + + Ok(res) + }, + ) + .await + } + + /// Get the current entry from the `leader` table if one exists. + /// It is an error for the table to contain more than one entry. + pub(crate) async fn get_leader(&self) -> DatabaseResult> { + let mut leader: Vec = self + .with_measured_conn( + DatabaseOperation::GetLeader, + move |conn| -> DatabaseResult<_> { + Ok(crate::schema::controllers::table.load::(conn)?) + }, + ) + .await?; + + if leader.len() > 1 { + return Err(DatabaseError::Logical(format!( + "More than one entry present in the leader table: {leader:?}" + ))); + } + + Ok(leader.pop()) + } + + /// Update the new leader with compare-exchange semantics. If `prev` does not + /// match the current leader entry, then the update is treated as a failure. + /// When `prev` is not specified, the update is forced. + pub(crate) async fn update_leader( + &self, + prev: Option, + new: ControllerPersistence, + ) -> DatabaseResult<()> { + use crate::schema::controllers::dsl::*; + + let updated = self + .with_measured_conn( + DatabaseOperation::UpdateLeader, + move |conn| -> DatabaseResult { + let updated = match &prev { + Some(prev) => diesel::update(controllers) + .filter(address.eq(prev.address.clone())) + .filter(started_at.eq(prev.started_at)) + .set(( + address.eq(new.address.clone()), + started_at.eq(new.started_at), + )) + .execute(conn)?, + None => diesel::insert_into(controllers) + .values(new.clone()) + .execute(conn)?, + }; + + Ok(updated) + }, + ) + .await?; + + if updated == 0 { + return Err(DatabaseError::Logical( + "Leader table update failed".to_string(), + )); + } + + Ok(()) + } + + pub(crate) async fn safekeeper_get( + &self, + id: i64, + ) -> Result { + use crate::schema::safekeepers::dsl::{id as id_column, safekeepers}; + self.with_conn(move |conn| -> DatabaseResult { + Ok(safekeepers + .filter(id_column.eq(&id)) + .select(SafekeeperPersistence::as_select()) + .get_result(conn)?) + }) + .await + } + + pub(crate) async fn safekeeper_upsert( + &self, + record: SafekeeperPersistence, + ) -> Result<(), DatabaseError> { + use crate::schema::safekeepers::dsl::*; + + self.with_conn(move |conn| -> DatabaseResult<()> { + let bind = record.as_insert_or_update(); + + let inserted_updated = diesel::insert_into(safekeepers) + .values(&bind) + .on_conflict(id) + .do_update() + .set(&bind) + .execute(conn)?; + + if inserted_updated != 1 { + return Err(DatabaseError::Logical(format!( + "unexpected number of rows ({})", + inserted_updated + ))); + } + + Ok(()) + }) + .await + } +} + +/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably +#[derive( + QueryableByName, Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq, +)] +#[diesel(table_name = crate::schema::tenant_shards)] +pub(crate) struct TenantShardPersistence { + #[serde(default)] + pub(crate) tenant_id: String, + #[serde(default)] + pub(crate) shard_number: i32, + #[serde(default)] + pub(crate) shard_count: i32, + #[serde(default)] + pub(crate) shard_stripe_size: i32, + + // Latest generation number: next time we attach, increment this + // and use the incremented number when attaching. + // + // Generation is only None when first onboarding a tenant, where it may + // be in PlacementPolicy::Secondary and therefore have no valid generation state. + pub(crate) generation: Option, + + // Currently attached pageserver + #[serde(rename = "pageserver")] + pub(crate) generation_pageserver: Option, + + #[serde(default)] + pub(crate) placement_policy: String, + #[serde(default)] + pub(crate) splitting: SplitState, + #[serde(default)] + pub(crate) config: String, + #[serde(default)] + pub(crate) scheduling_policy: String, + + // Hint that we should attempt to schedule this tenant shard the given + // availability zone in order to minimise the chances of cross-AZ communication + // with compute. + pub(crate) preferred_az_id: Option, +} + +impl TenantShardPersistence { + pub(crate) fn get_shard_identity(&self) -> Result { + if self.shard_count == 0 { + Ok(ShardIdentity::unsharded()) + } else { + Ok(ShardIdentity::new( + ShardNumber(self.shard_number as u8), + ShardCount::new(self.shard_count as u8), + ShardStripeSize(self.shard_stripe_size as u32), + )?) + } + } + + pub(crate) fn get_tenant_shard_id(&self) -> Result { + Ok(TenantShardId { + tenant_id: TenantId::from_str(self.tenant_id.as_str())?, + shard_number: ShardNumber(self.shard_number as u8), + shard_count: ShardCount::new(self.shard_count as u8), + }) + } +} + +/// Parts of [`crate::node::Node`] that are stored durably +#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)] +#[diesel(table_name = crate::schema::nodes)] +pub(crate) struct NodePersistence { + pub(crate) node_id: i64, + pub(crate) scheduling_policy: String, + pub(crate) listen_http_addr: String, + pub(crate) listen_http_port: i32, + pub(crate) listen_pg_addr: String, + pub(crate) listen_pg_port: i32, + pub(crate) availability_zone_id: String, +} + +/// Tenant metadata health status that are stored durably. +#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)] +#[diesel(table_name = crate::schema::metadata_health)] +pub(crate) struct MetadataHealthPersistence { + #[serde(default)] + pub(crate) tenant_id: String, + #[serde(default)] + pub(crate) shard_number: i32, + #[serde(default)] + pub(crate) shard_count: i32, + + pub(crate) healthy: bool, + pub(crate) last_scrubbed_at: chrono::DateTime, +} + +impl MetadataHealthPersistence { + pub fn new( + tenant_shard_id: TenantShardId, + healthy: bool, + last_scrubbed_at: chrono::DateTime, + ) -> Self { + let tenant_id = tenant_shard_id.tenant_id.to_string(); + let shard_number = tenant_shard_id.shard_number.0 as i32; + let shard_count = tenant_shard_id.shard_count.literal() as i32; + + MetadataHealthPersistence { + tenant_id, + shard_number, + shard_count, + healthy, + last_scrubbed_at, + } + } + + #[allow(dead_code)] + pub(crate) fn get_tenant_shard_id(&self) -> Result { + Ok(TenantShardId { + tenant_id: TenantId::from_str(self.tenant_id.as_str())?, + shard_number: ShardNumber(self.shard_number as u8), + shard_count: ShardCount::new(self.shard_count as u8), + }) + } +} + +impl From for MetadataHealthRecord { + fn from(value: MetadataHealthPersistence) -> Self { + MetadataHealthRecord { + tenant_shard_id: value + .get_tenant_shard_id() + .expect("stored tenant id should be valid"), + healthy: value.healthy, + last_scrubbed_at: value.last_scrubbed_at, + } + } +} + +#[derive( + Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq, Debug, Clone, +)] +#[diesel(table_name = crate::schema::controllers)] +pub(crate) struct ControllerPersistence { + pub(crate) address: String, + pub(crate) started_at: chrono::DateTime, +} + +#[derive(Serialize, Deserialize, Queryable, Selectable, Eq, PartialEq, Debug, Clone)] +#[diesel(table_name = crate::schema::safekeepers)] +pub(crate) struct SafekeeperPersistence { + pub(crate) id: i64, + pub(crate) region_id: String, + /// 1 is special, it means just created (not currently posted to storcon). + /// Zero or negative is not really expected. + /// Otherwise the number from `release-$(number_of_commits_on_branch)` tag. + pub(crate) version: i64, + pub(crate) host: String, + pub(crate) port: i32, + pub(crate) active: bool, + pub(crate) http_port: i32, + pub(crate) availability_zone_id: String, +} + +impl SafekeeperPersistence { + fn as_insert_or_update(&self) -> InsertUpdateSafekeeper<'_> { + InsertUpdateSafekeeper { + id: self.id, + region_id: &self.region_id, + version: self.version, + host: &self.host, + port: self.port, + active: self.active, + http_port: self.http_port, + availability_zone_id: &self.availability_zone_id, + } + } +} + +#[derive(Insertable, AsChangeset)] +#[diesel(table_name = crate::schema::safekeepers)] +struct InsertUpdateSafekeeper<'a> { + id: i64, + region_id: &'a str, + version: i64, + host: &'a str, + port: i32, + active: bool, + http_port: i32, + availability_zone_id: &'a str, +} diff --git a/storage_controller/src/persistence/split_state.rs b/storage_controller/src/persistence/split_state.rs new file mode 100644 index 0000000000..bce1a75843 --- /dev/null +++ b/storage_controller/src/persistence/split_state.rs @@ -0,0 +1,46 @@ +use diesel::pg::{Pg, PgValue}; +use diesel::{ + deserialize::FromSql, deserialize::FromSqlRow, expression::AsExpression, serialize::ToSql, + sql_types::Int2, +}; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, FromSqlRow, AsExpression)] +#[diesel(sql_type = SplitStateSQLRepr)] +#[derive(Deserialize, Serialize)] +pub enum SplitState { + Idle = 0, + Splitting = 1, +} + +impl Default for SplitState { + fn default() -> Self { + Self::Idle + } +} + +type SplitStateSQLRepr = Int2; + +impl ToSql for SplitState { + fn to_sql<'a>( + &'a self, + out: &'a mut diesel::serialize::Output, + ) -> diesel::serialize::Result { + let raw_value: i16 = *self as i16; + let mut new_out = out.reborrow(); + ToSql::::to_sql(&raw_value, &mut new_out) + } +} + +impl FromSql for SplitState { + fn from_sql(pg_value: PgValue) -> diesel::deserialize::Result { + match FromSql::::from_sql(pg_value).map(|v| match v { + 0 => Some(Self::Idle), + 1 => Some(Self::Splitting), + _ => None, + })? { + Some(v) => Ok(v), + None => Err(format!("Invalid SplitState value, was: {:?}", pg_value.as_bytes()).into()), + } + } +} diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs new file mode 100644 index 0000000000..83b7b2b4f2 --- /dev/null +++ b/storage_controller/src/reconciler.rs @@ -0,0 +1,912 @@ +use crate::pageserver_client::PageserverClient; +use crate::persistence::Persistence; +use crate::service; +use pageserver_api::controller_api::PlacementPolicy; +use pageserver_api::models::{ + LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, +}; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; +use pageserver_client::mgmt_api; +use reqwest::StatusCode; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{Duration, Instant}; +use tokio_util::sync::CancellationToken; +use utils::backoff::exponential_backoff; +use utils::failpoint_support; +use utils::generation::Generation; +use utils::id::{NodeId, TimelineId}; +use utils::lsn::Lsn; +use utils::pausable_failpoint; +use utils::sync::gate::GateGuard; + +use crate::compute_hook::{ComputeHook, NotifyError}; +use crate::node::Node; +use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation}; + +const DEFAULT_HEATMAP_PERIOD: &str = "60s"; + +/// Object with the lifetime of the background reconcile task that is created +/// for tenants which have a difference between their intent and observed states. +pub(super) struct Reconciler { + /// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot + /// of a tenant's state from when we spawned a reconcile task. + pub(super) tenant_shard_id: TenantShardId, + pub(crate) shard: ShardIdentity, + pub(crate) placement_policy: PlacementPolicy, + pub(crate) generation: Option, + pub(crate) intent: TargetState, + + /// Nodes not referenced by [`Self::intent`], from which we should try + /// to detach this tenant shard. + pub(crate) detach: Vec, + + /// Configuration specific to this reconciler + pub(crate) reconciler_config: ReconcilerConfig, + + pub(crate) config: TenantConfig, + pub(crate) observed: ObservedState, + + pub(crate) service_config: service::Config, + + /// A hook to notify the running postgres instances when we change the location + /// of a tenant. Use this via [`Self::compute_notify`] to update our failure flag + /// and guarantee eventual retries. + pub(crate) compute_hook: Arc, + + /// To avoid stalling if the cloud control plane is unavailable, we may proceed + /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed + /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry. + pub(crate) compute_notify_failure: bool, + + /// Reconciler is responsible for keeping alive semaphore units that limit concurrency on how many + /// we will spawn. + pub(crate) _resource_units: ReconcileUnits, + + /// A means to abort background reconciliation: it is essential to + /// call this when something changes in the original TenantShard that + /// will make this reconciliation impossible or unnecessary, for + /// example when a pageserver node goes offline, or the PlacementPolicy for + /// the tenant is changed. + pub(crate) cancel: CancellationToken, + + /// Reconcilers are registered with a Gate so that during a graceful shutdown we + /// can wait for all the reconcilers to respond to their cancellation tokens. + pub(crate) _gate_guard: GateGuard, + + /// Access to persistent storage for updating generation numbers + pub(crate) persistence: Arc, +} + +pub(crate) struct ReconcilerConfigBuilder { + config: ReconcilerConfig, +} + +impl ReconcilerConfigBuilder { + pub(crate) fn new() -> Self { + Self { + config: ReconcilerConfig::default(), + } + } + + pub(crate) fn secondary_warmup_timeout(self, value: Duration) -> Self { + Self { + config: ReconcilerConfig { + secondary_warmup_timeout: Some(value), + ..self.config + }, + } + } + + pub(crate) fn secondary_download_request_timeout(self, value: Duration) -> Self { + Self { + config: ReconcilerConfig { + secondary_download_request_timeout: Some(value), + ..self.config + }, + } + } + + pub(crate) fn build(self) -> ReconcilerConfig { + self.config + } +} + +#[derive(Default, Debug, Copy, Clone)] +pub(crate) struct ReconcilerConfig { + // During live migration give up on warming-up the secondary + // after this timeout. + secondary_warmup_timeout: Option, + + // During live migrations this is the amount of time that + // the pagserver will hold our poll. + secondary_download_request_timeout: Option, +} + +impl ReconcilerConfig { + pub(crate) fn get_secondary_warmup_timeout(&self) -> Duration { + const SECONDARY_WARMUP_TIMEOUT_DEFAULT: Duration = Duration::from_secs(300); + self.secondary_warmup_timeout + .unwrap_or(SECONDARY_WARMUP_TIMEOUT_DEFAULT) + } + + pub(crate) fn get_secondary_download_request_timeout(&self) -> Duration { + const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT: Duration = Duration::from_secs(20); + self.secondary_download_request_timeout + .unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT) + } +} + +/// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O +pub(crate) struct ReconcileUnits { + _sem_units: tokio::sync::OwnedSemaphorePermit, +} + +impl ReconcileUnits { + pub(crate) fn new(sem_units: tokio::sync::OwnedSemaphorePermit) -> Self { + Self { + _sem_units: sem_units, + } + } +} + +/// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any +/// reference counting for Scheduler. The IntentState is what the scheduler works with, +/// and the TargetState is just the instruction for a particular Reconciler run. +#[derive(Debug)] +pub(crate) struct TargetState { + pub(crate) attached: Option, + pub(crate) secondary: Vec, +} + +impl TargetState { + pub(crate) fn from_intent(nodes: &HashMap, intent: &IntentState) -> Self { + Self { + attached: intent.get_attached().map(|n| { + nodes + .get(&n) + .expect("Intent attached referenced non-existent node") + .clone() + }), + secondary: intent + .get_secondary() + .iter() + .map(|n| { + nodes + .get(n) + .expect("Intent secondary referenced non-existent node") + .clone() + }) + .collect(), + } + } +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum ReconcileError { + #[error(transparent)] + Remote(#[from] mgmt_api::Error), + #[error(transparent)] + Notify(#[from] NotifyError), + #[error("Cancelled")] + Cancel, + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl Reconciler { + async fn location_config( + &mut self, + node: &Node, + config: LocationConfig, + flush_ms: Option, + lazy: bool, + ) -> Result<(), ReconcileError> { + if !node.is_available() && config.mode == LocationConfigMode::Detached { + // Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline + // will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of + // what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`] + tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation"); + self.observed.locations.remove(&node.get_id()); + return Ok(()); + } + + self.observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: None }); + + // TODO: amend locations that use long-polling: they will hit this timeout. + let timeout = Duration::from_secs(25); + + tracing::info!("location_config({node}) calling: {:?}", config); + let tenant_shard_id = self.tenant_shard_id; + let config_ref = &config; + match node + .with_client_retries( + |client| async move { + let config = config_ref.clone(); + client + .location_config(tenant_shard_id, config.clone(), flush_ms, lazy) + .await + }, + &self.service_config.jwt_token, + 1, + 3, + timeout, + &self.cancel, + ) + .await + { + Some(Ok(_)) => {} + Some(Err(e)) => return Err(e.into()), + None => return Err(ReconcileError::Cancel), + }; + tracing::info!("location_config({node}) complete: {:?}", config); + + match config.mode { + LocationConfigMode::Detached => { + self.observed.locations.remove(&node.get_id()); + } + _ => { + self.observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: Some(config) }); + } + } + + Ok(()) + } + + fn get_node(&self, node_id: &NodeId) -> Option<&Node> { + if let Some(node) = self.intent.attached.as_ref() { + if node.get_id() == *node_id { + return Some(node); + } + } + + if let Some(node) = self + .intent + .secondary + .iter() + .find(|n| n.get_id() == *node_id) + { + return Some(node); + } + + if let Some(node) = self.detach.iter().find(|n| n.get_id() == *node_id) { + return Some(node); + } + + None + } + + async fn maybe_live_migrate(&mut self) -> Result<(), ReconcileError> { + let destination = if let Some(node) = &self.intent.attached { + match self.observed.locations.get(&node.get_id()) { + Some(conf) => { + // We will do a live migration only if the intended destination is not + // currently in an attached state. + match &conf.conf { + Some(conf) if conf.mode == LocationConfigMode::Secondary => { + // Fall through to do a live migration + node + } + None | Some(_) => { + // Attached or uncertain: don't do a live migration, proceed + // with a general-case reconciliation + tracing::info!("maybe_live_migrate: destination is None or attached"); + return Ok(()); + } + } + } + None => { + // Our destination is not attached: maybe live migrate if some other + // node is currently attached. Fall through. + node + } + } + } else { + // No intent to be attached + tracing::info!("maybe_live_migrate: no attached intent"); + return Ok(()); + }; + + let mut origin = None; + for (node_id, state) in &self.observed.locations { + if let Some(observed_conf) = &state.conf { + if observed_conf.mode == LocationConfigMode::AttachedSingle { + // We will only attempt live migration if the origin is not offline: this + // avoids trying to do it while reconciling after responding to an HA failover. + if let Some(node) = self.get_node(node_id) { + if node.is_available() { + origin = Some(node.clone()); + break; + } + } + } + } + } + + let Some(origin) = origin else { + tracing::info!("maybe_live_migrate: no origin found"); + return Ok(()); + }; + + // We have an origin and a destination: proceed to do the live migration + tracing::info!("Live migrating {}->{}", origin, destination); + self.live_migrate(origin, destination.clone()).await?; + + Ok(()) + } + + async fn get_lsns( + &self, + tenant_shard_id: TenantShardId, + node: &Node, + ) -> anyhow::Result> { + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.service_config.jwt_token.as_deref(), + ); + + let timelines = client.timeline_list(&tenant_shard_id).await?; + Ok(timelines + .into_iter() + .map(|t| (t.timeline_id, t.last_record_lsn)) + .collect()) + } + + async fn secondary_download( + &self, + tenant_shard_id: TenantShardId, + node: &Node, + ) -> Result<(), ReconcileError> { + // This is not the timeout for a request, but the total amount of time we're willing to wait + // for a secondary location to get up to date before + let total_download_timeout = self.reconciler_config.get_secondary_warmup_timeout(); + + // This the long-polling interval for the secondary download requests we send to destination pageserver + // during a migration. + let request_download_timeout = self + .reconciler_config + .get_secondary_download_request_timeout(); + + let started_at = Instant::now(); + + loop { + let (status, progress) = match node + .with_client_retries( + |client| async move { + client + .tenant_secondary_download( + tenant_shard_id, + Some(request_download_timeout), + ) + .await + }, + &self.service_config.jwt_token, + 1, + 3, + request_download_timeout * 2, + &self.cancel, + ) + .await + { + None => Err(ReconcileError::Cancel), + Some(Ok(v)) => Ok(v), + Some(Err(e)) => { + // Give up, but proceed: it's unfortunate if we couldn't freshen the destination before + // attaching, but we should not let an issue with a secondary location stop us proceeding + // with a live migration. + tracing::warn!("Failed to prepare by downloading layers on node {node}: {e})"); + return Ok(()); + } + }?; + + if status == StatusCode::OK { + tracing::info!( + "Downloads to {} complete: {}/{} layers, {}/{} bytes", + node, + progress.layers_downloaded, + progress.layers_total, + progress.bytes_downloaded, + progress.bytes_total + ); + return Ok(()); + } else if status == StatusCode::ACCEPTED { + let total_runtime = started_at.elapsed(); + if total_runtime > total_download_timeout { + tracing::warn!("Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes", + total_runtime.as_millis(), + progress.layers_downloaded, + progress.layers_total, + progress.bytes_downloaded, + progress.bytes_total + ); + // Give up, but proceed: an incompletely warmed destination doesn't prevent migration working, + // it just makes the I/O performance for users less good. + return Ok(()); + } + + // Log and proceed around the loop to retry. We don't sleep between requests, because our HTTP call + // to the pageserver is a long-poll. + tracing::info!( + "Downloads to {} not yet complete: {}/{} layers, {}/{} bytes", + node, + progress.layers_downloaded, + progress.layers_total, + progress.bytes_downloaded, + progress.bytes_total + ); + } + } + } + + async fn await_lsn( + &self, + tenant_shard_id: TenantShardId, + node: &Node, + baseline: HashMap, + ) -> anyhow::Result<()> { + loop { + let latest = match self.get_lsns(tenant_shard_id, node).await { + Ok(l) => l, + Err(e) => { + tracing::info!("🕑 Can't get LSNs on node {node} yet, waiting ({e})",); + std::thread::sleep(Duration::from_millis(500)); + continue; + } + }; + + let mut any_behind: bool = false; + for (timeline_id, baseline_lsn) in &baseline { + match latest.get(timeline_id) { + Some(latest_lsn) => { + tracing::info!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}"); + if latest_lsn < baseline_lsn { + any_behind = true; + } + } + None => { + // Expected timeline isn't yet visible on migration destination. + // (IRL we would have to account for timeline deletion, but this + // is just test helper) + any_behind = true; + } + } + } + + if !any_behind { + tracing::info!("✅ LSN caught up. Proceeding..."); + break; + } else { + std::thread::sleep(Duration::from_millis(500)); + } + } + + Ok(()) + } + + pub async fn live_migrate( + &mut self, + origin_ps: Node, + dest_ps: Node, + ) -> Result<(), ReconcileError> { + // `maybe_live_migrate` is responsibble for sanity of inputs + assert!(origin_ps.get_id() != dest_ps.get_id()); + + fn build_location_config( + shard: &ShardIdentity, + config: &TenantConfig, + mode: LocationConfigMode, + generation: Option, + secondary_conf: Option, + ) -> LocationConfig { + LocationConfig { + mode, + generation: generation.map(|g| g.into().unwrap()), + secondary_conf, + tenant_conf: config.clone(), + shard_number: shard.number.0, + shard_count: shard.count.literal(), + shard_stripe_size: shard.stripe_size.0, + } + } + + tracing::info!("🔁 Switching origin node {origin_ps} to stale mode",); + + // FIXME: it is incorrect to use self.generation here, we should use the generation + // from the ObservedState of the origin pageserver (it might be older than self.generation) + let stale_conf = build_location_config( + &self.shard, + &self.config, + LocationConfigMode::AttachedStale, + self.generation, + None, + ); + self.location_config(&origin_ps, stale_conf, Some(Duration::from_secs(10)), false) + .await?; + + let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps).await?); + + // If we are migrating to a destination that has a secondary location, warm it up first + if let Some(destination_conf) = self.observed.locations.get(&dest_ps.get_id()) { + if let Some(destination_conf) = &destination_conf.conf { + if destination_conf.mode == LocationConfigMode::Secondary { + tracing::info!("🔁 Downloading latest layers to destination node {dest_ps}",); + self.secondary_download(self.tenant_shard_id, &dest_ps) + .await?; + } + } + } + + // Increment generation before attaching to new pageserver + self.generation = Some( + self.persistence + .increment_generation(self.tenant_shard_id, dest_ps.get_id()) + .await?, + ); + + let dest_conf = build_location_config( + &self.shard, + &self.config, + LocationConfigMode::AttachedMulti, + self.generation, + None, + ); + + tracing::info!("🔁 Attaching to pageserver {dest_ps}"); + self.location_config(&dest_ps, dest_conf, None, false) + .await?; + + if let Some(baseline) = baseline_lsns { + tracing::info!("🕑 Waiting for LSN to catch up..."); + self.await_lsn(self.tenant_shard_id, &dest_ps, baseline) + .await?; + } + + tracing::info!("🔁 Notifying compute to use pageserver {dest_ps}"); + + // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach + // the origin without notifying compute, we will render the tenant unavailable. + let mut notify_attempts = 0; + while let Err(e) = self.compute_notify().await { + match e { + NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)), + NotifyError::ShuttingDown => return Err(ReconcileError::Cancel), + _ => { + tracing::warn!( + "Live migration blocked by compute notification error, retrying: {e}" + ); + } + } + + exponential_backoff( + notify_attempts, + // Generous waits: control plane operations which might be blocking us usually complete on the order + // of hundreds to thousands of milliseconds, so no point busy polling. + 1.0, + 10.0, + &self.cancel, + ) + .await; + notify_attempts += 1; + } + + pausable_failpoint!("reconciler-live-migrate-post-notify"); + + // Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Attached(0), then + // this location will be deleted in the general case reconciliation that runs after this. + let origin_secondary_conf = build_location_config( + &self.shard, + &self.config, + LocationConfigMode::Secondary, + None, + Some(LocationConfigSecondary { warm: true }), + ); + self.location_config(&origin_ps, origin_secondary_conf.clone(), None, false) + .await?; + // TODO: we should also be setting the ObservedState on earlier API calls, in case we fail + // partway through. In fact, all location conf API calls should be in a wrapper that sets + // the observed state to None, then runs, then sets it to what we wrote. + self.observed.locations.insert( + origin_ps.get_id(), + ObservedStateLocation { + conf: Some(origin_secondary_conf), + }, + ); + + tracing::info!("🔁 Switching to AttachedSingle mode on node {dest_ps}",); + let dest_final_conf = build_location_config( + &self.shard, + &self.config, + LocationConfigMode::AttachedSingle, + self.generation, + None, + ); + self.location_config(&dest_ps, dest_final_conf.clone(), None, false) + .await?; + self.observed.locations.insert( + dest_ps.get_id(), + ObservedStateLocation { + conf: Some(dest_final_conf), + }, + ); + + tracing::info!("✅ Migration complete"); + + Ok(()) + } + + async fn maybe_refresh_observed(&mut self) -> Result<(), ReconcileError> { + // If the attached node has uncertain state, read it from the pageserver before proceeding: this + // is important to avoid spurious generation increments. + // + // We don't need to do this for secondary/detach locations because it's harmless to just PUT their + // location conf, whereas for attached locations it can interrupt clients if we spuriously destroy/recreate + // the `Timeline` object in the pageserver. + + let Some(attached_node) = self.intent.attached.as_ref() else { + // Nothing to do + return Ok(()); + }; + + if matches!( + self.observed.locations.get(&attached_node.get_id()), + Some(ObservedStateLocation { conf: None }) + ) { + let tenant_shard_id = self.tenant_shard_id; + let observed_conf = match attached_node + .with_client_retries( + |client| async move { client.get_location_config(tenant_shard_id).await }, + &self.service_config.jwt_token, + 1, + 1, + Duration::from_secs(5), + &self.cancel, + ) + .await + { + Some(Ok(observed)) => Some(observed), + Some(Err(mgmt_api::Error::ApiError(status, _msg))) + if status == StatusCode::NOT_FOUND => + { + None + } + Some(Err(e)) => return Err(e.into()), + None => return Err(ReconcileError::Cancel), + }; + tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}"); + match observed_conf { + Some(conf) => { + // Pageserver returned a state: update it in observed. This may still be an indeterminate (None) state, + // if internally the pageserver's TenantSlot was being mutated (e.g. some long running API call is still running) + self.observed + .locations + .insert(attached_node.get_id(), ObservedStateLocation { conf }); + } + None => { + // Pageserver returned 404: we have confirmation that there is no state for this shard on that pageserver. + self.observed.locations.remove(&attached_node.get_id()); + } + } + } + + Ok(()) + } + + /// Reconciling a tenant makes API calls to pageservers until the observed state + /// matches the intended state. + /// + /// First we apply special case handling (e.g. for live migrations), and then a + /// general case reconciliation where we walk through the intent by pageserver + /// and call out to the pageserver to apply the desired state. + pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> { + // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it + self.maybe_refresh_observed().await?; + + // Special case: live migration + self.maybe_live_migrate().await?; + + // If the attached pageserver is not attached, do so now. + if let Some(node) = self.intent.attached.as_ref() { + // If we are in an attached policy, then generation must have been set (null generations + // are only present when a tenant is initially loaded with a secondary policy) + debug_assert!(self.generation.is_some()); + let Some(generation) = self.generation else { + return Err(ReconcileError::Other(anyhow::anyhow!( + "Attempted to attach with NULL generation" + ))); + }; + + let mut wanted_conf = attached_location_conf( + generation, + &self.shard, + &self.config, + &self.placement_policy, + ); + match self.observed.locations.get(&node.get_id()) { + Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { + // Nothing to do + tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.") + } + observed => { + // In all cases other than a matching observed configuration, we will + // reconcile this location. This includes locations with different configurations, as well + // as locations with unknown (None) observed state. + + // Incrementing generation is the safe general case, but is inefficient for changes that only + // modify some details (e.g. the tenant's config). + let increment_generation = match observed { + None => true, + Some(ObservedStateLocation { conf: None }) => true, + Some(ObservedStateLocation { + conf: Some(observed), + }) => { + let generations_match = observed.generation == wanted_conf.generation; + + // We may skip incrementing the generation if the location is already in the expected mode and + // generation. In principle it would also be safe to skip from certain other modes (e.g. AttachedStale), + // but such states are handled inside `live_migrate`, and if we see that state here we're cleaning up + // after a restart/crash, so fall back to the universally safe path of incrementing generation. + !generations_match || (observed.mode != wanted_conf.mode) + } + }; + + if increment_generation { + let generation = self + .persistence + .increment_generation(self.tenant_shard_id, node.get_id()) + .await?; + self.generation = Some(generation); + wanted_conf.generation = generation.into(); + } + tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); + + // Because `node` comes from a ref to &self, clone it before calling into a &mut self + // function: this could be avoided by refactoring the state mutated by location_config into + // a separate type to Self. + let node = node.clone(); + + // Use lazy=true, because we may run many of Self concurrently, and do not want to + // overload the pageserver with logical size calculations. + self.location_config(&node, wanted_conf, None, true).await?; + self.compute_notify().await?; + } + } + } + + // Configure secondary locations: if these were previously attached this + // implicitly downgrades them from attached to secondary. + let mut changes = Vec::new(); + for node in &self.intent.secondary { + let wanted_conf = secondary_location_conf(&self.shard, &self.config); + match self.observed.locations.get(&node.get_id()) { + Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { + // Nothing to do + tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.") + } + _ => { + // In all cases other than a matching observed configuration, we will + // reconcile this location. + tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); + changes.push((node.clone(), wanted_conf)) + } + } + } + + // Detach any extraneous pageservers that are no longer referenced + // by our intent. + for node in &self.detach { + changes.push(( + node.clone(), + LocationConfig { + mode: LocationConfigMode::Detached, + generation: None, + secondary_conf: None, + shard_number: self.shard.number.0, + shard_count: self.shard.count.literal(), + shard_stripe_size: self.shard.stripe_size.0, + tenant_conf: self.config.clone(), + }, + )); + } + + for (node, conf) in changes { + if self.cancel.is_cancelled() { + return Err(ReconcileError::Cancel); + } + self.location_config(&node, conf, None, false).await?; + } + + failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue"); + + Ok(()) + } + + pub(crate) async fn compute_notify(&mut self) -> Result<(), NotifyError> { + // Whenever a particular Reconciler emits a notification, it is always notifying for the intended + // destination. + if let Some(node) = &self.intent.attached { + let result = self + .compute_hook + .notify( + self.tenant_shard_id, + node.get_id(), + self.shard.stripe_size, + &self.cancel, + ) + .await; + if let Err(e) = &result { + // It is up to the caller whether they want to drop out on this error, but they don't have to: + // in general we should avoid letting unavailability of the cloud control plane stop us from + // making progress. + if !matches!(e, NotifyError::ShuttingDown) { + tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}"); + } + + // Set this flag so that in our ReconcileResult we will set the flag on the shard that it + // needs to retry at some point. + self.compute_notify_failure = true; + } + result + } else { + Ok(()) + } + } +} + +/// We tweak the externally-set TenantConfig while configuring +/// locations, using our awareness of whether secondary locations +/// are in use to automatically enable/disable heatmap uploads. +fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig { + let mut config = config.clone(); + if has_secondaries { + if config.heatmap_period.is_none() { + config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string()); + } + } else { + config.heatmap_period = None; + } + config +} + +pub(crate) fn attached_location_conf( + generation: Generation, + shard: &ShardIdentity, + config: &TenantConfig, + policy: &PlacementPolicy, +) -> LocationConfig { + let has_secondaries = match policy { + PlacementPolicy::Attached(0) | PlacementPolicy::Detached | PlacementPolicy::Secondary => { + false + } + PlacementPolicy::Attached(_) => true, + }; + + LocationConfig { + mode: LocationConfigMode::AttachedSingle, + generation: generation.into(), + secondary_conf: None, + shard_number: shard.number.0, + shard_count: shard.count.literal(), + shard_stripe_size: shard.stripe_size.0, + tenant_conf: ha_aware_config(config, has_secondaries), + } +} + +pub(crate) fn secondary_location_conf( + shard: &ShardIdentity, + config: &TenantConfig, +) -> LocationConfig { + LocationConfig { + mode: LocationConfigMode::Secondary, + generation: None, + secondary_conf: Some(LocationConfigSecondary { warm: true }), + shard_number: shard.number.0, + shard_count: shard.count.literal(), + shard_stripe_size: shard.stripe_size.0, + tenant_conf: ha_aware_config(config, true), + } +} diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs new file mode 100644 index 0000000000..deb5f27226 --- /dev/null +++ b/storage_controller/src/scheduler.rs @@ -0,0 +1,732 @@ +use crate::{node::Node, tenant_shard::TenantShard}; +use itertools::Itertools; +use pageserver_api::models::PageserverUtilization; +use serde::Serialize; +use std::collections::HashMap; +use utils::{http::error::ApiError, id::NodeId}; + +/// Scenarios in which we cannot find a suitable location for a tenant shard +#[derive(thiserror::Error, Debug)] +pub enum ScheduleError { + #[error("No pageservers found")] + NoPageservers, + #[error("No pageserver found matching constraint")] + ImpossibleConstraint, +} + +impl From for ApiError { + fn from(value: ScheduleError) -> Self { + ApiError::Conflict(format!("Scheduling error: {}", value)) + } +} + +#[derive(Serialize)] +pub enum MaySchedule { + Yes(PageserverUtilization), + No, +} + +#[derive(Serialize)] +struct SchedulerNode { + /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`]. + shard_count: usize, + /// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`]. + attached_shard_count: usize, + + /// Whether this node is currently elegible to have new shards scheduled (this is derived + /// from a node's availability state and scheduling policy). + may_schedule: MaySchedule, +} + +impl PartialEq for SchedulerNode { + fn eq(&self, other: &Self) -> bool { + let may_schedule_matches = matches!( + (&self.may_schedule, &other.may_schedule), + (MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No) + ); + + may_schedule_matches + && self.shard_count == other.shard_count + && self.attached_shard_count == other.attached_shard_count + } +} + +impl Eq for SchedulerNode {} + +/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver +/// on which to run. +/// +/// The type has no persistent state of its own: this is all populated at startup. The Serialize +/// impl is only for debug dumps. +#[derive(Serialize)] +pub(crate) struct Scheduler { + nodes: HashMap, +} + +/// Score for soft constraint scheduling: lower scores are preferred to higher scores. +/// +/// For example, we may set an affinity score based on the number of shards from the same +/// tenant already on a node, to implicitly prefer to balance out shards. +#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] +pub(crate) struct AffinityScore(pub(crate) usize); + +impl AffinityScore { + /// If we have no anti-affinity at all toward a node, this is its score. It means + /// the scheduler has a free choice amongst nodes with this score, and may pick a node + /// based on other information such as total utilization. + pub(crate) const FREE: Self = Self(0); + + pub(crate) fn inc(&mut self) { + self.0 += 1; + } +} + +impl std::ops::Add for AffinityScore { + type Output = Self; + + fn add(self, rhs: Self) -> Self::Output { + Self(self.0 + rhs.0) + } +} + +/// Hint for whether this is a sincere attempt to schedule, or a speculative +/// check for where we _would_ schedule (done during optimization) +#[derive(Debug)] +pub(crate) enum ScheduleMode { + Normal, + Speculative, +} + +impl Default for ScheduleMode { + fn default() -> Self { + Self::Normal + } +} + +// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling +// it for many shards in the same tenant. +#[derive(Debug, Default)] +pub(crate) struct ScheduleContext { + /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`] + pub(crate) nodes: HashMap, + + /// Specifically how many _attached_ locations are on each node + pub(crate) attached_nodes: HashMap, + + pub(crate) mode: ScheduleMode, +} + +impl ScheduleContext { + /// Input is a list of nodes we would like to avoid using again within this context. The more + /// times a node is passed into this call, the less inclined we are to use it. + pub(crate) fn avoid(&mut self, nodes: &[NodeId]) { + for node_id in nodes { + let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE); + entry.inc() + } + } + + pub(crate) fn push_attached(&mut self, node_id: NodeId) { + let entry = self.attached_nodes.entry(node_id).or_default(); + *entry += 1; + } + + pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore { + self.nodes + .get(&node_id) + .copied() + .unwrap_or(AffinityScore::FREE) + } + + pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize { + self.attached_nodes.get(&node_id).copied().unwrap_or(0) + } +} + +pub(crate) enum RefCountUpdate { + PromoteSecondary, + Attach, + Detach, + DemoteAttached, + AddSecondary, + RemoveSecondary, +} + +impl Scheduler { + pub(crate) fn new<'a>(nodes: impl Iterator) -> Self { + let mut scheduler_nodes = HashMap::new(); + for node in nodes { + scheduler_nodes.insert( + node.get_id(), + SchedulerNode { + shard_count: 0, + attached_shard_count: 0, + may_schedule: node.may_schedule(), + }, + ); + } + + Self { + nodes: scheduler_nodes, + } + } + + /// For debug/support: check that our internal statistics are in sync with the state of + /// the nodes & tenant shards. + /// + /// If anything is inconsistent, log details and return an error. + pub(crate) fn consistency_check<'a>( + &self, + nodes: impl Iterator, + shards: impl Iterator, + ) -> anyhow::Result<()> { + let mut expect_nodes: HashMap = HashMap::new(); + for node in nodes { + expect_nodes.insert( + node.get_id(), + SchedulerNode { + shard_count: 0, + attached_shard_count: 0, + may_schedule: node.may_schedule(), + }, + ); + } + + for shard in shards { + if let Some(node_id) = shard.intent.get_attached() { + match expect_nodes.get_mut(node_id) { + Some(node) => { + node.shard_count += 1; + node.attached_shard_count += 1; + } + None => anyhow::bail!( + "Tenant {} references nonexistent node {}", + shard.tenant_shard_id, + node_id + ), + } + } + + for node_id in shard.intent.get_secondary() { + match expect_nodes.get_mut(node_id) { + Some(node) => node.shard_count += 1, + None => anyhow::bail!( + "Tenant {} references nonexistent node {}", + shard.tenant_shard_id, + node_id + ), + } + } + } + + for (node_id, expect_node) in &expect_nodes { + let Some(self_node) = self.nodes.get(node_id) else { + anyhow::bail!("Node {node_id} not found in Self") + }; + + if self_node != expect_node { + tracing::error!("Inconsistency detected in scheduling state for node {node_id}"); + tracing::error!("Expected state: {}", serde_json::to_string(expect_node)?); + tracing::error!("Self state: {}", serde_json::to_string(self_node)?); + + anyhow::bail!("Inconsistent state on {node_id}"); + } + } + + if expect_nodes.len() != self.nodes.len() { + // We just checked that all the expected nodes are present. If the lengths don't match, + // it means that we have nodes in Self that are unexpected. + for node_id in self.nodes.keys() { + if !expect_nodes.contains_key(node_id) { + anyhow::bail!("Node {node_id} found in Self but not in expected nodes"); + } + } + } + + Ok(()) + } + + /// Update the reference counts of a node. These reference counts are used to guide scheduling + /// decisions, not for memory management: they represent the number of tenant shard whose IntentState + /// targets this node and the number of tenants shars whose IntentState is attached to this + /// node. + /// + /// It is an error to call this for a node that is not known to the scheduler (i.e. passed into + /// [`Self::new`] or [`Self::node_upsert`]) + pub(crate) fn update_node_ref_counts(&mut self, node_id: NodeId, update: RefCountUpdate) { + let Some(node) = self.nodes.get_mut(&node_id) else { + debug_assert!(false); + tracing::error!("Scheduler missing node {node_id}"); + return; + }; + + match update { + RefCountUpdate::PromoteSecondary => { + node.attached_shard_count += 1; + } + RefCountUpdate::Attach => { + node.shard_count += 1; + node.attached_shard_count += 1; + } + RefCountUpdate::Detach => { + node.shard_count -= 1; + node.attached_shard_count -= 1; + } + RefCountUpdate::DemoteAttached => { + node.attached_shard_count -= 1; + } + RefCountUpdate::AddSecondary => { + node.shard_count += 1; + } + RefCountUpdate::RemoveSecondary => { + node.shard_count -= 1; + } + } + + // Maybe update PageserverUtilization + match update { + RefCountUpdate::AddSecondary | RefCountUpdate::Attach => { + // Referencing the node: if this takes our shard_count above the utilzation structure's + // shard count, then artifically bump it: this ensures that the scheduler immediately + // recognizes that this node has more work on it, without waiting for the next heartbeat + // to update the utilization. + if let MaySchedule::Yes(utilization) = &mut node.may_schedule { + utilization.adjust_shard_count_max(node.shard_count as u32); + } + } + RefCountUpdate::PromoteSecondary + | RefCountUpdate::Detach + | RefCountUpdate::RemoveSecondary + | RefCountUpdate::DemoteAttached => { + // De-referencing the node: leave the utilization's shard_count at a stale higher + // value until some future heartbeat after we have physically removed this shard + // from the node: this prevents the scheduler over-optimistically trying to schedule + // more work onto the node before earlier detaches are done. + } + } + } + + // Check if the number of shards attached to a given node is lagging below + // the cluster average. If that's the case, the node should be filled. + pub(crate) fn compute_fill_requirement(&self, node_id: NodeId) -> usize { + let Some(node) = self.nodes.get(&node_id) else { + debug_assert!(false); + tracing::error!("Scheduler missing node {node_id}"); + return 0; + }; + assert!(!self.nodes.is_empty()); + let expected_attached_shards_per_node = self.expected_attached_shard_count(); + + for (node_id, node) in self.nodes.iter() { + tracing::trace!(%node_id, "attached_shard_count={} shard_count={} expected={}", node.attached_shard_count, node.shard_count, expected_attached_shards_per_node); + } + + if node.attached_shard_count < expected_attached_shards_per_node { + expected_attached_shards_per_node - node.attached_shard_count + } else { + 0 + } + } + + pub(crate) fn expected_attached_shard_count(&self) -> usize { + let total_attached_shards: usize = + self.nodes.values().map(|n| n.attached_shard_count).sum(); + + assert!(!self.nodes.is_empty()); + total_attached_shards / self.nodes.len() + } + + pub(crate) fn nodes_by_attached_shard_count(&self) -> Vec<(NodeId, usize)> { + self.nodes + .iter() + .map(|(node_id, stats)| (*node_id, stats.attached_shard_count)) + .sorted_by(|lhs, rhs| Ord::cmp(&lhs.1, &rhs.1).reverse()) + .collect() + } + + pub(crate) fn node_upsert(&mut self, node: &Node) { + use std::collections::hash_map::Entry::*; + match self.nodes.entry(node.get_id()) { + Occupied(mut entry) => { + // Updates to MaySchedule are how we receive updated PageserverUtilization: adjust these values + // to account for any shards scheduled on the controller but not yet visible to the pageserver. + let mut may_schedule = node.may_schedule(); + match &mut may_schedule { + MaySchedule::Yes(utilization) => { + utilization.adjust_shard_count_max(entry.get().shard_count as u32); + } + MaySchedule::No => { // Nothing to tweak + } + } + + entry.get_mut().may_schedule = may_schedule; + } + Vacant(entry) => { + entry.insert(SchedulerNode { + shard_count: 0, + attached_shard_count: 0, + may_schedule: node.may_schedule(), + }); + } + } + } + + pub(crate) fn node_remove(&mut self, node_id: NodeId) { + if self.nodes.remove(&node_id).is_none() { + tracing::warn!(node_id=%node_id, "Removed non-existent node from scheduler"); + } + } + + /// Where we have several nodes to choose from, for example when picking a secondary location + /// to promote to an attached location, this method may be used to pick the best choice based + /// on the scheduler's knowledge of utilization and availability. + /// + /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the + /// caller can pick a node some other way. + pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option { + if nodes.is_empty() { + return None; + } + + // TODO: When the utilization score returned by the pageserver becomes meaningful, + // schedule based on that instead of the shard count. + let node = nodes + .iter() + .map(|node_id| { + let may_schedule = self + .nodes + .get(node_id) + .map(|n| !matches!(n.may_schedule, MaySchedule::No)) + .unwrap_or(false); + (*node_id, may_schedule) + }) + .max_by_key(|(_n, may_schedule)| *may_schedule); + + // If even the preferred node has may_schedule==false, return None + node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None }) + } + + /// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they + /// are already in use by this shard -- we use this to avoid picking the same node + /// as both attached and secondary location. This is a hard constraint: if we cannot + /// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`]. + /// + /// context: we prefer to avoid using nodes identified in the context, according + /// to their anti-affinity score. We use this to prefeer to avoid placing shards in + /// the same tenant on the same node. This is a soft constraint: the context will never + /// cause us to fail to schedule a shard. + pub(crate) fn schedule_shard( + &mut self, + hard_exclude: &[NodeId], + context: &ScheduleContext, + ) -> Result { + if self.nodes.is_empty() { + return Err(ScheduleError::NoPageservers); + } + + let mut scores: Vec<(NodeId, AffinityScore, u64, usize)> = self + .nodes + .iter_mut() + .filter_map(|(k, v)| match &mut v.may_schedule { + MaySchedule::No => None, + MaySchedule::Yes(_) if hard_exclude.contains(k) => None, + MaySchedule::Yes(utilization) => Some(( + *k, + context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE), + utilization.cached_score(), + v.attached_shard_count, + )), + }) + .collect(); + + // Exclude nodes whose utilization is critically high, if there are alternatives available. This will + // cause us to violate affinity rules if it is necessary to avoid critically overloading nodes: for example + // we may place shards in the same tenant together on the same pageserver if all other pageservers are + // overloaded. + let non_overloaded_scores = scores + .iter() + .filter(|i| !PageserverUtilization::is_overloaded(i.2)) + .copied() + .collect::>(); + if !non_overloaded_scores.is_empty() { + scores = non_overloaded_scores; + } + + // Sort by, in order of precedence: + // 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available + // 2nd: Utilization score (this combines shard count and disk utilization) + // 3rd: Attached shard count. When nodes have identical utilization (e.g. when populating some + // empty nodes), this acts as an anti-affinity between attached shards. + // 4th: Node ID. This is a convenience to make selection deterministic in tests and empty systems. + scores.sort_by_key(|i| (i.1, i.2, i.3, i.0)); + + if scores.is_empty() { + // After applying constraints, no pageservers were left. + if !matches!(context.mode, ScheduleMode::Speculative) { + // If this was not a speculative attempt, log details to understand why we couldn't + // schedule: this may help an engineer understand if some nodes are marked offline + // in a way that's preventing progress. + tracing::info!( + "Scheduling failure, while excluding {hard_exclude:?}, node states:" + ); + for (node_id, node) in &self.nodes { + tracing::info!( + "Node {node_id}: may_schedule={} shards={}", + !matches!(node.may_schedule, MaySchedule::No), + node.shard_count + ); + } + } + return Err(ScheduleError::ImpossibleConstraint); + } + + // Lowest score wins + let node_id = scores.first().unwrap().0; + + if !matches!(context.mode, ScheduleMode::Speculative) { + tracing::info!( + "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})", + scores.iter().map(|i| i.0 .0).collect::>() + ); + } + + // Note that we do not update shard count here to reflect the scheduling: that + // is IntentState's job when the scheduled location is used. + + Ok(node_id) + } + + /// Unit test access to internal state + #[cfg(test)] + pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize { + self.nodes.get(&node_id).unwrap().shard_count + } + + #[cfg(test)] + pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize { + self.nodes.get(&node_id).unwrap().attached_shard_count + } +} + +#[cfg(test)] +pub(crate) mod test_utils { + + use crate::node::Node; + use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization}; + use std::collections::HashMap; + use utils::id::NodeId; + /// Test helper: synthesize the requested number of nodes, all in active state. + /// + /// Node IDs start at one. + pub(crate) fn make_test_nodes(n: u64) -> HashMap { + (1..n + 1) + .map(|i| { + (NodeId(i), { + let mut node = Node::new( + NodeId(i), + format!("httphost-{i}"), + 80 + i as u16, + format!("pghost-{i}"), + 5432 + i as u16, + "test-az".to_string(), + ); + node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0))); + assert!(node.is_available()); + node + }) + }) + .collect() + } +} + +#[cfg(test)] +mod tests { + use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization}; + + use super::*; + + use crate::tenant_shard::IntentState; + #[test] + fn scheduler_basic() -> anyhow::Result<()> { + let nodes = test_utils::make_test_nodes(2); + + let mut scheduler = Scheduler::new(nodes.values()); + let mut t1_intent = IntentState::new(); + let mut t2_intent = IntentState::new(); + + let context = ScheduleContext::default(); + + let scheduled = scheduler.schedule_shard(&[], &context)?; + t1_intent.set_attached(&mut scheduler, Some(scheduled)); + let scheduled = scheduler.schedule_shard(&[], &context)?; + t2_intent.set_attached(&mut scheduler, Some(scheduled)); + + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1); + + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1); + + let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?; + t1_intent.push_secondary(&mut scheduler, scheduled); + + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1); + + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1); + + t1_intent.clear(&mut scheduler); + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1); + + let total_attached = scheduler.get_node_attached_shard_count(NodeId(1)) + + scheduler.get_node_attached_shard_count(NodeId(2)); + assert_eq!(total_attached, 1); + + if cfg!(debug_assertions) { + // Dropping an IntentState without clearing it causes a panic in debug mode, + // because we have failed to properly update scheduler shard counts. + let result = std::panic::catch_unwind(move || { + drop(t2_intent); + }); + assert!(result.is_err()); + } else { + t2_intent.clear(&mut scheduler); + + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 0); + + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 0); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0); + } + + Ok(()) + } + + #[test] + /// Test the PageserverUtilization's contribution to scheduling algorithm + fn scheduler_utilization() { + let mut nodes = test_utils::make_test_nodes(3); + let mut scheduler = Scheduler::new(nodes.values()); + + // Need to keep these alive because they contribute to shard counts via RAII + let mut scheduled_intents = Vec::new(); + + let empty_context = ScheduleContext::default(); + + fn assert_scheduler_chooses( + expect_node: NodeId, + scheduled_intents: &mut Vec, + scheduler: &mut Scheduler, + context: &ScheduleContext, + ) { + let scheduled = scheduler.schedule_shard(&[], context).unwrap(); + let mut intent = IntentState::new(); + intent.set_attached(scheduler, Some(scheduled)); + scheduled_intents.push(intent); + assert_eq!(scheduled, expect_node); + } + + // Independent schedule calls onto empty nodes should round-robin, because each node's + // utilization's shard count is updated inline. The order is determinsitic because when all other factors are + // equal, we order by node ID. + assert_scheduler_chooses( + NodeId(1), + &mut scheduled_intents, + &mut scheduler, + &empty_context, + ); + assert_scheduler_chooses( + NodeId(2), + &mut scheduled_intents, + &mut scheduler, + &empty_context, + ); + assert_scheduler_chooses( + NodeId(3), + &mut scheduled_intents, + &mut scheduler, + &empty_context, + ); + + // Manually setting utilization higher should cause schedule calls to round-robin the other nodes + // which have equal utilization. + nodes + .get_mut(&NodeId(1)) + .unwrap() + .set_availability(NodeAvailability::Active(test_utilization::simple( + 10, + 1024 * 1024 * 1024, + ))); + scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap()); + + assert_scheduler_chooses( + NodeId(2), + &mut scheduled_intents, + &mut scheduler, + &empty_context, + ); + assert_scheduler_chooses( + NodeId(3), + &mut scheduled_intents, + &mut scheduler, + &empty_context, + ); + assert_scheduler_chooses( + NodeId(2), + &mut scheduled_intents, + &mut scheduler, + &empty_context, + ); + assert_scheduler_chooses( + NodeId(3), + &mut scheduled_intents, + &mut scheduler, + &empty_context, + ); + + // The scheduler should prefer nodes with lower affinity score, + // even if they have higher utilization (as long as they aren't utilized at >100%) + let mut context_prefer_node1 = ScheduleContext::default(); + context_prefer_node1.avoid(&[NodeId(2), NodeId(3)]); + assert_scheduler_chooses( + NodeId(1), + &mut scheduled_intents, + &mut scheduler, + &context_prefer_node1, + ); + assert_scheduler_chooses( + NodeId(1), + &mut scheduled_intents, + &mut scheduler, + &context_prefer_node1, + ); + + // If a node is over-utilized, it will not be used even if affinity scores prefer it + nodes + .get_mut(&NodeId(1)) + .unwrap() + .set_availability(NodeAvailability::Active(test_utilization::simple( + 20000, + 1024 * 1024 * 1024, + ))); + scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap()); + assert_scheduler_chooses( + NodeId(2), + &mut scheduled_intents, + &mut scheduler, + &context_prefer_node1, + ); + assert_scheduler_chooses( + NodeId(3), + &mut scheduled_intents, + &mut scheduler, + &context_prefer_node1, + ); + + for mut intent in scheduled_intents { + intent.clear(&mut scheduler); + } + } +} diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs new file mode 100644 index 0000000000..1717a9369d --- /dev/null +++ b/storage_controller/src/schema.rs @@ -0,0 +1,62 @@ +// @generated automatically by Diesel CLI. + +diesel::table! { + controllers (address, started_at) { + address -> Varchar, + started_at -> Timestamptz, + } +} + +diesel::table! { + metadata_health (tenant_id, shard_number, shard_count) { + tenant_id -> Varchar, + shard_number -> Int4, + shard_count -> Int4, + healthy -> Bool, + last_scrubbed_at -> Timestamptz, + } +} + +diesel::table! { + nodes (node_id) { + node_id -> Int8, + scheduling_policy -> Varchar, + listen_http_addr -> Varchar, + listen_http_port -> Int4, + listen_pg_addr -> Varchar, + listen_pg_port -> Int4, + availability_zone_id -> Varchar, + } +} + +diesel::table! { + tenant_shards (tenant_id, shard_number, shard_count) { + tenant_id -> Varchar, + shard_number -> Int4, + shard_count -> Int4, + shard_stripe_size -> Int4, + generation -> Nullable, + generation_pageserver -> Nullable, + placement_policy -> Varchar, + splitting -> Int2, + config -> Text, + scheduling_policy -> Varchar, + preferred_az_id -> Nullable, + } +} + +diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,); + +diesel::table! { + safekeepers { + id -> Int8, + region_id -> Text, + version -> Int8, + instance_id -> Text, + host -> Text, + port -> Int4, + active -> Bool, + http_port -> Int4, + availability_zone_id -> Text, + } +} diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs new file mode 100644 index 0000000000..e7eae647df --- /dev/null +++ b/storage_controller/src/service.rs @@ -0,0 +1,6645 @@ +use hyper::Uri; +use std::{ + borrow::Cow, + cmp::Ordering, + collections::{BTreeMap, HashMap, HashSet}, + ops::Deref, + path::PathBuf, + str::FromStr, + sync::Arc, + time::{Duration, Instant}, +}; + +use crate::{ + background_node_operations::{ + Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION, + }, + compute_hook::NotifyError, + drain_utils::{self, TenantShardDrain, TenantShardIterator}, + id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard}, + leadership::Leadership, + metrics, + peer_client::GlobalObservedState, + persistence::{ + AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence, + ShardGenerationState, TenantFilter, + }, + reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder}, + scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode}, + tenant_shard::{ + MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization, + ScheduleOptimizationAction, + }, +}; +use anyhow::Context; +use control_plane::storage_controller::{ + AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, +}; +use diesel::result::DatabaseErrorKind; +use futures::{stream::FuturesUnordered, StreamExt}; +use itertools::Itertools; +use pageserver_api::{ + controller_api::{ + MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest, + NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, ShardsPreferredAzsRequest, + ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse, + TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard, + TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest, + TenantShardMigrateResponse, + }, + models::{ + SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest, + TopTenantShardsRequest, + }, +}; +use reqwest::StatusCode; +use tracing::{instrument, Instrument}; + +use crate::pageserver_client::PageserverClient; +use pageserver_api::{ + models::{ + self, LocationConfig, LocationConfigListResponse, LocationConfigMode, + PageserverUtilization, ShardParameters, TenantConfig, TenantLocationConfigRequest, + TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, + TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo, + }, + shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId}, + upcall_api::{ + ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, + ValidateResponse, ValidateResponseTenant, + }, +}; +use pageserver_client::mgmt_api; +use tokio::sync::mpsc::error::TrySendError; +use tokio_util::sync::CancellationToken; +use utils::{ + completion::Barrier, + failpoint_support, + generation::Generation, + http::error::ApiError, + id::{NodeId, TenantId, TimelineId}, + sync::gate::Gate, +}; + +use crate::{ + compute_hook::ComputeHook, + heartbeater::{Heartbeater, PageserverState}, + node::{AvailabilityTransition, Node}, + persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence}, + reconciler::attached_location_conf, + scheduler::Scheduler, + tenant_shard::{ + IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError, + ReconcilerWaiter, TenantShard, + }, +}; + +pub mod chaos_injector; + +// For operations that should be quick, like attaching a new tenant +const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5); + +// For operations that might be slow, like migrating a tenant with +// some data in it. +pub const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); + +// If we receive a call using Secondary mode initially, it will omit generation. We will initialize +// tenant shards into this generation, and as long as it remains in this generation, we will accept +// input generation from future requests as authoritative. +const INITIAL_GENERATION: Generation = Generation::new(0); + +/// How long [`Service::startup_reconcile`] is allowed to take before it should give +/// up on unresponsive pageservers and proceed. +pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); + +/// How long a node may be unresponsive to heartbeats before we declare it offline. +/// This must be long enough to cover node restarts as well as normal operations: in future +pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30); + +/// How long a node may be unresponsive to heartbeats during start up before we declare it +/// offline. +/// +/// This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's +/// handling of the re-attach response may take a long time and blocks heartbeats from +/// being handled on the pageserver side. +pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300); + +/// How often to send heartbeats to registered nodes? +pub const HEARTBEAT_INTERVAL_DEFAULT: Duration = Duration::from_secs(5); + +#[derive(Clone, strum_macros::Display)] +enum TenantOperations { + Create, + LocationConfig, + ConfigSet, + TimeTravelRemoteStorage, + Delete, + UpdatePolicy, + ShardSplit, + SecondaryDownload, + TimelineCreate, + TimelineDelete, + AttachHook, + TimelineArchivalConfig, + TimelineDetachAncestor, +} + +#[derive(Clone, strum_macros::Display)] +enum NodeOperations { + Register, + Configure, + Delete, +} + +/// The leadership status for the storage controller process. +/// Allowed transitions are: +/// 1. Leader -> SteppedDown +/// 2. Candidate -> Leader +#[derive( + Eq, + PartialEq, + Copy, + Clone, + strum_macros::Display, + strum_macros::EnumIter, + measured::FixedCardinalityLabel, +)] +#[strum(serialize_all = "snake_case")] +pub(crate) enum LeadershipStatus { + /// This is the steady state where the storage controller can produce + /// side effects in the cluster. + Leader, + /// We've been notified to step down by another candidate. No reconciliations + /// take place in this state. + SteppedDown, + /// Initial state for a new storage controller instance. Will attempt to assume leadership. + #[allow(unused)] + Candidate, +} + +pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; + +// Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately. +// This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly +// than they're being pushed onto the queue. +const MAX_DELAYED_RECONCILES: usize = 10000; + +// Top level state available to all HTTP handlers +struct ServiceState { + leadership_status: LeadershipStatus, + + tenants: BTreeMap, + + nodes: Arc>, + + scheduler: Scheduler, + + /// Ongoing background operation on the cluster if any is running. + /// Note that only one such operation may run at any given time, + /// hence the type choice. + ongoing_operation: Option, + + /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile + delayed_reconcile_rx: tokio::sync::mpsc::Receiver, +} + +/// Transform an error from a pageserver into an error to return to callers of a storage +/// controller API. +fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { + match e { + mgmt_api::Error::SendRequest(e) => { + // Presume errors sending requests are connectivity/availability issues + ApiError::ResourceUnavailable(format!("{node} error sending request: {e}").into()) + } + mgmt_api::Error::ReceiveErrorBody(str) => { + // Presume errors receiving body are connectivity/availability issues + ApiError::ResourceUnavailable( + format!("{node} error receiving error body: {str}").into(), + ) + } + mgmt_api::Error::ReceiveBody(str) => { + // Presume errors receiving body are connectivity/availability issues + ApiError::ResourceUnavailable(format!("{node} error receiving body: {str}").into()) + } + mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, msg) => { + ApiError::NotFound(anyhow::anyhow!(format!("{node}: {msg}")).into()) + } + mgmt_api::Error::ApiError(StatusCode::SERVICE_UNAVAILABLE, msg) => { + ApiError::ResourceUnavailable(format!("{node}: {msg}").into()) + } + mgmt_api::Error::ApiError(status @ StatusCode::UNAUTHORIZED, msg) + | mgmt_api::Error::ApiError(status @ StatusCode::FORBIDDEN, msg) => { + // Auth errors talking to a pageserver are not auth errors for the caller: they are + // internal server errors, showing that something is wrong with the pageserver or + // storage controller's auth configuration. + ApiError::InternalServerError(anyhow::anyhow!("{node} {status}: {msg}")) + } + mgmt_api::Error::ApiError(status, msg) => { + // Presume general case of pageserver API errors is that we tried to do something + // that can't be done right now. + ApiError::Conflict(format!("{node} {status}: {status} {msg}")) + } + mgmt_api::Error::Cancelled => ApiError::ShuttingDown, + } +} + +impl ServiceState { + fn new( + nodes: HashMap, + tenants: BTreeMap, + scheduler: Scheduler, + delayed_reconcile_rx: tokio::sync::mpsc::Receiver, + initial_leadership_status: LeadershipStatus, + ) -> Self { + metrics::update_leadership_status(initial_leadership_status); + + Self { + leadership_status: initial_leadership_status, + tenants, + nodes: Arc::new(nodes), + scheduler, + ongoing_operation: None, + delayed_reconcile_rx, + } + } + + fn parts_mut( + &mut self, + ) -> ( + &mut Arc>, + &mut BTreeMap, + &mut Scheduler, + ) { + (&mut self.nodes, &mut self.tenants, &mut self.scheduler) + } + + fn get_leadership_status(&self) -> LeadershipStatus { + self.leadership_status + } + + fn step_down(&mut self) { + self.leadership_status = LeadershipStatus::SteppedDown; + metrics::update_leadership_status(self.leadership_status); + } + + fn become_leader(&mut self) { + self.leadership_status = LeadershipStatus::Leader; + metrics::update_leadership_status(self.leadership_status); + } +} + +#[derive(Clone)] +pub struct Config { + // All pageservers managed by one instance of this service must have + // the same public key. This JWT token will be used to authenticate + // this service to the pageservers it manages. + pub jwt_token: Option, + + // This JWT token will be used to authenticate this service to the control plane. + pub control_plane_jwt_token: Option, + + // This JWT token will be used to authenticate with other storage controller instances + pub peer_jwt_token: Option, + + /// Where the compute hook should send notifications of pageserver attachment locations + /// (this URL points to the control plane in prod). If this is None, the compute hook will + /// assume it is running in a test environment and try to update neon_local. + pub compute_hook_url: Option, + + /// Grace period within which a pageserver does not respond to heartbeats, but is still + /// considered active. Once the grace period elapses, the next heartbeat failure will + /// mark the pagseserver offline. + pub max_offline_interval: Duration, + + /// Extended grace period within which pageserver may not respond to heartbeats. + /// This extended grace period kicks in after the node has been drained for restart + /// and/or upon handling the re-attach request from a node. + pub max_warming_up_interval: Duration, + + /// How many Reconcilers may be spawned concurrently + pub reconciler_concurrency: usize, + + /// How large must a shard grow in bytes before we split it? + /// None disables auto-splitting. + pub split_threshold: Option, + + // TODO: make this cfg(feature = "testing") + pub neon_local_repo_dir: Option, + + // Maximum acceptable download lag for the secondary location + // while draining a node. If the secondary location is lagging + // by more than the configured amount, then the secondary is not + // upgraded to primary. + pub max_secondary_lag_bytes: Option, + + pub heartbeat_interval: Duration, + + pub address_for_peers: Option, + + pub start_as_candidate: bool, + + pub http_service_port: i32, +} + +impl From for ApiError { + fn from(err: DatabaseError) -> ApiError { + match err { + DatabaseError::Query(e) => ApiError::InternalServerError(e.into()), + // FIXME: ApiError doesn't have an Unavailable variant, but ShuttingDown maps to 503. + DatabaseError::Connection(_) | DatabaseError::ConnectionPool(_) => { + ApiError::ShuttingDown + } + DatabaseError::Logical(reason) | DatabaseError::Migration(reason) => { + ApiError::InternalServerError(anyhow::anyhow!(reason)) + } + } + } +} + +enum InitialShardScheduleOutcome { + Scheduled(TenantCreateResponseShard), + NotScheduled, + ShardScheduleError(ScheduleError), +} + +pub struct Service { + inner: Arc>, + config: Config, + persistence: Arc, + compute_hook: Arc, + result_tx: tokio::sync::mpsc::UnboundedSender, + + heartbeater: Heartbeater, + + // Channel for background cleanup from failed operations that require cleanup, such as shard split + abort_tx: tokio::sync::mpsc::UnboundedSender, + + // Locking on a tenant granularity (covers all shards in the tenant): + // - Take exclusively for rare operations that mutate the tenant's persistent state (e.g. create/delete/split) + // - Take in shared mode for operations that need the set of shards to stay the same to complete reliably (e.g. timeline CRUD) + tenant_op_locks: IdLockMap, + + // Locking for node-mutating operations: take exclusively for operations that modify the node's persistent state, or + // that transition it to/from Active. + node_op_locks: IdLockMap, + + // Limit how many Reconcilers we will spawn concurrently + reconciler_concurrency: Arc, + + /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile + /// Send into this queue to promptly attempt to reconcile this shard next time units are available. + /// + /// Note that this state logically lives inside ServiceInner, but carrying Sender here makes the code simpler + /// by avoiding needing a &mut ref to something inside the ServiceInner. This could be optimized to + /// use a VecDeque instead of a channel to reduce synchronization overhead, at the cost of some code complexity. + delayed_reconcile_tx: tokio::sync::mpsc::Sender, + + // Process shutdown will fire this token + cancel: CancellationToken, + + // Child token of [`Service::cancel`] used by reconcilers + reconcilers_cancel: CancellationToken, + + // Background tasks will hold this gate + gate: Gate, + + // Reconcilers background tasks will hold this gate + reconcilers_gate: Gate, + + /// This waits for initial reconciliation with pageservers to complete. Until this barrier + /// passes, it isn't safe to do any actions that mutate tenants. + pub(crate) startup_complete: Barrier, +} + +impl From for ApiError { + fn from(value: ReconcileWaitError) -> Self { + match value { + ReconcileWaitError::Shutdown => ApiError::ShuttingDown, + e @ ReconcileWaitError::Timeout(_) => ApiError::Timeout(format!("{e}").into()), + e @ ReconcileWaitError::Failed(..) => ApiError::InternalServerError(anyhow::anyhow!(e)), + } + } +} + +impl From for ApiError { + fn from(value: OperationError) -> Self { + match value { + OperationError::NodeStateChanged(err) | OperationError::FinalizeError(err) => { + ApiError::InternalServerError(anyhow::anyhow!(err)) + } + OperationError::Cancelled => ApiError::Conflict("Operation was cancelled".into()), + } + } +} + +#[allow(clippy::large_enum_variant)] +enum TenantCreateOrUpdate { + Create(TenantCreateRequest), + Update(Vec), +} + +struct ShardSplitParams { + old_shard_count: ShardCount, + new_shard_count: ShardCount, + new_stripe_size: Option, + targets: Vec, + policy: PlacementPolicy, + config: TenantConfig, + shard_ident: ShardIdentity, +} + +// When preparing for a shard split, we may either choose to proceed with the split, +// or find that the work is already done and return NoOp. +enum ShardSplitAction { + Split(ShardSplitParams), + NoOp(TenantShardSplitResponse), +} + +// A parent shard which will be split +struct ShardSplitTarget { + parent_id: TenantShardId, + node: Node, + child_ids: Vec, +} + +/// When we tenant shard split operation fails, we may not be able to clean up immediately, because nodes +/// might not be available. We therefore use a queue of abort operations processed in the background. +struct TenantShardSplitAbort { + tenant_id: TenantId, + /// The target values from the request that failed + new_shard_count: ShardCount, + new_stripe_size: Option, + /// Until this abort op is complete, no other operations may be done on the tenant + _tenant_lock: TracingExclusiveGuard, +} + +#[derive(thiserror::Error, Debug)] +enum TenantShardSplitAbortError { + #[error(transparent)] + Database(#[from] DatabaseError), + #[error(transparent)] + Remote(#[from] mgmt_api::Error), + #[error("Unavailable")] + Unavailable, +} + +struct ShardUpdate { + tenant_shard_id: TenantShardId, + placement_policy: PlacementPolicy, + tenant_config: TenantConfig, + + /// If this is None, generation is not updated. + generation: Option, +} + +enum StopReconciliationsReason { + ShuttingDown, + SteppingDown, +} + +impl std::fmt::Display for StopReconciliationsReason { + fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result { + let s = match self { + Self::ShuttingDown => "Shutting down", + Self::SteppingDown => "Stepping down", + }; + write!(writer, "{}", s) + } +} + +pub(crate) enum ReconcileResultRequest { + ReconcileResult(ReconcileResult), + Stop, +} + +impl Service { + pub fn get_config(&self) -> &Config { + &self.config + } + + /// Called once on startup, this function attempts to contact all pageservers to build an up-to-date + /// view of the world, and determine which pageservers are responsive. + #[instrument(skip_all)] + async fn startup_reconcile( + self: &Arc, + current_leader: Option, + leader_step_down_state: Option, + bg_compute_notify_result_tx: tokio::sync::mpsc::Sender< + Result<(), (TenantShardId, NotifyError)>, + >, + ) { + // Startup reconciliation does I/O to other services: whether they + // are responsive or not, we should aim to finish within our deadline, because: + // - If we don't, a k8s readiness hook watching /ready will kill us. + // - While we're waiting for startup reconciliation, we are not fully + // available for end user operations like creating/deleting tenants and timelines. + // + // We set multiple deadlines to break up the time available between the phases of work: this is + // arbitrary, but avoids a situation where the first phase could burn our entire timeout period. + let start_at = Instant::now(); + let node_scan_deadline = start_at + .checked_add(STARTUP_RECONCILE_TIMEOUT / 2) + .expect("Reconcile timeout is a modest constant"); + + let observed = if let Some(state) = leader_step_down_state { + tracing::info!( + "Using observed state received from leader at {}", + current_leader.as_ref().unwrap().address + ); + + state + } else { + self.build_global_observed_state(node_scan_deadline).await + }; + + // Accumulate a list of any tenant locations that ought to be detached + let mut cleanup = Vec::new(); + + // Send initial heartbeat requests to all nodes loaded from the database + let all_nodes = { + let locked = self.inner.read().unwrap(); + locked.nodes.clone() + }; + let mut nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await; + + // List of tenants for which we will attempt to notify compute of their location at startup + let mut compute_notifications = Vec::new(); + + // Populate intent and observed states for all tenants, based on reported state on pageservers + tracing::info!("Populating tenant shards' states from initial pageserver scan..."); + let shard_count = { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + // Mark nodes online if they responded to us: nodes are offline by default after a restart. + let mut new_nodes = (**nodes).clone(); + for (node_id, node) in new_nodes.iter_mut() { + if let Some(utilization) = nodes_online.remove(node_id) { + node.set_availability(NodeAvailability::Active(utilization)); + scheduler.node_upsert(node); + } + } + *nodes = Arc::new(new_nodes); + + for (tenant_shard_id, observed_state) in observed.0 { + let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else { + for node_id in observed_state.locations.keys() { + cleanup.push((tenant_shard_id, *node_id)); + } + + continue; + }; + + tenant_shard.observed = observed_state; + } + + // Populate each tenant's intent state + let mut schedule_context = ScheduleContext::default(); + for (tenant_shard_id, tenant_shard) in tenants.iter_mut() { + if tenant_shard_id.shard_number == ShardNumber(0) { + // Reset scheduling context each time we advance to the next Tenant + schedule_context = ScheduleContext::default(); + } + + tenant_shard.intent_from_observed(scheduler); + if let Err(e) = tenant_shard.schedule(scheduler, &mut schedule_context) { + // Non-fatal error: we are unable to properly schedule the tenant, perhaps because + // not enough pageservers are available. The tenant may well still be available + // to clients. + tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}"); + } else { + // If we're both intending and observed to be attached at a particular node, we will + // emit a compute notification for this. In the case where our observed state does not + // yet match our intent, we will eventually reconcile, and that will emit a compute notification. + if let Some(attached_at) = tenant_shard.stably_attached() { + compute_notifications.push(( + *tenant_shard_id, + attached_at, + tenant_shard.shard.stripe_size, + )); + } + } + } + + tenants.len() + }; + + // Before making any obeservable changes to the cluster, persist self + // as leader in database and memory. + let leadership = Leadership::new( + self.persistence.clone(), + self.config.clone(), + self.cancel.child_token(), + ); + + if let Err(e) = leadership.become_leader(current_leader).await { + tracing::error!("Failed to persist self as leader: {e}. Aborting start-up ..."); + std::process::exit(1); + } + + self.inner.write().unwrap().become_leader(); + + // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that + // generation_pageserver in the database. + + // Emit compute hook notifications for all tenants which are already stably attached. Other tenants + // will emit compute hook notifications when they reconcile. + // + // Ordering: our calls to notify_background synchronously establish a relative order for these notifications vs. any later + // calls into the ComputeHook for the same tenant: we can leave these to run to completion in the background and any later + // calls will be correctly ordered wrt these. + // + // Concurrency: we call notify_background for all tenants, which will create O(N) tokio tasks, but almost all of them + // will just wait on the ComputeHook::API_CONCURRENCY semaphore immediately, so very cheap until they get that semaphore + // unit and start doing I/O. + tracing::info!( + "Sending {} compute notifications", + compute_notifications.len() + ); + self.compute_hook.notify_background( + compute_notifications, + bg_compute_notify_result_tx.clone(), + &self.cancel, + ); + + // Finally, now that the service is up and running, launch reconcile operations for any tenants + // which require it: under normal circumstances this should only include tenants that were in some + // transient state before we restarted, or any tenants whose compute hooks failed above. + tracing::info!("Checking for shards in need of reconciliation..."); + let reconcile_tasks = self.reconcile_all(); + // We will not wait for these reconciliation tasks to run here: we're now done with startup and + // normal operations may proceed. + + // Clean up any tenants that were found on pageservers but are not known to us. Do this in the + // background because it does not need to complete in order to proceed with other work. + if !cleanup.is_empty() { + tracing::info!("Cleaning up {} locations in the background", cleanup.len()); + tokio::task::spawn({ + let cleanup_self = self.clone(); + async move { cleanup_self.cleanup_locations(cleanup).await } + }); + } + + tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)"); + } + + async fn initial_heartbeat_round<'a>( + &self, + node_ids: impl Iterator, + ) -> HashMap { + assert!(!self.startup_complete.is_ready()); + + let all_nodes = { + let locked = self.inner.read().unwrap(); + locked.nodes.clone() + }; + + let mut nodes_to_heartbeat = HashMap::new(); + for node_id in node_ids { + match all_nodes.get(node_id) { + Some(node) => { + nodes_to_heartbeat.insert(*node_id, node.clone()); + } + None => { + tracing::warn!("Node {node_id} was removed during start-up"); + } + } + } + + tracing::info!("Sending initial heartbeats..."); + let res = self + .heartbeater + .heartbeat(Arc::new(nodes_to_heartbeat)) + .await; + + let mut online_nodes = HashMap::new(); + if let Ok(deltas) = res { + for (node_id, status) in deltas.0 { + match status { + PageserverState::Available { utilization, .. } => { + online_nodes.insert(node_id, utilization); + } + PageserverState::Offline => {} + PageserverState::WarmingUp { .. } => { + unreachable!("Nodes are never marked warming-up during startup reconcile") + } + } + } + } + + online_nodes + } + + /// Used during [`Self::startup_reconcile`]: issue GETs to all nodes concurrently, with a deadline. + /// + /// The result includes only nodes which responded within the deadline + async fn scan_node_locations( + &self, + deadline: Instant, + ) -> HashMap { + let nodes = { + let locked = self.inner.read().unwrap(); + locked.nodes.clone() + }; + + let mut node_results = HashMap::new(); + + let mut node_list_futs = FuturesUnordered::new(); + + tracing::info!("Scanning shards on {} nodes...", nodes.len()); + for node in nodes.values() { + node_list_futs.push({ + async move { + tracing::info!("Scanning shards on node {node}..."); + let timeout = Duration::from_secs(1); + let response = node + .with_client_retries( + |client| async move { client.list_location_config().await }, + &self.config.jwt_token, + 1, + 5, + timeout, + &self.cancel, + ) + .await; + (node.get_id(), response) + } + }); + } + + loop { + let (node_id, result) = tokio::select! { + next = node_list_futs.next() => { + match next { + Some(result) => result, + None =>{ + // We got results for all our nodes + break; + } + + } + }, + _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => { + // Give up waiting for anyone who hasn't responded: we will yield the results that we have + tracing::info!("Reached deadline while waiting for nodes to respond to location listing requests"); + break; + } + }; + + let Some(list_response) = result else { + tracing::info!("Shutdown during startup_reconcile"); + break; + }; + + match list_response { + Err(e) => { + tracing::warn!("Could not scan node {} ({e})", node_id); + } + Ok(listing) => { + node_results.insert(node_id, listing); + } + } + } + + node_results + } + + async fn build_global_observed_state(&self, deadline: Instant) -> GlobalObservedState { + let node_listings = self.scan_node_locations(deadline).await; + let mut observed = GlobalObservedState::default(); + + for (node_id, location_confs) in node_listings { + tracing::info!( + "Received {} shard statuses from pageserver {}", + location_confs.tenant_shards.len(), + node_id + ); + + for (tid, location_conf) in location_confs.tenant_shards { + let entry = observed.0.entry(tid).or_default(); + entry.locations.insert( + node_id, + ObservedStateLocation { + conf: location_conf, + }, + ); + } + } + + observed + } + + /// Used during [`Self::startup_reconcile`]: detach a list of unknown-to-us tenants from pageservers. + /// + /// This is safe to run in the background, because if we don't have this TenantShardId in our map of + /// tenants, then it is probably something incompletely deleted before: we will not fight with any + /// other task trying to attach it. + #[instrument(skip_all)] + async fn cleanup_locations(&self, cleanup: Vec<(TenantShardId, NodeId)>) { + let nodes = self.inner.read().unwrap().nodes.clone(); + + for (tenant_shard_id, node_id) in cleanup { + // A node reported a tenant_shard_id which is unknown to us: detach it. + let Some(node) = nodes.get(&node_id) else { + // This is legitimate; we run in the background and [`Self::startup_reconcile`] might have identified + // a location to clean up on a node that has since been removed. + tracing::info!( + "Not cleaning up location {node_id}/{tenant_shard_id}: node not found" + ); + continue; + }; + + if self.cancel.is_cancelled() { + break; + } + + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); + match client + .location_config( + tenant_shard_id, + LocationConfig { + mode: LocationConfigMode::Detached, + generation: None, + secondary_conf: None, + shard_number: tenant_shard_id.shard_number.0, + shard_count: tenant_shard_id.shard_count.literal(), + shard_stripe_size: 0, + tenant_conf: models::TenantConfig::default(), + }, + None, + false, + ) + .await + { + Ok(()) => { + tracing::info!( + "Detached unknown shard {tenant_shard_id} on pageserver {node_id}" + ); + } + Err(e) => { + // Non-fatal error: leaving a tenant shard behind that we are not managing shouldn't + // break anything. + tracing::error!( + "Failed to detach unknkown shard {tenant_shard_id} on pageserver {node_id}: {e}" + ); + } + } + } + } + + /// Long running background task that periodically wakes up and looks for shards that need + /// reconciliation. Reconciliation is fallible, so any reconciliation tasks that fail during + /// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible + /// for those retries. + #[instrument(skip_all)] + async fn background_reconcile(self: &Arc) { + self.startup_complete.clone().wait().await; + + const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20); + + let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD); + while !self.reconcilers_cancel.is_cancelled() { + tokio::select! { + _ = interval.tick() => { + let reconciles_spawned = self.reconcile_all(); + if reconciles_spawned == 0 { + // Run optimizer only when we didn't find any other work to do + let optimizations = self.optimize_all().await; + if optimizations == 0 { + // Run new splits only when no optimizations are pending + self.autosplit_tenants().await; + } + } + } + _ = self.reconcilers_cancel.cancelled() => return + } + } + } + #[instrument(skip_all)] + async fn spawn_heartbeat_driver(&self) { + self.startup_complete.clone().wait().await; + + let mut interval = tokio::time::interval(self.config.heartbeat_interval); + while !self.cancel.is_cancelled() { + tokio::select! { + _ = interval.tick() => { } + _ = self.cancel.cancelled() => return + }; + + let nodes = { + let locked = self.inner.read().unwrap(); + locked.nodes.clone() + }; + + let res = self.heartbeater.heartbeat(nodes).await; + if let Ok(deltas) = res { + for (node_id, state) in deltas.0 { + let new_availability = match state { + PageserverState::Available { utilization, .. } => { + NodeAvailability::Active(utilization) + } + PageserverState::WarmingUp { started_at } => { + NodeAvailability::WarmingUp(started_at) + } + PageserverState::Offline => { + // The node might have been placed in the WarmingUp state + // while the heartbeat round was on-going. Hence, filter out + // offline transitions for WarmingUp nodes that are still within + // their grace period. + if let Ok(NodeAvailability::WarmingUp(started_at)) = self + .get_node(node_id) + .await + .as_ref() + .map(|n| n.get_availability()) + { + let now = Instant::now(); + if now - *started_at >= self.config.max_warming_up_interval { + NodeAvailability::Offline + } else { + NodeAvailability::WarmingUp(*started_at) + } + } else { + NodeAvailability::Offline + } + } + }; + + // This is the code path for geniune availability transitions (i.e node + // goes unavailable and/or comes back online). + let res = self + .node_configure(node_id, Some(new_availability), None) + .await; + + match res { + Ok(()) => {} + Err(ApiError::NotFound(_)) => { + // This should be rare, but legitimate since the heartbeats are done + // on a snapshot of the nodes. + tracing::info!("Node {} was not found after heartbeat round", node_id); + } + Err(err) => { + // Transition to active involves reconciling: if a node responds to a heartbeat then + // becomes unavailable again, we may get an error here. + tracing::error!( + "Failed to update node {} after heartbeat round: {}", + node_id, + err + ); + } + } + } + } + } + } + + /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation + /// was successful and intent hasn't changed since the Reconciler was spawned, this will update + /// the observed state of the tenant such that subsequent calls to [`TenantShard::get_reconcile_needed`] + /// will indicate that reconciliation is not needed. + #[instrument(skip_all, fields( + tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(), + sequence=%result.sequence + ))] + fn process_result(&self, mut result: ReconcileResult) { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, _scheduler) = locked.parts_mut(); + let Some(tenant) = tenants.get_mut(&result.tenant_shard_id) else { + // A reconciliation result might race with removing a tenant: drop results for + // tenants that aren't in our map. + return; + }; + + // Usually generation should only be updated via this path, so the max() isn't + // needed, but it is used to handle out-of-band updates via. e.g. test hook. + tenant.generation = std::cmp::max(tenant.generation, result.generation); + + // If the reconciler signals that it failed to notify compute, set this state on + // the shard so that a future [`TenantShard::maybe_reconcile`] will try again. + tenant.pending_compute_notification = result.pending_compute_notification; + + // Let the TenantShard know it is idle. + tenant.reconcile_complete(result.sequence); + + // In case a node was deleted while this reconcile is in flight, filter it out of the update we will + // make to the tenant + result + .observed + .locations + .retain(|node_id, _loc| nodes.contains_key(node_id)); + + match result.result { + Ok(()) => { + for (node_id, loc) in &result.observed.locations { + if let Some(conf) = &loc.conf { + tracing::info!("Updating observed location {}: {:?}", node_id, conf); + } else { + tracing::info!("Setting observed location {} to None", node_id,) + } + } + + tenant.observed = result.observed; + tenant.waiter.advance(result.sequence); + } + Err(e) => { + match e { + ReconcileError::Cancel => { + tracing::info!("Reconciler was cancelled"); + } + ReconcileError::Remote(mgmt_api::Error::Cancelled) => { + // This might be due to the reconciler getting cancelled, or it might + // be due to the `Node` being marked offline. + tracing::info!("Reconciler cancelled during pageserver API call"); + } + _ => { + tracing::warn!("Reconcile error: {}", e); + } + } + + // Ordering: populate last_error before advancing error_seq, + // so that waiters will see the correct error after waiting. + tenant.set_last_error(result.sequence, e); + + for (node_id, o) in result.observed.locations { + tenant.observed.locations.insert(node_id, o); + } + } + } + + // Maybe some other work can proceed now that this job finished. + if self.reconciler_concurrency.available_permits() > 0 { + while let Ok(tenant_shard_id) = locked.delayed_reconcile_rx.try_recv() { + let (nodes, tenants, _scheduler) = locked.parts_mut(); + if let Some(shard) = tenants.get_mut(&tenant_shard_id) { + shard.delayed_reconcile = false; + self.maybe_reconcile_shard(shard, nodes); + } + + if self.reconciler_concurrency.available_permits() == 0 { + break; + } + } + } + } + + async fn process_results( + &self, + mut result_rx: tokio::sync::mpsc::UnboundedReceiver, + mut bg_compute_hook_result_rx: tokio::sync::mpsc::Receiver< + Result<(), (TenantShardId, NotifyError)>, + >, + ) { + loop { + // Wait for the next result, or for cancellation + tokio::select! { + r = result_rx.recv() => { + match r { + Some(ReconcileResultRequest::ReconcileResult(result)) => {self.process_result(result);}, + None | Some(ReconcileResultRequest::Stop) => {break;} + } + } + _ = async{ + match bg_compute_hook_result_rx.recv().await { + Some(result) => { + if let Err((tenant_shard_id, notify_error)) = result { + tracing::warn!("Marking shard {tenant_shard_id} for notification retry, due to error {notify_error}"); + let mut locked = self.inner.write().unwrap(); + if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) { + shard.pending_compute_notification = true; + } + + } + }, + None => { + // This channel is dead, but we don't want to terminate the outer loop{}: just wait for shutdown + self.cancel.cancelled().await; + } + } + } => {}, + _ = self.cancel.cancelled() => { + break; + } + }; + } + } + + async fn process_aborts( + &self, + mut abort_rx: tokio::sync::mpsc::UnboundedReceiver, + ) { + loop { + // Wait for the next result, or for cancellation + let op = tokio::select! { + r = abort_rx.recv() => { + match r { + Some(op) => {op}, + None => {break;} + } + } + _ = self.cancel.cancelled() => { + break; + } + }; + + // Retry until shutdown: we must keep this request object alive until it is properly + // processed, as it holds a lock guard that prevents other operations trying to do things + // to the tenant while it is in a weird part-split state. + while !self.cancel.is_cancelled() { + match self.abort_tenant_shard_split(&op).await { + Ok(_) => break, + Err(e) => { + tracing::warn!( + "Failed to abort shard split on {}, will retry: {e}", + op.tenant_id + ); + + // If a node is unavailable, we hope that it has been properly marked Offline + // when we retry, so that the abort op will succeed. If the abort op is failing + // for some other reason, we will keep retrying forever, or until a human notices + // and does something about it (either fixing a pageserver or restarting the controller). + tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled()) + .await + .ok(); + } + } + } + } + } + + pub async fn spawn(config: Config, persistence: Arc) -> anyhow::Result> { + let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel(); + let (abort_tx, abort_rx) = tokio::sync::mpsc::unbounded_channel(); + + let leadership_cancel = CancellationToken::new(); + let leadership = Leadership::new(persistence.clone(), config.clone(), leadership_cancel); + let (leader, leader_step_down_state) = leadership.step_down_current_leader().await?; + + // Apply the migrations **after** the current leader has stepped down + // (or we've given up waiting for it), but **before** reading from the + // database. The only exception is reading the current leader before + // migrating. + persistence.migration_run().await?; + + tracing::info!("Loading nodes from database..."); + let nodes = persistence + .list_nodes() + .await? + .into_iter() + .map(Node::from_persistent) + .collect::>(); + let nodes: HashMap = nodes.into_iter().map(|n| (n.get_id(), n)).collect(); + tracing::info!("Loaded {} nodes from database.", nodes.len()); + + tracing::info!("Loading shards from database..."); + let mut tenant_shard_persistence = persistence.list_tenant_shards().await?; + tracing::info!( + "Loaded {} shards from database.", + tenant_shard_persistence.len() + ); + + // If any shard splits were in progress, reset the database state to abort them + let mut tenant_shard_count_min_max: HashMap = + HashMap::new(); + for tsp in &mut tenant_shard_persistence { + let shard = tsp.get_shard_identity()?; + let tenant_shard_id = tsp.get_tenant_shard_id()?; + let entry = tenant_shard_count_min_max + .entry(tenant_shard_id.tenant_id) + .or_insert_with(|| (shard.count, shard.count)); + entry.0 = std::cmp::min(entry.0, shard.count); + entry.1 = std::cmp::max(entry.1, shard.count); + } + + for (tenant_id, (count_min, count_max)) in tenant_shard_count_min_max { + if count_min != count_max { + // Aborting the split in the database and dropping the child shards is sufficient: the reconciliation in + // [`Self::startup_reconcile`] will implicitly drop the child shards on remote pageservers, or they'll + // be dropped later in [`Self::node_activate_reconcile`] if it isn't available right now. + tracing::info!("Aborting shard split {tenant_id} {count_min:?} -> {count_max:?}"); + let abort_status = persistence.abort_shard_split(tenant_id, count_max).await?; + + // We may never see the Complete status here: if the split was complete, we wouldn't have + // identified this tenant has having mismatching min/max counts. + assert!(matches!(abort_status, AbortShardSplitStatus::Aborted)); + + // Clear the splitting status in-memory, to reflect that we just aborted in the database + tenant_shard_persistence.iter_mut().for_each(|tsp| { + // Set idle split state on those shards that we will retain. + let tsp_tenant_id = TenantId::from_str(tsp.tenant_id.as_str()).unwrap(); + if tsp_tenant_id == tenant_id + && tsp.get_shard_identity().unwrap().count == count_min + { + tsp.splitting = SplitState::Idle; + } else if tsp_tenant_id == tenant_id { + // Leave the splitting state on the child shards: this will be used next to + // drop them. + tracing::info!( + "Shard {tsp_tenant_id} will be dropped after shard split abort", + ); + } + }); + + // Drop shards for this tenant which we didn't just mark idle (i.e. child shards of the aborted split) + tenant_shard_persistence.retain(|tsp| { + TenantId::from_str(tsp.tenant_id.as_str()).unwrap() != tenant_id + || tsp.splitting == SplitState::Idle + }); + } + } + + let mut tenants = BTreeMap::new(); + + let mut scheduler = Scheduler::new(nodes.values()); + + #[cfg(feature = "testing")] + { + // Hack: insert scheduler state for all nodes referenced by shards, as compatibility + // tests only store the shards, not the nodes. The nodes will be loaded shortly + // after when pageservers start up and register. + let mut node_ids = HashSet::new(); + for tsp in &tenant_shard_persistence { + if let Some(node_id) = tsp.generation_pageserver { + node_ids.insert(node_id); + } + } + for node_id in node_ids { + tracing::info!("Creating node {} in scheduler for tests", node_id); + let node = Node::new( + NodeId(node_id as u64), + "".to_string(), + 123, + "".to_string(), + 123, + "test_az".to_string(), + ); + + scheduler.node_upsert(&node); + } + } + for tsp in tenant_shard_persistence { + let tenant_shard_id = tsp.get_tenant_shard_id()?; + + // We will populate intent properly later in [`Self::startup_reconcile`], initially populate + // it with what we can infer: the node for which a generation was most recently issued. + let mut intent = IntentState::new(); + if let Some(generation_pageserver) = tsp.generation_pageserver.map(|n| NodeId(n as u64)) + { + if nodes.contains_key(&generation_pageserver) { + intent.set_attached(&mut scheduler, Some(generation_pageserver)); + } else { + // If a node was removed before being completely drained, it is legal for it to leave behind a `generation_pageserver` referring + // to a non-existent node, because node deletion doesn't block on completing the reconciliations that will issue new generations + // on different pageservers. + tracing::warn!("Tenant shard {tenant_shard_id} references non-existent node {generation_pageserver} in database, will be rescheduled"); + } + } + let new_tenant = TenantShard::from_persistent(tsp, intent)?; + + tenants.insert(tenant_shard_id, new_tenant); + } + + let (startup_completion, startup_complete) = utils::completion::channel(); + + // This channel is continuously consumed by process_results, so doesn't need to be very large. + let (bg_compute_notify_result_tx, bg_compute_notify_result_rx) = + tokio::sync::mpsc::channel(512); + + let (delayed_reconcile_tx, delayed_reconcile_rx) = + tokio::sync::mpsc::channel(MAX_DELAYED_RECONCILES); + + let cancel = CancellationToken::new(); + let reconcilers_cancel = cancel.child_token(); + + let heartbeater = Heartbeater::new( + config.jwt_token.clone(), + config.max_offline_interval, + config.max_warming_up_interval, + cancel.clone(), + ); + + let initial_leadership_status = if config.start_as_candidate { + LeadershipStatus::Candidate + } else { + LeadershipStatus::Leader + }; + + let this = Arc::new(Self { + inner: Arc::new(std::sync::RwLock::new(ServiceState::new( + nodes, + tenants, + scheduler, + delayed_reconcile_rx, + initial_leadership_status, + ))), + config: config.clone(), + persistence, + compute_hook: Arc::new(ComputeHook::new(config.clone())), + result_tx, + heartbeater, + reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new( + config.reconciler_concurrency, + )), + delayed_reconcile_tx, + abort_tx, + startup_complete: startup_complete.clone(), + cancel, + reconcilers_cancel, + gate: Gate::default(), + reconcilers_gate: Gate::default(), + tenant_op_locks: Default::default(), + node_op_locks: Default::default(), + }); + + let result_task_this = this.clone(); + tokio::task::spawn(async move { + // Block shutdown until we're done (we must respect self.cancel) + if let Ok(_gate) = result_task_this.gate.enter() { + result_task_this + .process_results(result_rx, bg_compute_notify_result_rx) + .await + } + }); + + tokio::task::spawn({ + let this = this.clone(); + async move { + // Block shutdown until we're done (we must respect self.cancel) + if let Ok(_gate) = this.gate.enter() { + this.process_aborts(abort_rx).await + } + } + }); + + tokio::task::spawn({ + let this = this.clone(); + async move { + if let Ok(_gate) = this.gate.enter() { + loop { + tokio::select! { + _ = this.cancel.cancelled() => { + break; + }, + _ = tokio::time::sleep(Duration::from_secs(60)) => {} + }; + this.tenant_op_locks.housekeeping(); + } + } + } + }); + + tokio::task::spawn({ + let this = this.clone(); + // We will block the [`Service::startup_complete`] barrier until [`Self::startup_reconcile`] + // is done. + let startup_completion = startup_completion.clone(); + async move { + // Block shutdown until we're done (we must respect self.cancel) + let Ok(_gate) = this.gate.enter() else { + return; + }; + + this.startup_reconcile(leader, leader_step_down_state, bg_compute_notify_result_tx) + .await; + + drop(startup_completion); + } + }); + + tokio::task::spawn({ + let this = this.clone(); + let startup_complete = startup_complete.clone(); + async move { + startup_complete.wait().await; + this.background_reconcile().await; + } + }); + + tokio::task::spawn({ + let this = this.clone(); + let startup_complete = startup_complete.clone(); + async move { + startup_complete.wait().await; + this.spawn_heartbeat_driver().await; + } + }); + + Ok(this) + } + + pub(crate) async fn attach_hook( + &self, + attach_req: AttachHookRequest, + ) -> anyhow::Result { + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + attach_req.tenant_shard_id.tenant_id, + TenantOperations::AttachHook, + ) + .await; + + // This is a test hook. To enable using it on tenants that were created directly with + // the pageserver API (not via this service), we will auto-create any missing tenant + // shards with default state. + let insert = { + let locked = self.inner.write().unwrap(); + !locked.tenants.contains_key(&attach_req.tenant_shard_id) + }; + + if insert { + let tsp = TenantShardPersistence { + tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(), + shard_number: attach_req.tenant_shard_id.shard_number.0 as i32, + shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32, + shard_stripe_size: 0, + generation: attach_req.generation_override.or(Some(0)), + generation_pageserver: None, + placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(), + config: serde_json::to_string(&TenantConfig::default()).unwrap(), + splitting: SplitState::default(), + scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) + .unwrap(), + preferred_az_id: None, + }; + + match self.persistence.insert_tenant_shards(vec![tsp]).await { + Err(e) => match e { + DatabaseError::Query(diesel::result::Error::DatabaseError( + DatabaseErrorKind::UniqueViolation, + _, + )) => { + tracing::info!( + "Raced with another request to insert tenant {}", + attach_req.tenant_shard_id + ) + } + _ => return Err(e.into()), + }, + Ok(()) => { + tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id); + + let mut locked = self.inner.write().unwrap(); + locked.tenants.insert( + attach_req.tenant_shard_id, + TenantShard::new( + attach_req.tenant_shard_id, + ShardIdentity::unsharded(), + PlacementPolicy::Attached(0), + ), + ); + tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id); + } + } + } + + let new_generation = if let Some(req_node_id) = attach_req.node_id { + let maybe_tenant_conf = { + let locked = self.inner.write().unwrap(); + locked + .tenants + .get(&attach_req.tenant_shard_id) + .map(|t| t.config.clone()) + }; + + match maybe_tenant_conf { + Some(conf) => { + let new_generation = self + .persistence + .increment_generation(attach_req.tenant_shard_id, req_node_id) + .await?; + + // Persist the placement policy update. This is required + // when we reattaching a detached tenant. + self.persistence + .update_tenant_shard( + TenantFilter::Shard(attach_req.tenant_shard_id), + Some(PlacementPolicy::Attached(0)), + Some(conf), + None, + None, + ) + .await?; + Some(new_generation) + } + None => { + anyhow::bail!("Attach hook handling raced with tenant removal") + } + } + } else { + self.persistence.detach(attach_req.tenant_shard_id).await?; + None + }; + + let mut locked = self.inner.write().unwrap(); + let (_nodes, tenants, scheduler) = locked.parts_mut(); + + let tenant_shard = tenants + .get_mut(&attach_req.tenant_shard_id) + .expect("Checked for existence above"); + + if let Some(new_generation) = new_generation { + tenant_shard.generation = Some(new_generation); + tenant_shard.policy = PlacementPolicy::Attached(0); + } else { + // This is a detach notification. We must update placement policy to avoid re-attaching + // during background scheduling/reconciliation, or during storage controller restart. + assert!(attach_req.node_id.is_none()); + tenant_shard.policy = PlacementPolicy::Detached; + } + + if let Some(attaching_pageserver) = attach_req.node_id.as_ref() { + tracing::info!( + tenant_id = %attach_req.tenant_shard_id, + ps_id = %attaching_pageserver, + generation = ?tenant_shard.generation, + "issuing", + ); + } else if let Some(ps_id) = tenant_shard.intent.get_attached() { + tracing::info!( + tenant_id = %attach_req.tenant_shard_id, + %ps_id, + generation = ?tenant_shard.generation, + "dropping", + ); + } else { + tracing::info!( + tenant_id = %attach_req.tenant_shard_id, + "no-op: tenant already has no pageserver"); + } + tenant_shard + .intent + .set_attached(scheduler, attach_req.node_id); + + tracing::info!( + "attach_hook: tenant {} set generation {:?}, pageserver {}", + attach_req.tenant_shard_id, + tenant_shard.generation, + // TODO: this is an odd number of 0xf's + attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)) + ); + + // Trick the reconciler into not doing anything for this tenant: this helps + // tests that manually configure a tenant on the pagesrever, and then call this + // attach hook: they don't want background reconciliation to modify what they + // did to the pageserver. + #[cfg(feature = "testing")] + { + if let Some(node_id) = attach_req.node_id { + tenant_shard.observed.locations = HashMap::from([( + node_id, + ObservedStateLocation { + conf: Some(attached_location_conf( + tenant_shard.generation.unwrap(), + &tenant_shard.shard, + &tenant_shard.config, + &PlacementPolicy::Attached(0), + )), + }, + )]); + } else { + tenant_shard.observed.locations.clear(); + } + } + + Ok(AttachHookResponse { + gen: attach_req + .node_id + .map(|_| tenant_shard.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()), + }) + } + + pub(crate) fn inspect(&self, inspect_req: InspectRequest) -> InspectResponse { + let locked = self.inner.read().unwrap(); + + let tenant_shard = locked.tenants.get(&inspect_req.tenant_shard_id); + + InspectResponse { + attachment: tenant_shard.and_then(|s| { + s.intent + .get_attached() + .map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps)) + }), + } + } + + // When the availability state of a node transitions to active, we must do a full reconciliation + // of LocationConfigs on that node. This is because while a node was offline: + // - we might have proceeded through startup_reconcile without checking for extraneous LocationConfigs on this node + // - aborting a tenant shard split might have left rogue child shards behind on this node. + // + // This function must complete _before_ setting a `Node` to Active: once it is set to Active, other + // Reconcilers might communicate with the node, and these must not overlap with the work we do in + // this function. + // + // The reconciliation logic in here is very similar to what [`Self::startup_reconcile`] does, but + // for written for a single node rather than as a batch job for all nodes. + #[tracing::instrument(skip_all, fields(node_id=%node.get_id()))] + async fn node_activate_reconcile( + &self, + mut node: Node, + _lock: &TracingExclusiveGuard, + ) -> Result<(), ApiError> { + // This Node is a mutable local copy: we will set it active so that we can use its + // API client to reconcile with the node. The Node in [`Self::nodes`] will get updated + // later. + node.set_availability(NodeAvailability::Active(PageserverUtilization::full())); + + let configs = match node + .with_client_retries( + |client| async move { client.list_location_config().await }, + &self.config.jwt_token, + 1, + 5, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + None => { + // We're shutting down (the Node's cancellation token can't have fired, because + // we're the only scope that has a reference to it, and we didn't fire it). + return Err(ApiError::ShuttingDown); + } + Some(Err(e)) => { + // This node didn't succeed listing its locations: it may not proceed to active state + // as it is apparently unavailable. + return Err(ApiError::PreconditionFailed( + format!("Failed to query node location configs, cannot activate ({e})").into(), + )); + } + Some(Ok(configs)) => configs, + }; + tracing::info!("Loaded {} LocationConfigs", configs.tenant_shards.len()); + + let mut cleanup = Vec::new(); + { + let mut locked = self.inner.write().unwrap(); + + for (tenant_shard_id, observed_loc) in configs.tenant_shards { + let Some(tenant_shard) = locked.tenants.get_mut(&tenant_shard_id) else { + cleanup.push(tenant_shard_id); + continue; + }; + tenant_shard + .observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: observed_loc }); + } + } + + for tenant_shard_id in cleanup { + tracing::info!("Detaching {tenant_shard_id}"); + match node + .with_client_retries( + |client| async move { + let config = LocationConfig { + mode: LocationConfigMode::Detached, + generation: None, + secondary_conf: None, + shard_number: tenant_shard_id.shard_number.0, + shard_count: tenant_shard_id.shard_count.literal(), + shard_stripe_size: 0, + tenant_conf: models::TenantConfig::default(), + }; + client + .location_config(tenant_shard_id, config, None, false) + .await + }, + &self.config.jwt_token, + 1, + 5, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + None => { + // We're shutting down (the Node's cancellation token can't have fired, because + // we're the only scope that has a reference to it, and we didn't fire it). + return Err(ApiError::ShuttingDown); + } + Some(Err(e)) => { + // Do not let the node proceed to Active state if it is not responsive to requests + // to detach. This could happen if e.g. a shutdown bug in the pageserver is preventing + // detach completing: we should not let this node back into the set of nodes considered + // okay for scheduling. + return Err(ApiError::Conflict(format!( + "Node {node} failed to detach {tenant_shard_id}: {e}" + ))); + } + Some(Ok(_)) => {} + }; + } + + Ok(()) + } + + pub(crate) async fn re_attach( + &self, + reattach_req: ReAttachRequest, + ) -> Result { + if let Some(register_req) = reattach_req.register { + self.node_register(register_req).await?; + } + + // Ordering: we must persist generation number updates before making them visible in the in-memory state + let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?; + + tracing::info!( + node_id=%reattach_req.node_id, + "Incremented {} tenant shards' generations", + incremented_generations.len() + ); + + // Apply the updated generation to our in-memory state, and + // gather discover secondary locations. + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + let mut response = ReAttachResponse { + tenants: Vec::new(), + }; + + // TODO: cancel/restart any running reconciliation for this tenant, it might be trying + // to call location_conf API with an old generation. Wait for cancellation to complete + // before responding to this request. Requires well implemented CancellationToken logic + // all the way to where we call location_conf. Even then, there can still be a location_conf + // request in flight over the network: TODO handle that by making location_conf API refuse + // to go backward in generations. + + // Scan through all shards, applying updates for ones where we updated generation + // and identifying shards that intend to have a secondary location on this node. + for (tenant_shard_id, shard) in tenants { + if let Some(new_gen) = incremented_generations.get(tenant_shard_id) { + let new_gen = *new_gen; + response.tenants.push(ReAttachResponseTenant { + id: *tenant_shard_id, + gen: Some(new_gen.into().unwrap()), + // A tenant is only put into multi or stale modes in the middle of a [`Reconciler::live_migrate`] + // execution. If a pageserver is restarted during that process, then the reconcile pass will + // fail, and start from scratch, so it doesn't make sense for us to try and preserve + // the stale/multi states at this point. + mode: LocationConfigMode::AttachedSingle, + }); + + shard.generation = std::cmp::max(shard.generation, Some(new_gen)); + if let Some(observed) = shard.observed.locations.get_mut(&reattach_req.node_id) { + // Why can we update `observed` even though we're not sure our response will be received + // by the pageserver? Because the pageserver will not proceed with startup until + // it has processed response: if it loses it, we'll see another request and increment + // generation again, avoiding any uncertainty about dirtiness of tenant's state. + if let Some(conf) = observed.conf.as_mut() { + conf.generation = new_gen.into(); + } + } else { + // This node has no observed state for the shard: perhaps it was offline + // when the pageserver restarted. Insert a None, so that the Reconciler + // will be prompted to learn the location's state before it makes changes. + shard + .observed + .locations + .insert(reattach_req.node_id, ObservedStateLocation { conf: None }); + } + } else if shard.intent.get_secondary().contains(&reattach_req.node_id) { + // Ordering: pageserver will not accept /location_config requests until it has + // finished processing the response from re-attach. So we can update our in-memory state + // now, and be confident that we are not stamping on the result of some later location config. + // TODO: however, we are not strictly ordered wrt ReconcileResults queue, + // so we might update observed state here, and then get over-written by some racing + // ReconcileResult. The impact is low however, since we have set state on pageserver something + // that matches intent, so worst case if we race then we end up doing a spurious reconcile. + + response.tenants.push(ReAttachResponseTenant { + id: *tenant_shard_id, + gen: None, + mode: LocationConfigMode::Secondary, + }); + + // We must not update observed, because we have no guarantee that our + // response will be received by the pageserver. This could leave it + // falsely dirty, but the resulting reconcile should be idempotent. + } + } + + // We consider a node Active once we have composed a re-attach response, but we + // do not call [`Self::node_activate_reconcile`]: the handling of the re-attach response + // implicitly synchronizes the LocationConfigs on the node. + // + // Setting a node active unblocks any Reconcilers that might write to the location config API, + // but those requests will not be accepted by the node until it has finished processing + // the re-attach response. + // + // Additionally, reset the nodes scheduling policy to match the conditional update done + // in [`Persistence::re_attach`]. + if let Some(node) = nodes.get(&reattach_req.node_id) { + let reset_scheduling = matches!( + node.get_scheduling(), + NodeSchedulingPolicy::PauseForRestart + | NodeSchedulingPolicy::Draining + | NodeSchedulingPolicy::Filling + ); + + let mut new_nodes = (**nodes).clone(); + if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) { + if reset_scheduling { + node.set_scheduling(NodeSchedulingPolicy::Active); + } + + tracing::info!("Marking {} warming-up on reattach", reattach_req.node_id); + node.set_availability(NodeAvailability::WarmingUp(std::time::Instant::now())); + + scheduler.node_upsert(node); + let new_nodes = Arc::new(new_nodes); + *nodes = new_nodes; + } else { + tracing::error!( + "Reattaching node {} was removed while processing the request", + reattach_req.node_id + ); + } + } + + Ok(response) + } + + pub(crate) async fn validate( + &self, + validate_req: ValidateRequest, + ) -> Result { + // Fast in-memory check: we may reject validation on anything that doesn't match our + // in-memory generation for a shard + let in_memory_result = { + let mut in_memory_result = Vec::new(); + let locked = self.inner.read().unwrap(); + for req_tenant in validate_req.tenants { + if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) { + let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen)); + tracing::info!( + "handle_validate: {}(gen {}): valid={valid} (latest {:?})", + req_tenant.id, + req_tenant.gen, + tenant_shard.generation + ); + + in_memory_result.push((req_tenant.id, Generation::new(req_tenant.gen), valid)); + } else { + // This is legal: for example during a shard split the pageserver may still + // have deletions in its queue from the old pre-split shard, or after deletion + // of a tenant that was busy with compaction/gc while being deleted. + tracing::info!( + "Refusing deletion validation for missing shard {}", + req_tenant.id + ); + } + } + + in_memory_result + }; + + // Database calls to confirm validity for anything that passed the in-memory check. We must do this + // in case of controller split-brain, where some other controller process might have incremented the generation. + let db_generations = self + .persistence + .shard_generations(in_memory_result.iter().filter_map(|i| { + if i.2 { + Some(&i.0) + } else { + None + } + })) + .await?; + let db_generations = db_generations.into_iter().collect::>(); + + let mut response = ValidateResponse { + tenants: Vec::new(), + }; + for (tenant_shard_id, validate_generation, valid) in in_memory_result.into_iter() { + let valid = if valid { + let db_generation = db_generations.get(&tenant_shard_id); + db_generation == Some(&Some(validate_generation)) + } else { + // If in-memory state says it's invalid, trust that. It's always safe to fail a validation, at worst + // this prevents a pageserver from cleaning up an object in S3. + false + }; + + response.tenants.push(ValidateResponseTenant { + id: tenant_shard_id, + valid, + }) + } + + Ok(response) + } + + pub(crate) async fn tenant_create( + &self, + create_req: TenantCreateRequest, + ) -> Result { + let tenant_id = create_req.new_tenant_id.tenant_id; + + // Exclude any concurrent attempts to create/access the same tenant ID + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + create_req.new_tenant_id.tenant_id, + TenantOperations::Create, + ) + .await; + let (response, waiters) = self.do_tenant_create(create_req).await?; + + if let Err(e) = self.await_waiters(waiters, RECONCILE_TIMEOUT).await { + // Avoid deadlock: reconcile may fail while notifying compute, if the cloud control plane refuses to + // accept compute notifications while it is in the process of creating. Reconciliation will + // be retried in the background. + tracing::warn!(%tenant_id, "Reconcile not done yet while creating tenant ({e})"); + } + Ok(response) + } + + pub(crate) async fn do_tenant_create( + &self, + create_req: TenantCreateRequest, + ) -> Result<(TenantCreateResponse, Vec), ApiError> { + let placement_policy = create_req + .placement_policy + .clone() + // As a default, zero secondaries is convenient for tests that don't choose a policy. + .unwrap_or(PlacementPolicy::Attached(0)); + + // This service expects to handle sharding itself: it is an error to try and directly create + // a particular shard here. + let tenant_id = if !create_req.new_tenant_id.is_unsharded() { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Attempted to create a specific shard, this API is for creating the whole tenant" + ))); + } else { + create_req.new_tenant_id.tenant_id + }; + + tracing::info!( + "Creating tenant {}, shard_count={:?}", + create_req.new_tenant_id, + create_req.shard_parameters.count, + ); + + let create_ids = (0..create_req.shard_parameters.count.count()) + .map(|i| TenantShardId { + tenant_id, + shard_number: ShardNumber(i), + shard_count: create_req.shard_parameters.count, + }) + .collect::>(); + + // If the caller specifies a None generation, it means "start from default". This is different + // to [`Self::tenant_location_config`], where a None generation is used to represent + // an incompletely-onboarded tenant. + let initial_generation = if matches!(placement_policy, PlacementPolicy::Secondary) { + tracing::info!( + "tenant_create: secondary mode, generation is_some={}", + create_req.generation.is_some() + ); + create_req.generation.map(Generation::new) + } else { + tracing::info!( + "tenant_create: not secondary mode, generation is_some={}", + create_req.generation.is_some() + ); + Some( + create_req + .generation + .map(Generation::new) + .unwrap_or(INITIAL_GENERATION), + ) + }; + + // Ordering: we persist tenant shards before creating them on the pageserver. This enables a caller + // to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart + // during the creation, rather than risking leaving orphan objects in S3. + let persist_tenant_shards = create_ids + .iter() + .map(|tenant_shard_id| TenantShardPersistence { + tenant_id: tenant_shard_id.tenant_id.to_string(), + shard_number: tenant_shard_id.shard_number.0 as i32, + shard_count: tenant_shard_id.shard_count.literal() as i32, + shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32, + generation: initial_generation.map(|g| g.into().unwrap() as i32), + // The pageserver is not known until scheduling happens: we will set this column when + // incrementing the generation the first time we attach to a pageserver. + generation_pageserver: None, + placement_policy: serde_json::to_string(&placement_policy).unwrap(), + config: serde_json::to_string(&create_req.config).unwrap(), + splitting: SplitState::default(), + scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) + .unwrap(), + preferred_az_id: None, + }) + .collect(); + + match self + .persistence + .insert_tenant_shards(persist_tenant_shards) + .await + { + Ok(_) => {} + Err(DatabaseError::Query(diesel::result::Error::DatabaseError( + DatabaseErrorKind::UniqueViolation, + _, + ))) => { + // Unique key violation: this is probably a retry. Because the shard count is part of the unique key, + // if we see a unique key violation it means that the creation request's shard count matches the previous + // creation's shard count. + tracing::info!("Tenant shards already present in database, proceeding with idempotent creation..."); + } + // Any other database error is unexpected and a bug. + Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))), + }; + + let mut schedule_context = ScheduleContext::default(); + let mut schedule_error = None; + let mut response_shards = Vec::new(); + for tenant_shard_id in create_ids { + tracing::info!("Creating shard {tenant_shard_id}..."); + + let outcome = self + .do_initial_shard_scheduling( + tenant_shard_id, + initial_generation, + &create_req.shard_parameters, + create_req.config.clone(), + placement_policy.clone(), + &mut schedule_context, + ) + .await; + + match outcome { + InitialShardScheduleOutcome::Scheduled(resp) => response_shards.push(resp), + InitialShardScheduleOutcome::NotScheduled => {} + InitialShardScheduleOutcome::ShardScheduleError(err) => { + schedule_error = Some(err); + } + } + } + + let preferred_azs = { + let locked = self.inner.read().unwrap(); + response_shards + .iter() + .filter_map(|resp| { + let az_id = locked + .nodes + .get(&resp.node_id) + .map(|n| n.get_availability_zone_id().to_string())?; + + Some((resp.shard_id, az_id)) + }) + .collect::>() + }; + + // Note that we persist the preferred AZ for the new shards separately. + // In theory, we could "peek" the scheduler to determine where the shard will + // land, but the subsequent "real" call into the scheduler might select a different + // node. Hence, we do this awkward update to keep things consistent. + let updated = self + .persistence + .set_tenant_shard_preferred_azs(preferred_azs) + .await + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!( + "Failed to persist preferred az ids: {err}" + )) + })?; + + { + let mut locked = self.inner.write().unwrap(); + for (tid, az_id) in updated { + if let Some(shard) = locked.tenants.get_mut(&tid) { + shard.set_preferred_az(az_id); + } + } + } + + // If we failed to schedule shards, then they are still created in the controller, + // but we return an error to the requester to avoid a silent failure when someone + // tries to e.g. create a tenant whose placement policy requires more nodes than + // are present in the system. We do this here rather than in the above loop, to + // avoid situations where we only create a subset of shards in the tenant. + if let Some(e) = schedule_error { + return Err(ApiError::Conflict(format!( + "Failed to schedule shard(s): {e}" + ))); + } + + let waiters = { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, _scheduler) = locked.parts_mut(); + tenants + .range_mut(TenantShardId::tenant_range(tenant_id)) + .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes)) + .collect::>() + }; + + Ok(( + TenantCreateResponse { + shards: response_shards, + }, + waiters, + )) + } + + /// Helper for tenant creation that does the scheduling for an individual shard. Covers both the + /// case of a new tenant and a pre-existing one. + async fn do_initial_shard_scheduling( + &self, + tenant_shard_id: TenantShardId, + initial_generation: Option, + shard_params: &ShardParameters, + config: TenantConfig, + placement_policy: PlacementPolicy, + schedule_context: &mut ScheduleContext, + ) -> InitialShardScheduleOutcome { + let mut locked = self.inner.write().unwrap(); + let (_nodes, tenants, scheduler) = locked.parts_mut(); + + use std::collections::btree_map::Entry; + match tenants.entry(tenant_shard_id) { + Entry::Occupied(mut entry) => { + tracing::info!("Tenant shard {tenant_shard_id} already exists while creating"); + + // TODO: schedule() should take an anti-affinity expression that pushes + // attached and secondary locations (independently) away frorm those + // pageservers also holding a shard for this tenant. + + if let Err(err) = entry.get_mut().schedule(scheduler, schedule_context) { + return InitialShardScheduleOutcome::ShardScheduleError(err); + } + + if let Some(node_id) = entry.get().intent.get_attached() { + let generation = entry + .get() + .generation + .expect("Generation is set when in attached mode"); + InitialShardScheduleOutcome::Scheduled(TenantCreateResponseShard { + shard_id: tenant_shard_id, + node_id: *node_id, + generation: generation.into().unwrap(), + }) + } else { + InitialShardScheduleOutcome::NotScheduled + } + } + Entry::Vacant(entry) => { + let state = entry.insert(TenantShard::new( + tenant_shard_id, + ShardIdentity::from_params(tenant_shard_id.shard_number, shard_params), + placement_policy, + )); + + state.generation = initial_generation; + state.config = config; + if let Err(e) = state.schedule(scheduler, schedule_context) { + return InitialShardScheduleOutcome::ShardScheduleError(e); + } + + // Only include shards in result if we are attaching: the purpose + // of the response is to tell the caller where the shards are attached. + if let Some(node_id) = state.intent.get_attached() { + let generation = state + .generation + .expect("Generation is set when in attached mode"); + InitialShardScheduleOutcome::Scheduled(TenantCreateResponseShard { + shard_id: tenant_shard_id, + node_id: *node_id, + generation: generation.into().unwrap(), + }) + } else { + InitialShardScheduleOutcome::NotScheduled + } + } + } + } + + /// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded + /// wait for reconciliation to complete before responding. + async fn await_waiters( + &self, + waiters: Vec, + timeout: Duration, + ) -> Result<(), ReconcileWaitError> { + let deadline = Instant::now().checked_add(timeout).unwrap(); + for waiter in waiters { + let timeout = deadline.duration_since(Instant::now()); + waiter.wait_timeout(timeout).await?; + } + + Ok(()) + } + + /// Same as [`Service::await_waiters`], but returns the waiters which are still + /// in progress + async fn await_waiters_remainder( + &self, + waiters: Vec, + timeout: Duration, + ) -> Vec { + let deadline = Instant::now().checked_add(timeout).unwrap(); + for waiter in waiters.iter() { + let timeout = deadline.duration_since(Instant::now()); + let _ = waiter.wait_timeout(timeout).await; + } + + waiters + .into_iter() + .filter(|waiter| matches!(waiter.get_status(), ReconcilerStatus::InProgress)) + .collect::>() + } + + /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request, + /// and transform it into either a tenant creation of a series of shard updates. + /// + /// If the incoming request makes no changes, a [`TenantCreateOrUpdate::Update`] result will + /// still be returned. + fn tenant_location_config_prepare( + &self, + tenant_id: TenantId, + req: TenantLocationConfigRequest, + ) -> TenantCreateOrUpdate { + let mut updates = Vec::new(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, _scheduler) = locked.parts_mut(); + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + + // Use location config mode as an indicator of policy. + let placement_policy = match req.config.mode { + LocationConfigMode::Detached => PlacementPolicy::Detached, + LocationConfigMode::Secondary => PlacementPolicy::Secondary, + LocationConfigMode::AttachedMulti + | LocationConfigMode::AttachedSingle + | LocationConfigMode::AttachedStale => { + if nodes.len() > 1 { + PlacementPolicy::Attached(1) + } else { + // Convenience for dev/test: if we just have one pageserver, import + // tenants into non-HA mode so that scheduling will succeed. + PlacementPolicy::Attached(0) + } + } + }; + + let mut create = true; + for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { + // Saw an existing shard: this is not a creation + create = false; + + // Shards may have initially been created by a Secondary request, where we + // would have left generation as None. + // + // We only update generation the first time we see an attached-mode request, + // and if there is no existing generation set. The caller is responsible for + // ensuring that no non-storage-controller pageserver ever uses a higher + // generation than they passed in here. + use LocationConfigMode::*; + let set_generation = match req.config.mode { + AttachedMulti | AttachedSingle | AttachedStale if shard.generation.is_none() => { + req.config.generation.map(Generation::new) + } + _ => None, + }; + + updates.push(ShardUpdate { + tenant_shard_id: *shard_id, + placement_policy: placement_policy.clone(), + tenant_config: req.config.tenant_conf.clone(), + generation: set_generation, + }); + } + + if create { + use LocationConfigMode::*; + let generation = match req.config.mode { + AttachedMulti | AttachedSingle | AttachedStale => req.config.generation, + // If a caller provided a generation in a non-attached request, ignore it + // and leave our generation as None: this enables a subsequent update to set + // the generation when setting an attached mode for the first time. + _ => None, + }; + + TenantCreateOrUpdate::Create( + // Synthesize a creation request + TenantCreateRequest { + new_tenant_id: tenant_shard_id, + generation, + shard_parameters: ShardParameters { + count: tenant_shard_id.shard_count, + // We only import un-sharded or single-sharded tenants, so stripe + // size can be made up arbitrarily here. + stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE, + }, + placement_policy: Some(placement_policy), + config: req.config.tenant_conf, + }, + ) + } else { + assert!(!updates.is_empty()); + TenantCreateOrUpdate::Update(updates) + } + } + + /// This API is used by the cloud control plane to migrate unsharded tenants that it created + /// directly with pageservers into this service. + /// + /// Cloud control plane MUST NOT continue issuing GENERATION NUMBERS for this tenant once it + /// has attempted to call this API. Failure to oblige to this rule may lead to S3 corruption. + /// Think of the first attempt to call this API as a transfer of absolute authority over the + /// tenant's source of generation numbers. + /// + /// The mode in this request coarse-grained control of tenants: + /// - Call with mode Attached* to upsert the tenant. + /// - Call with mode Secondary to either onboard a tenant without attaching it, or + /// to set an existing tenant to PolicyMode::Secondary + /// - Call with mode Detached to switch to PolicyMode::Detached + pub(crate) async fn tenant_location_config( + &self, + tenant_shard_id: TenantShardId, + req: TenantLocationConfigRequest, + ) -> Result { + // We require an exclusive lock, because we are updating both persistent and in-memory state + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + tenant_shard_id.tenant_id, + TenantOperations::LocationConfig, + ) + .await; + + if !tenant_shard_id.is_unsharded() { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "This API is for importing single-sharded or unsharded tenants" + ))); + } + + // First check if this is a creation or an update + let create_or_update = self.tenant_location_config_prepare(tenant_shard_id.tenant_id, req); + + let mut result = TenantLocationConfigResponse { + shards: Vec::new(), + stripe_size: None, + }; + let waiters = match create_or_update { + TenantCreateOrUpdate::Create(create_req) => { + let (create_resp, waiters) = self.do_tenant_create(create_req).await?; + result.shards = create_resp + .shards + .into_iter() + .map(|s| TenantShardLocation { + node_id: s.node_id, + shard_id: s.shard_id, + }) + .collect(); + waiters + } + TenantCreateOrUpdate::Update(updates) => { + // Persist updates + // Ordering: write to the database before applying changes in-memory, so that + // we will not appear time-travel backwards on a restart. + let mut schedule_context = ScheduleContext::default(); + for ShardUpdate { + tenant_shard_id, + placement_policy, + tenant_config, + generation, + } in &updates + { + self.persistence + .update_tenant_shard( + TenantFilter::Shard(*tenant_shard_id), + Some(placement_policy.clone()), + Some(tenant_config.clone()), + *generation, + None, + ) + .await?; + } + + // Apply updates in-memory + let mut waiters = Vec::new(); + { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + for ShardUpdate { + tenant_shard_id, + placement_policy, + tenant_config, + generation: update_generation, + } in updates + { + let Some(shard) = tenants.get_mut(&tenant_shard_id) else { + tracing::warn!("Shard {tenant_shard_id} removed while updating"); + continue; + }; + + // Update stripe size + if result.stripe_size.is_none() && shard.shard.count.count() > 1 { + result.stripe_size = Some(shard.shard.stripe_size); + } + + shard.policy = placement_policy; + shard.config = tenant_config; + if let Some(generation) = update_generation { + shard.generation = Some(generation); + } + + shard.schedule(scheduler, &mut schedule_context)?; + + let maybe_waiter = self.maybe_reconcile_shard(shard, nodes); + if let Some(waiter) = maybe_waiter { + waiters.push(waiter); + } + + if let Some(node_id) = shard.intent.get_attached() { + result.shards.push(TenantShardLocation { + shard_id: tenant_shard_id, + node_id: *node_id, + }) + } + } + } + waiters + } + }; + + if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await { + // Do not treat a reconcile error as fatal: we have already applied any requested + // Intent changes, and the reconcile can fail for external reasons like unavailable + // compute notification API. In these cases, it is important that we do not + // cause the cloud control plane to retry forever on this API. + tracing::warn!( + "Failed to reconcile after /location_config: {e}, returning success anyway" + ); + } + + // Logging the full result is useful because it lets us cross-check what the cloud control + // plane's tenant_shards table should contain. + tracing::info!("Complete, returning {result:?}"); + + Ok(result) + } + + pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> { + // We require an exclusive lock, because we are updating persistent and in-memory state + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + req.tenant_id, + TenantOperations::ConfigSet, + ) + .await; + + let tenant_id = req.tenant_id; + let config = req.config; + + self.persistence + .update_tenant_shard( + TenantFilter::Tenant(req.tenant_id), + None, + Some(config.clone()), + None, + None, + ) + .await?; + + let waiters = { + let mut waiters = Vec::new(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, _scheduler) = locked.parts_mut(); + for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { + shard.config = config.clone(); + if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + waiters.push(waiter); + } + } + waiters + }; + + if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await { + // Treat this as success because we have stored the configuration. If e.g. + // a node was unavailable at this time, it should not stop us accepting a + // configuration change. + tracing::warn!(%tenant_id, "Accepted configuration update but reconciliation failed: {e}"); + } + + Ok(()) + } + + pub(crate) fn tenant_config_get( + &self, + tenant_id: TenantId, + ) -> Result, ApiError> { + let config = { + let locked = self.inner.read().unwrap(); + + match locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .next() + { + Some((_tenant_shard_id, shard)) => shard.config.clone(), + None => { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )) + } + } + }; + + // Unlike the pageserver, we do not have a set of global defaults: the config is + // entirely per-tenant. Therefore the distinction between `tenant_specific_overrides` + // and `effective_config` in the response is meaningless, but we retain that syntax + // in order to remain compatible with the pageserver API. + + let response = HashMap::from([ + ( + "tenant_specific_overrides", + serde_json::to_value(&config) + .context("serializing tenant specific overrides") + .map_err(ApiError::InternalServerError)?, + ), + ( + "effective_config", + serde_json::to_value(&config) + .context("serializing effective config") + .map_err(ApiError::InternalServerError)?, + ), + ]); + + Ok(response) + } + + pub(crate) async fn tenant_time_travel_remote_storage( + &self, + time_travel_req: &TenantTimeTravelRequest, + tenant_id: TenantId, + timestamp: Cow<'_, str>, + done_if_after: Cow<'_, str>, + ) -> Result<(), ApiError> { + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimeTravelRemoteStorage, + ) + .await; + + let node = { + let mut locked = self.inner.write().unwrap(); + // Just a sanity check to prevent misuse: the API expects that the tenant is fully + // detached everywhere, and nothing writes to S3 storage. Here, we verify that, + // but only at the start of the process, so it's really just to prevent operator + // mistakes. + for (shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) { + if shard.intent.get_attached().is_some() || !shard.intent.get_secondary().is_empty() + { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "We want tenant to be attached in shard with tenant_shard_id={shard_id}" + ))); + } + let maybe_attached = shard + .observed + .locations + .iter() + .filter_map(|(node_id, observed_location)| { + observed_location + .conf + .as_ref() + .map(|loc| (node_id, observed_location, loc.mode)) + }) + .find(|(_, _, mode)| *mode != LocationConfigMode::Detached); + if let Some((node_id, _observed_location, mode)) = maybe_attached { + return Err(ApiError::InternalServerError(anyhow::anyhow!("We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}"))); + } + } + let scheduler = &mut locked.scheduler; + // Right now we only perform the operation on a single node without parallelization + // TODO fan out the operation to multiple nodes for better performance + let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?; + let node = locked + .nodes + .get(&node_id) + .expect("Pageservers may not be deleted while lock is active"); + node.clone() + }; + + // The shard count is encoded in the remote storage's URL, so we need to handle all historically used shard counts + let mut counts = time_travel_req + .shard_counts + .iter() + .copied() + .collect::>() + .into_iter() + .collect::>(); + counts.sort_unstable(); + + for count in counts { + let shard_ids = (0..count.count()) + .map(|i| TenantShardId { + tenant_id, + shard_number: ShardNumber(i), + shard_count: count, + }) + .collect::>(); + for tenant_shard_id in shard_ids { + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); + + tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",); + + client + .tenant_time_travel_remote_storage( + tenant_shard_id, + ×tamp, + &done_if_after, + ) + .await + .map_err(|e| { + ApiError::InternalServerError(anyhow::anyhow!( + "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}", + node + )) + })?; + } + } + Ok(()) + } + + pub(crate) async fn tenant_secondary_download( + &self, + tenant_id: TenantId, + wait: Option, + ) -> Result<(StatusCode, SecondaryProgress), ApiError> { + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::SecondaryDownload, + ) + .await; + + // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to + let targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + for node_id in shard.intent.get_secondary() { + let node = locked + .nodes + .get(node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push((*tenant_shard_id, node.clone())); + } + } + targets + }; + + // Issue concurrent requests to all shards' locations + let mut futs = FuturesUnordered::new(); + for (tenant_shard_id, node) in targets { + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); + futs.push(async move { + let result = client + .tenant_secondary_download(tenant_shard_id, wait) + .await; + (result, node, tenant_shard_id) + }) + } + + // Handle any errors returned by pageservers. This includes cases like this request racing with + // a scheduling operation, such that the tenant shard we're calling doesn't exist on that pageserver any more, as + // well as more general cases like 503s, 500s, or timeouts. + let mut aggregate_progress = SecondaryProgress::default(); + let mut aggregate_status: Option = None; + let mut error: Option = None; + while let Some((result, node, tenant_shard_id)) = futs.next().await { + match result { + Err(e) => { + // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever + // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache + // than they had hoped for. + tracing::warn!("Secondary download error from pageserver {node}: {e}",); + error = Some(e) + } + Ok((status_code, progress)) => { + tracing::info!(%tenant_shard_id, "Shard status={status_code} progress: {progress:?}"); + aggregate_progress.layers_downloaded += progress.layers_downloaded; + aggregate_progress.layers_total += progress.layers_total; + aggregate_progress.bytes_downloaded += progress.bytes_downloaded; + aggregate_progress.bytes_total += progress.bytes_total; + aggregate_progress.heatmap_mtime = + std::cmp::max(aggregate_progress.heatmap_mtime, progress.heatmap_mtime); + aggregate_status = match aggregate_status { + None => Some(status_code), + Some(StatusCode::OK) => Some(status_code), + Some(cur) => { + // Other status codes (e.g. 202) -- do not overwrite. + Some(cur) + } + }; + } + } + } + + // If any of the shards return 202, indicate our result as 202. + match aggregate_status { + None => { + match error { + Some(e) => { + // No successes, and an error: surface it + Err(ApiError::Conflict(format!("Error from pageserver: {e}"))) + } + None => { + // No shards found + Err(ApiError::NotFound( + anyhow::anyhow!("Tenant {} not found", tenant_id).into(), + )) + } + } + } + Some(aggregate_status) => Ok((aggregate_status, aggregate_progress)), + } + } + + pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result { + let _tenant_lock = + trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await; + + // Detach all shards + let (detach_waiters, shard_ids, node) = { + let mut shard_ids = Vec::new(); + let mut detach_waiters = Vec::new(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + for (tenant_shard_id, shard) in + tenants.range_mut(TenantShardId::tenant_range(tenant_id)) + { + shard_ids.push(*tenant_shard_id); + + // Update the tenant's intent to remove all attachments + shard.policy = PlacementPolicy::Detached; + shard + .schedule(scheduler, &mut ScheduleContext::default()) + .expect("De-scheduling is infallible"); + debug_assert!(shard.intent.get_attached().is_none()); + debug_assert!(shard.intent.get_secondary().is_empty()); + + if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + detach_waiters.push(waiter); + } + } + + // Pick an arbitrary node to use for remote deletions (does not have to be where the tenant + // was attached, just has to be able to see the S3 content) + let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?; + let node = nodes + .get(&node_id) + .expect("Pageservers may not be deleted while lock is active"); + (detach_waiters, shard_ids, node.clone()) + }; + + // This reconcile wait can fail in a few ways: + // A there is a very long queue for the reconciler semaphore + // B some pageserver is failing to handle a detach promptly + // C some pageserver goes offline right at the moment we send it a request. + // + // A and C are transient: the semaphore will eventually become available, and once a node is marked offline + // the next attempt to reconcile will silently skip detaches for an offline node and succeed. If B happens, + // it's a bug, and needs resolving at the pageserver level (we shouldn't just leave attachments behind while + // deleting the underlying data). + self.await_waiters(detach_waiters, RECONCILE_TIMEOUT) + .await?; + + let locations = shard_ids + .into_iter() + .map(|s| (s, node.clone())) + .collect::>(); + let results = self.tenant_for_shards_api( + locations, + |tenant_shard_id, client| async move { client.tenant_delete(tenant_shard_id).await }, + 1, + 3, + RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + for result in results { + match result { + Ok(StatusCode::ACCEPTED) => { + // This should never happen: we waited for detaches to finish above + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Unexpectedly still attached on {}", + node + ))); + } + Ok(_) => {} + Err(mgmt_api::Error::Cancelled) => { + return Err(ApiError::ShuttingDown); + } + Err(e) => { + // This is unexpected: remote deletion should be infallible, unless the object store + // at large is unavailable. + tracing::error!("Error deleting via node {}: {e}", node); + return Err(ApiError::InternalServerError(anyhow::anyhow!(e))); + } + } + } + + // Fall through: deletion of the tenant on pageservers is complete, we may proceed to drop + // our in-memory state and database state. + + // Ordering: we delete persistent state first: if we then + // crash, we will drop the in-memory state. + + // Drop persistent state. + self.persistence.delete_tenant(tenant_id).await?; + + // Drop in-memory state + { + let mut locked = self.inner.write().unwrap(); + let (_nodes, tenants, scheduler) = locked.parts_mut(); + + // Dereference Scheduler from shards before dropping them + for (_tenant_shard_id, shard) in + tenants.range_mut(TenantShardId::tenant_range(tenant_id)) + { + shard.intent.clear(scheduler); + } + + tenants.retain(|tenant_shard_id, _shard| tenant_shard_id.tenant_id != tenant_id); + tracing::info!( + "Deleted tenant {tenant_id}, now have {} tenants", + locked.tenants.len() + ); + }; + + // Success is represented as 404, to imitate the existing pageserver deletion API + Ok(StatusCode::NOT_FOUND) + } + + /// Naming: this configures the storage controller's policies for a tenant, whereas [`Self::tenant_config_set`] is "set the TenantConfig" + /// for a tenant. The TenantConfig is passed through to pageservers, whereas this function modifies + /// the tenant's policies (configuration) within the storage controller + pub(crate) async fn tenant_update_policy( + &self, + tenant_id: TenantId, + req: TenantPolicyRequest, + ) -> Result<(), ApiError> { + // We require an exclusive lock, because we are updating persistent and in-memory state + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::UpdatePolicy, + ) + .await; + + failpoint_support::sleep_millis_async!("tenant-update-policy-exclusive-lock"); + + let TenantPolicyRequest { + placement, + scheduling, + } = req; + + self.persistence + .update_tenant_shard( + TenantFilter::Tenant(tenant_id), + placement.clone(), + None, + None, + scheduling, + ) + .await?; + + let mut schedule_context = ScheduleContext::default(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { + if let Some(placement) = &placement { + shard.policy = placement.clone(); + + tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(), + "Updated placement policy to {placement:?}"); + } + + if let Some(scheduling) = &scheduling { + shard.set_scheduling_policy(*scheduling); + + tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(), + "Updated scheduling policy to {scheduling:?}"); + } + + // In case scheduling is being switched back on, try it now. + shard.schedule(scheduler, &mut schedule_context).ok(); + self.maybe_reconcile_shard(shard, nodes); + } + + Ok(()) + } + + pub(crate) async fn tenant_timeline_create( + &self, + tenant_id: TenantId, + mut create_req: TimelineCreateRequest, + ) -> Result { + tracing::info!( + "Creating timeline {}/{}", + tenant_id, + create_req.new_timeline_id, + ); + + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineCreate, + ) + .await; + failpoint_support::sleep_millis_async!("tenant-create-timeline-shared-lock"); + + self.tenant_remote_mutation(tenant_id, move |mut targets| async move { + if targets.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); + }; + let shard_zero = targets.remove(0); + + async fn create_one( + tenant_shard_id: TenantShardId, + node: Node, + jwt: Option, + create_req: TimelineCreateRequest, + ) -> Result { + tracing::info!( + "Creating timeline on shard {}/{}, attached to node {node}", + tenant_shard_id, + create_req.new_timeline_id, + ); + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + + client + .timeline_create(tenant_shard_id, &create_req) + .await + .map_err(|e| passthrough_api_error(&node, e)) + } + + // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then + // use whatever LSN that shard picked when creating on subsequent shards. We arbitrarily use shard zero as the shard + // that will get the first creation request, and propagate the LSN to all the >0 shards. + let timeline_info = create_one( + shard_zero.0, + shard_zero.1, + self.config.jwt_token.clone(), + create_req.clone(), + ) + .await?; + + // Propagate the LSN that shard zero picked, if caller didn't provide one + if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none() + { + create_req.ancestor_start_lsn = timeline_info.ancestor_lsn; + } + + // Create timeline on remaining shards with number >0 + if !targets.is_empty() { + // If we had multiple shards, issue requests for the remainder now. + let jwt = &self.config.jwt_token; + self.tenant_for_shards( + targets.iter().map(|t| (t.0, t.1.clone())).collect(), + |tenant_shard_id: TenantShardId, node: Node| { + let create_req = create_req.clone(); + Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req)) + }, + ) + .await?; + } + + Ok(timeline_info) + }) + .await? + } + + pub(crate) async fn tenant_timeline_archival_config( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: TimelineArchivalConfigRequest, + ) -> Result<(), ApiError> { + tracing::info!( + "Setting archival config of timeline {tenant_id}/{timeline_id} to '{:?}'", + req.state + ); + + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineArchivalConfig, + ) + .await; + + self.tenant_remote_mutation(tenant_id, move |targets| async move { + if targets.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); + } + async fn config_one( + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + node: Node, + jwt: Option, + req: TimelineArchivalConfigRequest, + ) -> Result<(), ApiError> { + tracing::info!( + "Setting archival config of timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", + ); + + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + + client + .timeline_archival_config(tenant_shard_id, timeline_id, &req) + .await + .map_err(|e| match e { + mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg) => { + ApiError::PreconditionFailed(msg.into_boxed_str()) + } + _ => passthrough_api_error(&node, e), + }) + } + + // no shard needs to go first/last; the operation should be idempotent + // TODO: it would be great to ensure that all shards return the same error + let results = self + .tenant_for_shards(targets, |tenant_shard_id, node| { + futures::FutureExt::boxed(config_one( + tenant_shard_id, + timeline_id, + node, + self.config.jwt_token.clone(), + req.clone(), + )) + }) + .await?; + assert!(!results.is_empty(), "must have at least one result"); + + Ok(()) + }).await? + } + + pub(crate) async fn tenant_timeline_detach_ancestor( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + tracing::info!("Detaching timeline {tenant_id}/{timeline_id}",); + + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineDetachAncestor, + ) + .await; + + self.tenant_remote_mutation(tenant_id, move |targets| async move { + if targets.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); + } + + async fn detach_one( + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + node: Node, + jwt: Option, + ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> { + tracing::info!( + "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", + ); + + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + + client + .timeline_detach_ancestor(tenant_shard_id, timeline_id) + .await + .map_err(|e| { + use mgmt_api::Error; + + match e { + // no ancestor (ever) + Error::ApiError(StatusCode::CONFLICT, msg) => ApiError::Conflict(format!( + "{node}: {}", + msg.strip_prefix("Conflict: ").unwrap_or(&msg) + )), + // too many ancestors + Error::ApiError(StatusCode::BAD_REQUEST, msg) => { + ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}")) + } + Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, msg) => { + // avoid turning these into conflicts to remain compatible with + // pageservers, 500 errors are sadly retryable with timeline ancestor + // detach + ApiError::InternalServerError(anyhow::anyhow!("{node}: {msg}")) + } + // rest can be mapped as usual + other => passthrough_api_error(&node, other), + } + }) + .map(|res| (tenant_shard_id.shard_number, res)) + } + + // no shard needs to go first/last; the operation should be idempotent + let mut results = self + .tenant_for_shards(targets, |tenant_shard_id, node| { + futures::FutureExt::boxed(detach_one( + tenant_shard_id, + timeline_id, + node, + self.config.jwt_token.clone(), + )) + }) + .await?; + + let any = results.pop().expect("we must have at least one response"); + + let mismatching = results + .iter() + .filter(|(_, res)| res != &any.1) + .collect::>(); + if !mismatching.is_empty() { + // this can be hit by races which should not happen because operation lock on cplane + let matching = results.len() - mismatching.len(); + tracing::error!( + matching, + compared_against=?any, + ?mismatching, + "shards returned different results" + ); + + return Err(ApiError::InternalServerError(anyhow::anyhow!("pageservers returned mixed results for ancestor detach; manual intervention is required."))); + } + + Ok(any.1) + }).await? + } + + /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation. + /// + /// On success, the returned vector contains exactly the same number of elements as the input `locations`. + async fn tenant_for_shards( + &self, + locations: Vec<(TenantShardId, Node)>, + mut req_fn: F, + ) -> Result, ApiError> + where + F: FnMut( + TenantShardId, + Node, + ) + -> std::pin::Pin> + Send>>, + { + let mut futs = FuturesUnordered::new(); + let mut results = Vec::with_capacity(locations.len()); + + for (tenant_shard_id, node) in locations { + futs.push(req_fn(tenant_shard_id, node)); + } + + while let Some(r) = futs.next().await { + results.push(r?); + } + + Ok(results) + } + + /// Concurrently invoke a pageserver API call on many shards at once + pub(crate) async fn tenant_for_shards_api( + &self, + locations: Vec<(TenantShardId, Node)>, + op: O, + warn_threshold: u32, + max_retries: u32, + timeout: Duration, + cancel: &CancellationToken, + ) -> Vec> + where + O: Fn(TenantShardId, PageserverClient) -> F + Copy, + F: std::future::Future>, + { + let mut futs = FuturesUnordered::new(); + let mut results = Vec::with_capacity(locations.len()); + + for (tenant_shard_id, node) in locations { + futs.push(async move { + node.with_client_retries( + |client| op(tenant_shard_id, client), + &self.config.jwt_token, + warn_threshold, + max_retries, + timeout, + cancel, + ) + .await + }); + } + + while let Some(r) = futs.next().await { + let r = r.unwrap_or(Err(mgmt_api::Error::Cancelled)); + results.push(r); + } + + results + } + + /// Helper for safely working with the shards in a tenant remotely on pageservers, for example + /// when creating and deleting timelines: + /// - Makes sure shards are attached somewhere if they weren't already + /// - Looks up the shards and the nodes where they were most recently attached + /// - Guarantees that after the inner function returns, the shards' generations haven't moved on: this + /// ensures that the remote operation acted on the most recent generation, and is therefore durable. + async fn tenant_remote_mutation( + &self, + tenant_id: TenantId, + op: O, + ) -> Result + where + O: FnOnce(Vec<(TenantShardId, Node)>) -> F, + F: std::future::Future, + { + let target_gens = { + let mut targets = Vec::new(); + + // Load the currently attached pageservers for the latest generation of each shard. This can + // run concurrently with reconciliations, and it is not guaranteed that the node we find here + // will still be the latest when we're done: we will check generations again at the end of + // this function to handle that. + let generations = self.persistence.tenant_generations(tenant_id).await?; + + if generations + .iter() + .any(|i| i.generation.is_none() || i.generation_pageserver.is_none()) + { + // One or more shards has not been attached to a pageserver. Check if this is because it's configured + // to be detached (409: caller should give up), or because it's meant to be attached but isn't yet (503: caller should retry) + let locked = self.inner.read().unwrap(); + for (shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + match shard.policy { + PlacementPolicy::Attached(_) => { + // This shard is meant to be attached: the caller is not wrong to try and + // use this function, but we can't service the request right now. + } + PlacementPolicy::Secondary | PlacementPolicy::Detached => { + return Err(ApiError::Conflict(format!( + "Shard {shard_id} tenant has policy {:?}", + shard.policy + ))); + } + } + } + + return Err(ApiError::ResourceUnavailable( + "One or more shards in tenant is not yet attached".into(), + )); + } + + let locked = self.inner.read().unwrap(); + for ShardGenerationState { + tenant_shard_id, + generation, + generation_pageserver, + } in generations + { + let node_id = generation_pageserver.expect("We checked for None above"); + let node = locked + .nodes + .get(&node_id) + .ok_or(ApiError::Conflict(format!( + "Raced with removal of node {node_id}" + )))?; + targets.push((tenant_shard_id, node.clone(), generation)); + } + + targets + }; + + let targets = target_gens.iter().map(|t| (t.0, t.1.clone())).collect(); + let result = op(targets).await; + + // Post-check: are all the generations of all the shards the same as they were initially? This proves that + // our remote operation executed on the latest generation and is therefore persistent. + { + let latest_generations = self.persistence.tenant_generations(tenant_id).await?; + if latest_generations + .into_iter() + .map( + |ShardGenerationState { + tenant_shard_id, + generation, + generation_pageserver: _, + }| (tenant_shard_id, generation), + ) + .collect::>() + != target_gens + .into_iter() + .map(|i| (i.0, i.2)) + .collect::>() + { + // We raced with something that incremented the generation, and therefore cannot be + // confident that our actions are persistent (they might have hit an old generation). + // + // This is safe but requires a retry: ask the client to do that by giving them a 503 response. + return Err(ApiError::ResourceUnavailable( + "Tenant attachment changed, please retry".into(), + )); + } + } + + Ok(result) + } + + pub(crate) async fn tenant_timeline_delete( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,); + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineDelete, + ) + .await; + + self.tenant_remote_mutation(tenant_id, move |mut targets| async move { + if targets.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); + } + let shard_zero = targets.remove(0); + + async fn delete_one( + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + node: Node, + jwt: Option, + ) -> Result { + tracing::info!( + "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", + ); + + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + client + .timeline_delete(tenant_shard_id, timeline_id) + .await + .map_err(|e| { + ApiError::InternalServerError(anyhow::anyhow!( + "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", + )) + }) + } + + let statuses = self + .tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| { + Box::pin(delete_one( + tenant_shard_id, + timeline_id, + node, + self.config.jwt_token.clone(), + )) + }) + .await?; + + // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero + if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) { + return Ok(StatusCode::ACCEPTED); + } + + // Delete shard zero last: this is not strictly necessary, but since a caller's GET on a timeline will be routed + // to shard zero, it gives a more obvious behavior that a GET returns 404 once the deletion is done. + let shard_zero_status = delete_one( + shard_zero.0, + timeline_id, + shard_zero.1, + self.config.jwt_token.clone(), + ) + .await?; + Ok(shard_zero_status) + }).await? + } + + /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this + /// function looks up and returns node. If the tenant isn't found, returns Err(ApiError::NotFound) + pub(crate) fn tenant_shard0_node( + &self, + tenant_id: TenantId, + ) -> Result<(Node, TenantShardId), ApiError> { + let locked = self.inner.read().unwrap(); + let Some((tenant_shard_id, shard)) = locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .next() + else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant {tenant_id} not found").into(), + )); + }; + + // TODO: should use the ID last published to compute_hook, rather than the intent: the intent might + // point to somewhere we haven't attached yet. + let Some(node_id) = shard.intent.get_attached() else { + tracing::warn!( + tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Shard not scheduled (policy {:?}), cannot generate pass-through URL", + shard.policy + ); + return Err(ApiError::Conflict( + "Cannot call timeline API on non-attached tenant".to_string(), + )); + }; + + let Some(node) = locked.nodes.get(node_id) else { + // This should never happen + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Shard refers to nonexistent node" + ))); + }; + + Ok((node.clone(), *tenant_shard_id)) + } + + pub(crate) fn tenant_locate( + &self, + tenant_id: TenantId, + ) -> Result { + let locked = self.inner.read().unwrap(); + tracing::info!("Locating shards for tenant {tenant_id}"); + + let mut result = Vec::new(); + let mut shard_params: Option = None; + + for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + let node_id = + shard + .intent + .get_attached() + .ok_or(ApiError::BadRequest(anyhow::anyhow!( + "Cannot locate a tenant that is not attached" + )))?; + + let node = locked + .nodes + .get(&node_id) + .expect("Pageservers may not be deleted while referenced"); + + result.push(node.shard_location(*tenant_shard_id)); + + match &shard_params { + None => { + shard_params = Some(ShardParameters { + stripe_size: shard.shard.stripe_size, + count: shard.shard.count, + }); + } + Some(params) => { + if params.stripe_size != shard.shard.stripe_size { + // This should never happen. We enforce at runtime because it's simpler than + // adding an extra per-tenant data structure to store the things that should be the same + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Inconsistent shard stripe size parameters!" + ))); + } + } + } + } + + if result.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("No shards for this tenant ID found").into(), + )); + } + let shard_params = shard_params.expect("result is non-empty, therefore this is set"); + tracing::info!( + "Located tenant {} with params {:?} on shards {}", + tenant_id, + shard_params, + result + .iter() + .map(|s| format!("{:?}", s)) + .collect::>() + .join(",") + ); + + Ok(TenantLocateResponse { + shards: result, + shard_params, + }) + } + + /// Returns None if the input iterator of shards does not include a shard with number=0 + fn tenant_describe_impl<'a>( + &self, + shards: impl Iterator, + ) -> Option { + let mut shard_zero = None; + let mut describe_shards = Vec::new(); + + for shard in shards { + if shard.tenant_shard_id.is_shard_zero() { + shard_zero = Some(shard); + } + + describe_shards.push(TenantDescribeResponseShard { + tenant_shard_id: shard.tenant_shard_id, + node_attached: *shard.intent.get_attached(), + node_secondary: shard.intent.get_secondary().to_vec(), + last_error: shard + .last_error + .lock() + .unwrap() + .as_ref() + .map(|e| format!("{e}")) + .unwrap_or("".to_string()) + .clone(), + is_reconciling: shard.reconciler.is_some(), + is_pending_compute_notification: shard.pending_compute_notification, + is_splitting: matches!(shard.splitting, SplitState::Splitting), + scheduling_policy: *shard.get_scheduling_policy(), + preferred_az_id: shard.preferred_az().map(ToString::to_string), + }) + } + + let shard_zero = shard_zero?; + + Some(TenantDescribeResponse { + tenant_id: shard_zero.tenant_shard_id.tenant_id, + shards: describe_shards, + stripe_size: shard_zero.shard.stripe_size, + policy: shard_zero.policy.clone(), + config: shard_zero.config.clone(), + }) + } + + pub(crate) fn tenant_describe( + &self, + tenant_id: TenantId, + ) -> Result { + let locked = self.inner.read().unwrap(); + + self.tenant_describe_impl( + locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .map(|(_k, v)| v), + ) + .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into())) + } + + pub(crate) fn tenant_list(&self) -> Vec { + let locked = self.inner.read().unwrap(); + + let mut result = Vec::new(); + for (_tenant_id, tenant_shards) in + &locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id) + { + result.push( + self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v)) + .expect("Groups are always non-empty"), + ); + } + + result + } + + #[instrument(skip_all, fields(tenant_id=%op.tenant_id))] + async fn abort_tenant_shard_split( + &self, + op: &TenantShardSplitAbort, + ) -> Result<(), TenantShardSplitAbortError> { + // Cleaning up a split: + // - Parent shards are not destroyed during a split, just detached. + // - Failed pageserver split API calls can leave the remote node with just the parent attached, + // just the children attached, or both. + // + // Therefore our work to do is to: + // 1. Clean up storage controller's internal state to just refer to parents, no children + // 2. Call out to pageservers to ensure that children are detached + // 3. Call out to pageservers to ensure that parents are attached. + // + // Crash safety: + // - If the storage controller stops running during this cleanup *after* clearing the splitting state + // from our database, then [`Self::startup_reconcile`] will regard child attachments as garbage + // and detach them. + // - TODO: If the storage controller stops running during this cleanup *before* clearing the splitting state + // from our database, then we will re-enter this cleanup routine on startup. + + let TenantShardSplitAbort { + tenant_id, + new_shard_count, + new_stripe_size, + .. + } = op; + + // First abort persistent state, if any exists. + match self + .persistence + .abort_shard_split(*tenant_id, *new_shard_count) + .await? + { + AbortShardSplitStatus::Aborted => { + // Proceed to roll back any child shards created on pageservers + } + AbortShardSplitStatus::Complete => { + // The split completed (we might hit that path if e.g. our database transaction + // to write the completion landed in the database, but we dropped connection + // before seeing the result). + // + // We must update in-memory state to reflect the successful split. + self.tenant_shard_split_commit_inmem( + *tenant_id, + *new_shard_count, + *new_stripe_size, + ); + return Ok(()); + } + } + + // Clean up in-memory state, and accumulate the list of child locations that need detaching + let detach_locations: Vec<(Node, TenantShardId)> = { + let mut detach_locations = Vec::new(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + for (tenant_shard_id, shard) in + tenants.range_mut(TenantShardId::tenant_range(op.tenant_id)) + { + if shard.shard.count == op.new_shard_count { + // Surprising: the phase of [`Self::do_tenant_shard_split`] which inserts child shards in-memory + // is infallible, so if we got an error we shouldn't have got that far. + tracing::warn!( + "During split abort, child shard {tenant_shard_id} found in-memory" + ); + continue; + } + + // Add the children of this shard to this list of things to detach + if let Some(node_id) = shard.intent.get_attached() { + for child_id in tenant_shard_id.split(*new_shard_count) { + detach_locations.push(( + nodes + .get(node_id) + .expect("Intent references nonexistent node") + .clone(), + child_id, + )); + } + } else { + tracing::warn!( + "During split abort, shard {tenant_shard_id} has no attached location" + ); + } + + tracing::info!("Restoring parent shard {tenant_shard_id}"); + shard.splitting = SplitState::Idle; + if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) { + // If this shard can't be scheduled now (perhaps due to offline nodes or + // capacity issues), that must not prevent us rolling back a split. In this + // case it should be eventually scheduled in the background. + tracing::warn!("Failed to schedule {tenant_shard_id} during shard abort: {e}") + } + + self.maybe_reconcile_shard(shard, nodes); + } + + // We don't expect any new_shard_count shards to exist here, but drop them just in case + tenants.retain(|_id, s| s.shard.count != *new_shard_count); + + detach_locations + }; + + for (node, child_id) in detach_locations { + if !node.is_available() { + // An unavailable node cannot be cleaned up now: to avoid blocking forever, we will permit this, and + // rely on the reconciliation that happens when a node transitions to Active to clean up. Since we have + // removed child shards from our in-memory state and database, the reconciliation will implicitly remove + // them from the node. + tracing::warn!("Node {node} unavailable, can't clean up during split abort. It will be cleaned up when it is reactivated."); + continue; + } + + // Detach the remote child. If the pageserver split API call is still in progress, this call will get + // a 503 and retry, up to our limit. + tracing::info!("Detaching {child_id} on {node}..."); + match node + .with_client_retries( + |client| async move { + let config = LocationConfig { + mode: LocationConfigMode::Detached, + generation: None, + secondary_conf: None, + shard_number: child_id.shard_number.0, + shard_count: child_id.shard_count.literal(), + // Stripe size and tenant config don't matter when detaching + shard_stripe_size: 0, + tenant_conf: TenantConfig::default(), + }; + + client.location_config(child_id, config, None, false).await + }, + &self.config.jwt_token, + 1, + 10, + Duration::from_secs(5), + &self.cancel, + ) + .await + { + Some(Ok(_)) => {} + Some(Err(e)) => { + // We failed to communicate with the remote node. This is problematic: we may be + // leaving it with a rogue child shard. + tracing::warn!( + "Failed to detach child {child_id} from node {node} during abort" + ); + return Err(e.into()); + } + None => { + // Cancellation: we were shutdown or the node went offline. Shutdown is fine, we'll + // clean up on restart. The node going offline requires a retry. + return Err(TenantShardSplitAbortError::Unavailable); + } + }; + } + + tracing::info!("Successfully aborted split"); + Ok(()) + } + + /// Infallible final stage of [`Self::tenant_shard_split`]: update the contents + /// of the tenant map to reflect the child shards that exist after the split. + fn tenant_shard_split_commit_inmem( + &self, + tenant_id: TenantId, + new_shard_count: ShardCount, + new_stripe_size: Option, + ) -> ( + TenantShardSplitResponse, + Vec<(TenantShardId, NodeId, ShardStripeSize)>, + Vec, + ) { + let mut response = TenantShardSplitResponse { + new_shards: Vec::new(), + }; + let mut child_locations = Vec::new(); + let mut waiters = Vec::new(); + + { + let mut locked = self.inner.write().unwrap(); + + let parent_ids = locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .map(|(shard_id, _)| *shard_id) + .collect::>(); + + let (nodes, tenants, scheduler) = locked.parts_mut(); + for parent_id in parent_ids { + let child_ids = parent_id.split(new_shard_count); + + let (pageserver, generation, policy, parent_ident, config) = { + let mut old_state = tenants + .remove(&parent_id) + .expect("It was present, we just split it"); + + // A non-splitting state is impossible, because [`Self::tenant_shard_split`] holds + // a TenantId lock and passes it through to [`TenantShardSplitAbort`] in case of cleanup: + // nothing else can clear this. + assert!(matches!(old_state.splitting, SplitState::Splitting)); + + let old_attached = old_state.intent.get_attached().unwrap(); + old_state.intent.clear(scheduler); + let generation = old_state.generation.expect("Shard must have been attached"); + ( + old_attached, + generation, + old_state.policy, + old_state.shard, + old_state.config, + ) + }; + + let mut schedule_context = ScheduleContext::default(); + for child in child_ids { + let mut child_shard = parent_ident; + child_shard.number = child.shard_number; + child_shard.count = child.shard_count; + if let Some(stripe_size) = new_stripe_size { + child_shard.stripe_size = stripe_size; + } + + let mut child_observed: HashMap = HashMap::new(); + child_observed.insert( + pageserver, + ObservedStateLocation { + conf: Some(attached_location_conf( + generation, + &child_shard, + &config, + &policy, + )), + }, + ); + + let mut child_state = TenantShard::new(child, child_shard, policy.clone()); + child_state.intent = IntentState::single(scheduler, Some(pageserver)); + child_state.observed = ObservedState { + locations: child_observed, + }; + child_state.generation = Some(generation); + child_state.config = config.clone(); + + // The child's TenantShard::splitting is intentionally left at the default value of Idle, + // as at this point in the split process we have succeeded and this part is infallible: + // we will never need to do any special recovery from this state. + + child_locations.push((child, pageserver, child_shard.stripe_size)); + + if let Err(e) = child_state.schedule(scheduler, &mut schedule_context) { + // This is not fatal, because we've implicitly already got an attached + // location for the child shard. Failure here just means we couldn't + // find a secondary (e.g. because cluster is overloaded). + tracing::warn!("Failed to schedule child shard {child}: {e}"); + } + // In the background, attach secondary locations for the new shards + if let Some(waiter) = self.maybe_reconcile_shard(&mut child_state, nodes) { + waiters.push(waiter); + } + + tenants.insert(child, child_state); + response.new_shards.push(child); + } + } + (response, child_locations, waiters) + } + } + + async fn tenant_shard_split_start_secondaries( + &self, + tenant_id: TenantId, + waiters: Vec, + ) { + // Wait for initial reconcile of child shards, this creates the secondary locations + if let Err(e) = self.await_waiters(waiters, RECONCILE_TIMEOUT).await { + // This is not a failure to split: it's some issue reconciling the new child shards, perhaps + // their secondaries couldn't be attached. + tracing::warn!("Failed to reconcile after split: {e}"); + return; + } + + // Take the state lock to discover the attached & secondary intents for all shards + let (attached, secondary) = { + let locked = self.inner.read().unwrap(); + let mut attached = Vec::new(); + let mut secondary = Vec::new(); + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + let Some(node_id) = shard.intent.get_attached() else { + // Unexpected. Race with a PlacementPolicy change? + tracing::warn!( + "No attached node on {tenant_shard_id} immediately after shard split!" + ); + continue; + }; + + let Some(secondary_node_id) = shard.intent.get_secondary().first() else { + // No secondary location. Nothing for us to do. + continue; + }; + + let attached_node = locked + .nodes + .get(node_id) + .expect("Pageservers may not be deleted while referenced"); + + let secondary_node = locked + .nodes + .get(secondary_node_id) + .expect("Pageservers may not be deleted while referenced"); + + attached.push((*tenant_shard_id, attached_node.clone())); + secondary.push((*tenant_shard_id, secondary_node.clone())); + } + (attached, secondary) + }; + + if secondary.is_empty() { + // No secondary locations; nothing for us to do + return; + } + + for result in self + .tenant_for_shards_api( + attached, + |tenant_shard_id, client| async move { + client.tenant_heatmap_upload(tenant_shard_id).await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + if let Err(e) = result { + tracing::warn!("Error calling heatmap upload after shard split: {e}"); + return; + } + } + + for result in self + .tenant_for_shards_api( + secondary, + |tenant_shard_id, client| async move { + client + .tenant_secondary_download(tenant_shard_id, Some(Duration::ZERO)) + .await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + if let Err(e) = result { + tracing::warn!("Error calling secondary download after shard split: {e}"); + return; + } + } + } + + pub(crate) async fn tenant_shard_split( + &self, + tenant_id: TenantId, + split_req: TenantShardSplitRequest, + ) -> Result { + // TODO: return 503 if we get stuck waiting for this lock + // (issue https://github.com/neondatabase/neon/issues/7108) + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::ShardSplit, + ) + .await; + + let new_shard_count = ShardCount::new(split_req.new_shard_count); + let new_stripe_size = split_req.new_stripe_size; + + // Validate the request and construct parameters. This phase is fallible, but does not require + // rollback on errors, as it does no I/O and mutates no state. + let shard_split_params = match self.prepare_tenant_shard_split(tenant_id, split_req)? { + ShardSplitAction::NoOp(resp) => return Ok(resp), + ShardSplitAction::Split(params) => params, + }; + + // Execute this split: this phase mutates state and does remote I/O on pageservers. If it fails, + // we must roll back. + let r = self + .do_tenant_shard_split(tenant_id, shard_split_params) + .await; + + let (response, waiters) = match r { + Ok(r) => r, + Err(e) => { + // Split might be part-done, we must do work to abort it. + tracing::warn!("Enqueuing background abort of split on {tenant_id}"); + self.abort_tx + .send(TenantShardSplitAbort { + tenant_id, + new_shard_count, + new_stripe_size, + _tenant_lock, + }) + // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it. + .ok(); + return Err(e); + } + }; + + // The split is now complete. As an optimization, we will trigger all the child shards to upload + // a heatmap immediately, and all their secondary locations to start downloading: this avoids waiting + // for the background heatmap/download interval before secondaries get warm enough to migrate shards + // in [`Self::optimize_all`] + self.tenant_shard_split_start_secondaries(tenant_id, waiters) + .await; + Ok(response) + } + + fn prepare_tenant_shard_split( + &self, + tenant_id: TenantId, + split_req: TenantShardSplitRequest, + ) -> Result { + fail::fail_point!("shard-split-validation", |_| Err(ApiError::BadRequest( + anyhow::anyhow!("failpoint") + ))); + + let mut policy = None; + let mut config = None; + let mut shard_ident = None; + // Validate input, and calculate which shards we will create + let (old_shard_count, targets) = + { + let locked = self.inner.read().unwrap(); + + let pageservers = locked.nodes.clone(); + + let mut targets = Vec::new(); + + // In case this is a retry, count how many already-split shards we found + let mut children_found = Vec::new(); + let mut old_shard_count = None; + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + match shard.shard.count.count().cmp(&split_req.new_shard_count) { + Ordering::Equal => { + // Already split this + children_found.push(*tenant_shard_id); + continue; + } + Ordering::Greater => { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Requested count {} but already have shards at count {}", + split_req.new_shard_count, + shard.shard.count.count() + ))); + } + Ordering::Less => { + // Fall through: this shard has lower count than requested, + // is a candidate for splitting. + } + } + + match old_shard_count { + None => old_shard_count = Some(shard.shard.count), + Some(old_shard_count) => { + if old_shard_count != shard.shard.count { + // We may hit this case if a caller asked for two splits to + // different sizes, before the first one is complete. + // e.g. 1->2, 2->4, where the 4 call comes while we have a mixture + // of shard_count=1 and shard_count=2 shards in the map. + return Err(ApiError::Conflict( + "Cannot split, currently mid-split".to_string(), + )); + } + } + } + if policy.is_none() { + policy = Some(shard.policy.clone()); + } + if shard_ident.is_none() { + shard_ident = Some(shard.shard); + } + if config.is_none() { + config = Some(shard.config.clone()); + } + + if tenant_shard_id.shard_count.count() == split_req.new_shard_count { + tracing::info!( + "Tenant shard {} already has shard count {}", + tenant_shard_id, + split_req.new_shard_count + ); + continue; + } + + let node_id = shard.intent.get_attached().ok_or(ApiError::BadRequest( + anyhow::anyhow!("Cannot split a tenant that is not attached"), + ))?; + + let node = pageservers + .get(&node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push(ShardSplitTarget { + parent_id: *tenant_shard_id, + node: node.clone(), + child_ids: tenant_shard_id + .split(ShardCount::new(split_req.new_shard_count)), + }); + } + + if targets.is_empty() { + if children_found.len() == split_req.new_shard_count as usize { + return Ok(ShardSplitAction::NoOp(TenantShardSplitResponse { + new_shards: children_found, + })); + } else { + // No shards found to split, and no existing children found: the + // tenant doesn't exist at all. + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant {} not found", tenant_id).into(), + )); + } + } + + (old_shard_count, targets) + }; + + // unwrap safety: we would have returned above if we didn't find at least one shard to split + let old_shard_count = old_shard_count.unwrap(); + let shard_ident = if let Some(new_stripe_size) = split_req.new_stripe_size { + // This ShardIdentity will be used as the template for all children, so this implicitly + // applies the new stripe size to the children. + let mut shard_ident = shard_ident.unwrap(); + if shard_ident.count.count() > 1 && shard_ident.stripe_size != new_stripe_size { + return Err(ApiError::BadRequest(anyhow::anyhow!("Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", shard_ident.stripe_size))); + } + + shard_ident.stripe_size = new_stripe_size; + tracing::info!("applied stripe size {}", shard_ident.stripe_size.0); + shard_ident + } else { + shard_ident.unwrap() + }; + let policy = policy.unwrap(); + let config = config.unwrap(); + + Ok(ShardSplitAction::Split(ShardSplitParams { + old_shard_count, + new_shard_count: ShardCount::new(split_req.new_shard_count), + new_stripe_size: split_req.new_stripe_size, + targets, + policy, + config, + shard_ident, + })) + } + + async fn do_tenant_shard_split( + &self, + tenant_id: TenantId, + params: ShardSplitParams, + ) -> Result<(TenantShardSplitResponse, Vec), ApiError> { + // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another + // request could occur here, deleting or mutating the tenant. begin_shard_split checks that the + // parent shards exist as expected, but it would be neater to do the above pre-checks within the + // same database transaction rather than pre-check in-memory and then maybe-fail the database write. + // (https://github.com/neondatabase/neon/issues/6676) + + let ShardSplitParams { + old_shard_count, + new_shard_count, + new_stripe_size, + mut targets, + policy, + config, + shard_ident, + } = params; + + // Drop any secondary locations: pageservers do not support splitting these, and in any case the + // end-state for a split tenant will usually be to have secondary locations on different nodes. + // The reconciliation calls in this block also implicitly cancel+barrier wrt any ongoing reconciliation + // at the time of split. + let waiters = { + let mut locked = self.inner.write().unwrap(); + let mut waiters = Vec::new(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + for target in &mut targets { + let Some(shard) = tenants.get_mut(&target.parent_id) else { + // Paranoia check: this shouldn't happen: we have the oplock for this tenant ID. + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Shard {} not found", + target.parent_id + ))); + }; + + if shard.intent.get_attached() != &Some(target.node.get_id()) { + // Paranoia check: this shouldn't happen: we have the oplock for this tenant ID. + return Err(ApiError::Conflict(format!( + "Shard {} unexpectedly rescheduled during split", + target.parent_id + ))); + } + + // Irrespective of PlacementPolicy, clear secondary locations from intent + shard.intent.clear_secondary(scheduler); + + // Run Reconciler to execute detach fo secondary locations. + if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + waiters.push(waiter); + } + } + waiters + }; + self.await_waiters(waiters, RECONCILE_TIMEOUT).await?; + + // Before creating any new child shards in memory or on the pageservers, persist them: this + // enables us to ensure that we will always be able to clean up if something goes wrong. This also + // acts as the protection against two concurrent attempts to split: one of them will get a database + // error trying to insert the child shards. + let mut child_tsps = Vec::new(); + for target in &targets { + let mut this_child_tsps = Vec::new(); + for child in &target.child_ids { + let mut child_shard = shard_ident; + child_shard.number = child.shard_number; + child_shard.count = child.shard_count; + + tracing::info!( + "Create child shard persistence with stripe size {}", + shard_ident.stripe_size.0 + ); + + this_child_tsps.push(TenantShardPersistence { + tenant_id: child.tenant_id.to_string(), + shard_number: child.shard_number.0 as i32, + shard_count: child.shard_count.literal() as i32, + shard_stripe_size: shard_ident.stripe_size.0 as i32, + // Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will + // populate the correct generation as part of its transaction, to protect us + // against racing with changes in the state of the parent. + generation: None, + generation_pageserver: Some(target.node.get_id().0 as i64), + placement_policy: serde_json::to_string(&policy).unwrap(), + config: serde_json::to_string(&config).unwrap(), + splitting: SplitState::Splitting, + + // Scheduling policies and preferred AZ do not carry through to children + scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) + .unwrap(), + preferred_az_id: None, + }); + } + + child_tsps.push((target.parent_id, this_child_tsps)); + } + + if let Err(e) = self + .persistence + .begin_shard_split(old_shard_count, tenant_id, child_tsps) + .await + { + match e { + DatabaseError::Query(diesel::result::Error::DatabaseError( + DatabaseErrorKind::UniqueViolation, + _, + )) => { + // Inserting a child shard violated a unique constraint: we raced with another call to + // this function + tracing::warn!("Conflicting attempt to split {tenant_id}: {e}"); + return Err(ApiError::Conflict("Tenant is already splitting".into())); + } + _ => return Err(ApiError::InternalServerError(e.into())), + } + } + fail::fail_point!("shard-split-post-begin", |_| Err( + ApiError::InternalServerError(anyhow::anyhow!("failpoint")) + )); + + // Now that I have persisted the splitting state, apply it in-memory. This is infallible, so + // callers may assume that if splitting is set in memory, then it was persisted, and if splitting + // is not set in memory, then it was not persisted. + { + let mut locked = self.inner.write().unwrap(); + for target in &targets { + if let Some(parent_shard) = locked.tenants.get_mut(&target.parent_id) { + parent_shard.splitting = SplitState::Splitting; + // Put the observed state to None, to reflect that it is indeterminate once we start the + // split operation. + parent_shard + .observed + .locations + .insert(target.node.get_id(), ObservedStateLocation { conf: None }); + } + } + } + + // TODO: issue split calls concurrently (this only matters once we're splitting + // N>1 shards into M shards -- initially we're usually splitting 1 shard into N). + + for target in &targets { + let ShardSplitTarget { + parent_id, + node, + child_ids, + } = target; + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); + let response = client + .tenant_shard_split( + *parent_id, + TenantShardSplitRequest { + new_shard_count: new_shard_count.literal(), + new_stripe_size, + }, + ) + .await + .map_err(|e| ApiError::Conflict(format!("Failed to split {}: {}", parent_id, e)))?; + + fail::fail_point!("shard-split-post-remote", |_| Err(ApiError::Conflict( + "failpoint".to_string() + ))); + + failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel); + + tracing::info!( + "Split {} into {}", + parent_id, + response + .new_shards + .iter() + .map(|s| format!("{:?}", s)) + .collect::>() + .join(",") + ); + + if &response.new_shards != child_ids { + // This should never happen: the pageserver should agree with us on how shard splits work. + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Splitting shard {} resulted in unexpected IDs: {:?} (expected {:?})", + parent_id, + response.new_shards, + child_ids + ))); + } + } + + // TODO: if the pageserver restarted concurrently with our split API call, + // the actual generation of the child shard might differ from the generation + // we expect it to have. In order for our in-database generation to end up + // correct, we should carry the child generation back in the response and apply it here + // in complete_shard_split (and apply the correct generation in memory) + // (or, we can carry generation in the request and reject the request if + // it doesn't match, but that requires more retry logic on this side) + + self.persistence + .complete_shard_split(tenant_id, old_shard_count) + .await?; + + fail::fail_point!("shard-split-post-complete", |_| Err( + ApiError::InternalServerError(anyhow::anyhow!("failpoint")) + )); + + // Replace all the shards we just split with their children: this phase is infallible. + let (response, child_locations, waiters) = + self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size); + + // Now that we have scheduled the child shards, attempt to set their preferred AZ + // to that of the pageserver they've been attached on. + let preferred_azs = { + let locked = self.inner.read().unwrap(); + child_locations + .iter() + .filter_map(|(tid, node_id, _stripe_size)| { + let az_id = locked + .nodes + .get(node_id) + .map(|n| n.get_availability_zone_id().to_string())?; + + Some((*tid, az_id)) + }) + .collect::>() + }; + + let updated = self + .persistence + .set_tenant_shard_preferred_azs(preferred_azs) + .await + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!( + "Failed to persist preferred az ids: {err}" + )) + }); + + match updated { + Ok(updated) => { + let mut locked = self.inner.write().unwrap(); + for (tid, az_id) in updated { + if let Some(shard) = locked.tenants.get_mut(&tid) { + shard.set_preferred_az(az_id); + } + } + } + Err(err) => { + tracing::warn!("Failed to persist preferred AZs after split: {err}"); + } + } + + // Send compute notifications for all the new shards + let mut failed_notifications = Vec::new(); + for (child_id, child_ps, stripe_size) in child_locations { + if let Err(e) = self + .compute_hook + .notify(child_id, child_ps, stripe_size, &self.cancel) + .await + { + tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})", + child_id, child_ps); + failed_notifications.push(child_id); + } + } + + // If we failed any compute notifications, make a note to retry later. + if !failed_notifications.is_empty() { + let mut locked = self.inner.write().unwrap(); + for failed in failed_notifications { + if let Some(shard) = locked.tenants.get_mut(&failed) { + shard.pending_compute_notification = true; + } + } + } + + Ok((response, waiters)) + } + + pub(crate) async fn tenant_shard_migrate( + &self, + tenant_shard_id: TenantShardId, + migrate_req: TenantShardMigrateRequest, + ) -> Result { + let waiter = { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + let Some(node) = nodes.get(&migrate_req.node_id) else { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Node {} not found", + migrate_req.node_id + ))); + }; + + if !node.is_available() { + // Warn but proceed: the caller may intend to manually adjust the placement of + // a shard even if the node is down, e.g. if intervening during an incident. + tracing::warn!("Migrating to unavailable node {node}"); + } + + let Some(shard) = tenants.get_mut(&tenant_shard_id) else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant shard not found").into(), + )); + }; + + if shard.intent.get_attached() == &Some(migrate_req.node_id) { + // No-op case: we will still proceed to wait for reconciliation in case it is + // incomplete from an earlier update to the intent. + tracing::info!("Migrating: intent is unchanged {:?}", shard.intent); + } else { + let old_attached = *shard.intent.get_attached(); + + match shard.policy { + PlacementPolicy::Attached(n) => { + // If our new attached node was a secondary, it no longer should be. + shard.intent.remove_secondary(scheduler, migrate_req.node_id); + + // If we were already attached to something, demote that to a secondary + if let Some(old_attached) = old_attached { + if n > 0 { + // Remove other secondaries to make room for the location we'll demote + while shard.intent.get_secondary().len() >= n { + shard.intent.pop_secondary(scheduler); + } + + shard.intent.push_secondary(scheduler, old_attached); + } + } + + shard.intent.set_attached(scheduler, Some(migrate_req.node_id)); + } + PlacementPolicy::Secondary => { + shard.intent.clear(scheduler); + shard.intent.push_secondary(scheduler, migrate_req.node_id); + } + PlacementPolicy::Detached => { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first" + ))) + } + } + + tracing::info!("Migrating: new intent {:?}", shard.intent); + shard.sequence = shard.sequence.next(); + } + + self.maybe_reconcile_shard(shard, nodes) + }; + + if let Some(waiter) = waiter { + waiter.wait_timeout(RECONCILE_TIMEOUT).await?; + } else { + tracing::info!("Migration is a no-op"); + } + + Ok(TenantShardMigrateResponse {}) + } + + /// This is for debug/support only: we simply drop all state for a tenant, without + /// detaching or deleting it on pageservers. + pub(crate) async fn tenant_drop(&self, tenant_id: TenantId) -> Result<(), ApiError> { + self.persistence.delete_tenant(tenant_id).await?; + + let mut locked = self.inner.write().unwrap(); + let (_nodes, tenants, scheduler) = locked.parts_mut(); + let mut shards = Vec::new(); + for (tenant_shard_id, _) in tenants.range(TenantShardId::tenant_range(tenant_id)) { + shards.push(*tenant_shard_id); + } + + for shard_id in shards { + if let Some(mut shard) = tenants.remove(&shard_id) { + shard.intent.clear(scheduler); + } + } + + Ok(()) + } + + /// This is for debug/support only: assuming tenant data is already present in S3, we "create" a + /// tenant with a very high generation number so that it will see the existing data. + pub(crate) async fn tenant_import( + &self, + tenant_id: TenantId, + ) -> Result { + // Pick an arbitrary available pageserver to use for scanning the tenant in remote storage + let maybe_node = { + self.inner + .read() + .unwrap() + .nodes + .values() + .find(|n| n.is_available()) + .cloned() + }; + let Some(node) = maybe_node else { + return Err(ApiError::BadRequest(anyhow::anyhow!("No nodes available"))); + }; + + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); + + let scan_result = client + .tenant_scan_remote_storage(tenant_id) + .await + .map_err(|e| passthrough_api_error(&node, e))?; + + // A post-split tenant may contain a mixture of shard counts in remote storage: pick the highest count. + let Some(shard_count) = scan_result + .shards + .iter() + .map(|s| s.tenant_shard_id.shard_count) + .max() + else { + return Err(ApiError::NotFound( + anyhow::anyhow!("No shards found").into(), + )); + }; + + // Ideally we would set each newly imported shard's generation independently, but for correctness it is sufficient + // to + let generation = scan_result + .shards + .iter() + .map(|s| s.generation) + .max() + .expect("We already validated >0 shards"); + + // FIXME: we have no way to recover the shard stripe size from contents of remote storage: this will + // only work if they were using the default stripe size. + let stripe_size = ShardParameters::DEFAULT_STRIPE_SIZE; + + let (response, waiters) = self + .do_tenant_create(TenantCreateRequest { + new_tenant_id: TenantShardId::unsharded(tenant_id), + generation, + + shard_parameters: ShardParameters { + count: shard_count, + stripe_size, + }, + placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking + + // There is no way to know what the tenant's config was: revert to defaults + // + // TODO: remove `switch_aux_file_policy` once we finish auxv2 migration + // + // we write to both v1+v2 storage, so that the test case can use either storage format for testing + config: TenantConfig { + switch_aux_file_policy: Some(models::AuxFilePolicy::CrossValidation), + ..TenantConfig::default() + }, + }) + .await?; + + if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await { + // Since this is a debug/support operation, all kinds of weird issues are possible (e.g. this + // tenant doesn't exist in the control plane), so don't fail the request if it can't fully + // reconcile, as reconciliation includes notifying compute. + tracing::warn!(%tenant_id, "Reconcile not done yet while importing tenant ({e})"); + } + + Ok(response) + } + + /// For debug/support: a full JSON dump of TenantShards. Returns a response so that + /// we don't have to make TenantShard clonable in the return path. + pub(crate) fn tenants_dump(&self) -> Result, ApiError> { + let serialized = { + let locked = self.inner.read().unwrap(); + let result = locked.tenants.values().collect::>(); + serde_json::to_string(&result).map_err(|e| ApiError::InternalServerError(e.into()))? + }; + + hyper::Response::builder() + .status(hyper::StatusCode::OK) + .header(hyper::header::CONTENT_TYPE, "application/json") + .body(hyper::Body::from(serialized)) + .map_err(|e| ApiError::InternalServerError(e.into())) + } + + /// Check the consistency of in-memory state vs. persistent state, and check that the + /// scheduler's statistics are up to date. + /// + /// These consistency checks expect an **idle** system. If changes are going on while + /// we run, then we can falsely indicate a consistency issue. This is sufficient for end-of-test + /// checks, but not suitable for running continuously in the background in the field. + pub(crate) async fn consistency_check(&self) -> Result<(), ApiError> { + let (mut expect_nodes, mut expect_shards) = { + let locked = self.inner.read().unwrap(); + + locked + .scheduler + .consistency_check(locked.nodes.values(), locked.tenants.values()) + .context("Scheduler checks") + .map_err(ApiError::InternalServerError)?; + + let expect_nodes = locked + .nodes + .values() + .map(|n| n.to_persistent()) + .collect::>(); + + let expect_shards = locked + .tenants + .values() + .map(|t| t.to_persistent()) + .collect::>(); + + // This method can only validate the state of an idle system: if a reconcile is in + // progress, fail out early to avoid giving false errors on state that won't match + // between database and memory under a ReconcileResult is processed. + for t in locked.tenants.values() { + if t.reconciler.is_some() { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Shard {} reconciliation in progress", + t.tenant_shard_id + ))); + } + } + + (expect_nodes, expect_shards) + }; + + let mut nodes = self.persistence.list_nodes().await?; + expect_nodes.sort_by_key(|n| n.node_id); + nodes.sort_by_key(|n| n.node_id); + + if nodes != expect_nodes { + tracing::error!("Consistency check failed on nodes."); + tracing::error!( + "Nodes in memory: {}", + serde_json::to_string(&expect_nodes) + .map_err(|e| ApiError::InternalServerError(e.into()))? + ); + tracing::error!( + "Nodes in database: {}", + serde_json::to_string(&nodes) + .map_err(|e| ApiError::InternalServerError(e.into()))? + ); + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Node consistency failure" + ))); + } + + let mut shards = self.persistence.list_tenant_shards().await?; + shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count)); + expect_shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count)); + + if shards != expect_shards { + tracing::error!("Consistency check failed on shards."); + tracing::error!( + "Shards in memory: {}", + serde_json::to_string(&expect_shards) + .map_err(|e| ApiError::InternalServerError(e.into()))? + ); + tracing::error!( + "Shards in database: {}", + serde_json::to_string(&shards) + .map_err(|e| ApiError::InternalServerError(e.into()))? + ); + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Shard consistency failure" + ))); + } + + Ok(()) + } + + /// For debug/support: a JSON dump of the [`Scheduler`]. Returns a response so that + /// we don't have to make TenantShard clonable in the return path. + pub(crate) fn scheduler_dump(&self) -> Result, ApiError> { + let serialized = { + let locked = self.inner.read().unwrap(); + serde_json::to_string(&locked.scheduler) + .map_err(|e| ApiError::InternalServerError(e.into()))? + }; + + hyper::Response::builder() + .status(hyper::StatusCode::OK) + .header(hyper::header::CONTENT_TYPE, "application/json") + .body(hyper::Body::from(serialized)) + .map_err(|e| ApiError::InternalServerError(e.into())) + } + + /// This is for debug/support only: we simply drop all state for a tenant, without + /// detaching or deleting it on pageservers. We do not try and re-schedule any + /// tenants that were on this node. + pub(crate) async fn node_drop(&self, node_id: NodeId) -> Result<(), ApiError> { + self.persistence.delete_node(node_id).await?; + + let mut locked = self.inner.write().unwrap(); + + for shard in locked.tenants.values_mut() { + shard.deref_node(node_id); + shard.observed.locations.remove(&node_id); + } + + let mut nodes = (*locked.nodes).clone(); + nodes.remove(&node_id); + locked.nodes = Arc::new(nodes); + + locked.scheduler.node_remove(node_id); + + Ok(()) + } + + /// If a node has any work on it, it will be rescheduled: this is "clean" in the sense + /// that we don't leave any bad state behind in the storage controller, but unclean + /// in the sense that we are not carefully draining the node. + pub(crate) async fn node_delete(&self, node_id: NodeId) -> Result<(), ApiError> { + let _node_lock = + trace_exclusive_lock(&self.node_op_locks, node_id, NodeOperations::Delete).await; + + // 1. Atomically update in-memory state: + // - set the scheduling state to Pause to make subsequent scheduling ops skip it + // - update shards' intents to exclude the node, and reschedule any shards whose intents we modified. + // - drop the node from the main nodes map, so that when running reconciles complete they do not + // re-insert references to this node into the ObservedState of shards + // - drop the node from the scheduler + { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + { + let mut nodes_mut = (*nodes).deref().clone(); + match nodes_mut.get_mut(&node_id) { + Some(node) => { + // We do not bother setting this in the database, because we're about to delete the row anyway, and + // if we crash it would not be desirable to leave the node paused after a restart. + node.set_scheduling(NodeSchedulingPolicy::Pause); + } + None => { + tracing::info!( + "Node not found: presuming this is a retry and returning success" + ); + return Ok(()); + } + } + + *nodes = Arc::new(nodes_mut); + } + + for (tenant_shard_id, shard) in tenants { + if shard.deref_node(node_id) { + // FIXME: we need to build a ScheduleContext that reflects this shard's peers, otherwise + // it won't properly do anti-affinity. + let mut schedule_context = ScheduleContext::default(); + + if let Err(e) = shard.schedule(scheduler, &mut schedule_context) { + // TODO: implement force flag to remove a node even if we can't reschedule + // a tenant + tracing::error!("Refusing to delete node, shard {tenant_shard_id} can't be rescheduled: {e}"); + return Err(e.into()); + } else { + tracing::info!( + "Rescheduled shard {tenant_shard_id} away from node during deletion" + ) + } + + self.maybe_reconcile_shard(shard, nodes); + } + + // Here we remove an existing observed location for the node we're removing, and it will + // not be re-added by a reconciler's completion because we filter out removed nodes in + // process_result. + // + // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that + // means any reconciles we spawned will know about the node we're deleting, enabling them + // to do live migrations if it's still online. + shard.observed.locations.remove(&node_id); + } + + scheduler.node_remove(node_id); + + { + let mut nodes_mut = (**nodes).clone(); + nodes_mut.remove(&node_id); + *nodes = Arc::new(nodes_mut); + } + } + + // Note: some `generation_pageserver` columns on tenant shards in the database may still refer to + // the removed node, as this column means "The pageserver to which this generation was issued", and + // their generations won't get updated until the reconcilers moving them away from this node complete. + // That is safe because in Service::spawn we only use generation_pageserver if it refers to a node + // that exists. + + // 2. Actually delete the node from the database and from in-memory state + tracing::info!("Deleting node from database"); + self.persistence.delete_node(node_id).await?; + + Ok(()) + } + + pub(crate) async fn node_list(&self) -> Result, ApiError> { + let nodes = { + self.inner + .read() + .unwrap() + .nodes + .values() + .cloned() + .collect::>() + }; + + Ok(nodes) + } + + pub(crate) async fn get_node(&self, node_id: NodeId) -> Result { + self.inner + .read() + .unwrap() + .nodes + .get(&node_id) + .cloned() + .ok_or(ApiError::NotFound( + format!("Node {node_id} not registered").into(), + )) + } + + pub(crate) async fn get_leader(&self) -> DatabaseResult> { + self.persistence.get_leader().await + } + + pub(crate) async fn node_register( + &self, + register_req: NodeRegisterRequest, + ) -> Result<(), ApiError> { + let _node_lock = trace_exclusive_lock( + &self.node_op_locks, + register_req.node_id, + NodeOperations::Register, + ) + .await; + + enum RegistrationStatus { + Matched, + Mismatched, + New, + } + + let registration_status = { + let locked = self.inner.read().unwrap(); + if let Some(node) = locked.nodes.get(®ister_req.node_id) { + if node.registration_match(®ister_req) { + RegistrationStatus::Matched + } else { + RegistrationStatus::Mismatched + } + } else { + RegistrationStatus::New + } + }; + + match registration_status { + RegistrationStatus::Matched => { + tracing::info!( + "Node {} re-registered with matching address", + register_req.node_id + ); + + return Ok(()); + } + RegistrationStatus::Mismatched => { + // TODO: decide if we want to allow modifying node addresses without removing and re-adding + // the node. Safest/simplest thing is to refuse it, and usually we deploy with + // a fixed address through the lifetime of a node. + tracing::warn!( + "Node {} tried to register with different address", + register_req.node_id + ); + return Err(ApiError::Conflict( + "Node is already registered with different address".to_string(), + )); + } + RegistrationStatus::New => { + // fallthrough + } + } + + // We do not require that a node is actually online when registered (it will start life + // with it's availability set to Offline), but we _do_ require that its DNS record exists. We're + // therefore not immune to asymmetric L3 connectivity issues, but we are protected against nodes + // that register themselves with a broken DNS config. We check only the HTTP hostname, because + // the postgres hostname might only be resolvable to clients (e.g. if we're on a different VPC than clients). + if tokio::net::lookup_host(format!( + "{}:{}", + register_req.listen_http_addr, register_req.listen_http_port + )) + .await + .is_err() + { + // If we have a transient DNS issue, it's up to the caller to retry their registration. Because + // we can't robustly distinguish between an intermittent issue and a totally bogus DNS situation, + // we return a soft 503 error, to encourage callers to retry past transient issues. + return Err(ApiError::ResourceUnavailable( + format!( + "Node {} tried to register with unknown DNS name '{}'", + register_req.node_id, register_req.listen_http_addr + ) + .into(), + )); + } + + // Ordering: we must persist the new node _before_ adding it to in-memory state. + // This ensures that before we use it for anything or expose it via any external + // API, it is guaranteed to be available after a restart. + let new_node = Node::new( + register_req.node_id, + register_req.listen_http_addr, + register_req.listen_http_port, + register_req.listen_pg_addr, + register_req.listen_pg_port, + register_req.availability_zone_id, + ); + + // TODO: idempotency if the node already exists in the database + self.persistence.insert_node(&new_node).await?; + + let mut locked = self.inner.write().unwrap(); + let mut new_nodes = (*locked.nodes).clone(); + + locked.scheduler.node_upsert(&new_node); + new_nodes.insert(register_req.node_id, new_node); + + locked.nodes = Arc::new(new_nodes); + + tracing::info!( + "Registered pageserver {}, now have {} pageservers", + register_req.node_id, + locked.nodes.len() + ); + Ok(()) + } + + pub(crate) async fn node_configure( + &self, + node_id: NodeId, + availability: Option, + scheduling: Option, + ) -> Result<(), ApiError> { + let _node_lock = + trace_exclusive_lock(&self.node_op_locks, node_id, NodeOperations::Configure).await; + + if let Some(scheduling) = scheduling { + // Scheduling is a persistent part of Node: we must write updates to the database before + // applying them in memory + self.persistence.update_node(node_id, scheduling).await?; + } + + // If we're activating a node, then before setting it active we must reconcile any shard locations + // on that node, in case it is out of sync, e.g. due to being unavailable during controller startup, + // by calling [`Self::node_activate_reconcile`] + // + // The transition we calculate here remains valid later in the function because we hold the op lock on the node: + // nothing else can mutate its availability while we run. + let availability_transition = if let Some(input_availability) = availability.as_ref() { + let (activate_node, availability_transition) = { + let locked = self.inner.read().unwrap(); + let Some(node) = locked.nodes.get(&node_id) else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Node {} not registered", node_id).into(), + )); + }; + + ( + node.clone(), + node.get_availability_transition(input_availability), + ) + }; + + if matches!(availability_transition, AvailabilityTransition::ToActive) { + self.node_activate_reconcile(activate_node, &_node_lock) + .await?; + } + availability_transition + } else { + AvailabilityTransition::Unchanged + }; + + // Apply changes from the request to our in-memory state for the Node + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + let mut new_nodes = (**nodes).clone(); + + let Some(node) = new_nodes.get_mut(&node_id) else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Node not registered").into(), + )); + }; + + if let Some(availability) = availability.as_ref() { + node.set_availability(availability.clone()); + } + + if let Some(scheduling) = scheduling { + node.set_scheduling(scheduling); + } + + // Update the scheduler, in case the elegibility of the node for new shards has changed + scheduler.node_upsert(node); + + let new_nodes = Arc::new(new_nodes); + + // Modify scheduling state for any Tenants that are affected by a change in the node's availability state. + match availability_transition { + AvailabilityTransition::ToOffline => { + tracing::info!("Node {} transition to offline", node_id); + let mut tenants_affected: usize = 0; + + for (tenant_shard_id, tenant_shard) in tenants { + if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) { + // When a node goes offline, we set its observed configuration to None, indicating unknown: we will + // not assume our knowledge of the node's configuration is accurate until it comes back online + observed_loc.conf = None; + } + + if new_nodes.len() == 1 { + // Special case for single-node cluster: there is no point trying to reschedule + // any tenant shards: avoid doing so, in order to avoid spewing warnings about + // failures to schedule them. + continue; + } + + if !new_nodes + .values() + .any(|n| matches!(n.may_schedule(), MaySchedule::Yes(_))) + { + // Special case for when all nodes are unavailable and/or unschedulable: there is no point + // trying to reschedule since there's nowhere else to go. Without this + // branch we incorrectly detach tenants in response to node unavailability. + continue; + } + + if tenant_shard.intent.demote_attached(scheduler, node_id) { + tenant_shard.sequence = tenant_shard.sequence.next(); + + // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters + // for tenants without secondary locations: if they have a secondary location, then this + // schedule() call is just promoting an existing secondary) + let mut schedule_context = ScheduleContext::default(); + + match tenant_shard.schedule(scheduler, &mut schedule_context) { + Err(e) => { + // It is possible that some tenants will become unschedulable when too many pageservers + // go offline: in this case there isn't much we can do other than make the issue observable. + // TODO: give TenantShard a scheduling error attribute to be queried later. + tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id); + } + Ok(()) => { + if self + .maybe_reconcile_shard(tenant_shard, &new_nodes) + .is_some() + { + tenants_affected += 1; + }; + } + } + } + } + tracing::info!( + "Launched {} reconciler tasks for tenants affected by node {} going offline", + tenants_affected, + node_id + ) + } + AvailabilityTransition::ToActive => { + tracing::info!("Node {} transition to active", node_id); + // When a node comes back online, we must reconcile any tenant that has a None observed + // location on the node. + for tenant_shard in locked.tenants.values_mut() { + // If a reconciliation is already in progress, rely on the previous scheduling + // decision and skip triggering a new reconciliation. + if tenant_shard.reconciler.is_some() { + continue; + } + + if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) { + if observed_loc.conf.is_none() { + self.maybe_reconcile_shard(tenant_shard, &new_nodes); + } + } + } + + // TODO: in the background, we should balance work back onto this pageserver + } + // No action required for the intermediate unavailable state. + // When we transition into active or offline from the unavailable state, + // the correct handling above will kick in. + AvailabilityTransition::ToWarmingUpFromActive => { + tracing::info!("Node {} transition to unavailable from active", node_id); + } + AvailabilityTransition::ToWarmingUpFromOffline => { + tracing::info!("Node {} transition to unavailable from offline", node_id); + } + AvailabilityTransition::Unchanged => { + tracing::debug!("Node {} no availability change during config", node_id); + } + } + + locked.nodes = new_nodes; + + Ok(()) + } + + /// Wrapper around [`Self::node_configure`] which only allows changes while there is no ongoing + /// operation for HTTP api. + pub(crate) async fn external_node_configure( + &self, + node_id: NodeId, + availability: Option, + scheduling: Option, + ) -> Result<(), ApiError> { + { + let locked = self.inner.read().unwrap(); + if let Some(op) = locked.ongoing_operation.as_ref().map(|op| op.operation) { + return Err(ApiError::PreconditionFailed( + format!("Ongoing background operation forbids configuring: {op}").into(), + )); + } + } + + self.node_configure(node_id, availability, scheduling).await + } + + pub(crate) async fn start_node_drain( + self: &Arc, + node_id: NodeId, + ) -> Result<(), ApiError> { + let (ongoing_op, node_available, node_policy, schedulable_nodes_count) = { + let locked = self.inner.read().unwrap(); + let nodes = &locked.nodes; + let node = nodes.get(&node_id).ok_or(ApiError::NotFound( + anyhow::anyhow!("Node {} not registered", node_id).into(), + ))?; + let schedulable_nodes_count = nodes + .iter() + .filter(|(_, n)| matches!(n.may_schedule(), MaySchedule::Yes(_))) + .count(); + + ( + locked + .ongoing_operation + .as_ref() + .map(|ongoing| ongoing.operation), + node.is_available(), + node.get_scheduling(), + schedulable_nodes_count, + ) + }; + + if let Some(ongoing) = ongoing_op { + return Err(ApiError::PreconditionFailed( + format!("Background operation already ongoing for node: {}", ongoing).into(), + )); + } + + if !node_available { + return Err(ApiError::ResourceUnavailable( + format!("Node {node_id} is currently unavailable").into(), + )); + } + + if schedulable_nodes_count == 0 { + return Err(ApiError::PreconditionFailed( + "No other schedulable nodes to drain to".into(), + )); + } + + match node_policy { + NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Pause => { + self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Draining)) + .await?; + + let cancel = self.cancel.child_token(); + let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?; + + self.inner.write().unwrap().ongoing_operation = Some(OperationHandler { + operation: Operation::Drain(Drain { node_id }), + cancel: cancel.clone(), + }); + + let span = tracing::info_span!(parent: None, "drain_node", %node_id); + + tokio::task::spawn({ + let service = self.clone(); + let cancel = cancel.clone(); + async move { + let _gate_guard = gate_guard; + + scopeguard::defer! { + let prev = service.inner.write().unwrap().ongoing_operation.take(); + + if let Some(Operation::Drain(removed_drain)) = prev.map(|h| h.operation) { + assert_eq!(removed_drain.node_id, node_id, "We always take the same operation"); + } else { + panic!("We always remove the same operation") + } + } + + tracing::info!("Drain background operation starting"); + let res = service.drain_node(node_id, cancel).await; + match res { + Ok(()) => { + tracing::info!("Drain background operation completed successfully"); + } + Err(OperationError::Cancelled) => { + tracing::info!("Drain background operation was cancelled"); + } + Err(err) => { + tracing::error!("Drain background operation encountered: {err}") + } + } + } + }.instrument(span)); + } + NodeSchedulingPolicy::Draining => { + return Err(ApiError::Conflict(format!( + "Node {node_id} has drain in progress" + ))); + } + policy => { + return Err(ApiError::PreconditionFailed( + format!("Node {node_id} cannot be drained due to {policy:?} policy").into(), + )); + } + } + + Ok(()) + } + + pub(crate) async fn cancel_node_drain(&self, node_id: NodeId) -> Result<(), ApiError> { + let node_available = { + let locked = self.inner.read().unwrap(); + let nodes = &locked.nodes; + let node = nodes.get(&node_id).ok_or(ApiError::NotFound( + anyhow::anyhow!("Node {} not registered", node_id).into(), + ))?; + + node.is_available() + }; + + if !node_available { + return Err(ApiError::ResourceUnavailable( + format!("Node {node_id} is currently unavailable").into(), + )); + } + + if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() { + if let Operation::Drain(drain) = op_handler.operation { + if drain.node_id == node_id { + tracing::info!("Cancelling background drain operation for node {node_id}"); + op_handler.cancel.cancel(); + return Ok(()); + } + } + } + + Err(ApiError::PreconditionFailed( + format!("Node {node_id} has no drain in progress").into(), + )) + } + + pub(crate) async fn start_node_fill(self: &Arc, node_id: NodeId) -> Result<(), ApiError> { + let (ongoing_op, node_available, node_policy, total_nodes_count) = { + let locked = self.inner.read().unwrap(); + let nodes = &locked.nodes; + let node = nodes.get(&node_id).ok_or(ApiError::NotFound( + anyhow::anyhow!("Node {} not registered", node_id).into(), + ))?; + + ( + locked + .ongoing_operation + .as_ref() + .map(|ongoing| ongoing.operation), + node.is_available(), + node.get_scheduling(), + nodes.len(), + ) + }; + + if let Some(ongoing) = ongoing_op { + return Err(ApiError::PreconditionFailed( + format!("Background operation already ongoing for node: {}", ongoing).into(), + )); + } + + if !node_available { + return Err(ApiError::ResourceUnavailable( + format!("Node {node_id} is currently unavailable").into(), + )); + } + + if total_nodes_count <= 1 { + return Err(ApiError::PreconditionFailed( + "No other nodes to fill from".into(), + )); + } + + match node_policy { + NodeSchedulingPolicy::Active => { + self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Filling)) + .await?; + + let cancel = self.cancel.child_token(); + let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?; + + self.inner.write().unwrap().ongoing_operation = Some(OperationHandler { + operation: Operation::Fill(Fill { node_id }), + cancel: cancel.clone(), + }); + + let span = tracing::info_span!(parent: None, "fill_node", %node_id); + + tokio::task::spawn({ + let service = self.clone(); + let cancel = cancel.clone(); + async move { + let _gate_guard = gate_guard; + + scopeguard::defer! { + let prev = service.inner.write().unwrap().ongoing_operation.take(); + + if let Some(Operation::Fill(removed_fill)) = prev.map(|h| h.operation) { + assert_eq!(removed_fill.node_id, node_id, "We always take the same operation"); + } else { + panic!("We always remove the same operation") + } + } + + tracing::info!("Fill background operation starting"); + let res = service.fill_node(node_id, cancel).await; + match res { + Ok(()) => { + tracing::info!("Fill background operation completed successfully"); + } + Err(OperationError::Cancelled) => { + tracing::info!("Fill background operation was cancelled"); + } + Err(err) => { + tracing::error!("Fill background operation encountered: {err}") + } + } + } + }.instrument(span)); + } + NodeSchedulingPolicy::Filling => { + return Err(ApiError::Conflict(format!( + "Node {node_id} has fill in progress" + ))); + } + policy => { + return Err(ApiError::PreconditionFailed( + format!("Node {node_id} cannot be filled due to {policy:?} policy").into(), + )); + } + } + + Ok(()) + } + + pub(crate) async fn cancel_node_fill(&self, node_id: NodeId) -> Result<(), ApiError> { + let node_available = { + let locked = self.inner.read().unwrap(); + let nodes = &locked.nodes; + let node = nodes.get(&node_id).ok_or(ApiError::NotFound( + anyhow::anyhow!("Node {} not registered", node_id).into(), + ))?; + + node.is_available() + }; + + if !node_available { + return Err(ApiError::ResourceUnavailable( + format!("Node {node_id} is currently unavailable").into(), + )); + } + + if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() { + if let Operation::Fill(fill) = op_handler.operation { + if fill.node_id == node_id { + tracing::info!("Cancelling background drain operation for node {node_id}"); + op_handler.cancel.cancel(); + return Ok(()); + } + } + } + + Err(ApiError::PreconditionFailed( + format!("Node {node_id} has no fill in progress").into(), + )) + } + + /// Like [`Self::maybe_configured_reconcile_shard`], but uses the default reconciler + /// configuration + fn maybe_reconcile_shard( + &self, + shard: &mut TenantShard, + nodes: &Arc>, + ) -> Option { + self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::default()) + } + + /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`], + fn maybe_configured_reconcile_shard( + &self, + shard: &mut TenantShard, + nodes: &Arc>, + reconciler_config: ReconcilerConfig, + ) -> Option { + let reconcile_needed = shard.get_reconcile_needed(nodes); + + match reconcile_needed { + ReconcileNeeded::No => return None, + ReconcileNeeded::WaitExisting(waiter) => return Some(waiter), + ReconcileNeeded::Yes => { + // Fall through to try and acquire units for spawning reconciler + } + }; + + let units = match self.reconciler_concurrency.clone().try_acquire_owned() { + Ok(u) => ReconcileUnits::new(u), + Err(_) => { + tracing::info!(tenant_id=%shard.tenant_shard_id.tenant_id, shard_id=%shard.tenant_shard_id.shard_slug(), + "Concurrency limited: enqueued for reconcile later"); + if !shard.delayed_reconcile { + match self.delayed_reconcile_tx.try_send(shard.tenant_shard_id) { + Err(TrySendError::Closed(_)) => { + // Weird mid-shutdown case? + } + Err(TrySendError::Full(_)) => { + // It is safe to skip sending our ID in the channel: we will eventually get retried by the background reconcile task. + tracing::warn!( + "Many shards are waiting to reconcile: delayed_reconcile queue is full" + ); + } + Ok(()) => { + shard.delayed_reconcile = true; + } + } + } + + // We won't spawn a reconciler, but we will construct a waiter that waits for the shard's sequence + // number to advance. When this function is eventually called again and succeeds in getting units, + // it will spawn a reconciler that makes this waiter complete. + return Some(shard.future_reconcile_waiter()); + } + }; + + let Ok(gate_guard) = self.reconcilers_gate.enter() else { + // Gate closed: we're shutting down, drop out. + return None; + }; + + shard.spawn_reconciler( + &self.result_tx, + nodes, + &self.compute_hook, + reconciler_config, + &self.config, + &self.persistence, + units, + gate_guard, + &self.reconcilers_cancel, + ) + } + + /// Check all tenants for pending reconciliation work, and reconcile those in need. + /// Additionally, reschedule tenants that require it. + /// + /// Returns how many reconciliation tasks were started, or `1` if no reconciles were + /// spawned but some _would_ have been spawned if `reconciler_concurrency` units where + /// available. A return value of 0 indicates that everything is fully reconciled already. + fn reconcile_all(&self) -> usize { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, _scheduler) = locked.parts_mut(); + let pageservers = nodes.clone(); + + let mut schedule_context = ScheduleContext::default(); + + let mut reconciles_spawned = 0; + for (tenant_shard_id, shard) in tenants.iter_mut() { + if tenant_shard_id.is_shard_zero() { + schedule_context = ScheduleContext::default(); + } + + // Skip checking if this shard is already enqueued for reconciliation + if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 { + // If there is something delayed, then return a nonzero count so that + // callers like reconcile_all_now do not incorrectly get the impression + // that the system is in a quiescent state. + reconciles_spawned = std::cmp::max(1, reconciles_spawned); + continue; + } + + // Eventual consistency: if an earlier reconcile job failed, and the shard is still + // dirty, spawn another rone + if self.maybe_reconcile_shard(shard, &pageservers).is_some() { + reconciles_spawned += 1; + } + + schedule_context.avoid(&shard.intent.all_pageservers()); + } + + reconciles_spawned + } + + /// `optimize` in this context means identifying shards which have valid scheduled locations, but + /// could be scheduled somewhere better: + /// - Cutting over to a secondary if the node with the secondary is more lightly loaded + /// * e.g. after a node fails then recovers, to move some work back to it + /// - Cutting over to a secondary if it improves the spread of shard attachments within a tenant + /// * e.g. after a shard split, the initial attached locations will all be on the node where + /// we did the split, but are probably better placed elsewhere. + /// - Creating new secondary locations if it improves the spreading of a sharded tenant + /// * e.g. after a shard split, some locations will be on the same node (where the split + /// happened), and will probably be better placed elsewhere. + /// + /// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at + /// the time of scheduling, this function looks for cases where a better-scoring location is available + /// according to those same soft constraints. + async fn optimize_all(&self) -> usize { + // Limit on how many shards' optmizations each call to this function will execute. Combined + // with the frequency of background calls, this acts as an implicit rate limit that runs a small + // trickle of optimizations in the background, rather than executing a large number in parallel + // when a change occurs. + const MAX_OPTIMIZATIONS_EXEC_PER_PASS: usize = 2; + + // Synchronous prepare: scan shards for possible scheduling optimizations + let candidate_work = self.optimize_all_plan(); + let candidate_work_len = candidate_work.len(); + + // Asynchronous validate: I/O to pageservers to make sure shards are in a good state to apply validation + let validated_work = self.optimize_all_validate(candidate_work).await; + + let was_work_filtered = validated_work.len() != candidate_work_len; + + // Synchronous apply: update the shards' intent states according to validated optimisations + let mut reconciles_spawned = 0; + let mut optimizations_applied = 0; + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + for (tenant_shard_id, optimization) in validated_work { + let Some(shard) = tenants.get_mut(&tenant_shard_id) else { + // Shard was dropped between planning and execution; + continue; + }; + if shard.apply_optimization(scheduler, optimization) { + optimizations_applied += 1; + if self.maybe_reconcile_shard(shard, nodes).is_some() { + reconciles_spawned += 1; + } + } + + if optimizations_applied >= MAX_OPTIMIZATIONS_EXEC_PER_PASS { + break; + } + } + + if was_work_filtered { + // If we filtered any work out during validation, ensure we return a nonzero value to indicate + // to callers that the system is not in a truly quiet state, it's going to do some work as soon + // as these validations start passing. + reconciles_spawned = std::cmp::max(reconciles_spawned, 1); + } + + reconciles_spawned + } + + fn optimize_all_plan(&self) -> Vec<(TenantShardId, ScheduleOptimization)> { + let mut schedule_context = ScheduleContext::default(); + + let mut tenant_shards: Vec<&TenantShard> = Vec::new(); + + // How many candidate optimizations we will generate, before evaluating them for readniess: setting + // this higher than the execution limit gives us a chance to execute some work even if the first + // few optimizations we find are not ready. + const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 8; + + let mut work = Vec::new(); + + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + for (tenant_shard_id, shard) in tenants.iter() { + if tenant_shard_id.is_shard_zero() { + // Reset accumulators on the first shard in a tenant + schedule_context = ScheduleContext::default(); + schedule_context.mode = ScheduleMode::Speculative; + tenant_shards.clear(); + } + + if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS { + break; + } + + match shard.get_scheduling_policy() { + ShardSchedulingPolicy::Active => { + // Ok to do optimization + } + ShardSchedulingPolicy::Essential + | ShardSchedulingPolicy::Pause + | ShardSchedulingPolicy::Stop => { + // Policy prevents optimizing this shard. + continue; + } + } + + // Accumulate the schedule context for all the shards in a tenant: we must have + // the total view of all shards before we can try to optimize any of them. + schedule_context.avoid(&shard.intent.all_pageservers()); + if let Some(attached) = shard.intent.get_attached() { + schedule_context.push_attached(*attached); + } + tenant_shards.push(shard); + + // Once we have seen the last shard in the tenant, proceed to search across all shards + // in the tenant for optimizations + if shard.shard.number.0 == shard.shard.count.count() - 1 { + if tenant_shards.iter().any(|s| s.reconciler.is_some()) { + // Do not start any optimizations while another change to the tenant is ongoing: this + // is not necessary for correctness, but simplifies operations and implicitly throttles + // optimization changes to happen in a "trickle" over time. + continue; + } + + if tenant_shards.iter().any(|s| { + !matches!(s.splitting, SplitState::Idle) + || matches!(s.policy, PlacementPolicy::Detached) + }) { + // Never attempt to optimize a tenant that is currently being split, or + // a tenant that is meant to be detached + continue; + } + + // TODO: optimization calculations are relatively expensive: create some fast-path for + // the common idle case (avoiding the search on tenants that we have recently checked) + + for shard in &tenant_shards { + if let Some(optimization) = + // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to + // its primary location based on soft constraints, cut it over. + shard.optimize_attachment(nodes, &schedule_context) + { + work.push((shard.tenant_shard_id, optimization)); + break; + } else if let Some(optimization) = + // If idle, maybe optimize secondary locations: if a shard has a secondary location that would be + // better placed on another node, based on ScheduleContext, then adjust it. This + // covers cases like after a shard split, where we might have too many shards + // in the same tenant with secondary locations on the node where they originally split. + shard.optimize_secondary(scheduler, &schedule_context) + { + work.push((shard.tenant_shard_id, optimization)); + break; + } + + // TODO: extend this mechanism to prefer attaching on nodes with fewer attached + // tenants (i.e. extend schedule state to distinguish attached from secondary counts), + // for the total number of attachments on a node (not just within a tenant.) + } + } + } + + work + } + + async fn optimize_all_validate( + &self, + candidate_work: Vec<(TenantShardId, ScheduleOptimization)>, + ) -> Vec<(TenantShardId, ScheduleOptimization)> { + // Take a clone of the node map to use outside the lock in async validation phase + let validation_nodes = { self.inner.read().unwrap().nodes.clone() }; + + let mut want_secondary_status = Vec::new(); + + // Validate our plans: this is an async phase where we may do I/O to pageservers to + // check that the state of locations is acceptable to run the optimization, such as + // checking that a secondary location is sufficiently warmed-up to cleanly cut over + // in a live migration. + let mut validated_work = Vec::new(); + for (tenant_shard_id, optimization) in candidate_work { + match optimization.action { + ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: _, + new_attached_node_id, + }) => { + match validation_nodes.get(&new_attached_node_id) { + None => { + // Node was dropped between planning and validation + } + Some(node) => { + if !node.is_available() { + tracing::info!("Skipping optimization migration of {tenant_shard_id} to {new_attached_node_id} because node unavailable"); + } else { + // Accumulate optimizations that require fetching secondary status, so that we can execute these + // remote API requests concurrently. + want_secondary_status.push(( + tenant_shard_id, + node.clone(), + optimization, + )); + } + } + } + } + ScheduleOptimizationAction::ReplaceSecondary(_) => { + // No extra checks needed to replace a secondary: this does not interrupt client access + validated_work.push((tenant_shard_id, optimization)) + } + }; + } + + // Call into pageserver API to find out if the destination secondary location is warm enough for a reasonably smooth migration: we + // do this so that we avoid spawning a Reconciler that would have to wait minutes/hours for a destination to warm up: that reconciler + // would hold a precious reconcile semaphore unit the whole time it was waiting for the destination to warm up. + let results = self + .tenant_for_shards_api( + want_secondary_status + .iter() + .map(|i| (i.0, i.1.clone())) + .collect(), + |tenant_shard_id, client| async move { + client.tenant_secondary_status(tenant_shard_id).await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + + for ((tenant_shard_id, node, optimization), secondary_status) in + want_secondary_status.into_iter().zip(results.into_iter()) + { + match secondary_status { + Err(e) => { + tracing::info!("Skipping migration of {tenant_shard_id} to {node}, error querying secondary: {e}"); + } + Ok(progress) => { + // We require secondary locations to have less than 10GiB of downloads pending before we will use + // them in an optimization + const DOWNLOAD_FRESHNESS_THRESHOLD: u64 = 10 * 1024 * 1024 * 1024; + + if progress.heatmap_mtime.is_none() + || progress.bytes_total < DOWNLOAD_FRESHNESS_THRESHOLD + && progress.bytes_downloaded != progress.bytes_total + || progress.bytes_total - progress.bytes_downloaded + > DOWNLOAD_FRESHNESS_THRESHOLD + { + tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}"); + } else { + // Location looks ready: proceed + tracing::info!( + "{tenant_shard_id} secondary on {node} is warm enough for migration: {progress:?}" + ); + validated_work.push((tenant_shard_id, optimization)) + } + } + } + } + + validated_work + } + + /// Look for shards which are oversized and in need of splitting + async fn autosplit_tenants(self: &Arc) { + let Some(split_threshold) = self.config.split_threshold else { + // Auto-splitting is disabled + return; + }; + + let nodes = self.inner.read().unwrap().nodes.clone(); + + const SPLIT_TO_MAX: ShardCount = ShardCount::new(8); + + let mut top_n = Vec::new(); + + // Call into each node to look for big tenants + let top_n_request = TopTenantShardsRequest { + // We currently split based on logical size, for simplicity: logical size is a signal of + // the user's intent to run a large database, whereas physical/resident size can be symptoms + // of compaction issues. Eventually we should switch to using resident size to bound the + // disk space impact of one shard. + order_by: models::TenantSorting::MaxLogicalSize, + limit: 10, + where_shards_lt: Some(SPLIT_TO_MAX), + where_gt: Some(split_threshold), + }; + for node in nodes.values() { + let request_ref = &top_n_request; + match node + .with_client_retries( + |client| async move { + let request = request_ref.clone(); + client.top_tenant_shards(request.clone()).await + }, + &self.config.jwt_token, + 3, + 3, + Duration::from_secs(5), + &self.cancel, + ) + .await + { + Some(Ok(node_top_n)) => { + top_n.extend(node_top_n.shards.into_iter()); + } + Some(Err(mgmt_api::Error::Cancelled)) => { + continue; + } + Some(Err(e)) => { + tracing::warn!("Failed to fetch top N tenants from {node}: {e}"); + continue; + } + None => { + // Node is shutting down + continue; + } + }; + } + + // Pick the biggest tenant to split first + top_n.sort_by_key(|i| i.resident_size); + let Some(split_candidate) = top_n.into_iter().next() else { + tracing::debug!("No split-elegible shards found"); + return; + }; + + // We spawn a task to run this, so it's exactly like some external API client requesting it. We don't + // want to block the background reconcile loop on this. + tracing::info!("Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}"); + + let this = self.clone(); + tokio::spawn( + async move { + match this + .tenant_shard_split( + split_candidate.id.tenant_id, + TenantShardSplitRequest { + // Always split to the max number of shards: this avoids stepping through + // intervening shard counts and encountering the overrhead of a split+cleanup + // each time as a tenant grows, and is not too expensive because our max shard + // count is relatively low anyway. + // This policy will be adjusted in future once we support higher shard count. + new_shard_count: SPLIT_TO_MAX.literal(), + new_stripe_size: Some(ShardParameters::DEFAULT_STRIPE_SIZE), + }, + ) + .await + { + Ok(_) => { + tracing::info!("Successful auto-split"); + } + Err(e) => { + tracing::error!("Auto-split failed: {e}"); + } + } + } + .instrument(tracing::info_span!("auto_split", tenant_id=%split_candidate.id.tenant_id)), + ); + } + + /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but + /// also wait for any generated Reconcilers to complete. Calling this until it returns zero should + /// put the system into a quiescent state where future background reconciliations won't do anything. + pub(crate) async fn reconcile_all_now(&self) -> Result { + let reconciles_spawned = self.reconcile_all(); + let reconciles_spawned = if reconciles_spawned == 0 { + // Only optimize when we are otherwise idle + self.optimize_all().await + } else { + reconciles_spawned + }; + + let waiters = { + let mut waiters = Vec::new(); + let locked = self.inner.read().unwrap(); + for (_tenant_shard_id, shard) in locked.tenants.iter() { + if let Some(waiter) = shard.get_waiter() { + waiters.push(waiter); + } + } + waiters + }; + + let waiter_count = waiters.len(); + match self.await_waiters(waiters, RECONCILE_TIMEOUT).await { + Ok(()) => {} + Err(ReconcileWaitError::Failed(_, reconcile_error)) + if matches!(*reconcile_error, ReconcileError::Cancel) => + { + // Ignore reconciler cancel errors: this reconciler might have shut down + // because some other change superceded it. We will return a nonzero number, + // so the caller knows they might have to call again to quiesce the system. + } + Err(e) => { + return Err(e); + } + }; + + tracing::info!( + "{} reconciles in reconcile_all, {} waiters", + reconciles_spawned, + waiter_count + ); + + Ok(std::cmp::max(waiter_count, reconciles_spawned)) + } + + async fn stop_reconciliations(&self, reason: StopReconciliationsReason) { + // Cancel all on-going reconciles and wait for them to exit the gate. + tracing::info!("{reason}: cancelling and waiting for in-flight reconciles"); + self.reconcilers_cancel.cancel(); + self.reconcilers_gate.close().await; + + // Signal the background loop in [`Service::process_results`] to exit once + // it has proccessed the results from all the reconciles we cancelled earlier. + tracing::info!("{reason}: processing results from previously in-flight reconciles"); + self.result_tx.send(ReconcileResultRequest::Stop).ok(); + self.result_tx.closed().await; + } + + pub async fn shutdown(&self) { + self.stop_reconciliations(StopReconciliationsReason::ShuttingDown) + .await; + + // Background tasks hold gate guards: this notifies them of the cancellation and + // waits for them all to complete. + tracing::info!("Shutting down: cancelling and waiting for background tasks to exit"); + self.cancel.cancel(); + self.gate.close().await; + } + + /// Spot check the download lag for a secondary location of a shard. + /// Should be used as a heuristic, since it's not always precise: the + /// secondary might have not downloaded the new heat map yet and, hence, + /// is not aware of the lag. + /// + /// Returns: + /// * Ok(None) if the lag could not be determined from the status, + /// * Ok(Some(_)) if the lag could be determind + /// * Err on failures to query the pageserver. + async fn secondary_lag( + &self, + secondary: &NodeId, + tenant_shard_id: TenantShardId, + ) -> Result, mgmt_api::Error> { + let nodes = self.inner.read().unwrap().nodes.clone(); + let node = nodes.get(secondary).ok_or(mgmt_api::Error::ApiError( + StatusCode::NOT_FOUND, + format!("Node with id {} not found", secondary), + ))?; + + match node + .with_client_retries( + |client| async move { client.tenant_secondary_status(tenant_shard_id).await }, + &self.config.jwt_token, + 1, + 3, + Duration::from_millis(250), + &self.cancel, + ) + .await + { + Some(Ok(status)) => match status.heatmap_mtime { + Some(_) => Ok(Some(status.bytes_total - status.bytes_downloaded)), + None => Ok(None), + }, + Some(Err(e)) => Err(e), + None => Err(mgmt_api::Error::Cancelled), + } + } + + /// Drain a node by moving the shards attached to it as primaries. + /// This is a long running operation and it should run as a separate Tokio task. + pub(crate) async fn drain_node( + self: &Arc, + node_id: NodeId, + cancel: CancellationToken, + ) -> Result<(), OperationError> { + const MAX_SECONDARY_LAG_BYTES_DEFAULT: u64 = 256 * 1024 * 1024; + let max_secondary_lag_bytes = self + .config + .max_secondary_lag_bytes + .unwrap_or(MAX_SECONDARY_LAG_BYTES_DEFAULT); + + // By default, live migrations are generous about the wait time for getting + // the secondary location up to speed. When draining, give up earlier in order + // to not stall the operation when a cold secondary is encountered. + const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); + const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); + let reconciler_config = ReconcilerConfigBuilder::new() + .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) + .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT) + .build(); + + let mut waiters = Vec::new(); + + let mut tid_iter = TenantShardIterator::new({ + let service = self.clone(); + move |last_inspected_shard: Option| { + let locked = &service.inner.read().unwrap(); + let tenants = &locked.tenants; + let entry = match last_inspected_shard { + Some(skip_past) => { + // Skip to the last seen tenant shard id + let mut cursor = tenants.iter().skip_while(|(tid, _)| **tid != skip_past); + + // Skip past the last seen + cursor.nth(1) + } + None => tenants.first_key_value(), + }; + + entry.map(|(tid, _)| tid).copied() + } + }); + + while !tid_iter.finished() { + if cancel.is_cancelled() { + match self + .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active)) + .await + { + Ok(()) => return Err(OperationError::Cancelled), + Err(err) => { + return Err(OperationError::FinalizeError( + format!( + "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}", + node_id, err + ) + .into(), + )); + } + } + } + + drain_utils::validate_node_state(&node_id, self.inner.read().unwrap().nodes.clone())?; + + while waiters.len() < MAX_RECONCILES_PER_OPERATION { + let tid = match tid_iter.next() { + Some(tid) => tid, + None => { + break; + } + }; + + let tid_drain = TenantShardDrain { + drained_node: node_id, + tenant_shard_id: tid, + }; + + let dest_node_id = { + let locked = self.inner.read().unwrap(); + + match tid_drain + .tenant_shard_eligible_for_drain(&locked.tenants, &locked.scheduler) + { + Some(node_id) => node_id, + None => { + continue; + } + } + }; + + match self.secondary_lag(&dest_node_id, tid).await { + Ok(Some(lag)) if lag <= max_secondary_lag_bytes => { + // The secondary is reasonably up to date. + // Migrate to it + } + Ok(Some(lag)) => { + tracing::info!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Secondary on node {dest_node_id} is lagging by {lag}. Skipping reconcile." + ); + continue; + } + Ok(None) => { + tracing::info!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Could not determine lag for secondary on node {dest_node_id}. Skipping reconcile." + ); + continue; + } + Err(err) => { + tracing::warn!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Failed to get secondary lag from node {dest_node_id}. Skipping reconcile: {err}" + ); + continue; + } + } + + { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + let rescheduled = tid_drain.reschedule_to_secondary( + dest_node_id, + tenants, + scheduler, + nodes, + )?; + + if let Some(tenant_shard) = rescheduled { + let waiter = self.maybe_configured_reconcile_shard( + tenant_shard, + nodes, + reconciler_config, + ); + if let Some(some) = waiter { + waiters.push(some); + } + } + } + } + + waiters = self + .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT) + .await; + + failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel); + } + + while !waiters.is_empty() { + if cancel.is_cancelled() { + match self + .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active)) + .await + { + Ok(()) => return Err(OperationError::Cancelled), + Err(err) => { + return Err(OperationError::FinalizeError( + format!( + "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}", + node_id, err + ) + .into(), + )); + } + } + } + + tracing::info!("Awaiting {} pending drain reconciliations", waiters.len()); + + waiters = self + .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT) + .await; + } + + // At this point we have done the best we could to drain shards from this node. + // Set the node scheduling policy to `[NodeSchedulingPolicy::PauseForRestart]` + // to complete the drain. + if let Err(err) = self + .node_configure(node_id, None, Some(NodeSchedulingPolicy::PauseForRestart)) + .await + { + // This is not fatal. Anything that is polling the node scheduling policy to detect + // the end of the drain operations will hang, but all such places should enforce an + // overall timeout. The scheduling policy will be updated upon node re-attach and/or + // by the counterpart fill operation. + return Err(OperationError::FinalizeError( + format!( + "Failed to finalise drain of {node_id} by setting scheduling policy to PauseForRestart: {err}" + ) + .into(), + )); + } + + Ok(()) + } + + /// Create a node fill plan (pick secondaries to promote) that meets the following requirements: + /// 1. The node should be filled until it reaches the expected cluster average of + /// attached shards. If there are not enough secondaries on the node, the plan stops early. + /// 2. Select tenant shards to promote such that the number of attached shards is balanced + /// throughout the cluster. We achieve this by picking tenant shards from each node, + /// starting from the ones with the largest number of attached shards, until the node + /// reaches the expected cluster average. + /// 3. Avoid promoting more shards of the same tenant than required. The upper bound + /// for the number of tenants from the same shard promoted to the node being filled is: + /// shard count for the tenant divided by the number of nodes in the cluster. + fn fill_node_plan(&self, node_id: NodeId) -> Vec { + let mut locked = self.inner.write().unwrap(); + let fill_requirement = locked.scheduler.compute_fill_requirement(node_id); + + let mut tids_by_node = locked + .tenants + .iter_mut() + .filter_map(|(tid, tenant_shard)| { + if tenant_shard.intent.get_secondary().contains(&node_id) { + if let Some(primary) = tenant_shard.intent.get_attached() { + return Some((*primary, *tid)); + } + } + + None + }) + .into_group_map(); + + let expected_attached = locked.scheduler.expected_attached_shard_count(); + let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count(); + + let mut promoted_per_tenant: HashMap = HashMap::new(); + let mut plan = Vec::new(); + + for (node_id, attached) in nodes_by_load { + let available = locked + .nodes + .get(&node_id) + .map_or(false, |n| n.is_available()); + if !available { + continue; + } + + if plan.len() >= fill_requirement + || tids_by_node.is_empty() + || attached <= expected_attached + { + break; + } + + let can_take = attached - expected_attached; + let needed = fill_requirement - plan.len(); + let mut take = std::cmp::min(can_take, needed); + + let mut remove_node = false; + while take > 0 { + match tids_by_node.get_mut(&node_id) { + Some(tids) => match tids.pop() { + Some(tid) => { + let max_promote_for_tenant = std::cmp::max( + tid.shard_count.count() as usize / locked.nodes.len(), + 1, + ); + let promoted = promoted_per_tenant.entry(tid.tenant_id).or_default(); + if *promoted < max_promote_for_tenant { + plan.push(tid); + *promoted += 1; + take -= 1; + } + } + None => { + remove_node = true; + break; + } + }, + None => { + break; + } + } + } + + if remove_node { + tids_by_node.remove(&node_id); + } + } + + plan + } + + /// Fill a node by promoting its secondaries until the cluster is balanced + /// with regards to attached shard counts. Note that this operation only + /// makes sense as a counterpart to the drain implemented in [`Service::drain_node`]. + /// This is a long running operation and it should run as a separate Tokio task. + pub(crate) async fn fill_node( + &self, + node_id: NodeId, + cancel: CancellationToken, + ) -> Result<(), OperationError> { + const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); + const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); + let reconciler_config = ReconcilerConfigBuilder::new() + .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) + .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT) + .build(); + + let mut tids_to_promote = self.fill_node_plan(node_id); + let mut waiters = Vec::new(); + + // Execute the plan we've composed above. Before aplying each move from the plan, + // we validate to ensure that it has not gone stale in the meantime. + while !tids_to_promote.is_empty() { + if cancel.is_cancelled() { + match self + .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active)) + .await + { + Ok(()) => return Err(OperationError::Cancelled), + Err(err) => { + return Err(OperationError::FinalizeError( + format!( + "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}", + node_id, err + ) + .into(), + )); + } + } + } + + { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged( + format!("node {node_id} was removed").into(), + ))?; + + let current_policy = node.get_scheduling(); + if !matches!(current_policy, NodeSchedulingPolicy::Filling) { + // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think + // about it + return Err(OperationError::NodeStateChanged( + format!("node {node_id} changed state to {current_policy:?}").into(), + )); + } + + while waiters.len() < MAX_RECONCILES_PER_OPERATION { + if let Some(tid) = tids_to_promote.pop() { + if let Some(tenant_shard) = tenants.get_mut(&tid) { + // If the node being filled is not a secondary anymore, + // skip the promotion. + if !tenant_shard.intent.get_secondary().contains(&node_id) { + continue; + } + + let previously_attached_to = *tenant_shard.intent.get_attached(); + match tenant_shard.reschedule_to_secondary(Some(node_id), scheduler) { + Err(e) => { + tracing::warn!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Scheduling error when filling pageserver {} : {e}", node_id + ); + } + Ok(()) => { + tracing::info!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Rescheduled shard while filling node {}: {:?} -> {}", + node_id, + previously_attached_to, + node_id + ); + + if let Some(waiter) = self.maybe_configured_reconcile_shard( + tenant_shard, + nodes, + reconciler_config, + ) { + waiters.push(waiter); + } + } + } + } + } else { + break; + } + } + } + + waiters = self + .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT) + .await; + } + + while !waiters.is_empty() { + if cancel.is_cancelled() { + match self + .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active)) + .await + { + Ok(()) => return Err(OperationError::Cancelled), + Err(err) => { + return Err(OperationError::FinalizeError( + format!( + "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}", + node_id, err + ) + .into(), + )); + } + } + } + + tracing::info!("Awaiting {} pending fill reconciliations", waiters.len()); + + waiters = self + .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT) + .await; + } + + if let Err(err) = self + .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active)) + .await + { + // This isn't a huge issue since the filling process starts upon request. However, it + // will prevent the next drain from starting. The only case in which this can fail + // is database unavailability. Such a case will require manual intervention. + return Err(OperationError::FinalizeError( + format!("Failed to finalise fill of {node_id} by setting scheduling policy to Active: {err}") + .into(), + )); + } + + Ok(()) + } + + /// Updates scrubber metadata health check results. + pub(crate) async fn metadata_health_update( + &self, + update_req: MetadataHealthUpdateRequest, + ) -> Result<(), ApiError> { + let now = chrono::offset::Utc::now(); + let (healthy_records, unhealthy_records) = { + let locked = self.inner.read().unwrap(); + let healthy_records = update_req + .healthy_tenant_shards + .into_iter() + // Retain only health records associated with tenant shards managed by storage controller. + .filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id)) + .map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, true, now)) + .collect(); + let unhealthy_records = update_req + .unhealthy_tenant_shards + .into_iter() + .filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id)) + .map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, false, now)) + .collect(); + + (healthy_records, unhealthy_records) + }; + + self.persistence + .update_metadata_health_records(healthy_records, unhealthy_records, now) + .await?; + Ok(()) + } + + /// Lists the tenant shards that has unhealthy metadata status. + pub(crate) async fn metadata_health_list_unhealthy( + &self, + ) -> Result, ApiError> { + let result = self + .persistence + .list_unhealthy_metadata_health_records() + .await? + .iter() + .map(|p| p.get_tenant_shard_id().unwrap()) + .collect(); + + Ok(result) + } + + /// Lists the tenant shards that have not been scrubbed for some duration. + pub(crate) async fn metadata_health_list_outdated( + &self, + not_scrubbed_for: Duration, + ) -> Result, ApiError> { + let earlier = chrono::offset::Utc::now() - not_scrubbed_for; + let result = self + .persistence + .list_outdated_metadata_health_records(earlier) + .await? + .into_iter() + .map(|record| record.into()) + .collect(); + Ok(result) + } + + pub(crate) fn get_leadership_status(&self) -> LeadershipStatus { + self.inner.read().unwrap().get_leadership_status() + } + + pub(crate) async fn step_down(&self) -> GlobalObservedState { + tracing::info!("Received step down request from peer"); + failpoint_support::sleep_millis_async!("sleep-on-step-down-handling"); + + self.inner.write().unwrap().step_down(); + // TODO: would it make sense to have a time-out for this? + self.stop_reconciliations(StopReconciliationsReason::SteppingDown) + .await; + + let mut global_observed = GlobalObservedState::default(); + let locked = self.inner.read().unwrap(); + for (tid, tenant_shard) in locked.tenants.iter() { + global_observed + .0 + .insert(*tid, tenant_shard.observed.clone()); + } + + global_observed + } + + pub(crate) async fn get_safekeeper( + &self, + id: i64, + ) -> Result { + self.persistence.safekeeper_get(id).await + } + + pub(crate) async fn upsert_safekeeper( + &self, + record: crate::persistence::SafekeeperPersistence, + ) -> Result<(), DatabaseError> { + self.persistence.safekeeper_upsert(record).await + } + + pub(crate) async fn update_shards_preferred_azs( + &self, + req: ShardsPreferredAzsRequest, + ) -> Result { + let preferred_azs = req.preferred_az_ids.into_iter().collect::>(); + let updated = self + .persistence + .set_tenant_shard_preferred_azs(preferred_azs) + .await + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!( + "Failed to persist preferred AZs: {err}" + )) + })?; + + let mut updated_in_mem_and_db = Vec::default(); + + let mut locked = self.inner.write().unwrap(); + for (tid, az_id) in updated { + let shard = locked.tenants.get_mut(&tid); + if let Some(shard) = shard { + shard.set_preferred_az(az_id); + updated_in_mem_and_db.push(tid); + } + } + + Ok(ShardsPreferredAzsResponse { + updated: updated_in_mem_and_db, + }) + } +} diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs new file mode 100644 index 0000000000..99961d691c --- /dev/null +++ b/storage_controller/src/service/chaos_injector.rs @@ -0,0 +1,71 @@ +use std::{sync::Arc, time::Duration}; + +use rand::seq::SliceRandom; +use rand::thread_rng; +use tokio_util::sync::CancellationToken; + +use super::Service; + +pub struct ChaosInjector { + service: Arc, + interval: Duration, +} + +impl ChaosInjector { + pub fn new(service: Arc, interval: Duration) -> Self { + Self { service, interval } + } + + pub async fn run(&mut self, cancel: CancellationToken) { + let mut interval = tokio::time::interval(self.interval); + + loop { + tokio::select! { + _ = interval.tick() => {} + _ = cancel.cancelled() => { + tracing::info!("Shutting down"); + return; + } + } + + self.inject_chaos().await; + + tracing::info!("Chaos iteration..."); + } + } + + async fn inject_chaos(&mut self) { + // Pick some shards to interfere with + let batch_size = 128; + let mut inner = self.service.inner.write().unwrap(); + let (nodes, tenants, scheduler) = inner.parts_mut(); + let tenant_ids = tenants.keys().cloned().collect::>(); + let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size); + + for victim in victims { + let shard = tenants + .get_mut(victim) + .expect("Held lock between choosing ID and this get"); + + // Pick a secondary to promote + let Some(new_location) = shard + .intent + .get_secondary() + .choose(&mut thread_rng()) + .cloned() + else { + tracing::info!("Skipping shard {victim}: no secondary location, can't migrate"); + continue; + }; + + let Some(old_location) = *shard.intent.get_attached() else { + tracing::info!("Skipping shard {victim}: currently has no attached location"); + continue; + }; + + shard.intent.demote_attached(scheduler, old_location); + shard.intent.promote_attached(scheduler, new_location); + self.service.maybe_reconcile_shard(shard, nodes); + } + } +} diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs new file mode 100644 index 0000000000..cdb0633e2b --- /dev/null +++ b/storage_controller/src/tenant_shard.rs @@ -0,0 +1,1733 @@ +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, + time::Duration, +}; + +use crate::{ + metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome}, + persistence::TenantShardPersistence, + reconciler::{ReconcileUnits, ReconcilerConfig}, + scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext}, + service::ReconcileResultRequest, +}; +use pageserver_api::controller_api::{ + NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, +}; +use pageserver_api::{ + models::{LocationConfig, LocationConfigMode, TenantConfig}, + shard::{ShardIdentity, TenantShardId}, +}; +use serde::{Deserialize, Serialize}; +use tokio::task::JoinHandle; +use tokio_util::sync::CancellationToken; +use tracing::{instrument, Instrument}; +use utils::{ + generation::Generation, + id::NodeId, + seqwait::{SeqWait, SeqWaitError}, + sync::gate::GateGuard, +}; + +use crate::{ + compute_hook::ComputeHook, + node::Node, + persistence::{split_state::SplitState, Persistence}, + reconciler::{ + attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState, + }, + scheduler::{ScheduleError, Scheduler}, + service, Sequence, +}; + +/// Serialization helper +fn read_last_error(v: &std::sync::Mutex>, serializer: S) -> Result +where + S: serde::ser::Serializer, + T: std::fmt::Display, +{ + serializer.collect_str( + &v.lock() + .unwrap() + .as_ref() + .map(|e| format!("{e}")) + .unwrap_or("".to_string()), + ) +} + +/// In-memory state for a particular tenant shard. +/// +/// This struct implement Serialize for debugging purposes, but is _not_ persisted +/// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted. +#[derive(Serialize)] +pub(crate) struct TenantShard { + pub(crate) tenant_shard_id: TenantShardId, + + pub(crate) shard: ShardIdentity, + + // Runtime only: sequence used to coordinate when updating this object while + // with background reconcilers may be running. A reconciler runs to a particular + // sequence. + pub(crate) sequence: Sequence, + + // Latest generation number: next time we attach, increment this + // and use the incremented number when attaching. + // + // None represents an incompletely onboarded tenant via the [`Service::location_config`] + // API, where this tenant may only run in PlacementPolicy::Secondary. + pub(crate) generation: Option, + + // High level description of how the tenant should be set up. Provided + // externally. + pub(crate) policy: PlacementPolicy, + + // Low level description of exactly which pageservers should fulfil + // which role. Generated by `Self::schedule`. + pub(crate) intent: IntentState, + + // Low level description of how the tenant is configured on pageservers: + // if this does not match `Self::intent` then the tenant needs reconciliation + // with `Self::reconcile`. + pub(crate) observed: ObservedState, + + // Tenant configuration, passed through opaquely to the pageserver. Identical + // for all shards in a tenant. + pub(crate) config: TenantConfig, + + /// If a reconcile task is currently in flight, it may be joined here (it is + /// only safe to join if either the result has been received or the reconciler's + /// cancellation token has been fired) + #[serde(skip)] + pub(crate) reconciler: Option, + + /// If a tenant is being split, then all shards with that TenantId will have a + /// SplitState set, this acts as a guard against other operations such as background + /// reconciliation, and timeline creation. + pub(crate) splitting: SplitState, + + /// If a tenant was enqueued for later reconcile due to hitting concurrency limit, this flag + /// is set. This flag is cleared when the tenant is popped off the delay queue. + pub(crate) delayed_reconcile: bool, + + /// Optionally wait for reconciliation to complete up to a particular + /// sequence number. + #[serde(skip)] + pub(crate) waiter: std::sync::Arc>, + + /// Indicates sequence number for which we have encountered an error reconciling. If + /// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred, + /// and callers should stop waiting for `waiter` and propagate the error. + #[serde(skip)] + pub(crate) error_waiter: std::sync::Arc>, + + /// The most recent error from a reconcile on this tenant. This is a nested Arc + /// because: + /// - ReconcileWaiters need to Arc-clone the overall object to read it later + /// - ReconcileWaitError needs to use an `Arc` because we can construct + /// many waiters for one shard, and the underlying error types are not Clone. + /// + /// TODO: generalize to an array of recent events + /// TOOD: use a ArcSwap instead of mutex for faster reads? + #[serde(serialize_with = "read_last_error")] + pub(crate) last_error: std::sync::Arc>>>, + + /// If we have a pending compute notification that for some reason we weren't able to send, + /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes + /// and trigger a Reconciler run. This is the mechanism by which compute notifications are included in the scope + /// of state that we publish externally in an eventually consistent way. + pub(crate) pending_compute_notification: bool, + + // Support/debug tool: if something is going wrong or flapping with scheduling, this may + // be set to a non-active state to avoid making changes while the issue is fixed. + scheduling_policy: ShardSchedulingPolicy, + + // We should attempt to schedule this shard in the provided AZ to + // decrease chances of cross-AZ compute. + preferred_az_id: Option, +} + +#[derive(Default, Clone, Debug, Serialize)] +pub(crate) struct IntentState { + attached: Option, + secondary: Vec, +} + +impl IntentState { + pub(crate) fn new() -> Self { + Self { + attached: None, + secondary: vec![], + } + } + pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option) -> Self { + if let Some(node_id) = node_id { + scheduler.update_node_ref_counts(node_id, RefCountUpdate::Attach); + } + Self { + attached: node_id, + secondary: vec![], + } + } + + pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option) { + if self.attached != new_attached { + if let Some(old_attached) = self.attached.take() { + scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach); + } + if let Some(new_attached) = &new_attached { + scheduler.update_node_ref_counts(*new_attached, RefCountUpdate::Attach); + } + self.attached = new_attached; + } + } + + /// Like set_attached, but the node is from [`Self::secondary`]. This swaps the node from + /// secondary to attached while maintaining the scheduler's reference counts. + pub(crate) fn promote_attached( + &mut self, + scheduler: &mut Scheduler, + promote_secondary: NodeId, + ) { + // If we call this with a node that isn't in secondary, it would cause incorrect + // scheduler reference counting, since we assume the node is already referenced as a secondary. + debug_assert!(self.secondary.contains(&promote_secondary)); + + self.secondary.retain(|n| n != &promote_secondary); + + let demoted = self.attached; + self.attached = Some(promote_secondary); + + scheduler.update_node_ref_counts(promote_secondary, RefCountUpdate::PromoteSecondary); + if let Some(demoted) = demoted { + scheduler.update_node_ref_counts(demoted, RefCountUpdate::DemoteAttached); + } + } + + pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) { + debug_assert!(!self.secondary.contains(&new_secondary)); + scheduler.update_node_ref_counts(new_secondary, RefCountUpdate::AddSecondary); + self.secondary.push(new_secondary); + } + + /// It is legal to call this with a node that is not currently a secondary: that is a no-op + pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) { + let index = self.secondary.iter().position(|n| *n == node_id); + if let Some(index) = index { + scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary); + self.secondary.remove(index); + } + } + + pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) { + for secondary in self.secondary.drain(..) { + scheduler.update_node_ref_counts(secondary, RefCountUpdate::RemoveSecondary); + } + } + + /// Remove the last secondary node from the list of secondaries + pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) { + if let Some(node_id) = self.secondary.pop() { + scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary); + } + } + + pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) { + if let Some(old_attached) = self.attached.take() { + scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach); + } + + self.clear_secondary(scheduler); + } + + pub(crate) fn all_pageservers(&self) -> Vec { + let mut result = Vec::new(); + if let Some(p) = self.attached { + result.push(p) + } + + result.extend(self.secondary.iter().copied()); + + result + } + + pub(crate) fn get_attached(&self) -> &Option { + &self.attached + } + + pub(crate) fn get_secondary(&self) -> &Vec { + &self.secondary + } + + /// If the node is in use as the attached location, demote it into + /// the list of secondary locations. This is used when a node goes offline, + /// and we want to use a different node for attachment, but not permanently + /// forget the location on the offline node. + /// + /// Returns true if a change was made + pub(crate) fn demote_attached(&mut self, scheduler: &mut Scheduler, node_id: NodeId) -> bool { + if self.attached == Some(node_id) { + self.attached = None; + self.secondary.push(node_id); + scheduler.update_node_ref_counts(node_id, RefCountUpdate::DemoteAttached); + true + } else { + false + } + } +} + +impl Drop for IntentState { + fn drop(&mut self) { + // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler. + // We do not check this while panicking, to avoid polluting unit test failures or + // other assertions with this assertion's output. It's still wrong to leak these, + // but if we already have a panic then we don't need to independently flag this case. + if !(std::thread::panicking()) { + debug_assert!(self.attached.is_none() && self.secondary.is_empty()); + } + } +} + +#[derive(Default, Clone, Serialize, Deserialize, Debug)] +pub(crate) struct ObservedState { + pub(crate) locations: HashMap, +} + +/// Our latest knowledge of how this tenant is configured in the outside world. +/// +/// Meaning: +/// * No instance of this type exists for a node: we are certain that we have nothing configured on that +/// node for this shard. +/// * Instance exists with conf==None: we *might* have some state on that node, but we don't know +/// what it is (e.g. we failed partway through configuring it) +/// * Instance exists with conf==Some: this tells us what we last successfully configured on this node, +/// and that configuration will still be present unless something external interfered. +#[derive(Clone, Serialize, Deserialize, Debug)] +pub(crate) struct ObservedStateLocation { + /// If None, it means we do not know the status of this shard's location on this node, but + /// we know that we might have some state on this node. + pub(crate) conf: Option, +} +pub(crate) struct ReconcilerWaiter { + // For observability purposes, remember the ID of the shard we're + // waiting for. + pub(crate) tenant_shard_id: TenantShardId, + + seq_wait: std::sync::Arc>, + error_seq_wait: std::sync::Arc>, + error: std::sync::Arc>>>, + seq: Sequence, +} + +pub(crate) enum ReconcilerStatus { + Done, + Failed, + InProgress, +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum ReconcileWaitError { + #[error("Timeout waiting for shard {0}")] + Timeout(TenantShardId), + #[error("shutting down")] + Shutdown, + #[error("Reconcile error on shard {0}: {1}")] + Failed(TenantShardId, Arc), +} + +#[derive(Eq, PartialEq, Debug)] +pub(crate) struct ReplaceSecondary { + old_node_id: NodeId, + new_node_id: NodeId, +} + +#[derive(Eq, PartialEq, Debug)] +pub(crate) struct MigrateAttachment { + pub(crate) old_attached_node_id: NodeId, + pub(crate) new_attached_node_id: NodeId, +} + +#[derive(Eq, PartialEq, Debug)] +pub(crate) enum ScheduleOptimizationAction { + // Replace one of our secondary locations with a different node + ReplaceSecondary(ReplaceSecondary), + // Migrate attachment to an existing secondary location + MigrateAttachment(MigrateAttachment), +} + +#[derive(Eq, PartialEq, Debug)] +pub(crate) struct ScheduleOptimization { + // What was the reconcile sequence when we generated this optimization? The optimization + // should only be applied if the shard's sequence is still at this value, in case other changes + // happened between planning the optimization and applying it. + sequence: Sequence, + + pub(crate) action: ScheduleOptimizationAction, +} + +impl ReconcilerWaiter { + pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> { + tokio::select! { + result = self.seq_wait.wait_for_timeout(self.seq, timeout)=> { + result.map_err(|e| match e { + SeqWaitError::Timeout => ReconcileWaitError::Timeout(self.tenant_shard_id), + SeqWaitError::Shutdown => ReconcileWaitError::Shutdown + })?; + }, + result = self.error_seq_wait.wait_for(self.seq) => { + result.map_err(|e| match e { + SeqWaitError::Shutdown => ReconcileWaitError::Shutdown, + SeqWaitError::Timeout => unreachable!() + })?; + + return Err(ReconcileWaitError::Failed(self.tenant_shard_id, + self.error.lock().unwrap().clone().expect("If error_seq_wait was advanced error was set").clone())) + } + } + + Ok(()) + } + + pub(crate) fn get_status(&self) -> ReconcilerStatus { + if self.seq_wait.would_wait_for(self.seq).is_ok() { + ReconcilerStatus::Done + } else if self.error_seq_wait.would_wait_for(self.seq).is_ok() { + ReconcilerStatus::Failed + } else { + ReconcilerStatus::InProgress + } + } +} + +/// Having spawned a reconciler task, the tenant shard's state will carry enough +/// information to optionally cancel & await it later. +pub(crate) struct ReconcilerHandle { + sequence: Sequence, + handle: JoinHandle<()>, + cancel: CancellationToken, +} + +pub(crate) enum ReconcileNeeded { + /// shard either doesn't need reconciliation, or is forbidden from spawning a reconciler + /// in its current state (e.g. shard split in progress, or ShardSchedulingPolicy forbids it) + No, + /// shard has a reconciler running, and its intent hasn't changed since that one was + /// spawned: wait for the existing reconciler rather than spawning a new one. + WaitExisting(ReconcilerWaiter), + /// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`] + Yes, +} + +/// When a reconcile task completes, it sends this result object +/// to be applied to the primary TenantShard. +pub(crate) struct ReconcileResult { + pub(crate) sequence: Sequence, + /// On errors, `observed` should be treated as an incompleted description + /// of state (i.e. any nodes present in the result should override nodes + /// present in the parent tenant state, but any unmentioned nodes should + /// not be removed from parent tenant state) + pub(crate) result: Result<(), ReconcileError>, + + pub(crate) tenant_shard_id: TenantShardId, + pub(crate) generation: Option, + pub(crate) observed: ObservedState, + + /// Set [`TenantShard::pending_compute_notification`] from this flag + pub(crate) pending_compute_notification: bool, +} + +impl ObservedState { + pub(crate) fn new() -> Self { + Self { + locations: HashMap::new(), + } + } +} + +impl TenantShard { + pub(crate) fn new( + tenant_shard_id: TenantShardId, + shard: ShardIdentity, + policy: PlacementPolicy, + ) -> Self { + Self { + tenant_shard_id, + policy, + intent: IntentState::default(), + generation: Some(Generation::new(0)), + shard, + observed: ObservedState::default(), + config: TenantConfig::default(), + reconciler: None, + splitting: SplitState::Idle, + sequence: Sequence(1), + delayed_reconcile: false, + waiter: Arc::new(SeqWait::new(Sequence(0))), + error_waiter: Arc::new(SeqWait::new(Sequence(0))), + last_error: Arc::default(), + pending_compute_notification: false, + scheduling_policy: ShardSchedulingPolicy::default(), + preferred_az_id: None, + } + } + + /// For use on startup when learning state from pageservers: generate my [`IntentState`] from my + /// [`ObservedState`], even if it violates my [`PlacementPolicy`]. Call [`Self::schedule`] next, + /// to get an intent state that complies with placement policy. The overall goal is to do scheduling + /// in a way that makes use of any configured locations that already exist in the outside world. + pub(crate) fn intent_from_observed(&mut self, scheduler: &mut Scheduler) { + // Choose an attached location by filtering observed locations, and then sorting to get the highest + // generation + let mut attached_locs = self + .observed + .locations + .iter() + .filter_map(|(node_id, l)| { + if let Some(conf) = &l.conf { + if conf.mode == LocationConfigMode::AttachedMulti + || conf.mode == LocationConfigMode::AttachedSingle + || conf.mode == LocationConfigMode::AttachedStale + { + Some((node_id, conf.generation)) + } else { + None + } + } else { + None + } + }) + .collect::>(); + + attached_locs.sort_by_key(|i| i.1); + if let Some((node_id, _gen)) = attached_locs.into_iter().last() { + self.intent.set_attached(scheduler, Some(*node_id)); + } + + // All remaining observed locations generate secondary intents. This includes None + // observations, as these may well have some local content on disk that is usable (this + // is an edge case that might occur if we restarted during a migration or other change) + // + // We may leave intent.attached empty if we didn't find any attached locations: [`Self::schedule`] + // will take care of promoting one of these secondaries to be attached. + self.observed.locations.keys().for_each(|node_id| { + if Some(*node_id) != self.intent.attached { + self.intent.push_secondary(scheduler, *node_id); + } + }); + } + + /// Part of [`Self::schedule`] that is used to choose exactly one node to act as the + /// attached pageserver for a shard. + /// + /// Returns whether we modified it, and the NodeId selected. + fn schedule_attached( + &mut self, + scheduler: &mut Scheduler, + context: &ScheduleContext, + ) -> Result<(bool, NodeId), ScheduleError> { + // No work to do if we already have an attached tenant + if let Some(node_id) = self.intent.attached { + return Ok((false, node_id)); + } + + if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) { + // Promote a secondary + tracing::debug!("Promoted secondary {} to attached", promote_secondary); + self.intent.promote_attached(scheduler, promote_secondary); + Ok((true, promote_secondary)) + } else { + // Pick a fresh node: either we had no secondaries or none were schedulable + let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?; + tracing::debug!("Selected {} as attached", node_id); + self.intent.set_attached(scheduler, Some(node_id)); + Ok((true, node_id)) + } + } + + pub(crate) fn schedule( + &mut self, + scheduler: &mut Scheduler, + context: &mut ScheduleContext, + ) -> Result<(), ScheduleError> { + let r = self.do_schedule(scheduler, context); + + context.avoid(&self.intent.all_pageservers()); + if let Some(attached) = self.intent.get_attached() { + context.push_attached(*attached); + } + + r + } + + pub(crate) fn do_schedule( + &mut self, + scheduler: &mut Scheduler, + context: &ScheduleContext, + ) -> Result<(), ScheduleError> { + // TODO: before scheduling new nodes, check if any existing content in + // self.intent refers to pageservers that are offline, and pick other + // pageservers if so. + + // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not + // change their attach location. + + match self.scheduling_policy { + ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {} + ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => { + // Warn to make it obvious why other things aren't happening/working, if we skip scheduling + tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "Scheduling is disabled by policy {:?}", self.scheduling_policy); + return Ok(()); + } + } + + // Build the set of pageservers already in use by this tenant, to avoid scheduling + // more work on the same pageservers we're already using. + let mut modified = false; + + // Add/remove nodes to fulfil policy + use PlacementPolicy::*; + match self.policy { + Attached(secondary_count) => { + let retain_secondaries = if self.intent.attached.is_none() + && scheduler.node_preferred(&self.intent.secondary).is_some() + { + // If we have no attached, and one of the secondaries is elegible to be promoted, retain + // one more secondary than we usually would, as one of them will become attached futher down this function. + secondary_count + 1 + } else { + secondary_count + }; + + while self.intent.secondary.len() > retain_secondaries { + // We have no particular preference for one secondary location over another: just + // arbitrarily drop from the end + self.intent.pop_secondary(scheduler); + modified = true; + } + + // Should have exactly one attached, and N secondaries + let (modified_attached, attached_node_id) = + self.schedule_attached(scheduler, context)?; + modified |= modified_attached; + + let mut used_pageservers = vec![attached_node_id]; + while self.intent.secondary.len() < secondary_count { + let node_id = scheduler.schedule_shard(&used_pageservers, context)?; + self.intent.push_secondary(scheduler, node_id); + used_pageservers.push(node_id); + modified = true; + } + } + Secondary => { + if let Some(node_id) = self.intent.get_attached() { + // Populate secondary by demoting the attached node + self.intent.demote_attached(scheduler, *node_id); + modified = true; + } else if self.intent.secondary.is_empty() { + // Populate secondary by scheduling a fresh node + let node_id = scheduler.schedule_shard(&[], context)?; + self.intent.push_secondary(scheduler, node_id); + modified = true; + } + while self.intent.secondary.len() > 1 { + // We have no particular preference for one secondary location over another: just + // arbitrarily drop from the end + self.intent.pop_secondary(scheduler); + modified = true; + } + } + Detached => { + // Never add locations in this mode + if self.intent.get_attached().is_some() || !self.intent.get_secondary().is_empty() { + self.intent.clear(scheduler); + modified = true; + } + } + } + + if modified { + self.sequence.0 += 1; + } + + Ok(()) + } + + /// Reschedule this tenant shard to one of its secondary locations. Returns a scheduling error + /// if the swap is not possible and leaves the intent state in its original state. + /// + /// Arguments: + /// `attached_to`: the currently attached location matching the intent state (may be None if the + /// shard is not attached) + /// `promote_to`: an optional secondary location of this tenant shard. If set to None, we ask + /// the scheduler to recommend a node + pub(crate) fn reschedule_to_secondary( + &mut self, + promote_to: Option, + scheduler: &mut Scheduler, + ) -> Result<(), ScheduleError> { + let promote_to = match promote_to { + Some(node) => node, + None => match scheduler.node_preferred(self.intent.get_secondary()) { + Some(node) => node, + None => { + return Err(ScheduleError::ImpossibleConstraint); + } + }, + }; + + assert!(self.intent.get_secondary().contains(&promote_to)); + + if let Some(node) = self.intent.get_attached() { + let demoted = self.intent.demote_attached(scheduler, *node); + if !demoted { + return Err(ScheduleError::ImpossibleConstraint); + } + } + + self.intent.promote_attached(scheduler, promote_to); + + // Increment the sequence number for the edge case where a + // reconciler is already running to avoid waiting on the + // current reconcile instead of spawning a new one. + self.sequence = self.sequence.next(); + + Ok(()) + } + + /// Optimize attachments: if a shard has a secondary location that is preferable to + /// its primary location based on soft constraints, switch that secondary location + /// to be attached. + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] + pub(crate) fn optimize_attachment( + &self, + nodes: &HashMap, + schedule_context: &ScheduleContext, + ) -> Option { + let attached = (*self.intent.get_attached())?; + if self.intent.secondary.is_empty() { + // We can only do useful work if we have both attached and secondary locations: this + // function doesn't schedule new locations, only swaps between attached and secondaries. + return None; + } + + let current_affinity_score = schedule_context.get_node_affinity(attached); + let current_attachment_count = schedule_context.get_node_attachments(attached); + + // Generate score for each node, dropping any un-schedulable nodes. + let all_pageservers = self.intent.all_pageservers(); + let mut scores = all_pageservers + .iter() + .flat_map(|node_id| { + let node = nodes.get(node_id); + if node.is_none() { + None + } else if matches!( + node.unwrap().get_scheduling(), + NodeSchedulingPolicy::Filling + ) { + // If the node is currently filling, don't count it as a candidate to avoid, + // racing with the background fill. + None + } else if matches!(node.unwrap().may_schedule(), MaySchedule::No) { + None + } else { + let affinity_score = schedule_context.get_node_affinity(*node_id); + let attachment_count = schedule_context.get_node_attachments(*node_id); + Some((*node_id, affinity_score, attachment_count)) + } + }) + .collect::>(); + + // Sort precedence: + // 1st - prefer nodes with the lowest total affinity score + // 2nd - prefer nodes with the lowest number of attachments in this context + // 3rd - if all else is equal, sort by node ID for determinism in tests. + scores.sort_by_key(|i| (i.1, i.2, i.0)); + + if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) = + scores.first() + { + if attached != *preferred_node { + // The best alternative must be more than 1 better than us, otherwise we could end + // up flapping back next time we're called (e.g. there's no point migrating from + // a location with score 1 to a score zero, because on next location the situation + // would be the same, but in reverse). + if current_affinity_score > *preferred_affinity_score + AffinityScore(1) + || current_attachment_count > *preferred_attachment_count + 1 + { + tracing::info!( + "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})", + self.intent.get_secondary() + ); + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: attached, + new_attached_node_id: *preferred_node, + }), + }); + } + } else { + tracing::debug!( + "Node {} is already preferred (score {:?})", + preferred_node, + preferred_affinity_score + ); + } + } + + // Fall-through: we didn't find an optimization + None + } + + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] + pub(crate) fn optimize_secondary( + &self, + scheduler: &mut Scheduler, + schedule_context: &ScheduleContext, + ) -> Option { + if self.intent.secondary.is_empty() { + // We can only do useful work if we have both attached and secondary locations: this + // function doesn't schedule new locations, only swaps between attached and secondaries. + return None; + } + + for secondary in self.intent.get_secondary() { + let Some(affinity_score) = schedule_context.nodes.get(secondary) else { + // We're already on a node unaffected any affinity constraints, + // so we won't change it. + continue; + }; + + // Let the scheduler suggest a node, where it would put us if we were scheduling afresh + // This implicitly limits the choice to nodes that are available, and prefers nodes + // with lower utilization. + let Ok(candidate_node) = + scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context) + else { + // A scheduling error means we have no possible candidate replacements + continue; + }; + + let candidate_affinity_score = schedule_context + .nodes + .get(&candidate_node) + .unwrap_or(&AffinityScore::FREE); + + // The best alternative must be more than 1 better than us, otherwise we could end + // up flapping back next time we're called. + if *candidate_affinity_score + AffinityScore(1) < *affinity_score { + // If some other node is available and has a lower score than this node, then + // that other node is a good place to migrate to. + tracing::info!( + "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})", + self.intent.get_secondary() + ); + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { + old_node_id: *secondary, + new_node_id: candidate_node, + }), + }); + } + } + + None + } + + /// Return true if the optimization was really applied: it will not be applied if the optimization's + /// sequence is behind this tenant shard's + pub(crate) fn apply_optimization( + &mut self, + scheduler: &mut Scheduler, + optimization: ScheduleOptimization, + ) -> bool { + if optimization.sequence != self.sequence { + return false; + } + + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_schedule_optimization + .inc(); + + match optimization.action { + ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id, + new_attached_node_id, + }) => { + self.intent.demote_attached(scheduler, old_attached_node_id); + self.intent + .promote_attached(scheduler, new_attached_node_id); + } + ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { + old_node_id, + new_node_id, + }) => { + self.intent.remove_secondary(scheduler, old_node_id); + self.intent.push_secondary(scheduler, new_node_id); + } + } + + true + } + + /// Query whether the tenant's observed state for attached node matches its intent state, and if so, + /// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that + /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there. + /// + /// Reconciliation may still be needed for other aspects of state such as secondaries (see [`Self::dirty`]): this + /// funciton should not be used to decide whether to reconcile. + pub(crate) fn stably_attached(&self) -> Option { + if let Some(attach_intent) = self.intent.attached { + match self.observed.locations.get(&attach_intent) { + Some(loc) => match &loc.conf { + Some(conf) => match conf.mode { + LocationConfigMode::AttachedMulti + | LocationConfigMode::AttachedSingle + | LocationConfigMode::AttachedStale => { + // Our intent and observed state agree that this node is in an attached state. + Some(attach_intent) + } + // Our observed config is not an attached state + _ => None, + }, + // Our observed state is None, i.e. in flux + None => None, + }, + // We have no observed state for this node + None => None, + } + } else { + // Our intent is not to attach + None + } + } + + fn dirty(&self, nodes: &Arc>) -> bool { + let mut dirty_nodes = HashSet::new(); + + if let Some(node_id) = self.intent.attached { + // Maybe panic: it is a severe bug if we try to attach while generation is null. + let generation = self + .generation + .expect("Attempted to enter attached state without a generation"); + + let wanted_conf = + attached_location_conf(generation, &self.shard, &self.config, &self.policy); + match self.observed.locations.get(&node_id) { + Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} + Some(_) | None => { + dirty_nodes.insert(node_id); + } + } + } + + for node_id in &self.intent.secondary { + let wanted_conf = secondary_location_conf(&self.shard, &self.config); + match self.observed.locations.get(node_id) { + Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} + Some(_) | None => { + dirty_nodes.insert(*node_id); + } + } + } + + for node_id in self.observed.locations.keys() { + if self.intent.attached != Some(*node_id) && !self.intent.secondary.contains(node_id) { + // We have observed state that isn't part of our intent: need to clean it up. + dirty_nodes.insert(*node_id); + } + } + + dirty_nodes.retain(|node_id| { + nodes + .get(node_id) + .map(|n| n.is_available()) + .unwrap_or(false) + }); + + !dirty_nodes.is_empty() + } + + #[allow(clippy::too_many_arguments)] + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] + pub(crate) fn get_reconcile_needed( + &mut self, + pageservers: &Arc>, + ) -> ReconcileNeeded { + // If there are any ambiguous observed states, and the nodes they refer to are available, + // we should reconcile to clean them up. + let mut dirty_observed = false; + for (node_id, observed_loc) in &self.observed.locations { + let node = pageservers + .get(node_id) + .expect("Nodes may not be removed while referenced"); + if observed_loc.conf.is_none() && node.is_available() { + dirty_observed = true; + break; + } + } + + let active_nodes_dirty = self.dirty(pageservers); + + // Even if there is no pageserver work to be done, if we have a pending notification to computes, + // wake up a reconciler to send it. + let do_reconcile = + active_nodes_dirty || dirty_observed || self.pending_compute_notification; + + if !do_reconcile { + tracing::debug!("Not dirty, no reconciliation needed."); + return ReconcileNeeded::No; + } + + // If we are currently splitting, then never start a reconciler task: the splitting logic + // requires that shards are not interfered with while it runs. Do this check here rather than + // up top, so that we only log this message if we would otherwise have done a reconciliation. + if !matches!(self.splitting, SplitState::Idle) { + tracing::info!("Refusing to reconcile, splitting in progress"); + return ReconcileNeeded::No; + } + + // Reconcile already in flight for the current sequence? + if let Some(handle) = &self.reconciler { + if handle.sequence == self.sequence { + tracing::info!( + "Reconciliation already in progress for sequence {:?}", + self.sequence, + ); + return ReconcileNeeded::WaitExisting(ReconcilerWaiter { + tenant_shard_id: self.tenant_shard_id, + seq_wait: self.waiter.clone(), + error_seq_wait: self.error_waiter.clone(), + error: self.last_error.clone(), + seq: self.sequence, + }); + } + } + + // Pre-checks done: finally check whether we may actually do the work + match self.scheduling_policy { + ShardSchedulingPolicy::Active + | ShardSchedulingPolicy::Essential + | ShardSchedulingPolicy::Pause => {} + ShardSchedulingPolicy::Stop => { + // We only reach this point if there is work to do and we're going to skip + // doing it: warn it obvious why this tenant isn't doing what it ought to. + tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy); + return ReconcileNeeded::No; + } + } + + ReconcileNeeded::Yes + } + + /// Ensure the sequence number is set to a value where waiting for this value will make us wait + /// for the next reconcile: i.e. it is ahead of all completed or running reconcilers. + /// + /// Constructing a ReconcilerWaiter with the resulting sequence number gives the property + /// that the waiter will not complete until some future Reconciler is constructed and run. + fn ensure_sequence_ahead(&mut self) { + // Find the highest sequence for which a Reconciler has previously run or is currently + // running + let max_seen = std::cmp::max( + self.reconciler + .as_ref() + .map(|r| r.sequence) + .unwrap_or(Sequence(0)), + std::cmp::max(self.waiter.load(), self.error_waiter.load()), + ); + + if self.sequence <= max_seen { + self.sequence = max_seen.next(); + } + } + + /// Create a waiter that will wait for some future Reconciler that hasn't been spawned yet. + /// + /// This is appropriate when you can't spawn a reconciler (e.g. due to resource limits), but + /// you would like to wait on the next reconciler that gets spawned in the background. + pub(crate) fn future_reconcile_waiter(&mut self) -> ReconcilerWaiter { + self.ensure_sequence_ahead(); + + ReconcilerWaiter { + tenant_shard_id: self.tenant_shard_id, + seq_wait: self.waiter.clone(), + error_seq_wait: self.error_waiter.clone(), + error: self.last_error.clone(), + seq: self.sequence, + } + } + + #[allow(clippy::too_many_arguments)] + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] + pub(crate) fn spawn_reconciler( + &mut self, + result_tx: &tokio::sync::mpsc::UnboundedSender, + pageservers: &Arc>, + compute_hook: &Arc, + reconciler_config: ReconcilerConfig, + service_config: &service::Config, + persistence: &Arc, + units: ReconcileUnits, + gate_guard: GateGuard, + cancel: &CancellationToken, + ) -> Option { + // Reconcile in flight for a stale sequence? Our sequence's task will wait for it before + // doing our sequence's work. + let old_handle = self.reconciler.take(); + + // Build list of nodes from which the reconciler should detach + let mut detach = Vec::new(); + for node_id in self.observed.locations.keys() { + if self.intent.get_attached() != &Some(*node_id) + && !self.intent.secondary.contains(node_id) + { + detach.push( + pageservers + .get(node_id) + .expect("Intent references non-existent pageserver") + .clone(), + ) + } + } + + // Advance the sequence before spawning a reconciler, so that sequence waiters + // can distinguish between before+after the reconcile completes. + self.ensure_sequence_ahead(); + + let reconciler_cancel = cancel.child_token(); + let reconciler_intent = TargetState::from_intent(pageservers, &self.intent); + let mut reconciler = Reconciler { + tenant_shard_id: self.tenant_shard_id, + shard: self.shard, + placement_policy: self.policy.clone(), + generation: self.generation, + intent: reconciler_intent, + detach, + reconciler_config, + config: self.config.clone(), + observed: self.observed.clone(), + compute_hook: compute_hook.clone(), + service_config: service_config.clone(), + _gate_guard: gate_guard, + _resource_units: units, + cancel: reconciler_cancel.clone(), + persistence: persistence.clone(), + compute_notify_failure: false, + }; + + let reconcile_seq = self.sequence; + + tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence); + let must_notify = self.pending_compute_notification; + let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq, + tenant_id=%reconciler.tenant_shard_id.tenant_id, + shard_id=%reconciler.tenant_shard_id.shard_slug()); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_spawn + .inc(); + let result_tx = result_tx.clone(); + let join_handle = tokio::task::spawn( + async move { + // Wait for any previous reconcile task to complete before we start + if let Some(old_handle) = old_handle { + old_handle.cancel.cancel(); + if let Err(e) = old_handle.handle.await { + // We can't do much with this other than log it: the task is done, so + // we may proceed with our work. + tracing::error!("Unexpected join error waiting for reconcile task: {e}"); + } + } + + // Early check for cancellation before doing any work + // TODO: wrap all remote API operations in cancellation check + // as well. + if reconciler.cancel.is_cancelled() { + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_complete + .inc(ReconcileCompleteLabelGroup { + status: ReconcileOutcome::Cancel, + }); + return; + } + + // Attempt to make observed state match intent state + let result = reconciler.reconcile().await; + + // If we know we had a pending compute notification from some previous action, send a notification irrespective + // of whether the above reconcile() did any work + if result.is_ok() && must_notify { + // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`] + reconciler.compute_notify().await.ok(); + } + + // Update result counter + let outcome_label = match &result { + Ok(_) => ReconcileOutcome::Success, + Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel, + Err(_) => ReconcileOutcome::Error, + }; + + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_complete + .inc(ReconcileCompleteLabelGroup { + status: outcome_label, + }); + + // Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might + // try and schedule more work in response to our result. + let result = ReconcileResult { + sequence: reconcile_seq, + result, + tenant_shard_id: reconciler.tenant_shard_id, + generation: reconciler.generation, + observed: reconciler.observed, + pending_compute_notification: reconciler.compute_notify_failure, + }; + + result_tx + .send(ReconcileResultRequest::ReconcileResult(result)) + .ok(); + } + .instrument(reconciler_span), + ); + + self.reconciler = Some(ReconcilerHandle { + sequence: self.sequence, + handle: join_handle, + cancel: reconciler_cancel, + }); + + Some(ReconcilerWaiter { + tenant_shard_id: self.tenant_shard_id, + seq_wait: self.waiter.clone(), + error_seq_wait: self.error_waiter.clone(), + error: self.last_error.clone(), + seq: self.sequence, + }) + } + + /// Get a waiter for any reconciliation in flight, but do not start reconciliation + /// if it is not already running + pub(crate) fn get_waiter(&self) -> Option { + if self.reconciler.is_some() { + Some(ReconcilerWaiter { + tenant_shard_id: self.tenant_shard_id, + seq_wait: self.waiter.clone(), + error_seq_wait: self.error_waiter.clone(), + error: self.last_error.clone(), + seq: self.sequence, + }) + } else { + None + } + } + + /// Called when a ReconcileResult has been emitted and the service is updating + /// our state: if the result is from a sequence >= my ReconcileHandle, then drop + /// the handle to indicate there is no longer a reconciliation in progress. + pub(crate) fn reconcile_complete(&mut self, sequence: Sequence) { + if let Some(reconcile_handle) = &self.reconciler { + if reconcile_handle.sequence <= sequence { + self.reconciler = None; + } + } + } + + /// If we had any state at all referring to this node ID, drop it. Does not + /// attempt to reschedule. + /// + /// Returns true if we modified the node's intent state. + pub(crate) fn deref_node(&mut self, node_id: NodeId) -> bool { + let mut intent_modified = false; + + // Drop if this node was our attached intent + if self.intent.attached == Some(node_id) { + self.intent.attached = None; + intent_modified = true; + } + + // Drop from the list of secondaries, and check if we modified it + let had_secondaries = self.intent.secondary.len(); + self.intent.secondary.retain(|n| n != &node_id); + intent_modified |= self.intent.secondary.len() != had_secondaries; + + debug_assert!(!self.intent.all_pageservers().contains(&node_id)); + + intent_modified + } + + pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) { + self.scheduling_policy = p; + } + + pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy { + &self.scheduling_policy + } + + pub(crate) fn set_last_error(&mut self, sequence: Sequence, error: ReconcileError) { + // Ordering: always set last_error before advancing sequence, so that sequence + // waiters are guaranteed to see a Some value when they see an error. + *(self.last_error.lock().unwrap()) = Some(Arc::new(error)); + self.error_waiter.advance(sequence); + } + + pub(crate) fn from_persistent( + tsp: TenantShardPersistence, + intent: IntentState, + ) -> anyhow::Result { + let tenant_shard_id = tsp.get_tenant_shard_id()?; + let shard_identity = tsp.get_shard_identity()?; + + Ok(Self { + tenant_shard_id, + shard: shard_identity, + sequence: Sequence::initial(), + generation: tsp.generation.map(|g| Generation::new(g as u32)), + policy: serde_json::from_str(&tsp.placement_policy).unwrap(), + intent, + observed: ObservedState::new(), + config: serde_json::from_str(&tsp.config).unwrap(), + reconciler: None, + splitting: tsp.splitting, + waiter: Arc::new(SeqWait::new(Sequence::initial())), + error_waiter: Arc::new(SeqWait::new(Sequence::initial())), + last_error: Arc::default(), + pending_compute_notification: false, + delayed_reconcile: false, + scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(), + preferred_az_id: tsp.preferred_az_id, + }) + } + + pub(crate) fn to_persistent(&self) -> TenantShardPersistence { + TenantShardPersistence { + tenant_id: self.tenant_shard_id.tenant_id.to_string(), + shard_number: self.tenant_shard_id.shard_number.0 as i32, + shard_count: self.tenant_shard_id.shard_count.literal() as i32, + shard_stripe_size: self.shard.stripe_size.0 as i32, + generation: self.generation.map(|g| g.into().unwrap_or(0) as i32), + generation_pageserver: self.intent.get_attached().map(|n| n.0 as i64), + placement_policy: serde_json::to_string(&self.policy).unwrap(), + config: serde_json::to_string(&self.config).unwrap(), + splitting: SplitState::default(), + scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(), + preferred_az_id: self.preferred_az_id.clone(), + } + } + + pub(crate) fn preferred_az(&self) -> Option<&str> { + self.preferred_az_id.as_deref() + } + + pub(crate) fn set_preferred_az(&mut self, preferred_az_id: String) { + self.preferred_az_id = Some(preferred_az_id); + } +} + +#[cfg(test)] +pub(crate) mod tests { + use pageserver_api::{ + controller_api::NodeAvailability, + shard::{ShardCount, ShardNumber}, + }; + use utils::id::TenantId; + + use crate::scheduler::test_utils::make_test_nodes; + + use super::*; + + fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard { + let tenant_id = TenantId::generate(); + let shard_number = ShardNumber(0); + let shard_count = ShardCount::new(1); + + let tenant_shard_id = TenantShardId { + tenant_id, + shard_number, + shard_count, + }; + TenantShard::new( + tenant_shard_id, + ShardIdentity::new( + shard_number, + shard_count, + pageserver_api::shard::ShardStripeSize(32768), + ) + .unwrap(), + policy, + ) + } + + fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec { + let tenant_id = TenantId::generate(); + + (0..shard_count.count()) + .map(|i| { + let shard_number = ShardNumber(i); + + let tenant_shard_id = TenantShardId { + tenant_id, + shard_number, + shard_count, + }; + TenantShard::new( + tenant_shard_id, + ShardIdentity::new( + shard_number, + shard_count, + pageserver_api::shard::ShardStripeSize(32768), + ) + .unwrap(), + policy.clone(), + ) + }) + .collect() + } + + /// Test the scheduling behaviors used when a tenant configured for HA is subject + /// to nodes being marked offline. + #[test] + fn tenant_ha_scheduling() -> anyhow::Result<()> { + // Start with three nodes. Our tenant will only use two. The third one is + // expected to remain unused. + let mut nodes = make_test_nodes(3); + + let mut scheduler = Scheduler::new(nodes.values()); + let mut context = ScheduleContext::default(); + + let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); + tenant_shard + .schedule(&mut scheduler, &mut context) + .expect("we have enough nodes, scheduling should work"); + + // Expect to initially be schedule on to different nodes + assert_eq!(tenant_shard.intent.secondary.len(), 1); + assert!(tenant_shard.intent.attached.is_some()); + + let attached_node_id = tenant_shard.intent.attached.unwrap(); + let secondary_node_id = *tenant_shard.intent.secondary.iter().last().unwrap(); + assert_ne!(attached_node_id, secondary_node_id); + + // Notifying the attached node is offline should demote it to a secondary + let changed = tenant_shard + .intent + .demote_attached(&mut scheduler, attached_node_id); + assert!(changed); + assert!(tenant_shard.intent.attached.is_none()); + assert_eq!(tenant_shard.intent.secondary.len(), 2); + + // Update the scheduler state to indicate the node is offline + nodes + .get_mut(&attached_node_id) + .unwrap() + .set_availability(NodeAvailability::Offline); + scheduler.node_upsert(nodes.get(&attached_node_id).unwrap()); + + // Scheduling the node should promote the still-available secondary node to attached + tenant_shard + .schedule(&mut scheduler, &mut context) + .expect("active nodes are available"); + assert_eq!(tenant_shard.intent.attached.unwrap(), secondary_node_id); + + // The original attached node should have been retained as a secondary + assert_eq!( + *tenant_shard.intent.secondary.iter().last().unwrap(), + attached_node_id + ); + + tenant_shard.intent.clear(&mut scheduler); + + Ok(()) + } + + #[test] + fn intent_from_observed() -> anyhow::Result<()> { + let nodes = make_test_nodes(3); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); + + tenant_shard.observed.locations.insert( + NodeId(3), + ObservedStateLocation { + conf: Some(LocationConfig { + mode: LocationConfigMode::AttachedMulti, + generation: Some(2), + secondary_conf: None, + shard_number: tenant_shard.shard.number.0, + shard_count: tenant_shard.shard.count.literal(), + shard_stripe_size: tenant_shard.shard.stripe_size.0, + tenant_conf: TenantConfig::default(), + }), + }, + ); + + tenant_shard.observed.locations.insert( + NodeId(2), + ObservedStateLocation { + conf: Some(LocationConfig { + mode: LocationConfigMode::AttachedStale, + generation: Some(1), + secondary_conf: None, + shard_number: tenant_shard.shard.number.0, + shard_count: tenant_shard.shard.count.literal(), + shard_stripe_size: tenant_shard.shard.stripe_size.0, + tenant_conf: TenantConfig::default(), + }), + }, + ); + + tenant_shard.intent_from_observed(&mut scheduler); + + // The highest generationed attached location gets used as attached + assert_eq!(tenant_shard.intent.attached, Some(NodeId(3))); + // Other locations get used as secondary + assert_eq!(tenant_shard.intent.secondary, vec![NodeId(2)]); + + scheduler.consistency_check(nodes.values(), [&tenant_shard].into_iter())?; + + tenant_shard.intent.clear(&mut scheduler); + Ok(()) + } + + #[test] + fn scheduling_mode() -> anyhow::Result<()> { + let nodes = make_test_nodes(3); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); + + // In pause mode, schedule() shouldn't do anything + tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause; + assert!(tenant_shard + .schedule(&mut scheduler, &mut ScheduleContext::default()) + .is_ok()); + assert!(tenant_shard.intent.all_pageservers().is_empty()); + + // In active mode, schedule() works + tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active; + assert!(tenant_shard + .schedule(&mut scheduler, &mut ScheduleContext::default()) + .is_ok()); + assert!(!tenant_shard.intent.all_pageservers().is_empty()); + + tenant_shard.intent.clear(&mut scheduler); + Ok(()) + } + + #[test] + fn optimize_attachment() -> anyhow::Result<()> { + let nodes = make_test_nodes(3); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1)); + + // Initially: both nodes attached on shard 1, and both have secondary locations + // on different nodes. + shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1))); + shard_a.intent.push_secondary(&mut scheduler, NodeId(2)); + shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1))); + shard_b.intent.push_secondary(&mut scheduler, NodeId(3)); + + let mut schedule_context = ScheduleContext::default(); + schedule_context.avoid(&shard_a.intent.all_pageservers()); + schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); + schedule_context.avoid(&shard_b.intent.all_pageservers()); + schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); + + let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context); + + // Either shard should recognize that it has the option to switch to a secondary location where there + // would be no other shards from the same tenant, and request to do so. + assert_eq!( + optimization_a, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: NodeId(1), + new_attached_node_id: NodeId(2) + }) + }) + ); + + // Note that these optimizing two shards in the same tenant with the same ScheduleContext is + // mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility + // of [`Service::optimize_all`] to avoid trying + // to do optimizations for multiple shards in the same tenant at the same time. Generating + // both optimizations is just done for test purposes + let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context); + assert_eq!( + optimization_b, + Some(ScheduleOptimization { + sequence: shard_b.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: NodeId(1), + new_attached_node_id: NodeId(3) + }) + }) + ); + + // Applying these optimizations should result in the end state proposed + shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap()); + assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2))); + assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]); + shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap()); + assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3))); + assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]); + + shard_a.intent.clear(&mut scheduler); + shard_b.intent.clear(&mut scheduler); + + Ok(()) + } + + #[test] + fn optimize_secondary() -> anyhow::Result<()> { + let nodes = make_test_nodes(4); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1)); + + // Initially: both nodes attached on shard 1, and both have secondary locations + // on different nodes. + shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1))); + shard_a.intent.push_secondary(&mut scheduler, NodeId(3)); + shard_b.intent.set_attached(&mut scheduler, Some(NodeId(2))); + shard_b.intent.push_secondary(&mut scheduler, NodeId(3)); + + let mut schedule_context = ScheduleContext::default(); + schedule_context.avoid(&shard_a.intent.all_pageservers()); + schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); + schedule_context.avoid(&shard_b.intent.all_pageservers()); + schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); + + let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context); + + // Since there is a node with no locations available, the node with two locations for the + // same tenant should generate an optimization to move one away + assert_eq!( + optimization_a, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { + old_node_id: NodeId(3), + new_node_id: NodeId(4) + }) + }) + ); + + shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap()); + assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1))); + assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(4)]); + + shard_a.intent.clear(&mut scheduler); + shard_b.intent.clear(&mut scheduler); + + Ok(()) + } + + // Optimize til quiescent: this emulates what Service::optimize_all does, when + // called repeatedly in the background. + fn optimize_til_idle( + nodes: &HashMap, + scheduler: &mut Scheduler, + shards: &mut [TenantShard], + ) { + let mut loop_n = 0; + loop { + let mut schedule_context = ScheduleContext::default(); + let mut any_changed = false; + + for shard in shards.iter() { + schedule_context.avoid(&shard.intent.all_pageservers()); + if let Some(attached) = shard.intent.get_attached() { + schedule_context.push_attached(*attached); + } + } + + for shard in shards.iter_mut() { + let optimization = shard.optimize_attachment(nodes, &schedule_context); + if let Some(optimization) = optimization { + shard.apply_optimization(scheduler, optimization); + any_changed = true; + break; + } + + let optimization = shard.optimize_secondary(scheduler, &schedule_context); + if let Some(optimization) = optimization { + shard.apply_optimization(scheduler, optimization); + any_changed = true; + break; + } + } + + if !any_changed { + break; + } + + // Assert no infinite loop + loop_n += 1; + assert!(loop_n < 1000); + } + } + + /// Test the balancing behavior of shard scheduling: that it achieves a balance, and + /// that it converges. + #[test] + fn optimize_add_nodes() -> anyhow::Result<()> { + let nodes = make_test_nodes(4); + + // Only show the scheduler a couple of nodes + let mut scheduler = Scheduler::new([].iter()); + scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap()); + scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap()); + + let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4)); + let mut schedule_context = ScheduleContext::default(); + for shard in &mut shards { + assert!(shard + .schedule(&mut scheduler, &mut schedule_context) + .is_ok()); + } + + // We should see equal number of locations on the two nodes. + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2); + + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2); + + // Add another two nodes: we should see the shards spread out when their optimize + // methods are called + scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap()); + scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap()); + optimize_til_idle(&nodes, &mut scheduler, &mut shards); + + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1); + + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1); + + assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 1); + + assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 1); + + for shard in shards.iter_mut() { + shard.intent.clear(&mut scheduler); + } + + Ok(()) + } +} diff --git a/s3_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml similarity index 77% rename from s3_scrubber/Cargo.toml rename to storage_scrubber/Cargo.toml index 4d136472e0..d19119990b 100644 --- a/s3_scrubber/Cargo.toml +++ b/storage_scrubber/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "s3_scrubber" +name = "storage_scrubber" version = "0.1.0" edition.workspace = true license.workspace = true @@ -10,7 +10,9 @@ aws-smithy-async.workspace = true either.workspace = true tokio-rustls.workspace = true anyhow.workspace = true +git-version.workspace = true hex.workspace = true +humantime.workspace = true thiserror.workspace = true rand.workspace = true bytes.workspace = true @@ -22,9 +24,18 @@ serde_with.workspace = true workspace_hack.workspace = true utils.workspace = true async-stream.workspace = true +tokio-postgres-rustls.workspace = true +postgres_ffi.workspace = true tokio-stream.workspace = true +tokio-postgres.workspace = true +tokio-util = { workspace = true } futures-util.workspace = true itertools.workspace = true +camino.workspace = true +rustls.workspace = true +rustls-native-certs.workspace = true +once_cell.workspace = true +storage_controller_client.workspace = true tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } chrono = { workspace = true, default-features = false, features = ["clock", "serde"] } @@ -39,6 +50,5 @@ tracing.workspace = true tracing-subscriber.workspace = true clap.workspace = true tracing-appender = "0.2" -histogram = "0.7" futures.workspace = true diff --git a/s3_scrubber/README.md b/storage_scrubber/README.md similarity index 68% rename from s3_scrubber/README.md rename to storage_scrubber/README.md index 2f21b9f191..5be8541419 100644 --- a/s3_scrubber/README.md +++ b/storage_scrubber/README.md @@ -1,4 +1,4 @@ -# Neon S3 scrubber +# Neon Storage Scrubber This tool directly accesses the S3 buckets used by the Neon `pageserver` and `safekeeper`, and does housekeeping such as cleaning up objects for tenants & timelines that no longer exist. @@ -9,11 +9,13 @@ and `safekeeper`, and does housekeeping such as cleaning up objects for tenants #### S3 -Do `aws sso login --profile dev` to get the SSO access to the bucket to clean, get the SSO_ACCOUNT_ID for your profile (`cat ~/.aws/config` may help). +Do `aws sso login --profile dev` to get the SSO access to the bucket to clean. +Also, set the following environment variables: -- `SSO_ACCOUNT_ID`: Credentials id to use for accessing S3 buckets +- `AWS_PROFILE`: Profile name to use for accessing S3 buckets (e.g. `dev`) - `REGION`: A region where the bucket is located at. - `BUCKET`: Bucket name +- `BUCKET_PREFIX` (optional): Prefix inside the bucket #### Console API @@ -43,7 +45,11 @@ processing by the `purge-garbage` subcommand. Example: -`env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json` +`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=[client_key] CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json` + +Note that `CLOUD_ADMIN_API_TOKEN` can be obtained from https://console-stage.neon.build/app/settings/api-keys (for staging) or https://console.neon.tech/app/settings/api-keys for production. This is not the control plane admin JWT key. The env var name is confusing. Though anyone can generate that API key, you still need admin permission in order to access all projects in the region. + +And note that `CLOUD_ADMIN_API_URL` should include the region in the admin URL due to the control plane / console split. For example, `https://console-stage.neon.build/regions/aws-us-east-2/api/v1/admin` for the staging us-east-2 region. #### `purge-garbage` @@ -59,7 +65,7 @@ to pass them on the command line Example: -`env SSO_ACCOUNT_ID=123456 cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json` +`env AWS_PROFILE=dev cargo run --release -- purge-garbage --input-path=eu-west-1-garbage.json` Add the `--delete` argument before `purge-garbage` to enable deletion. This is intentionally not provided inline in the example above to avoid accidents. Without the `--delete` flag @@ -67,10 +73,12 @@ the purge command will log all the keys that it would have deleted. #### `scan-metadata` -Walk objects in a pageserver S3 bucket, and report statistics on the contents. +Walk objects in a pageserver or safekeeper S3 bucket, and report statistics on the contents and checking consistency. +Errors are logged to stderr and summary to stdout. +For pageserver: ``` -env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata +env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver Timelines: 31106 With errors: 3 @@ -82,11 +90,15 @@ Layer size bytes: min 24576, 1% 36879, 10% 36879, 50% 61471, 90% 44695551, 99% 2 Timeline layer count: min 1, 1% 3, 10% 6, 50% 16, 90% 25, 99% 39, max 1053 ``` +For safekeepers, dump_db_connstr and dump_db_table must be +specified; they should point to table with debug dump which will be used +to list timelines and find their backup and start LSNs. + ## Cleaning up running pageservers If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers. -First, we need to group pageservers by buckets, `https:///admin/pageservers`` can be used for all env nodes, then `cat /storage/pageserver/data/pageserver.toml` on every node will show the bucket names and regions needed. +First, we need to group pageservers by buckets, `https:///admin/pageservers` can be used for all env nodes, then `cat /storage/pageserver/data/pageserver.toml` on every node will show the bucket names and regions needed. Per bucket, for every pageserver id related, find deleted tenants: diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs new file mode 100644 index 0000000000..15dfb101b5 --- /dev/null +++ b/storage_scrubber/src/checks.rs @@ -0,0 +1,514 @@ +use std::collections::{BTreeSet, HashMap, HashSet}; + +use anyhow::Context; +use itertools::Itertools; +use pageserver::tenant::layer_map::LayerMap; +use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; +use pageserver_api::shard::ShardIndex; +use tokio_util::sync::CancellationToken; +use tracing::{error, info, warn}; +use utils::generation::Generation; +use utils::id::TimelineId; + +use crate::cloud_admin_api::BranchData; +use crate::metadata_stream::stream_listing; +use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; +use futures_util::StreamExt; +use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path}; +use pageserver::tenant::storage_layer::LayerName; +use pageserver::tenant::IndexPart; +use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; + +pub(crate) struct TimelineAnalysis { + /// Anomalies detected + pub(crate) errors: Vec, + + /// Healthy-but-noteworthy, like old-versioned structures that are readable but + /// worth reporting for awareness that we must not remove that old version decoding + /// yet. + pub(crate) warnings: Vec, + + /// Keys not referenced in metadata: candidates for removal, but NOT NECESSARILY: beware + /// of races between reading the metadata and reading the objects. + pub(crate) garbage_keys: Vec, +} + +impl TimelineAnalysis { + fn new() -> Self { + Self { + errors: Vec::new(), + warnings: Vec::new(), + garbage_keys: Vec::new(), + } + } + + /// Whether a timeline is healthy. + pub(crate) fn is_healthy(&self) -> bool { + self.errors.is_empty() && self.warnings.is_empty() + } +} + +/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong). +/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example, +/// +/// ```plain +/// | | | | +/// | 1 | | 2 | | 3 | +/// | | | | | | +/// ``` +/// +/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have +/// the same LSN range. +/// +/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example, +/// +/// ```plain +/// | | | 2 | | | +/// | 1 | |-------| | 3 | +/// | | | 4 | | | +/// +/// If layer 2 and 4 contain the same single key, this is also a valid layer map. +fn check_valid_layermap(metadata: &HashMap) -> Option { + let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?) + let mut all_delta_layers = Vec::new(); + for (name, _) in metadata.iter() { + if let LayerName::Delta(layer) = name { + if layer.key_range.start.next() != layer.key_range.end { + all_delta_layers.push(layer.clone()); + } + } + } + for layer in &all_delta_layers { + let lsn_range = &layer.lsn_range; + lsn_split_point.insert(lsn_range.start); + lsn_split_point.insert(lsn_range.end); + } + for layer in &all_delta_layers { + let lsn_range = layer.lsn_range.clone(); + let intersects = lsn_split_point.range(lsn_range).collect_vec(); + if intersects.len() > 1 { + let err = format!( + "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]", + layer, + intersects.into_iter().map(|lsn| lsn.to_string()).join(", ") + ); + return Some(err); + } + } + None +} + +pub(crate) async fn branch_cleanup_and_check_errors( + remote_client: &GenericRemoteStorage, + id: &TenantShardTimelineId, + tenant_objects: &mut TenantObjectListing, + s3_active_branch: Option<&BranchData>, + console_branch: Option, + s3_data: Option, +) -> TimelineAnalysis { + let mut result = TimelineAnalysis::new(); + + info!("Checking timeline {id}"); + + if let Some(s3_active_branch) = s3_active_branch { + info!( + "Checking console status for timeline for branch {:?}/{:?}", + s3_active_branch.project_id, s3_active_branch.id + ); + match console_branch { + Some(_) => {result.errors.push(format!("Timeline has deleted branch data in the console (id = {:?}, project_id = {:?}), recheck whether it got removed during the check", + s3_active_branch.id, s3_active_branch.project_id)) + }, + None => { + result.errors.push(format!("Timeline has no branch data in the console (id = {:?}, project_id = {:?}), recheck whether it got removed during the check", + s3_active_branch.id, s3_active_branch.project_id)) + } + }; + } + + match s3_data { + Some(s3_data) => { + result + .garbage_keys + .extend(s3_data.unknown_keys.into_iter().map(|k| k.key.to_string())); + + match s3_data.blob_data { + BlobDataParseResult::Parsed { + index_part, + index_part_generation: _index_part_generation, + s3_layers: _s3_layers, + } => { + if !IndexPart::KNOWN_VERSIONS.contains(&index_part.version()) { + result + .errors + .push(format!("index_part.json version: {}", index_part.version())) + } + + let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(3); + if !newest_versions.any(|ip| ip == &index_part.version()) { + info!( + "index_part.json version is not latest: {}", + index_part.version() + ); + } + + if index_part.metadata.disk_consistent_lsn() + != index_part.duplicated_disk_consistent_lsn() + { + // Tech debt: let's get rid of one of these, they are redundant + // https://github.com/neondatabase/neon/issues/8343 + result.errors.push(format!( + "Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})", + index_part.metadata.disk_consistent_lsn(), + index_part.duplicated_disk_consistent_lsn(), + )) + } + + if index_part.layer_metadata.is_empty() { + if index_part.metadata.ancestor_timeline().is_none() { + // The initial timeline with no ancestor should ALWAYS have layers. + result.errors.push( + "index_part.json has no layers (ancestor_timeline=None)" + .to_string(), + ); + } else { + // Not an error, can happen for branches with zero writes, but notice that + info!("index_part.json has no layers (ancestor_timeline exists)"); + } + } + + if let Some(err) = check_valid_layermap(&index_part.layer_metadata) { + result.errors.push(format!( + "index_part.json contains invalid layer map structure: {err}" + )); + } + + for (layer, metadata) in index_part.layer_metadata { + if metadata.file_size == 0 { + result.errors.push(format!( + "index_part.json contains a layer {} that has 0 size in its layer metadata", layer, + )) + } + + if !tenant_objects.check_ref(id.timeline_id, &layer, &metadata) { + let path = remote_layer_path( + &id.tenant_shard_id.tenant_id, + &id.timeline_id, + metadata.shard, + &layer, + metadata.generation, + ); + + // HEAD request used here to address a race condition when an index was uploaded concurrently + // with our scan. We check if the object is uploaded to S3 after taking the listing snapshot. + let response = remote_client + .head_object(&path, &CancellationToken::new()) + .await; + + if response.is_err() { + // Object is not present. + let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta()); + + let msg = format!( + "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})", + layer, + metadata.generation.get_suffix(), + metadata.shard, + is_l0, + ); + + if is_l0 { + result.warnings.push(msg); + } else { + result.errors.push(msg); + } + } + } + } + } + BlobDataParseResult::Relic => {} + BlobDataParseResult::Incorrect { + errors, + s3_layers: _, + } => result.errors.extend( + errors + .into_iter() + .map(|error| format!("parse error: {error}")), + ), + } + } + None => result + .errors + .push("Timeline has no data on S3 at all".to_string()), + } + + if result.errors.is_empty() { + info!("No check errors found"); + } else { + warn!("Timeline metadata errors: {0:?}", result.errors); + } + + if !result.warnings.is_empty() { + warn!("Timeline metadata warnings: {0:?}", result.warnings); + } + + if !result.garbage_keys.is_empty() { + error!( + "The following keys should be removed from S3: {0:?}", + result.garbage_keys + ) + } + + result +} + +#[derive(Default)] +pub(crate) struct LayerRef { + ref_count: usize, +} + +/// Top-level index of objects in a tenant. This may be used by any shard-timeline within +/// the tenant to query whether an object exists. +#[derive(Default)] +pub(crate) struct TenantObjectListing { + shard_timelines: HashMap<(ShardIndex, TimelineId), HashMap<(LayerName, Generation), LayerRef>>, +} + +impl TenantObjectListing { + /// Having done an S3 listing of the keys within a timeline prefix, merge them into the overall + /// list of layer keys for the Tenant. + pub(crate) fn push( + &mut self, + ttid: TenantShardTimelineId, + layers: HashSet<(LayerName, Generation)>, + ) { + let shard_index = ShardIndex::new( + ttid.tenant_shard_id.shard_number, + ttid.tenant_shard_id.shard_count, + ); + let replaced = self.shard_timelines.insert( + (shard_index, ttid.timeline_id), + layers + .into_iter() + .map(|l| (l, LayerRef::default())) + .collect(), + ); + + assert!( + replaced.is_none(), + "Built from an S3 object listing, which should never repeat a key" + ); + } + + /// Having loaded a timeline index, check if a layer referenced by the index exists. If it does, + /// the layer's refcount will be incremented. Later, after calling this for all references in all indices + /// in a tenant, orphan layers may be detected by their zero refcounts. + /// + /// Returns true if the layer exists + pub(crate) fn check_ref( + &mut self, + timeline_id: TimelineId, + layer_file: &LayerName, + metadata: &LayerFileMetadata, + ) -> bool { + let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else { + return false; + }; + + let Some(layer_ref) = shard_tl.get_mut(&(layer_file.clone(), metadata.generation)) else { + return false; + }; + + layer_ref.ref_count += 1; + + true + } + + pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerName, Generation)> { + let mut result = Vec::new(); + for ((shard_index, timeline_id), layers) in &self.shard_timelines { + for ((layer_file, generation), layer_ref) in layers { + if layer_ref.ref_count == 0 { + result.push((*shard_index, *timeline_id, layer_file.clone(), *generation)) + } + } + } + + result + } +} + +#[derive(Debug)] +pub(crate) struct RemoteTimelineBlobData { + pub(crate) blob_data: BlobDataParseResult, + + // Index objects that were not used when loading `blob_data`, e.g. those from old generations + pub(crate) unused_index_keys: Vec, + + // Objects whose keys were not recognized at all, i.e. not layer files, not indices + pub(crate) unknown_keys: Vec, +} + +#[derive(Debug)] +pub(crate) enum BlobDataParseResult { + Parsed { + index_part: Box, + index_part_generation: Generation, + s3_layers: HashSet<(LayerName, Generation)>, + }, + /// The remains of a deleted Timeline (i.e. an initdb archive only) + Relic, + Incorrect { + errors: Vec, + s3_layers: HashSet<(LayerName, Generation)>, + }, +} + +pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { + match name.rsplit_once('-') { + // FIXME: this is gross, just use a regex? + Some((layer_filename, gen)) if gen.len() == 8 => { + let layer = layer_filename.parse::()?; + let gen = + Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?; + Ok((layer, gen)) + } + _ => Ok((name.parse::()?, Generation::none())), + } +} + +pub(crate) async fn list_timeline_blobs( + remote_client: &GenericRemoteStorage, + id: TenantShardTimelineId, + root_target: &RootTarget, +) -> anyhow::Result { + let mut s3_layers = HashSet::new(); + + let mut errors = Vec::new(); + let mut unknown_keys = Vec::new(); + + let mut timeline_dir_target = root_target.timeline_root(&id); + timeline_dir_target.delimiter = String::new(); + + let mut index_part_keys: Vec = Vec::new(); + let mut initdb_archive: bool = false; + + let prefix_str = &timeline_dir_target + .prefix_in_bucket + .strip_prefix("/") + .unwrap_or(&timeline_dir_target.prefix_in_bucket); + + let mut stream = std::pin::pin!(stream_listing(remote_client, &timeline_dir_target)); + while let Some(obj) = stream.next().await { + let (key, Some(obj)) = obj? else { + panic!("ListingObject not specified"); + }; + + let blob_name = key.get_path().as_str().strip_prefix(prefix_str); + match blob_name { + Some(name) if name.starts_with("index_part.json") => { + tracing::debug!("Index key {key}"); + index_part_keys.push(obj) + } + Some("initdb.tar.zst") => { + tracing::debug!("initdb archive {key}"); + initdb_archive = true; + } + Some("initdb-preserved.tar.zst") => { + tracing::info!("initdb archive preserved {key}"); + } + Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) { + Ok((new_layer, gen)) => { + tracing::debug!("Parsed layer key: {new_layer} {gen:?}"); + s3_layers.insert((new_layer, gen)); + } + Err(e) => { + tracing::info!("Error parsing key {maybe_layer_name}"); + errors.push( + format!("S3 list response got an object with key {key} that is not a layer name: {e}"), + ); + unknown_keys.push(obj); + } + }, + None => { + tracing::warn!("Unknown key {key}"); + errors.push(format!("S3 list response got an object with odd key {key}")); + unknown_keys.push(obj); + } + } + } + + if index_part_keys.is_empty() && s3_layers.is_empty() && initdb_archive { + tracing::debug!( + "Timeline is empty apart from initdb archive: expected post-deletion state." + ); + return Ok(RemoteTimelineBlobData { + blob_data: BlobDataParseResult::Relic, + unused_index_keys: index_part_keys, + unknown_keys: Vec::new(), + }); + } + + // Choose the index_part with the highest generation + let (index_part_object, index_part_generation) = match index_part_keys + .iter() + .filter_map(|key| { + // Stripping the index key to the last part, because RemotePath doesn't + // like absolute paths, and depending on prefix_in_bucket it's possible + // for the keys we read back to start with a slash. + let basename = key.key.get_path().as_str().rsplit_once('/').unwrap().1; + parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (key, g)) + }) + .max_by_key(|i| i.1) + .map(|(k, g)| (k.clone(), g)) + { + Some((key, gen)) => (Some::(key.to_owned()), gen), + None => { + // Legacy/missing case: one or zero index parts, which did not have a generation + (index_part_keys.pop(), Generation::none()) + } + }; + + match index_part_object.as_ref() { + Some(selected) => index_part_keys.retain(|k| k != selected), + None => { + errors.push("S3 list response got no index_part.json file".to_string()); + } + } + + if let Some(index_part_object_key) = index_part_object.as_ref() { + let index_part_bytes = + download_object_with_retries(remote_client, &index_part_object_key.key) + .await + .context("index_part.json download")?; + + match serde_json::from_slice(&index_part_bytes) { + Ok(index_part) => { + return Ok(RemoteTimelineBlobData { + blob_data: BlobDataParseResult::Parsed { + index_part: Box::new(index_part), + index_part_generation, + s3_layers, + }, + unused_index_keys: index_part_keys, + unknown_keys, + }) + } + Err(index_parse_error) => errors.push(format!( + "index_part.json body parsing error: {index_parse_error}" + )), + } + } + + if errors.is_empty() { + errors.push( + "Unexpected: no errors did not lead to a successfully parsed blob return".to_string(), + ); + } + + Ok(RemoteTimelineBlobData { + blob_data: BlobDataParseResult::Incorrect { errors, s3_layers }, + unused_index_keys: index_part_keys, + unknown_keys, + }) +} diff --git a/s3_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs similarity index 71% rename from s3_scrubber/src/cloud_admin_api.rs rename to storage_scrubber/src/cloud_admin_api.rs index 151421c84f..70b108cf23 100644 --- a/s3_scrubber/src/cloud_admin_api.rs +++ b/storage_scrubber/src/cloud_admin_api.rs @@ -1,15 +1,13 @@ -#![allow(unused)] - -use std::str::FromStr; -use std::time::Duration; - use chrono::{DateTime, Utc}; +use futures::Future; use hex::FromHex; -use pageserver::tenant::Tenant; + use reqwest::{header, Client, StatusCode, Url}; use serde::Deserialize; use tokio::sync::Semaphore; +use tokio_util::sync::CancellationToken; +use utils::backoff; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; @@ -141,7 +139,7 @@ pub struct ProjectData { pub region_id: String, pub platform_id: String, pub user_id: String, - pub pageserver_id: u64, + pub pageserver_id: Option, #[serde(deserialize_with = "from_nullable_id")] pub tenant: TenantId, pub safekeepers: Vec, @@ -159,7 +157,7 @@ pub struct ProjectData { pub maintenance_set: Option, } -#[derive(Debug, serde::Deserialize)] +#[derive(Debug, Clone, serde::Deserialize)] pub struct BranchData { pub id: BranchId, pub created_at: DateTime, @@ -214,30 +212,39 @@ impl CloudAdminApiClient { .await .expect("Semaphore is not closed"); - let response = self - .http_client - .get(self.append_url("/projects")) - .query(&[ - ("tenant_id", tenant_id.to_string()), - ("show_deleted", "true".to_string()), - ]) - .header(header::ACCEPT, "application/json") - .bearer_auth(&self.token) - .send() - .await - .map_err(|e| { - Error::new( - "Find project for tenant".to_string(), - ErrorKind::RequestSend(e), - ) - })?; + let response = CloudAdminApiClient::with_retries( + || async { + let response = self + .http_client + .get(self.append_url("/projects")) + .query(&[ + ("tenant_id", tenant_id.to_string()), + ("show_deleted", "true".to_string()), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| { + Error::new( + "Find project for tenant".to_string(), + ErrorKind::RequestSend(e), + ) + })?; + + let response: AdminApiResponse> = + response.json().await.map_err(|e| { + Error::new( + "Find project for tenant".to_string(), + ErrorKind::BodyRead(e), + ) + })?; + Ok(response) + }, + "find_tenant_project", + ) + .await?; - let response: AdminApiResponse> = response.json().await.map_err(|e| { - Error::new( - "Find project for tenant".to_string(), - ErrorKind::BodyRead(e), - ) - })?; match response.data.len() { 0 => Ok(None), 1 => Ok(Some( @@ -265,42 +272,34 @@ impl CloudAdminApiClient { const PAGINATION_LIMIT: usize = 512; let mut result: Vec = Vec::with_capacity(PAGINATION_LIMIT); loop { - let response = self - .http_client - .get(self.append_url("/projects")) - .query(&[ - ("show_deleted", "false".to_string()), - ("limit", format!("{PAGINATION_LIMIT}")), - ("offset", format!("{pagination_offset}")), - ]) - .header(header::ACCEPT, "application/json") - .bearer_auth(&self.token) - .send() - .await - .map_err(|e| { - Error::new( - "List active projects".to_string(), - ErrorKind::RequestSend(e), - ) - })?; + let response_bytes = CloudAdminApiClient::with_retries( + || async { + let response = self + .http_client + .get(self.append_url("/projects")) + .query(&[ + ("show_deleted", "false".to_string()), + ("limit", format!("{PAGINATION_LIMIT}")), + ("offset", format!("{pagination_offset}")), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| { + Error::new( + "List active projects".to_string(), + ErrorKind::RequestSend(e), + ) + })?; - match response.status() { - StatusCode::OK => {} - StatusCode::SERVICE_UNAVAILABLE | StatusCode::TOO_MANY_REQUESTS => { - tokio::time::sleep(Duration::from_millis(500)).await; - continue; - } - status => { - return Err(Error::new( - "List active projects".to_string(), - ErrorKind::ResponseStatus(response.status()), - )) - } - } - - let response_bytes = response.bytes().await.map_err(|e| { - Error::new("List active projects".to_string(), ErrorKind::BodyRead(e)) - })?; + response.bytes().await.map_err(|e| { + Error::new("List active projects".to_string(), ErrorKind::BodyRead(e)) + }) + }, + "list_projects", + ) + .await?; let decode_result = serde_json::from_slice::>>(&response_bytes); @@ -331,6 +330,7 @@ impl CloudAdminApiClient { pub async fn find_timeline_branch( &self, + tenant_id: TenantId, timeline_id: TimelineId, ) -> Result, Error> { let _permit = self @@ -339,43 +339,61 @@ impl CloudAdminApiClient { .await .expect("Semaphore is not closed"); - let response = self - .http_client - .get(self.append_url("/branches")) - .query(&[ - ("timeline_id", timeline_id.to_string()), - ("show_deleted", "true".to_string()), - ]) - .header(header::ACCEPT, "application/json") - .bearer_auth(&self.token) - .send() - .await - .map_err(|e| { - Error::new( - "Find branch for timeline".to_string(), - ErrorKind::RequestSend(e), - ) - })?; + let response = CloudAdminApiClient::with_retries( + || async { + let response = self + .http_client + .get(self.append_url("/branches")) + .query(&[ + ("timeline_id", timeline_id.to_string()), + ("show_deleted", "true".to_string()), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| { + Error::new( + "Find branch for timeline".to_string(), + ErrorKind::RequestSend(e), + ) + })?; - let response: AdminApiResponse> = response.json().await.map_err(|e| { - Error::new( - "Find branch for timeline".to_string(), - ErrorKind::BodyRead(e), - ) - })?; - match response.data.len() { - 0 => Ok(None), - 1 => Ok(Some( - response - .data - .into_iter() - .next() - .expect("Should have exactly one element"), - )), - too_many => Err(Error::new( - format!("Find branch for timeline returned {too_many} branches instead of 0 or 1"), + let response: AdminApiResponse> = + response.json().await.map_err(|e| { + Error::new( + "Find branch for timeline".to_string(), + ErrorKind::BodyRead(e), + ) + })?; + Ok(response) + }, + "find_timeline_branch", + ) + .await?; + + let mut branches: Vec = response.data.into_iter().collect(); + // Normally timeline_id is unique. However, we do have at least one case + // of the same timeline_id in two different projects, apparently after + // manual recovery. So always recheck project_id (discovered through + // tenant_id). + let project_data = match self.find_tenant_project(tenant_id).await? { + Some(pd) => pd, + None => return Ok(None), + }; + branches.retain(|b| b.project_id == project_data.id); + if branches.len() < 2 { + Ok(branches.first().cloned()) + } else { + Err(Error::new( + format!( + "Find branch for timeline {}/{} returned {} branches instead of 0 or 1", + tenant_id, + timeline_id, + branches.len() + ), ErrorKind::UnexpectedState, - )), + )) } } @@ -536,4 +554,15 @@ impl CloudAdminApiClient { .parse() .unwrap_or_else(|e| panic!("Could not append {subpath} to base url: {e}")) } + + async fn with_retries(op: O, description: &str) -> Result + where + O: FnMut() -> F, + F: Future>, + { + let cancel = CancellationToken::new(); // not really used + backoff::retry(op, |_| false, 1, 20, description, &cancel) + .await + .expect("cancellations are disabled") + } } diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs new file mode 100644 index 0000000000..88e36af560 --- /dev/null +++ b/storage_scrubber/src/find_large_objects.rs @@ -0,0 +1,115 @@ +use std::pin::pin; + +use futures::{StreamExt, TryStreamExt}; +use pageserver::tenant::storage_layer::LayerName; +use remote_storage::ListingMode; +use serde::{Deserialize, Serialize}; + +use crate::{ + checks::parse_layer_object_name, init_remote, metadata_stream::stream_tenants, + stream_objects_with_retries, BucketConfig, NodeKind, +}; + +#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] +enum LargeObjectKind { + DeltaLayer, + ImageLayer, + Other, +} + +impl LargeObjectKind { + fn from_key(key: &str) -> Self { + let fname = key.split('/').last().unwrap(); + + let Ok((layer_name, _generation)) = parse_layer_object_name(fname) else { + return LargeObjectKind::Other; + }; + + match layer_name { + LayerName::Image(_) => LargeObjectKind::ImageLayer, + LayerName::Delta(_) => LargeObjectKind::DeltaLayer, + } + } +} + +#[derive(Serialize, Deserialize, Clone)] +pub struct LargeObject { + pub key: String, + pub size: u64, + kind: LargeObjectKind, +} + +#[derive(Serialize, Deserialize)] +pub struct LargeObjectListing { + pub objects: Vec, +} + +pub async fn find_large_objects( + bucket_config: BucketConfig, + min_size: u64, + ignore_deltas: bool, + concurrency: usize, +) -> anyhow::Result { + let (remote_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?; + let tenants = pin!(stream_tenants(&remote_client, &target)); + + let objects_stream = tenants.map_ok(|tenant_shard_id| { + let mut tenant_root = target.tenant_root(&tenant_shard_id); + let remote_client = remote_client.clone(); + async move { + let mut objects = Vec::new(); + let mut total_objects_ctr = 0u64; + // We want the objects and not just common prefixes + tenant_root.delimiter.clear(); + let mut objects_stream = pin!(stream_objects_with_retries( + &remote_client, + ListingMode::NoDelimiter, + &tenant_root + )); + while let Some(listing) = objects_stream.next().await { + let listing = listing?; + for obj in listing.keys.iter().filter(|obj| min_size <= obj.size) { + let key = obj.key.to_string(); + let kind = LargeObjectKind::from_key(&key); + if ignore_deltas && kind == LargeObjectKind::DeltaLayer { + continue; + } + objects.push(LargeObject { + key, + size: obj.size, + kind, + }) + } + total_objects_ctr += listing.keys.len() as u64; + } + + Ok((tenant_shard_id, objects, total_objects_ctr)) + } + }); + let mut objects_stream = std::pin::pin!(objects_stream.try_buffer_unordered(concurrency)); + + let mut objects = Vec::new(); + + let mut tenant_ctr = 0u64; + let mut object_ctr = 0u64; + while let Some(res) = objects_stream.next().await { + let (tenant_shard_id, objects_slice, total_objects_ctr) = res?; + objects.extend_from_slice(&objects_slice); + + object_ctr += total_objects_ctr; + tenant_ctr += 1; + if tenant_ctr % 100 == 0 { + tracing::info!( + "Scanned {tenant_ctr} shards. objects={object_ctr}, found={}, current={tenant_shard_id}.", + objects.len() + ); + } + } + + let bucket_name = target.bucket_name(); + tracing::info!( + "Scan of {bucket_name} finished. Scanned {tenant_ctr} shards. objects={object_ctr}, found={}.", + objects.len() + ); + Ok(LargeObjectListing { objects }) +} diff --git a/s3_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs similarity index 56% rename from s3_scrubber/src/garbage.rs rename to storage_scrubber/src/garbage.rs index 93bb115883..d53611ed6e 100644 --- a/s3_scrubber/src/garbage.rs +++ b/storage_scrubber/src/garbage.rs @@ -1,34 +1,39 @@ -//! Functionality for finding and purging garbage, as in "garbage collection". Garbage means -//! S3 objects which are either not referenced by any metadata, or are referenced by a -//! control plane tenant/timeline in a deleted state. +//! Functionality for finding and purging garbage, as in "garbage collection". +//! +//! Garbage means S3 objects which are either not referenced by any metadata, +//! or are referenced by a control plane tenant/timeline in a deleted state. use std::{ collections::{HashMap, HashSet}, sync::Arc, + time::Duration, }; use anyhow::Context; -use aws_sdk_s3::{ - types::{Delete, ObjectIdentifier}, - Client, -}; -use futures_util::{pin_mut, TryStreamExt}; +use futures_util::TryStreamExt; use pageserver_api::shard::TenantShardId; +use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath}; use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; +use tokio_util::sync::CancellationToken; use utils::id::TenantId; use crate::{ cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}, - init_remote, - metadata_stream::{stream_listing, stream_tenant_timelines, stream_tenants}, - BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, TraversingDepth, + init_remote, list_objects_with_retries, + metadata_stream::{stream_tenant_timelines, stream_tenants}, + BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, }; #[derive(Serialize, Deserialize, Debug)] enum GarbageReason { DeletedInConsole, MissingInConsole, + + // The remaining data relates to a known deletion issue, and we're sure that purging this + // will not delete any real data, for example https://github.com/neondatabase/neon/pull/7928 where + // there is nothing in a tenant path apart from a heatmap file. + KnownBug, } #[derive(Serialize, Deserialize, Debug)] @@ -60,6 +65,7 @@ pub struct GarbageList { /// see garbage, we saw some active tenants too. This protects against classes of bugs /// in the scrubber that might otherwise generate a "deleted all" result. active_tenant_count: usize, + active_timeline_count: usize, } impl GarbageList { @@ -67,11 +73,21 @@ impl GarbageList { Self { items: Vec::new(), active_tenant_count: 0, + active_timeline_count: 0, node_kind, bucket_config, } } + /// If an entity has been identified as requiring purge due to a known bug, e.g. + /// a particular type of object left behind after an incomplete deletion. + fn append_buggy(&mut self, entity: GarbageEntity) { + self.items.push(GarbageItem { + entity, + reason: GarbageReason::KnownBug, + }); + } + /// Return true if appended, false if not. False means the result was not garbage. fn maybe_append(&mut self, entity: GarbageEntity, result: Option) -> bool where @@ -119,7 +135,10 @@ pub async fn find_garbage( const S3_CONCURRENCY: usize = 32; // How many concurrent API requests to make to the console API. -const CONSOLE_CONCURRENCY: usize = 128; +// +// Be careful increasing this; roughly we shouldn't have more than ~100 rps. It +// would be better to implement real rsp limiter. +const CONSOLE_CONCURRENCY: usize = 16; struct ConsoleCache { /// Set of tenants found in the control plane API @@ -135,7 +154,7 @@ async fn find_garbage_inner( node_kind: NodeKind, ) -> anyhow::Result { // Construct clients for S3 and for Console API - let (s3_client, target) = init_remote(bucket_config.clone(), node_kind)?; + let (remote_client, target) = init_remote(bucket_config.clone(), node_kind).await?; let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config)); // Build a set of console-known tenants, for quickly eliminating known-active tenants without having @@ -161,7 +180,7 @@ async fn find_garbage_inner( // Enumerate Tenants in S3, and check if each one exists in Console tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket); - let tenants = stream_tenants(&s3_client, &target); + let tenants = stream_tenants(&remote_client, &target); let tenants_checked = tenants.map_ok(|t| { let api_client = cloud_admin_api_client.clone(); let console_cache = console_cache.clone(); @@ -199,12 +218,12 @@ async fn find_garbage_inner( } } }); - let tenants_checked = tenants_checked.try_buffer_unordered(CONSOLE_CONCURRENCY); + let mut tenants_checked = + std::pin::pin!(tenants_checked.try_buffer_unordered(CONSOLE_CONCURRENCY)); // Process the results of Tenant checks. If a Tenant is garbage, it goes into // the `GarbageList`. Else it goes into `active_tenants` for more detailed timeline // checks if they are enabled by the `depth` parameter. - pin_mut!(tenants_checked); let mut garbage = GarbageList::new(node_kind, bucket_config); let mut active_tenants: Vec = vec![]; let mut counter = 0; @@ -216,11 +235,71 @@ async fn find_garbage_inner( assert!(project.tenant == tenant_shard_id.tenant_id); } + // Special case: If it's missing in console, check for known bugs that would enable us to conclusively + // identify it as purge-able anyway + if console_result.is_none() { + let timelines = stream_tenant_timelines(&remote_client, &target, tenant_shard_id) + .await? + .collect::>() + .await; + if timelines.is_empty() { + // No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps + let tenant_objects = list_objects_with_retries( + &remote_client, + ListingMode::WithDelimiter, + &target.tenant_root(&tenant_shard_id), + ) + .await?; + let object = tenant_objects.keys.first().unwrap(); + if object.key.get_path().as_str().ends_with("heatmap-v1.json") { + tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)"); + garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); + continue; + } else { + tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key); + } + } else { + // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial + // rollout of WAL DR in which we never deleted these. + let mut any_non_initdb = false; + + for timeline_r in timelines { + let timeline = timeline_r?; + let timeline_objects = list_objects_with_retries( + &remote_client, + ListingMode::WithDelimiter, + &target.timeline_root(&timeline), + ) + .await?; + if !timeline_objects.prefixes.is_empty() { + // Sub-paths? Unexpected + any_non_initdb = true; + } else { + let object = timeline_objects.keys.first().unwrap(); + if object.key.get_path().as_str().ends_with("initdb.tar.zst") { + tracing::info!("Timeline {timeline} contains only initdb.tar.zst"); + } else { + any_non_initdb = true; + } + } + } + + if any_non_initdb { + tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb"); + } else { + tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb"); + garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); + continue; + } + } + } + if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) { tracing::debug!("Tenant {tenant_shard_id} is garbage"); } else { tracing::debug!("Tenant {tenant_shard_id} is active"); active_tenants.push(tenant_shard_id); + garbage.active_tenant_count = active_tenants.len(); } counter += 1; @@ -252,7 +331,7 @@ async fn find_garbage_inner( // Construct a stream of all timelines within active tenants let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok)); - let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, *t)); + let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, *t)); let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY); let timelines = timelines.try_flatten(); @@ -261,25 +340,39 @@ async fn find_garbage_inner( let api_client = cloud_admin_api_client.clone(); async move { api_client - .find_timeline_branch(ttid.timeline_id) + .find_timeline_branch(ttid.tenant_shard_id.tenant_id, ttid.timeline_id) .await .map_err(|e| anyhow::anyhow!(e)) .map(|r| (ttid, r)) } }); - let timelines_checked = timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY); + let mut timelines_checked = + std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY)); // Update the GarbageList with any timelines which appear not to exist. - pin_mut!(timelines_checked); + let mut active_timelines: Vec = vec![]; while let Some(result) = timelines_checked.next().await { let (ttid, console_result) = result?; if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) { tracing::debug!("Timeline {ttid} is garbage"); } else { tracing::debug!("Timeline {ttid} is active"); + active_timelines.push(ttid); + garbage.active_timeline_count = active_timelines.len(); } } + let num_garbage_timelines = garbage + .items + .iter() + .filter(|g| matches!(g.entity, GarbageEntity::Timeline(_))) + .count(); + tracing::info!( + "Found {}/{} garbage timelines in active tenants", + num_garbage_timelines, + active_timelines.len(), + ); + Ok(garbage) } @@ -304,79 +397,104 @@ impl std::fmt::Display for PurgeMode { } pub async fn get_tenant_objects( - s3_client: &Arc, - target: RootTarget, + s3_client: &GenericRemoteStorage, tenant_shard_id: TenantShardId, -) -> anyhow::Result> { +) -> anyhow::Result> { tracing::debug!("Listing objects in tenant {tenant_shard_id}"); + let tenant_root = super::remote_tenant_path(&tenant_shard_id); + // TODO: apply extra validation based on object modification time. Don't purge // tenants where any timeline's index_part.json has been touched recently. - let mut tenant_root = target.tenant_root(&tenant_shard_id); - - // Remove delimiter, so that object listing lists all keys in the prefix and not just - // common prefixes. - tenant_root.delimiter = String::new(); - - let key_stream = stream_listing(s3_client, &tenant_root); - key_stream.try_collect().await + let list = s3_client + .list( + Some(&tenant_root), + ListingMode::NoDelimiter, + None, + &CancellationToken::new(), + ) + .await?; + Ok(list.keys) } pub async fn get_timeline_objects( - s3_client: &Arc, - target: RootTarget, + s3_client: &GenericRemoteStorage, ttid: TenantShardTimelineId, -) -> anyhow::Result> { +) -> anyhow::Result> { tracing::debug!("Listing objects in timeline {ttid}"); - let mut timeline_root = target.timeline_root(&ttid); + let timeline_root = super::remote_timeline_path_id(&ttid); - // TODO: apply extra validation based on object modification time. Don't purge - // timelines whose index_part.json has been touched recently. - - // Remove delimiter, so that object listing lists all keys in the prefix and not just - // common prefixes. - timeline_root.delimiter = String::new(); - let key_stream = stream_listing(s3_client, &timeline_root); - - key_stream.try_collect().await + let list = s3_client + .list( + Some(&timeline_root), + ListingMode::NoDelimiter, + None, + &CancellationToken::new(), + ) + .await?; + Ok(list.keys) } const MAX_KEYS_PER_DELETE: usize = 1000; /// Drain a buffer of keys into DeleteObjects requests +/// +/// If `drain` is true, drains keys completely; otherwise stops when < +/// MAX_KEYS_PER_DELETE keys are left. +/// `num_deleted` returns number of deleted keys. async fn do_delete( - s3_client: &Arc, - bucket_name: &str, - keys: &mut Vec, + remote_client: &GenericRemoteStorage, + keys: &mut Vec, dry_run: bool, drain: bool, + progress_tracker: &mut DeletionProgressTracker, ) -> anyhow::Result<()> { + let cancel = CancellationToken::new(); while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) { let request_keys = keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len()))); + + let request_keys: Vec = request_keys.into_iter().map(|o| o.key).collect(); + + let num_deleted = request_keys.len(); if dry_run { tracing::info!("Dry-run deletion of objects: "); for k in request_keys { tracing::info!(" {k:?}"); } } else { - let delete_request = s3_client - .delete_objects() - .bucket(bucket_name) - .delete(Delete::builder().set_objects(Some(request_keys)).build()?); - delete_request - .send() + remote_client + .delete_objects(&request_keys, &cancel) .await - .context("DeleteObjects request")?; + .context("deletetion request")?; + progress_tracker.register(num_deleted); } } Ok(()) } +/// Simple tracker reporting each 10k deleted keys. +#[derive(Default)] +struct DeletionProgressTracker { + num_deleted: usize, + last_reported_num_deleted: usize, +} + +impl DeletionProgressTracker { + fn register(&mut self, n: usize) { + self.num_deleted += n; + if self.num_deleted - self.last_reported_num_deleted > 10000 { + tracing::info!("progress: deleted {} keys", self.num_deleted); + self.last_reported_num_deleted = self.num_deleted; + } + } +} + pub async fn purge_garbage( input_path: String, mode: PurgeMode, + min_age: Duration, dry_run: bool, ) -> anyhow::Result<()> { let list_bytes = tokio::fs::read(&input_path).await?; @@ -387,13 +505,26 @@ pub async fn purge_garbage( input_path ); - let (s3_client, target) = - init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind)?; + let (remote_client, _target) = + init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?; + + assert_eq!( + &garbage_list.bucket_config.bucket, + remote_client.bucket_name().unwrap() + ); // Sanity checks on the incoming list if garbage_list.active_tenant_count == 0 { anyhow::bail!("Refusing to purge a garbage list that reports 0 active tenants"); } + if garbage_list + .items + .iter() + .any(|g| matches!(g.entity, GarbageEntity::Timeline(_))) + && garbage_list.active_timeline_count == 0 + { + anyhow::bail!("Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines"); + } let filtered_items = garbage_list .items @@ -401,6 +532,7 @@ pub async fn purge_garbage( .filter(|i| match (&mode, &i.reason) { (PurgeMode::DeletedAndMissing, _) => true, (PurgeMode::DeletedOnly, GarbageReason::DeletedInConsole) => true, + (PurgeMode::DeletedOnly, GarbageReason::KnownBug) => true, (PurgeMode::DeletedOnly, GarbageReason::MissingInConsole) => false, }); @@ -412,48 +544,77 @@ pub async fn purge_garbage( let items = tokio_stream::iter(filtered_items.map(Ok)); let get_objects_results = items.map_ok(|i| { - let s3_client = s3_client.clone(); - let target = target.clone(); + let remote_client = remote_client.clone(); async move { match i.entity { GarbageEntity::Tenant(tenant_id) => { - get_tenant_objects(&s3_client, target, tenant_id).await - } - GarbageEntity::Timeline(ttid) => { - get_timeline_objects(&s3_client, target, ttid).await + get_tenant_objects(&remote_client, tenant_id).await } + GarbageEntity::Timeline(ttid) => get_timeline_objects(&remote_client, ttid).await, } } }); - let get_objects_results = get_objects_results.try_buffer_unordered(S3_CONCURRENCY); + let mut get_objects_results = + std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY)); - pin_mut!(get_objects_results); let mut objects_to_delete = Vec::new(); + let mut progress_tracker = DeletionProgressTracker::default(); while let Some(result) = get_objects_results.next().await { let mut object_list = result?; + + // Extra safety check: even if a collection of objects is garbage, check max() of modification + // times before purging, so that if we incorrectly marked a live tenant as garbage then we would + // notice that its index has been written recently and would omit deleting it. + if object_list.is_empty() { + // Simplify subsequent code by ensuring list always has at least one item + // Usually, this only occurs if there is parallel deletions racing us, as there is no empty prefixes + continue; + } + let max_mtime = object_list.iter().map(|o| o.last_modified).max().unwrap(); + let age = max_mtime.elapsed(); + match age { + Err(_) => { + tracing::warn!("Bad last_modified time"); + continue; + } + Ok(a) if a < min_age => { + // Failed age check. This doesn't mean we did something wrong: a tenant might really be garbage and recently + // written, but out of an abundance of caution we still don't purge it. + tracing::info!( + "Skipping tenant with young objects {}..{}", + object_list.first().as_ref().unwrap().key, + object_list.last().as_ref().unwrap().key + ); + continue; + } + Ok(_) => { + // Passed age check + } + } + objects_to_delete.append(&mut object_list); if objects_to_delete.len() >= MAX_KEYS_PER_DELETE { do_delete( - &s3_client, - &garbage_list.bucket_config.bucket, + &remote_client, &mut objects_to_delete, dry_run, false, + &mut progress_tracker, ) .await?; } } do_delete( - &s3_client, - &garbage_list.bucket_config.bucket, + &remote_client, &mut objects_to_delete, dry_run, true, + &mut progress_tracker, ) .await?; - tracing::info!("Fell through"); + tracing::info!("{} keys deleted in total", progress_tracker.num_deleted); Ok(()) } diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs new file mode 100644 index 0000000000..3f08cddf50 --- /dev/null +++ b/storage_scrubber/src/lib.rs @@ -0,0 +1,565 @@ +#![deny(unsafe_code)] +#![deny(clippy::undocumented_unsafe_blocks)] +pub mod checks; +pub mod cloud_admin_api; +pub mod find_large_objects; +pub mod garbage; +pub mod metadata_stream; +pub mod pageserver_physical_gc; +pub mod scan_pageserver_metadata; +pub mod scan_safekeeper_metadata; +pub mod tenant_snapshot; + +use std::env; +use std::fmt::Display; +use std::sync::Arc; +use std::time::Duration; + +use anyhow::Context; +use aws_config::retry::{RetryConfigBuilder, RetryMode}; +use aws_sdk_s3::config::Region; +use aws_sdk_s3::error::DisplayErrorContext; +use aws_sdk_s3::Client; + +use camino::{Utf8Path, Utf8PathBuf}; +use clap::ValueEnum; +use futures::{Stream, StreamExt}; +use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path}; +use pageserver::tenant::TENANTS_SEGMENT_NAME; +use pageserver_api::shard::TenantShardId; +use remote_storage::{ + GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind, + S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, +}; +use reqwest::Url; +use serde::{Deserialize, Serialize}; +use storage_controller_client::control_api; +use tokio::io::AsyncReadExt; +use tokio_util::sync::CancellationToken; +use tracing::{error, warn}; +use tracing_appender::non_blocking::WorkerGuard; +use tracing_subscriber::{fmt, prelude::*, EnvFilter}; +use utils::fs_ext; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; + +const MAX_RETRIES: usize = 20; +const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN"; + +#[derive(Debug, Clone)] +pub struct S3Target { + pub bucket_name: String, + /// This `prefix_in_bucket` is only equal to the PS/SK config of the same + /// name for the RootTarget: other instances of S3Target will have prefix_in_bucket + /// with extra parts. + pub prefix_in_bucket: String, + pub delimiter: String, +} + +/// Convenience for referring to timelines within a particular shard: more ergonomic +/// than using a 2-tuple. +/// +/// This is the shard-aware equivalent of TenantTimelineId. It's defined here rather +/// than somewhere more broadly exposed, because this kind of thing is rarely needed +/// in the pageserver, as all timeline objects existing in the scope of a particular +/// tenant: the scrubber is different in that it handles collections of data referring to many +/// TenantShardTimelineIds in on place. +#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub struct TenantShardTimelineId { + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, +} + +impl TenantShardTimelineId { + fn new(tenant_shard_id: TenantShardId, timeline_id: TimelineId) -> Self { + Self { + tenant_shard_id, + timeline_id, + } + } + + fn as_tenant_timeline_id(&self) -> TenantTimelineId { + TenantTimelineId::new(self.tenant_shard_id.tenant_id, self.timeline_id) + } +} + +impl Display for TenantShardTimelineId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}/{}", self.tenant_shard_id, self.timeline_id) + } +} + +#[derive(clap::ValueEnum, Debug, Clone, Copy, PartialEq, Eq)] +pub enum TraversingDepth { + Tenant, + Timeline, +} + +impl Display for TraversingDepth { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { + Self::Tenant => "tenant", + Self::Timeline => "timeline", + }) + } +} + +#[derive(ValueEnum, Clone, Copy, Eq, PartialEq, Debug, Serialize, Deserialize)] +pub enum NodeKind { + Safekeeper, + Pageserver, +} + +impl NodeKind { + fn as_str(&self) -> &'static str { + match self { + Self::Safekeeper => "safekeeper", + Self::Pageserver => "pageserver", + } + } +} + +impl Display for NodeKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +impl S3Target { + pub fn with_sub_segment(&self, new_segment: &str) -> Self { + let mut new_self = self.clone(); + if new_self.prefix_in_bucket.is_empty() { + new_self.prefix_in_bucket = format!("/{}/", new_segment); + } else { + if new_self.prefix_in_bucket.ends_with('/') { + new_self.prefix_in_bucket.pop(); + } + new_self.prefix_in_bucket = + [&new_self.prefix_in_bucket, new_segment, ""].join(&new_self.delimiter); + } + new_self + } +} + +#[derive(Clone)] +pub enum RootTarget { + Pageserver(S3Target), + Safekeeper(S3Target), +} + +impl RootTarget { + pub fn tenants_root(&self) -> S3Target { + match self { + Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME), + Self::Safekeeper(root) => root.clone(), + } + } + + pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target { + match self { + Self::Pageserver(_) => self.tenants_root().with_sub_segment(&tenant_id.to_string()), + Self::Safekeeper(_) => self + .tenants_root() + .with_sub_segment(&tenant_id.tenant_id.to_string()), + } + } + + pub(crate) fn tenant_shards_prefix(&self, tenant_id: &TenantId) -> S3Target { + // Only pageserver remote storage contains tenant-shards + assert!(matches!(self, Self::Pageserver(_))); + let Self::Pageserver(root) = self else { + panic!(); + }; + + S3Target { + bucket_name: root.bucket_name.clone(), + prefix_in_bucket: format!( + "{}/{TENANTS_SEGMENT_NAME}/{tenant_id}", + root.prefix_in_bucket + ), + delimiter: root.delimiter.clone(), + } + } + + pub fn timelines_root(&self, tenant_id: &TenantShardId) -> S3Target { + match self { + Self::Pageserver(_) => self.tenant_root(tenant_id).with_sub_segment("timelines"), + Self::Safekeeper(_) => self.tenant_root(tenant_id), + } + } + + pub fn timeline_root(&self, id: &TenantShardTimelineId) -> S3Target { + self.timelines_root(&id.tenant_shard_id) + .with_sub_segment(&id.timeline_id.to_string()) + } + + /// Given RemotePath "tenants/foo/timelines/bar/layerxyz", prefix it to a literal + /// key in the S3 bucket. + pub fn absolute_key(&self, key: &RemotePath) -> String { + let root = match self { + Self::Pageserver(root) => root, + Self::Safekeeper(root) => root, + }; + + let prefix = &root.prefix_in_bucket; + if prefix.ends_with('/') { + format!("{prefix}{key}") + } else { + format!("{prefix}/{key}") + } + } + + pub fn bucket_name(&self) -> &str { + match self { + Self::Pageserver(root) => &root.bucket_name, + Self::Safekeeper(root) => &root.bucket_name, + } + } + + pub fn delimiter(&self) -> &str { + match self { + Self::Pageserver(root) => &root.delimiter, + Self::Safekeeper(root) => &root.delimiter, + } + } +} + +pub fn remote_timeline_path_id(id: &TenantShardTimelineId) -> RemotePath { + remote_timeline_path(&id.tenant_shard_id, &id.timeline_id) +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct BucketConfig { + pub region: String, + pub bucket: String, + pub prefix_in_bucket: Option, +} + +impl BucketConfig { + pub fn from_env() -> anyhow::Result { + let region = env::var("REGION").context("'REGION' param retrieval")?; + let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?; + let prefix_in_bucket = env::var("BUCKET_PREFIX").ok(); + + Ok(Self { + region, + bucket, + prefix_in_bucket, + }) + } +} + +pub struct ControllerClientConfig { + /// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local` + pub controller_api: Url, + + /// JWT token for authenticating with storage controller. Requires scope 'scrubber' or 'admin'. + pub controller_jwt: String, +} + +impl ControllerClientConfig { + pub fn build_client(self) -> control_api::Client { + control_api::Client::new(self.controller_api, Some(self.controller_jwt)) + } +} + +pub struct ConsoleConfig { + pub token: String, + pub base_url: Url, +} + +impl ConsoleConfig { + pub fn from_env() -> anyhow::Result { + let base_url: Url = env::var("CLOUD_ADMIN_API_URL") + .context("'CLOUD_ADMIN_API_URL' param retrieval")? + .parse() + .context("'CLOUD_ADMIN_API_URL' param parsing")?; + + let token = env::var(CLOUD_ADMIN_API_TOKEN_ENV_VAR) + .context("'CLOUD_ADMIN_API_TOKEN' environment variable fetch")?; + + Ok(Self { base_url, token }) + } +} + +pub fn init_logging(file_name: &str) -> Option { + let stderr_logs = fmt::Layer::new() + .with_target(false) + .with_writer(std::io::stderr); + + let disable_file_logging = match std::env::var("PAGESERVER_DISABLE_FILE_LOGGING") { + Ok(s) => s == "1" || s.to_lowercase() == "true", + Err(_) => false, + }; + + if disable_file_logging { + tracing_subscriber::registry() + .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"))) + .with(stderr_logs) + .init(); + None + } else { + let (file_writer, guard) = + tracing_appender::non_blocking(tracing_appender::rolling::never("./logs/", file_name)); + let file_logs = fmt::Layer::new() + .with_target(false) + .with_ansi(false) + .with_writer(file_writer); + tracing_subscriber::registry() + .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"))) + .with(stderr_logs) + .with(file_logs) + .init(); + Some(guard) + } +} + +async fn init_s3_client(bucket_region: Region) -> Client { + let mut retry_config_builder = RetryConfigBuilder::new(); + + retry_config_builder + .set_max_attempts(Some(3)) + .set_mode(Some(RetryMode::Adaptive)); + + let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28()) + .region(bucket_region) + .retry_config(retry_config_builder.build()) + .load() + .await; + Client::new(&config) +} + +fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str { + match node_kind { + NodeKind::Pageserver => "pageserver/v1/", + NodeKind::Safekeeper => "wal/", + } +} + +fn make_root_target( + bucket_name: String, + prefix_in_bucket: String, + node_kind: NodeKind, +) -> RootTarget { + let s3_target = S3Target { + bucket_name, + prefix_in_bucket, + delimiter: "/".to_string(), + }; + match node_kind { + NodeKind::Pageserver => RootTarget::Pageserver(s3_target), + NodeKind::Safekeeper => RootTarget::Safekeeper(s3_target), + } +} + +async fn init_remote_s3( + bucket_config: BucketConfig, + node_kind: NodeKind, +) -> anyhow::Result<(Arc, RootTarget)> { + let bucket_region = Region::new(bucket_config.region); + let s3_client = Arc::new(init_s3_client(bucket_region).await); + let default_prefix = default_prefix_in_bucket(node_kind).to_string(); + + let s3_root = make_root_target( + bucket_config.bucket, + bucket_config.prefix_in_bucket.unwrap_or(default_prefix), + node_kind, + ); + + Ok((s3_client, s3_root)) +} + +async fn init_remote( + bucket_config: BucketConfig, + node_kind: NodeKind, +) -> anyhow::Result<(GenericRemoteStorage, RootTarget)> { + let endpoint = env::var("AWS_ENDPOINT_URL").ok(); + let default_prefix = default_prefix_in_bucket(node_kind).to_string(); + let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix)); + let storage = S3Config { + bucket_name: bucket_config.bucket.clone(), + bucket_region: bucket_config.region, + prefix_in_bucket, + endpoint, + concurrency_limit: DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT + .try_into() + .unwrap(), + max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, + upload_storage_class: None, + }; + let storage_config = RemoteStorageConfig { + storage: RemoteStorageKind::AwsS3(storage), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, + }; + + // We already pass the prefix to the remote client above + let prefix_in_root_target = String::new(); + let root_target = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind); + + let client = GenericRemoteStorage::from_config(&storage_config).await?; + Ok((client, root_target)) +} + +/// Listing possibly large amounts of keys in a streaming fashion. +fn stream_objects_with_retries<'a>( + storage_client: &'a GenericRemoteStorage, + listing_mode: ListingMode, + s3_target: &'a S3Target, +) -> impl Stream> + 'a { + async_stream::stream! { + let mut trial = 0; + let cancel = CancellationToken::new(); + let prefix_str = &s3_target + .prefix_in_bucket + .strip_prefix("/") + .unwrap_or(&s3_target.prefix_in_bucket); + let prefix = RemotePath::from_string(prefix_str)?; + let mut list_stream = + storage_client.list_streaming(Some(&prefix), listing_mode, None, &cancel); + while let Some(res) = list_stream.next().await { + match res { + Err(err) => { + let yield_err = if err.is_permanent() { + true + } else { + let backoff_time = 1 << trial.min(5); + tokio::time::sleep(Duration::from_secs(backoff_time)).await; + trial += 1; + trial == MAX_RETRIES - 1 + }; + if yield_err { + yield Err(err) + .with_context(|| format!("Failed to list objects {MAX_RETRIES} times")); + break; + } + } + Ok(res) => { + trial = 0; + yield Ok(res); + } + } + } + } +} + +/// If you want to list a bounded amount of prefixes or keys. For larger numbers of keys/prefixes, +/// use [`stream_objects_with_retries`] instead. +async fn list_objects_with_retries( + remote_client: &GenericRemoteStorage, + listing_mode: ListingMode, + s3_target: &S3Target, +) -> anyhow::Result

{ + let cancel = CancellationToken::new(); + let prefix_str = &s3_target + .prefix_in_bucket + .strip_prefix("/") + .unwrap_or(&s3_target.prefix_in_bucket); + let prefix = RemotePath::from_string(prefix_str)?; + for trial in 0..MAX_RETRIES { + match remote_client + .list(Some(&prefix), listing_mode, None, &cancel) + .await + { + Ok(response) => return Ok(response), + Err(e) => { + if trial == MAX_RETRIES - 1 { + return Err(e) + .with_context(|| format!("Failed to list objects {MAX_RETRIES} times")); + } + warn!( + "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}", + s3_target.bucket_name, + s3_target.prefix_in_bucket, + s3_target.delimiter, + DisplayErrorContext(e), + ); + let backoff_time = 1 << trial.min(5); + tokio::time::sleep(Duration::from_secs(backoff_time)).await; + } + } + } + panic!("MAX_RETRIES is not allowed to be 0"); +} + +async fn download_object_with_retries( + remote_client: &GenericRemoteStorage, + key: &RemotePath, +) -> anyhow::Result> { + let cancel = CancellationToken::new(); + for trial in 0..MAX_RETRIES { + let mut buf = Vec::new(); + let download = match remote_client.download(key, &cancel).await { + Ok(response) => response, + Err(e) => { + error!("Failed to download object for key {key}: {e}"); + let backoff_time = 1 << trial.min(5); + tokio::time::sleep(Duration::from_secs(backoff_time)).await; + continue; + } + }; + + match tokio_util::io::StreamReader::new(download.download_stream) + .read_to_end(&mut buf) + .await + { + Ok(bytes_read) => { + tracing::debug!("Downloaded {bytes_read} bytes for object {key}"); + return Ok(buf); + } + Err(e) => { + error!("Failed to stream object body for key {key}: {e}"); + let backoff_time = 1 << trial.min(5); + tokio::time::sleep(Duration::from_secs(backoff_time)).await; + } + } + } + + anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times") +} + +async fn download_object_to_file_s3( + s3_client: &Client, + bucket_name: &str, + key: &str, + version_id: Option<&str>, + local_path: &Utf8Path, +) -> anyhow::Result<()> { + let tmp_path = Utf8PathBuf::from(format!("{local_path}.tmp")); + for _ in 0..MAX_RETRIES { + tokio::fs::remove_file(&tmp_path) + .await + .or_else(fs_ext::ignore_not_found)?; + + let mut file = tokio::fs::File::create(&tmp_path) + .await + .context("Opening output file")?; + + let request = s3_client.get_object().bucket(bucket_name).key(key); + + let request = match version_id { + Some(version_id) => request.version_id(version_id), + None => request, + }; + + let response_stream = match request.send().await { + Ok(response) => response, + Err(e) => { + error!( + "Failed to download object for key {key} version {}: {e:#}", + version_id.unwrap_or("") + ); + tokio::time::sleep(Duration::from_secs(1)).await; + continue; + } + }; + + let mut read_stream = response_stream.body.into_async_read(); + + tokio::io::copy(&mut read_stream, &mut file).await?; + + tokio::fs::rename(&tmp_path, local_path).await?; + return Ok(()); + } + + anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times") +} diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs new file mode 100644 index 0000000000..c5961753c5 --- /dev/null +++ b/storage_scrubber/src/main.rs @@ -0,0 +1,399 @@ +use anyhow::{anyhow, bail, Context}; +use camino::Utf8PathBuf; +use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse}; +use pageserver_api::shard::TenantShardId; +use reqwest::{Method, Url}; +use storage_controller_client::control_api; +use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; +use storage_scrubber::pageserver_physical_gc::GcMode; +use storage_scrubber::scan_pageserver_metadata::scan_pageserver_metadata; +use storage_scrubber::scan_safekeeper_metadata::DatabaseOrList; +use storage_scrubber::tenant_snapshot::SnapshotDownloader; +use storage_scrubber::{find_large_objects, ControllerClientConfig}; +use storage_scrubber::{ + init_logging, pageserver_physical_gc::pageserver_physical_gc, + scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind, + TraversingDepth, +}; + +use clap::{Parser, Subcommand}; +use utils::id::TenantId; + +use utils::{project_build_tag, project_git_version}; + +project_git_version!(GIT_VERSION); +project_build_tag!(BUILD_TAG); + +#[derive(Parser)] +#[command(author, version, about, long_about = None)] +#[command(arg_required_else_help(true))] +struct Cli { + #[command(subcommand)] + command: Command, + + #[arg(short, long, default_value_t = false)] + delete: bool, + + #[arg(long)] + /// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local` + controller_api: Option, + + #[arg(long)] + /// JWT token for authenticating with storage controller. Requires scope 'scrubber' or 'admin'. + controller_jwt: Option, +} + +#[derive(Subcommand, Debug)] +enum Command { + FindGarbage { + #[arg(short, long)] + node_kind: NodeKind, + #[arg(short, long, default_value_t=TraversingDepth::Tenant)] + depth: TraversingDepth, + #[arg(short, long, default_value_t = String::from("garbage.json"))] + output_path: String, + }, + PurgeGarbage { + #[arg(short, long)] + input_path: String, + #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)] + mode: PurgeMode, + #[arg(long = "min-age")] + min_age: humantime::Duration, + }, + #[command(verbatim_doc_comment)] + ScanMetadata { + #[arg(short, long)] + node_kind: NodeKind, + #[arg(short, long, default_value_t = false)] + json: bool, + #[arg(long = "tenant-id", num_args = 0..)] + tenant_ids: Vec, + #[arg(long = "post", default_value_t = false)] + post_to_storcon: bool, + #[arg(long, default_value = None)] + /// For safekeeper node_kind only, points to db with debug dump + dump_db_connstr: Option, + /// For safekeeper node_kind only, table in the db with debug dump + #[arg(long, default_value = None)] + dump_db_table: Option, + /// For safekeeper node_kind only, json list of timelines and their lsn info + #[arg(long, default_value = None)] + timeline_lsns: Option, + }, + TenantSnapshot { + #[arg(long = "tenant-id")] + tenant_id: TenantId, + #[arg(long = "concurrency", short = 'j', default_value_t = 8)] + concurrency: usize, + #[arg(short, long)] + output_path: Utf8PathBuf, + }, + PageserverPhysicalGc { + #[arg(long = "tenant-id", num_args = 0..)] + tenant_ids: Vec, + #[arg(long = "min-age")] + min_age: humantime::Duration, + #[arg(short, long, default_value_t = GcMode::IndicesOnly)] + mode: GcMode, + }, + FindLargeObjects { + #[arg(long = "min-size")] + min_size: u64, + #[arg(short, long, default_value_t = false)] + ignore_deltas: bool, + #[arg(long = "concurrency", short = 'j', default_value_t = 64)] + concurrency: usize, + }, + CronJob { + // PageserverPhysicalGc + #[arg(long = "min-age")] + gc_min_age: humantime::Duration, + #[arg(short, long, default_value_t = GcMode::IndicesOnly)] + gc_mode: GcMode, + // ScanMetadata + #[arg(long = "post", default_value_t = false)] + post_to_storcon: bool, + }, +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let cli = Cli::parse(); + + tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG); + + let bucket_config = BucketConfig::from_env()?; + + let command_log_name = match &cli.command { + Command::ScanMetadata { .. } => "scan", + Command::FindGarbage { .. } => "find-garbage", + Command::PurgeGarbage { .. } => "purge-garbage", + Command::TenantSnapshot { .. } => "tenant-snapshot", + Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc", + Command::FindLargeObjects { .. } => "find-large-objects", + Command::CronJob { .. } => "cron-job", + }; + let _guard = init_logging(&format!( + "{}_{}_{}_{}.log", + std::env::args().next().unwrap(), + command_log_name, + bucket_config.bucket, + chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S") + )); + + let controller_client = cli.controller_api.map(|controller_api| { + ControllerClientConfig { + controller_api, + // Default to no key: this is a convenience when working in a development environment + controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()), + } + .build_client() + }); + + match cli.command { + Command::ScanMetadata { + json, + tenant_ids, + node_kind, + post_to_storcon, + dump_db_connstr, + dump_db_table, + timeline_lsns, + } => { + if let NodeKind::Safekeeper = node_kind { + let db_or_list = match (timeline_lsns, dump_db_connstr) { + (Some(timeline_lsns), _) => { + let timeline_lsns = serde_json::from_str(&timeline_lsns).context("parsing timeline_lsns")?; + DatabaseOrList::List(timeline_lsns) + } + (None, Some(dump_db_connstr)) => { + let dump_db_table = dump_db_table.ok_or_else(|| anyhow::anyhow!("dump_db_table not specified"))?; + let tenant_ids = tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(); + DatabaseOrList::Database { tenant_ids, connstr: dump_db_connstr, table: dump_db_table } + } + (None, None) => anyhow::bail!("neither `timeline_lsns` specified, nor `dump_db_connstr` and `dump_db_table`"), + }; + let summary = scan_safekeeper_metadata(bucket_config.clone(), db_or_list).await?; + if json { + println!("{}", serde_json::to_string(&summary).unwrap()) + } else { + println!("{}", summary.summary_string()); + } + if summary.is_fatal() { + bail!("Fatal scrub errors detected"); + } + if summary.is_empty() { + // Strictly speaking an empty bucket is a valid bucket, but if someone ran the + // scrubber they were likely expecting to scan something, and if we see no timelines + // at all then it's likely due to some configuration issues like a bad prefix + bail!( + "No timelines found in bucket {} prefix {}", + bucket_config.bucket, + bucket_config + .prefix_in_bucket + .unwrap_or("".to_string()) + ); + } + Ok(()) + } else { + scan_pageserver_metadata_cmd( + bucket_config, + controller_client.as_ref(), + tenant_ids, + json, + post_to_storcon, + ) + .await + } + } + Command::FindGarbage { + node_kind, + depth, + output_path, + } => { + let console_config = ConsoleConfig::from_env()?; + find_garbage(bucket_config, console_config, depth, node_kind, output_path).await + } + Command::PurgeGarbage { + input_path, + mode, + min_age, + } => purge_garbage(input_path, mode, min_age.into(), !cli.delete).await, + Command::TenantSnapshot { + tenant_id, + output_path, + concurrency, + } => { + let downloader = + SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency).await?; + downloader.download().await + } + Command::PageserverPhysicalGc { + tenant_ids, + min_age, + mode, + } => { + pageserver_physical_gc_cmd( + &bucket_config, + controller_client.as_ref(), + tenant_ids, + min_age, + mode, + ) + .await + } + Command::FindLargeObjects { + min_size, + ignore_deltas, + concurrency, + } => { + let summary = find_large_objects::find_large_objects( + bucket_config, + min_size, + ignore_deltas, + concurrency, + ) + .await?; + println!("{}", serde_json::to_string(&summary).unwrap()); + Ok(()) + } + Command::CronJob { + gc_min_age, + gc_mode, + post_to_storcon, + } => { + run_cron_job( + bucket_config, + controller_client.as_ref(), + gc_min_age, + gc_mode, + post_to_storcon, + ) + .await + } + } +} + +/// Runs the scrubber cron job. +/// 1. Do pageserver physical gc +/// 2. Scan pageserver metadata +pub async fn run_cron_job( + bucket_config: BucketConfig, + controller_client: Option<&control_api::Client>, + gc_min_age: humantime::Duration, + gc_mode: GcMode, + post_to_storcon: bool, +) -> anyhow::Result<()> { + tracing::info!(%gc_min_age, %gc_mode, "Running pageserver-physical-gc"); + pageserver_physical_gc_cmd( + &bucket_config, + controller_client, + Vec::new(), + gc_min_age, + gc_mode, + ) + .await?; + tracing::info!(%post_to_storcon, node_kind = %NodeKind::Pageserver, "Running scan-metadata"); + scan_pageserver_metadata_cmd( + bucket_config, + controller_client, + Vec::new(), + true, + post_to_storcon, + ) + .await?; + + Ok(()) +} + +pub async fn pageserver_physical_gc_cmd( + bucket_config: &BucketConfig, + controller_client: Option<&control_api::Client>, + tenant_shard_ids: Vec, + min_age: humantime::Duration, + mode: GcMode, +) -> anyhow::Result<()> { + match (controller_client, mode) { + (Some(_), _) => { + // Any mode may run when controller API is set + } + (None, GcMode::Full) => { + // The part of physical GC where we erase ancestor layers cannot be done safely without + // confirming the most recent complete shard split with the controller. Refuse to run, rather + // than doing it unsafely. + return Err(anyhow!( + "Full physical GC requires `--controller-api` and `--controller-jwt` to run" + )); + } + (None, GcMode::DryRun | GcMode::IndicesOnly) => { + // These GcModes do not require the controller to run. + } + } + + let summary = pageserver_physical_gc( + bucket_config, + controller_client, + tenant_shard_ids, + min_age.into(), + mode, + ) + .await?; + println!("{}", serde_json::to_string(&summary).unwrap()); + Ok(()) +} + +pub async fn scan_pageserver_metadata_cmd( + bucket_config: BucketConfig, + controller_client: Option<&control_api::Client>, + tenant_shard_ids: Vec, + json: bool, + post_to_storcon: bool, +) -> anyhow::Result<()> { + if controller_client.is_none() && post_to_storcon { + return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run")); + } + match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids).await { + Err(e) => { + tracing::error!("Failed: {e}"); + Err(e) + } + Ok(summary) => { + if json { + println!("{}", serde_json::to_string(&summary).unwrap()) + } else { + println!("{}", summary.summary_string()); + } + + if post_to_storcon { + if let Some(client) = controller_client { + let body = summary.build_health_update_request(); + client + .dispatch::( + Method::POST, + "control/v1/metadata_health/update".to_string(), + Some(body), + ) + .await?; + } + } + + if summary.is_fatal() { + tracing::error!("Fatal scrub errors detected"); + } else if summary.is_empty() { + // Strictly speaking an empty bucket is a valid bucket, but if someone ran the + // scrubber they were likely expecting to scan something, and if we see no timelines + // at all then it's likely due to some configuration issues like a bad prefix + tracing::error!( + "No timelines found in bucket {} prefix {}", + bucket_config.bucket, + bucket_config + .prefix_in_bucket + .unwrap_or("".to_string()) + ); + } + + Ok(()) + } + } +} diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs new file mode 100644 index 0000000000..f896cff2d5 --- /dev/null +++ b/storage_scrubber/src/metadata_stream.rs @@ -0,0 +1,164 @@ +use std::str::FromStr; + +use anyhow::{anyhow, Context}; +use async_stream::{stream, try_stream}; +use futures::StreamExt; +use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath}; +use tokio_stream::Stream; + +use crate::{ + list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target, + TenantShardTimelineId, +}; +use pageserver_api::shard::TenantShardId; +use utils::id::{TenantId, TimelineId}; + +/// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes +pub fn stream_tenants<'a>( + remote_client: &'a GenericRemoteStorage, + target: &'a RootTarget, +) -> impl Stream> + 'a { + try_stream! { + let tenants_target = target.tenants_root(); + let mut tenants_stream = + std::pin::pin!(stream_objects_with_retries(remote_client, ListingMode::WithDelimiter, &tenants_target)); + while let Some(chunk) = tenants_stream.next().await { + let chunk = chunk?; + let entry_ids = chunk.prefixes.iter() + .map(|prefix| prefix.get_path().file_name().ok_or_else(|| anyhow!("no final component in path '{prefix}'"))); + for dir_name_res in entry_ids { + let dir_name = dir_name_res?; + let id = TenantShardId::from_str(dir_name)?; + yield id; + } + } + } +} + +pub async fn stream_tenant_shards<'a>( + remote_client: &'a GenericRemoteStorage, + target: &'a RootTarget, + tenant_id: TenantId, +) -> anyhow::Result> + 'a> { + let shards_target = target.tenant_shards_prefix(&tenant_id); + + let strip_prefix = target.tenants_root().prefix_in_bucket; + let prefix_str = &strip_prefix.strip_prefix("/").unwrap_or(&strip_prefix); + + tracing::info!("Listing shards in {}", shards_target.prefix_in_bucket); + let listing = + list_objects_with_retries(remote_client, ListingMode::WithDelimiter, &shards_target) + .await?; + + let tenant_shard_ids = listing + .prefixes + .iter() + .map(|prefix| prefix.get_path().as_str()) + .filter_map(|prefix| -> Option<&str> { prefix.strip_prefix(prefix_str) }) + .map(|entry_id_str| { + let first_part = entry_id_str.split('/').next().unwrap(); + + first_part + .parse::() + .with_context(|| format!("Incorrect entry id str: {first_part}")) + }) + .collect::>(); + + tracing::debug!("Yielding {} shards for {tenant_id}", tenant_shard_ids.len()); + Ok(stream! { + for i in tenant_shard_ids { + let id = i?; + yield Ok(id); + } + }) +} + +/// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered +/// using a listing. +/// +/// The listing is done before the stream is built, so that this +/// function can be used to generate concurrency on a stream using buffer_unordered. +pub async fn stream_tenant_timelines<'a>( + remote_client: &'a GenericRemoteStorage, + target: &'a RootTarget, + tenant: TenantShardId, +) -> anyhow::Result> + 'a> { + let mut timeline_ids: Vec> = Vec::new(); + let timelines_target = target.timelines_root(&tenant); + + let prefix_str = &timelines_target + .prefix_in_bucket + .strip_prefix("/") + .unwrap_or(&timelines_target.prefix_in_bucket); + + let mut objects_stream = std::pin::pin!(stream_objects_with_retries( + remote_client, + ListingMode::WithDelimiter, + &timelines_target + )); + loop { + tracing::debug!("Listing in {tenant}"); + let fetch_response = match objects_stream.next().await { + None => break, + Some(Err(e)) => { + timeline_ids.push(Err(e)); + break; + } + Some(Ok(r)) => r, + }; + + let new_entry_ids = fetch_response + .prefixes + .iter() + .filter_map(|prefix| -> Option<&str> { + prefix.get_path().as_str().strip_prefix(prefix_str) + }) + .map(|entry_id_str| { + entry_id_str + .parse::() + .with_context(|| format!("Incorrect entry id str: {entry_id_str}")) + }); + + for i in new_entry_ids { + timeline_ids.push(i); + } + } + + tracing::debug!("Yielding {} timelines for {}", timeline_ids.len(), tenant); + Ok(stream! { + for i in timeline_ids { + let id = i?; + yield Ok(TenantShardTimelineId::new(tenant, id)); + } + }) +} + +pub(crate) fn stream_listing<'a>( + remote_client: &'a GenericRemoteStorage, + target: &'a S3Target, +) -> impl Stream)>> + 'a { + let listing_mode = if target.delimiter.is_empty() { + ListingMode::NoDelimiter + } else { + ListingMode::WithDelimiter + }; + try_stream! { + let mut objects_stream = std::pin::pin!(stream_objects_with_retries( + remote_client, + listing_mode, + target, + )); + while let Some(list) = objects_stream.next().await { + let list = list?; + if target.delimiter.is_empty() { + for key in list.keys { + yield (key.key.clone(), Some(key)); + } + } else { + for key in list.prefixes { + yield (key, None); + } + } + } + } +} diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs new file mode 100644 index 0000000000..c96d9cad3b --- /dev/null +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -0,0 +1,565 @@ +use std::collections::{BTreeMap, BTreeSet, HashMap}; +use std::sync::Arc; +use std::time::Duration; + +use crate::checks::{list_timeline_blobs, BlobDataParseResult}; +use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; +use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; +use futures_util::{StreamExt, TryStreamExt}; +use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; +use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path}; +use pageserver::tenant::storage_layer::LayerName; +use pageserver::tenant::IndexPart; +use pageserver_api::controller_api::TenantDescribeResponse; +use pageserver_api::shard::{ShardIndex, TenantShardId}; +use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; +use reqwest::Method; +use serde::Serialize; +use storage_controller_client::control_api; +use tokio_util::sync::CancellationToken; +use tracing::{info_span, Instrument}; +use utils::generation::Generation; +use utils::id::{TenantId, TenantTimelineId}; + +#[derive(Serialize, Default)] +pub struct GcSummary { + indices_deleted: usize, + remote_storage_errors: usize, + controller_api_errors: usize, + ancestor_layers_deleted: usize, +} + +impl GcSummary { + fn merge(&mut self, other: Self) { + let Self { + indices_deleted, + remote_storage_errors, + ancestor_layers_deleted, + controller_api_errors, + } = other; + + self.indices_deleted += indices_deleted; + self.remote_storage_errors += remote_storage_errors; + self.ancestor_layers_deleted += ancestor_layers_deleted; + self.controller_api_errors += controller_api_errors; + } +} + +#[derive(clap::ValueEnum, Debug, Clone, Copy)] +pub enum GcMode { + // Delete nothing + DryRun, + + // Enable only removing old-generation indices + IndicesOnly, + + // Enable all forms of GC + Full, +} + +impl std::fmt::Display for GcMode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + GcMode::DryRun => write!(f, "dry-run"), + GcMode::IndicesOnly => write!(f, "indices-only"), + GcMode::Full => write!(f, "full"), + } + } +} + +mod refs { + use super::*; + // Map of cross-shard layer references, giving a refcount for each layer in each shard that is referenced by some other + // shard in the same tenant. This is sparse! The vast majority of timelines will have no cross-shard refs, and those that + // do have cross shard refs should eventually drop most of them via compaction. + // + // In our inner map type, the TTID in the key is shard-agnostic, and the ShardIndex in the value refers to the _ancestor + // which is is referenced_. + #[derive(Default)] + pub(super) struct AncestorRefs( + BTreeMap>, + ); + + impl AncestorRefs { + /// Insert references for layers discovered in a particular shard-timeline that refer to an ancestral shard-timeline. + pub(super) fn update( + &mut self, + ttid: TenantShardTimelineId, + layers: Vec<(LayerName, LayerFileMetadata)>, + ) { + let ttid_refs = self.0.entry(ttid.as_tenant_timeline_id()).or_default(); + for (layer_name, layer_metadata) in layers { + // Increment refcount of this layer in the ancestor shard + *(ttid_refs + .entry((layer_metadata.shard, layer_name)) + .or_default()) += 1; + } + } + + /// For a particular TTID, return the map of all ancestor layers referenced by a descendent to their refcount + /// + /// The `ShardIndex` in the result's key is the index of the _ancestor_, not the descendent. + pub(super) fn get_ttid_refcounts( + &self, + ttid: &TenantTimelineId, + ) -> Option<&HashMap<(ShardIndex, LayerName), usize>> { + self.0.get(ttid) + } + } +} + +use refs::AncestorRefs; + +// As we see shards for a tenant, acccumulate knowledge needed for cross-shard GC: +// - Are there any ancestor shards? +// - Are there any refs to ancestor shards' layers? +#[derive(Default)] +struct TenantRefAccumulator { + shards_seen: HashMap>, + + // For each shard that has refs to an ancestor's layers, the set of ancestor layers referred to + ancestor_ref_shards: AncestorRefs, +} + +impl TenantRefAccumulator { + fn update(&mut self, ttid: TenantShardTimelineId, index_part: &IndexPart) { + let this_shard_idx = ttid.tenant_shard_id.to_index(); + (*self + .shards_seen + .entry(ttid.tenant_shard_id.tenant_id) + .or_default()) + .insert(this_shard_idx); + + let mut ancestor_refs = Vec::new(); + for (layer_name, layer_metadata) in &index_part.layer_metadata { + if layer_metadata.shard != this_shard_idx { + // This is a reference from this shard to a layer in an ancestor shard: we must track this + // as a marker to not GC this layer from the parent. + ancestor_refs.push((layer_name.clone(), layer_metadata.clone())); + } + } + + if !ancestor_refs.is_empty() { + tracing::info!(%ttid, "Found {} ancestor refs", ancestor_refs.len()); + self.ancestor_ref_shards.update(ttid, ancestor_refs); + } + } + + /// Consume Self and return a vector of ancestor tenant shards that should be GC'd, and map of referenced ancestor layers to preserve + async fn into_gc_ancestors( + self, + controller_client: &control_api::Client, + summary: &mut GcSummary, + ) -> (Vec, AncestorRefs) { + let mut ancestors_to_gc = Vec::new(); + for (tenant_id, shard_indices) in self.shards_seen { + // Find the highest shard count + let latest_count = shard_indices + .iter() + .map(|i| i.shard_count) + .max() + .expect("Always at least one shard"); + + let mut shard_indices = shard_indices.iter().collect::>(); + let (mut latest_shards, ancestor_shards) = { + let at = + itertools::partition(&mut shard_indices, |i| i.shard_count == latest_count); + (shard_indices[0..at].to_owned(), &shard_indices[at..]) + }; + // Sort shards, as we will later compare them with a sorted list from the controller + latest_shards.sort(); + + // Check that we have a complete view of the latest shard count: this should always be the case unless we happened + // to scan the S3 bucket halfway through a shard split. + if latest_shards.len() != latest_count.count() as usize { + // This should be extremely rare, so we warn on it. + tracing::warn!(%tenant_id, "Missed some shards at count {:?}: {latest_shards:?}", latest_count); + continue; + } + + // Check if we have any non-latest-count shards + if ancestor_shards.is_empty() { + tracing::debug!(%tenant_id, "No ancestor shards to clean up"); + continue; + } + + // Based on S3 view, this tenant looks like it might have some ancestor shard work to do. We + // must only do this work if the tenant is not currently being split: otherwise, it is not safe + // to GC ancestors, because if the split fails then the controller will try to attach ancestor + // shards again. + match controller_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await + { + Err(e) => { + // We were not able to learn the latest shard split state from the controller, so we will not + // do ancestor GC on this tenant. + tracing::warn!(%tenant_id, "Failed to query storage controller, will not do ancestor GC: {e}"); + summary.controller_api_errors += 1; + continue; + } + Ok(desc) => { + // We expect to see that the latest shard count matches the one we saw in S3, and that none + // of the shards indicate splitting in progress. + + let controller_indices: Vec = desc + .shards + .iter() + .map(|s| s.tenant_shard_id.to_index()) + .collect(); + if !controller_indices.iter().eq(latest_shards.iter().copied()) { + tracing::info!(%tenant_id, "Latest shards seen in S3 ({latest_shards:?}) don't match controller state ({controller_indices:?})"); + continue; + } + + if desc.shards.iter().any(|s| s.is_splitting) { + tracing::info!(%tenant_id, "One or more shards is currently splitting"); + continue; + } + + // This shouldn't be too noisy, because we only log this for tenants that have some ancestral refs. + tracing::info!(%tenant_id, "Validated state with controller: {desc:?}"); + } + } + + // GC ancestor shards + for ancestor_shard in ancestor_shards.iter().map(|idx| TenantShardId { + tenant_id, + shard_count: idx.shard_count, + shard_number: idx.shard_number, + }) { + ancestors_to_gc.push(ancestor_shard); + } + } + + (ancestors_to_gc, self.ancestor_ref_shards) + } +} + +fn is_old_enough(min_age: &Duration, key: &ListingObject, summary: &mut GcSummary) -> bool { + // Validation: we will only GC indices & layers after a time threshold (e.g. one week) so that during an incident + // it is easier to read old data for analysis, and easier to roll back shard splits without having to un-delete any objects. + let age = match key.last_modified.elapsed() { + Ok(e) => e, + Err(_) => { + tracing::warn!("Bad last_modified time: {:?}", key.last_modified); + summary.remote_storage_errors += 1; + return false; + } + }; + let old_enough = &age > min_age; + + if !old_enough { + tracing::info!( + "Skipping young object {} < {}", + humantime::format_duration(age), + humantime::format_duration(*min_age) + ); + } + + old_enough +} + +/// Same as [`is_old_enough`], but doesn't require a [`ListingObject`] passed to it. +async fn check_is_old_enough( + remote_client: &GenericRemoteStorage, + key: &RemotePath, + min_age: &Duration, + summary: &mut GcSummary, +) -> Option { + let listing_object = remote_client + .head_object(key, &CancellationToken::new()) + .await + .ok()?; + Some(is_old_enough(min_age, &listing_object, summary)) +} + +async fn maybe_delete_index( + remote_client: &GenericRemoteStorage, + min_age: &Duration, + latest_gen: Generation, + obj: &ListingObject, + mode: GcMode, + summary: &mut GcSummary, +) { + // Validation: we will only delete things that parse cleanly + let basename = obj.key.get_path().file_name().unwrap(); + let candidate_generation = + match parse_remote_index_path(RemotePath::from_string(basename).unwrap()) { + Some(g) => g, + None => { + if basename == IndexPart::FILE_NAME { + // A legacy pre-generation index + Generation::none() + } else { + // A strange key: we will not delete this because we don't understand it. + tracing::warn!("Bad index key"); + return; + } + } + }; + + // Validation: we will only delete indices more than one generation old, to avoid interfering + // in typical migrations, even if they are very long running. + if candidate_generation >= latest_gen { + // This shouldn't happen: when we loaded metadata, it should have selected the latest + // generation already, and only populated [`S3TimelineBlobData::unused_index_keys`] + // with older generations. + tracing::warn!("Deletion candidate is >= latest generation, this is a bug!"); + return; + } else if candidate_generation.next() == latest_gen { + // Skip deleting the latest-1th generation's index. + return; + } + + if !is_old_enough(min_age, obj, summary) { + return; + } + + if matches!(mode, GcMode::DryRun) { + tracing::info!("Dry run: would delete this key"); + return; + } + + // All validations passed: erase the object + match remote_client + .delete(&obj.key, &CancellationToken::new()) + .await + { + Ok(_) => { + tracing::info!("Successfully deleted index"); + summary.indices_deleted += 1; + } + Err(e) => { + tracing::warn!("Failed to delete index: {e}"); + summary.remote_storage_errors += 1; + } + } +} + +#[allow(clippy::too_many_arguments)] +async fn gc_ancestor( + remote_client: &GenericRemoteStorage, + root_target: &RootTarget, + min_age: &Duration, + ancestor: TenantShardId, + refs: &AncestorRefs, + mode: GcMode, + summary: &mut GcSummary, +) -> anyhow::Result<()> { + // Scan timelines in the ancestor + let timelines = stream_tenant_timelines(remote_client, root_target, ancestor).await?; + let mut timelines = std::pin::pin!(timelines); + + // Build a list of keys to retain + + while let Some(ttid) = timelines.next().await { + let ttid = ttid?; + + let data = list_timeline_blobs(remote_client, ttid, root_target).await?; + + let s3_layers = match data.blob_data { + BlobDataParseResult::Parsed { + index_part: _, + index_part_generation: _, + s3_layers, + } => s3_layers, + BlobDataParseResult::Relic => { + // Post-deletion tenant location: don't try and GC it. + continue; + } + BlobDataParseResult::Incorrect { + errors, + s3_layers: _, // TODO(yuchen): could still check references to these s3 layers? + } => { + // Our primary purpose isn't to report on bad data, but log this rather than skipping silently + tracing::warn!( + "Skipping ancestor GC for timeline {ttid}, bad metadata: {errors:?}" + ); + continue; + } + }; + + let ttid_refs = refs.get_ttid_refcounts(&ttid.as_tenant_timeline_id()); + let ancestor_shard_index = ttid.tenant_shard_id.to_index(); + + for (layer_name, layer_gen) in s3_layers { + let ref_count = ttid_refs + .and_then(|m| m.get(&(ancestor_shard_index, layer_name.clone()))) + .copied() + .unwrap_or(0); + + if ref_count > 0 { + tracing::debug!(%ttid, "Ancestor layer {layer_name} has {ref_count} refs"); + continue; + } + + tracing::info!(%ttid, "Ancestor layer {layer_name} is not referenced"); + + // Build the key for the layer we are considering deleting + let key = root_target.absolute_key(&remote_layer_path( + &ttid.tenant_shard_id.tenant_id, + &ttid.timeline_id, + ancestor_shard_index, + &layer_name, + layer_gen, + )); + + // We apply a time threshold to GCing objects that are un-referenced: this preserves our ability + // to roll back a shard split if we have to, by avoiding deleting ancestor layers right away + let path = RemotePath::from_string(key.strip_prefix("/").unwrap_or(&key)).unwrap(); + if check_is_old_enough(remote_client, &path, min_age, summary).await != Some(true) { + continue; + } + + if !matches!(mode, GcMode::Full) { + tracing::info!("Dry run: would delete key {key}"); + continue; + } + + // All validations passed: erase the object + match remote_client.delete(&path, &CancellationToken::new()).await { + Ok(_) => { + tracing::info!("Successfully deleted unreferenced ancestor layer {key}"); + summary.ancestor_layers_deleted += 1; + } + Err(e) => { + tracing::warn!("Failed to delete layer {key}: {e}"); + summary.remote_storage_errors += 1; + } + } + } + + // TODO: if all the layers are gone, clean up the whole timeline dir (remove index) + } + + Ok(()) +} + +/// Physical garbage collection: removing unused S3 objects. +/// +/// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level +/// (keys, layers). This type of garbage collection is about removing: +/// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between +/// uploading a layer and uploading an index) +/// - Index objects from historic generations +/// +/// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and +/// make sure that object listings don't get slowed down by large numbers of garbage objects. +pub async fn pageserver_physical_gc( + bucket_config: &BucketConfig, + controller_client: Option<&control_api::Client>, + tenant_shard_ids: Vec, + min_age: Duration, + mode: GcMode, +) -> anyhow::Result { + let (remote_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?; + + let tenants = if tenant_shard_ids.is_empty() { + futures::future::Either::Left(stream_tenants(&remote_client, &target)) + } else { + futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok))) + }; + + // How many tenants to process in parallel. We need to be mindful of pageservers + // accessing the same per tenant prefixes, so use a lower setting than pageservers. + const CONCURRENCY: usize = 32; + + // Accumulate information about each tenant for cross-shard GC step we'll do at the end + let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default())); + + // Generate a stream of TenantTimelineId + let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t)); + let timelines = timelines.try_buffered(CONCURRENCY); + let timelines = timelines.try_flatten(); + + // Generate a stream of S3TimelineBlobData + async fn gc_timeline( + remote_client: &GenericRemoteStorage, + min_age: &Duration, + target: &RootTarget, + mode: GcMode, + ttid: TenantShardTimelineId, + accumulator: &Arc>, + ) -> anyhow::Result { + let mut summary = GcSummary::default(); + let data = list_timeline_blobs(remote_client, ttid, target).await?; + + let (index_part, latest_gen, candidates) = match &data.blob_data { + BlobDataParseResult::Parsed { + index_part, + index_part_generation, + s3_layers: _s3_layers, + } => (index_part, *index_part_generation, data.unused_index_keys), + BlobDataParseResult::Relic => { + // Post-deletion tenant location: don't try and GC it. + return Ok(summary); + } + BlobDataParseResult::Incorrect { + errors, + s3_layers: _, + } => { + // Our primary purpose isn't to report on bad data, but log this rather than skipping silently + tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}"); + return Ok(summary); + } + }; + + accumulator.lock().unwrap().update(ttid, index_part); + + for key in candidates { + maybe_delete_index(remote_client, min_age, latest_gen, &key, mode, &mut summary) + .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, %key.key)) + .await; + } + + Ok(summary) + } + + let mut summary = GcSummary::default(); + + // Drain futures for per-shard GC, populating accumulator as a side effect + { + let timelines = timelines.map_ok(|ttid| { + gc_timeline(&remote_client, &min_age, &target, mode, ttid, &accumulator) + }); + let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); + + while let Some(i) = timelines.next().await { + summary.merge(i?); + } + } + + // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC + let Some(client) = controller_client else { + tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified"); + return Ok(summary); + }; + + let (ancestor_shards, ancestor_refs) = Arc::into_inner(accumulator) + .unwrap() + .into_inner() + .unwrap() + .into_gc_ancestors(client, &mut summary) + .await; + + for ancestor_shard in ancestor_shards { + gc_ancestor( + &remote_client, + &target, + &min_age, + ancestor_shard, + &ancestor_refs, + mode, + &mut summary, + ) + .instrument(info_span!("gc_ancestor", %ancestor_shard)) + .await?; + } + + Ok(summary) +} diff --git a/s3_scrubber/src/scan_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs similarity index 59% rename from s3_scrubber/src/scan_metadata.rs rename to storage_scrubber/src/scan_pageserver_metadata.rs index 4b63bb3884..151ef27672 100644 --- a/s3_scrubber/src/scan_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -1,21 +1,21 @@ use std::collections::{HashMap, HashSet}; use crate::checks::{ - branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData, - TenantObjectListing, TimelineAnalysis, + branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, + RemoteTimelineBlobData, TenantObjectListing, TimelineAnalysis, }; use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; -use aws_sdk_s3::Client; -use futures_util::{pin_mut, StreamExt, TryStreamExt}; -use histogram::Histogram; +use futures_util::{StreamExt, TryStreamExt}; use pageserver::tenant::remote_timeline_client::remote_layer_path; -use pageserver::tenant::IndexPart; +use pageserver_api::controller_api::MetadataHealthUpdateRequest; use pageserver_api::shard::TenantShardId; +use remote_storage::GenericRemoteStorage; use serde::Serialize; use utils::id::TenantId; +use utils::shard::ShardCount; -#[derive(Serialize)] +#[derive(Serialize, Default)] pub struct MetadataSummary { tenant_count: usize, timeline_count: usize, @@ -25,97 +25,18 @@ pub struct MetadataSummary { with_orphans: HashSet, indices_by_version: HashMap, - layer_count: MinMaxHisto, - timeline_size_bytes: MinMaxHisto, - layer_size_bytes: MinMaxHisto, -} - -/// A histogram plus minimum and maximum tracking -#[derive(Serialize)] -struct MinMaxHisto { #[serde(skip)] - histo: Histogram, - min: u64, - max: u64, -} - -impl MinMaxHisto { - fn new() -> Self { - Self { - histo: histogram::Histogram::builder() - .build() - .expect("Bad histogram params"), - min: u64::MAX, - max: 0, - } - } - - fn sample(&mut self, v: u64) -> Result<(), histogram::Error> { - self.min = std::cmp::min(self.min, v); - self.max = std::cmp::max(self.max, v); - let r = self.histo.increment(v, 1); - - if r.is_err() { - tracing::warn!("Bad histogram sample: {v}"); - } - - r - } - - fn oneline(&self) -> String { - let percentiles = match self.histo.percentiles(&[1.0, 10.0, 50.0, 90.0, 99.0]) { - Ok(p) => p, - Err(e) => return format!("No data: {}", e), - }; - - let percentiles: Vec = percentiles - .iter() - .map(|p| p.bucket().low() + p.bucket().high() / 2) - .collect(); - - format!( - "min {}, 1% {}, 10% {}, 50% {}, 90% {}, 99% {}, max {}", - self.min, - percentiles[0], - percentiles[1], - percentiles[2], - percentiles[3], - percentiles[4], - self.max, - ) - } + pub(crate) healthy_tenant_shards: HashSet, + #[serde(skip)] + pub(crate) unhealthy_tenant_shards: HashSet, } impl MetadataSummary { fn new() -> Self { - Self { - tenant_count: 0, - timeline_count: 0, - timeline_shard_count: 0, - with_errors: HashSet::new(), - with_warnings: HashSet::new(), - with_orphans: HashSet::new(), - indices_by_version: HashMap::new(), - layer_count: MinMaxHisto::new(), - timeline_size_bytes: MinMaxHisto::new(), - layer_size_bytes: MinMaxHisto::new(), - } + Self::default() } - fn update_histograms(&mut self, index_part: &IndexPart) -> Result<(), histogram::Error> { - self.layer_count - .sample(index_part.layer_metadata.len() as u64)?; - let mut total_size: u64 = 0; - for meta in index_part.layer_metadata.values() { - total_size += meta.file_size; - self.layer_size_bytes.sample(meta.file_size)?; - } - self.timeline_size_bytes.sample(total_size)?; - - Ok(()) - } - - fn update_data(&mut self, data: &S3TimelineBlobData) { + fn update_data(&mut self, data: &RemoteTimelineBlobData) { self.timeline_shard_count += 1; if let BlobDataParseResult::Parsed { index_part, @@ -125,20 +46,19 @@ impl MetadataSummary { { *self .indices_by_version - .entry(index_part.get_version()) + .entry(index_part.version()) .or_insert(0) += 1; - - if let Err(e) = self.update_histograms(index_part) { - // Value out of range? Warn that the results are untrustworthy - tracing::warn!( - "Error updating histograms, summary stats may be wrong: {}", - e - ); - } } } fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) { + if analysis.is_healthy() { + self.healthy_tenant_shards.insert(id.tenant_shard_id); + } else { + self.healthy_tenant_shards.remove(&id.tenant_shard_id); + self.unhealthy_tenant_shards.insert(id.tenant_shard_id); + } + if !analysis.errors.is_empty() { self.with_errors.insert(*id); } @@ -169,9 +89,6 @@ With errors: {} With warnings: {} With orphan layers: {} Index versions: {version_summary} -Timeline size bytes: {} -Layer size bytes: {} -Timeline layer count: {} ", self.tenant_count, self.timeline_count, @@ -179,9 +96,6 @@ Timeline layer count: {} self.with_errors.len(), self.with_warnings.len(), self.with_orphans.len(), - self.timeline_size_bytes.oneline(), - self.layer_size_bytes.oneline(), - self.layer_count.oneline(), ) } @@ -192,17 +106,24 @@ Timeline layer count: {} pub fn is_empty(&self) -> bool { self.timeline_shard_count == 0 } + + pub fn build_health_update_request(&self) -> MetadataHealthUpdateRequest { + MetadataHealthUpdateRequest { + healthy_tenant_shards: self.healthy_tenant_shards.clone(), + unhealthy_tenant_shards: self.unhealthy_tenant_shards.clone(), + } + } } /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics. -pub async fn scan_metadata( +pub async fn scan_pageserver_metadata( bucket_config: BucketConfig, tenant_ids: Vec, ) -> anyhow::Result { - let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?; + let (remote_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?; let tenants = if tenant_ids.is_empty() { - futures::future::Either::Left(stream_tenants(&s3_client, &target)) + futures::future::Either::Left(stream_tenants(&remote_client, &target)) } else { futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok))) }; @@ -212,21 +133,21 @@ pub async fn scan_metadata( const CONCURRENCY: usize = 32; // Generate a stream of TenantTimelineId - let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t)); + let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t)); let timelines = timelines.try_buffered(CONCURRENCY); let timelines = timelines.try_flatten(); // Generate a stream of S3TimelineBlobData async fn report_on_timeline( - s3_client: &Client, + remote_client: &GenericRemoteStorage, target: &RootTarget, ttid: TenantShardTimelineId, - ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> { - let data = list_timeline_blobs(s3_client, ttid, target).await?; + ) -> anyhow::Result<(TenantShardTimelineId, RemoteTimelineBlobData)> { + let data = list_timeline_blobs(remote_client, ttid, target).await?; Ok((ttid, data)) } - let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid)); - let timelines = timelines.try_buffered(CONCURRENCY); + let timelines = timelines.map_ok(|ttid| report_on_timeline(&remote_client, &target, ttid)); + let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); // We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different // shards in the same tenant might refer to one anothers' keys if a shard split has happened. @@ -235,33 +156,58 @@ pub async fn scan_metadata( let mut tenant_objects = TenantObjectListing::default(); let mut tenant_timeline_results = Vec::new(); - fn analyze_tenant( + async fn analyze_tenant( + remote_client: &GenericRemoteStorage, tenant_id: TenantId, summary: &mut MetadataSummary, mut tenant_objects: TenantObjectListing, - timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>, + timelines: Vec<(TenantShardTimelineId, RemoteTimelineBlobData)>, + highest_shard_count: ShardCount, ) { summary.tenant_count += 1; let mut timeline_ids = HashSet::new(); let mut timeline_generations = HashMap::new(); for (ttid, data) in timelines { - timeline_ids.insert(ttid.timeline_id); - // Stash the generation of each timeline, for later use identifying orphan layers - if let BlobDataParseResult::Parsed { - index_part: _index_part, - index_part_generation, - s3_layers: _s3_layers, - } = &data.blob_data - { - timeline_generations.insert(ttid, *index_part_generation); - } + if ttid.tenant_shard_id.shard_count == highest_shard_count { + // Only analyze `TenantShardId`s with highest shard count. - // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects` - // reference counts for layers across the tenant. - let analysis = - branch_cleanup_and_check_errors(&ttid, &mut tenant_objects, None, None, Some(data)); - summary.update_analysis(&ttid, &analysis); + // Stash the generation of each timeline, for later use identifying orphan layers + if let BlobDataParseResult::Parsed { + index_part, + index_part_generation, + s3_layers: _s3_layers, + } = &data.blob_data + { + if index_part.deleted_at.is_some() { + // skip deleted timeline. + tracing::info!("Skip analysis of {} b/c timeline is already deleted", ttid); + continue; + } + timeline_generations.insert(ttid, *index_part_generation); + } + + // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects` + // reference counts for layers across the tenant. + let analysis = branch_cleanup_and_check_errors( + remote_client, + &ttid, + &mut tenant_objects, + None, + None, + Some(data), + ) + .await; + summary.update_analysis(&ttid, &analysis); + + timeline_ids.insert(ttid.timeline_id); + } else { + tracing::info!( + "Skip analysis of {} b/c a lower shard count than {}", + ttid, + highest_shard_count.0, + ); + } } summary.timeline_count += timeline_ids.len(); @@ -309,41 +255,67 @@ pub async fn scan_metadata( // all results for the same tenant will be adjacent. We accumulate these, // and then call `analyze_tenant` to flush, when we see the next tenant ID. let mut summary = MetadataSummary::new(); - pin_mut!(timelines); + let mut highest_shard_count = ShardCount::MIN; while let Some(i) = timelines.next().await { let (ttid, data) = i?; summary.update_data(&data); match tenant_id { - None => tenant_id = Some(ttid.tenant_shard_id.tenant_id), + None => { + tenant_id = Some(ttid.tenant_shard_id.tenant_id); + highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count); + } Some(prev_tenant_id) => { if prev_tenant_id != ttid.tenant_shard_id.tenant_id { + // New tenant: analyze this tenant's timelines, clear accumulated tenant_timeline_results let tenant_objects = std::mem::take(&mut tenant_objects); let timelines = std::mem::take(&mut tenant_timeline_results); - analyze_tenant(prev_tenant_id, &mut summary, tenant_objects, timelines); + analyze_tenant( + &remote_client, + prev_tenant_id, + &mut summary, + tenant_objects, + timelines, + highest_shard_count, + ) + .await; tenant_id = Some(ttid.tenant_shard_id.tenant_id); + highest_shard_count = ttid.tenant_shard_id.shard_count; + } else { + highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count); } } } - if let BlobDataParseResult::Parsed { - index_part: _index_part, - index_part_generation: _index_part_generation, - s3_layers, - } = &data.blob_data - { - tenant_objects.push(ttid, s3_layers.clone()); + match &data.blob_data { + BlobDataParseResult::Parsed { + index_part: _index_part, + index_part_generation: _index_part_generation, + s3_layers, + } => { + tenant_objects.push(ttid, s3_layers.clone()); + } + BlobDataParseResult::Relic => (), + BlobDataParseResult::Incorrect { + errors: _, + s3_layers, + } => { + tenant_objects.push(ttid, s3_layers.clone()); + } } tenant_timeline_results.push((ttid, data)); } if !tenant_timeline_results.is_empty() { analyze_tenant( + &remote_client, tenant_id.expect("Must be set if results are present"), &mut summary, tenant_objects, tenant_timeline_results, - ); + highest_shard_count, + ) + .await; } Ok(summary) diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs new file mode 100644 index 0000000000..15f3665fac --- /dev/null +++ b/storage_scrubber/src/scan_safekeeper_metadata.rs @@ -0,0 +1,302 @@ +use std::{collections::HashSet, str::FromStr, sync::Arc}; + +use futures::stream::{StreamExt, TryStreamExt}; +use once_cell::sync::OnceCell; +use pageserver_api::shard::TenantShardId; +use postgres_ffi::{XLogFileName, PG_TLI}; +use remote_storage::GenericRemoteStorage; +use serde::Serialize; +use tokio_postgres::types::PgLsn; +use tracing::{debug, error, info}; +use utils::{ + id::{TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, +}; + +use crate::{ + cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing, + BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, +}; + +/// Generally we should ask safekeepers, but so far we use everywhere default 16MB. +const WAL_SEGSIZE: usize = 16 * 1024 * 1024; + +#[derive(Serialize)] +pub struct MetadataSummary { + timeline_count: usize, + with_errors: HashSet, + deleted_count: usize, +} + +impl MetadataSummary { + fn new() -> Self { + Self { + timeline_count: 0, + with_errors: HashSet::new(), + deleted_count: 0, + } + } + + pub fn summary_string(&self) -> String { + format!( + "timeline_count: {}, with_errors: {}", + self.timeline_count, + self.with_errors.len() + ) + } + + pub fn is_empty(&self) -> bool { + self.timeline_count == 0 + } + + pub fn is_fatal(&self) -> bool { + !self.with_errors.is_empty() + } +} + +#[derive(serde::Deserialize)] +pub struct TimelineLsnData { + tenant_id: String, + timeline_id: String, + timeline_start_lsn: Lsn, + backup_lsn: Lsn, +} + +pub enum DatabaseOrList { + Database { + tenant_ids: Vec, + connstr: String, + table: String, + }, + List(Vec), +} + +/// Scan the safekeeper metadata in an S3 bucket, reporting errors and +/// statistics. +/// +/// It works by listing timelines along with timeline_start_lsn and backup_lsn +/// in debug dump in dump_db_table and verifying its s3 contents. If some WAL +/// segments are missing, before complaining control plane is queried to check if +/// the project wasn't deleted in the meanwhile. +pub async fn scan_safekeeper_metadata( + bucket_config: BucketConfig, + db_or_list: DatabaseOrList, +) -> anyhow::Result { + info!( + "checking bucket {}, region {}", + bucket_config.bucket, bucket_config.region + ); + + let (remote_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?; + let console_config = ConsoleConfig::from_env()?; + let cloud_admin_api_client = CloudAdminApiClient::new(console_config); + + let timelines = match db_or_list { + DatabaseOrList::Database { + tenant_ids, + connstr, + table, + } => load_timelines_from_db(tenant_ids, connstr, table).await?, + DatabaseOrList::List(list) => list, + }; + info!("loaded {} timelines", timelines.len()); + + let checks = futures::stream::iter(timelines.into_iter().map(Ok)).map_ok(|timeline| { + let tenant_id = TenantId::from_str(&timeline.tenant_id).expect("failed to parse tenant_id"); + let timeline_id = + TimelineId::from_str(&timeline.timeline_id).expect("failed to parse tenant_id"); + let ttid = TenantTimelineId::new(tenant_id, timeline_id); + check_timeline( + &remote_client, + &target, + &cloud_admin_api_client, + ttid, + timeline.timeline_start_lsn, + timeline.backup_lsn, + ) + }); + // Run multiple check_timeline's concurrently. + const CONCURRENCY: usize = 32; + let mut timelines = checks.try_buffered(CONCURRENCY); + + let mut summary = MetadataSummary::new(); + while let Some(r) = timelines.next().await { + let res = r?; + summary.timeline_count += 1; + if !res.is_ok { + summary.with_errors.insert(res.ttid); + } + if res.is_deleted { + summary.deleted_count += 1; + } + } + + Ok(summary) +} + +struct TimelineCheckResult { + ttid: TenantTimelineId, + is_ok: bool, + is_deleted: bool, // timeline is deleted in cplane +} + +/// List s3 and check that is has all expected WAL for the ttid. Consistency +/// errors are logged to stderr; returns Ok(true) if timeline is consistent, +/// Ok(false) if not, Err if failed to check. +async fn check_timeline( + remote_client: &GenericRemoteStorage, + root: &RootTarget, + api_client: &CloudAdminApiClient, + ttid: TenantTimelineId, + timeline_start_lsn: Lsn, + backup_lsn: Lsn, +) -> anyhow::Result { + debug!( + "checking ttid {}, should contain WAL [{}-{}]", + ttid, timeline_start_lsn, backup_lsn + ); + // calculate expected segfiles + let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE); + let expected_last_segno = backup_lsn.segment_number(WAL_SEGSIZE); + let mut expected_segfiles: HashSet = HashSet::from_iter( + (expected_first_segno..expected_last_segno) + .map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)), + ); + let expected_files_num = expected_segfiles.len(); + debug!("expecting {} files", expected_segfiles.len(),); + + // now list s3 and check if it misses something + let ttshid = + TenantShardTimelineId::new(TenantShardId::unsharded(ttid.tenant_id), ttid.timeline_id); + let mut timeline_dir_target = root.timeline_root(&ttshid); + // stream_listing yields only common_prefixes if delimiter is not empty, but + // we need files, so unset it. + timeline_dir_target.delimiter = String::new(); + + let prefix_str = &timeline_dir_target + .prefix_in_bucket + .strip_prefix("/") + .unwrap_or(&timeline_dir_target.prefix_in_bucket); + + let mut stream = std::pin::pin!(stream_listing(remote_client, &timeline_dir_target)); + while let Some(obj) = stream.next().await { + let (key, _obj) = obj?; + + let seg_name = key + .get_path() + .as_str() + .strip_prefix(prefix_str) + .expect("failed to extract segment name"); + expected_segfiles.remove(seg_name); + } + if !expected_segfiles.is_empty() { + // Before complaining check cplane, probably timeline is already deleted. + let bdata = api_client + .find_timeline_branch(ttid.tenant_id, ttid.timeline_id) + .await?; + let deleted = match bdata { + Some(bdata) => bdata.deleted, + None => { + // note: should be careful with selecting proper cplane address + info!("ttid {} not found, assuming it is deleted", ttid); + true + } + }; + if deleted { + // ok, branch is deleted + return Ok(TimelineCheckResult { + ttid, + is_ok: true, + is_deleted: true, + }); + } + error!( + "ttid {}: missing {} files out of {}, timeline_start_lsn {}, wal_backup_lsn {}", + ttid, + expected_segfiles.len(), + expected_files_num, + timeline_start_lsn, + backup_lsn, + ); + return Ok(TimelineCheckResult { + ttid, + is_ok: false, + is_deleted: false, + }); + } + Ok(TimelineCheckResult { + ttid, + is_ok: true, + is_deleted: false, + }) +} + +fn load_certs() -> Result, std::io::Error> { + let der_certs = rustls_native_certs::load_native_certs()?; + let mut store = rustls::RootCertStore::empty(); + store.add_parsable_certificates(der_certs); + Ok(Arc::new(store)) +} +static TLS_ROOTS: OnceCell> = OnceCell::new(); + +async fn load_timelines_from_db( + tenant_ids: Vec, + dump_db_connstr: String, + dump_db_table: String, +) -> anyhow::Result> { + info!("loading from table {dump_db_table}"); + + // Use rustls (Neon requires TLS) + let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone(); + let client_config = rustls::ClientConfig::builder() + .with_root_certificates(root_store) + .with_no_client_auth(); + let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config); + let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?; + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + let tenant_filter_clause = if !tenant_ids.is_empty() { + format!( + "and tenant_id in ({})", + tenant_ids + .iter() + .map(|t| format!("'{}'", t)) + .collect::>() + .join(", ") + ) + } else { + "".to_owned() + }; + let query = format!( + "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) \ + from \"{dump_db_table}\" \ + where not is_cancelled {tenant_filter_clause} \ + group by tenant_id, timeline_id;" + ); + info!("query is {}", query); + let timelines = client.query(&query, &[]).await?; + + let timelines = timelines + .into_iter() + .map(|row| { + let tenant_id = row.get(0); + let timeline_id = row.get(1); + let timeline_start_lsn_pg: PgLsn = row.get(2); + let backup_lsn_pg: PgLsn = row.get(3); + + TimelineLsnData { + tenant_id, + timeline_id, + timeline_start_lsn: Lsn(u64::from(timeline_start_lsn_pg)), + backup_lsn: Lsn(u64::from(backup_lsn_pg)), + } + }) + .collect::>(); + Ok(timelines) +} diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs new file mode 100644 index 0000000000..bb4079b5f4 --- /dev/null +++ b/storage_scrubber/src/tenant_snapshot.rs @@ -0,0 +1,295 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use crate::checks::{list_timeline_blobs, BlobDataParseResult, RemoteTimelineBlobData}; +use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines}; +use crate::{ + download_object_to_file_s3, init_remote, init_remote_s3, BucketConfig, NodeKind, RootTarget, + TenantShardTimelineId, +}; +use anyhow::Context; +use async_stream::stream; +use aws_sdk_s3::Client; +use camino::Utf8PathBuf; +use futures::{StreamExt, TryStreamExt}; +use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; +use pageserver::tenant::storage_layer::LayerName; +use pageserver::tenant::IndexPart; +use pageserver_api::shard::TenantShardId; +use remote_storage::GenericRemoteStorage; +use utils::generation::Generation; +use utils::id::TenantId; + +pub struct SnapshotDownloader { + s3_client: Arc, + s3_root: RootTarget, + bucket_config: BucketConfig, + tenant_id: TenantId, + output_path: Utf8PathBuf, + concurrency: usize, +} + +impl SnapshotDownloader { + pub async fn new( + bucket_config: BucketConfig, + tenant_id: TenantId, + output_path: Utf8PathBuf, + concurrency: usize, + ) -> anyhow::Result { + let (s3_client, s3_root) = + init_remote_s3(bucket_config.clone(), NodeKind::Pageserver).await?; + Ok(Self { + s3_client, + s3_root, + bucket_config, + tenant_id, + output_path, + concurrency, + }) + } + + async fn download_layer( + &self, + ttid: TenantShardTimelineId, + layer_name: LayerName, + layer_metadata: LayerFileMetadata, + ) -> anyhow::Result<(LayerName, LayerFileMetadata)> { + // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format. They use + // different layer names (remote-style has the generation suffix) + let local_path = self.output_path.join(format!( + "{}/timelines/{}/{}{}", + ttid.tenant_shard_id, + ttid.timeline_id, + layer_name, + layer_metadata.generation.get_suffix() + )); + + // We should only be called for layers that are owned by the input TTID + assert_eq!(layer_metadata.shard, ttid.tenant_shard_id.to_index()); + + // Assumption: we always write layer files atomically, and layer files are immutable. Therefore if the file + // already exists on local disk, we assume it is fully correct and skip it. + if tokio::fs::try_exists(&local_path).await? { + tracing::debug!("{} already exists", local_path); + return Ok((layer_name, layer_metadata)); + } else { + tracing::debug!("{} requires download...", local_path); + + let timeline_root = self.s3_root.timeline_root(&ttid); + let remote_layer_path = format!( + "{}{}{}", + timeline_root.prefix_in_bucket, + layer_name, + layer_metadata.generation.get_suffix() + ); + + // List versions: the object might be deleted. + let versions = self + .s3_client + .list_object_versions() + .bucket(self.bucket_config.bucket.clone()) + .prefix(&remote_layer_path) + .send() + .await?; + let Some(version) = versions.versions.as_ref().and_then(|v| v.first()) else { + return Err(anyhow::anyhow!("No versions found for {remote_layer_path}")); + }; + download_object_to_file_s3( + &self.s3_client, + &self.bucket_config.bucket, + &remote_layer_path, + version.version_id.as_deref(), + &local_path, + ) + .await?; + + tracing::debug!("Downloaded successfully to {local_path}"); + } + + Ok((layer_name, layer_metadata)) + } + + /// Download many layers belonging to the same TTID, with some concurrency + async fn download_layers( + &self, + ttid: TenantShardTimelineId, + layers: Vec<(LayerName, LayerFileMetadata)>, + ) -> anyhow::Result<()> { + let layer_count = layers.len(); + tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count); + let layers_stream = stream! { + for (layer_name, layer_metadata) in layers { + yield self.download_layer(ttid, layer_name, layer_metadata); + } + }; + + tokio::fs::create_dir_all(self.output_path.join(format!( + "{}/timelines/{}", + ttid.tenant_shard_id, ttid.timeline_id + ))) + .await?; + + let layer_results = layers_stream.buffered(self.concurrency); + let mut layer_results = std::pin::pin!(layer_results); + + let mut err = None; + let mut download_count = 0; + while let Some(i) = layer_results.next().await { + download_count += 1; + match i { + Ok((layer_name, layer_metadata)) => { + tracing::info!( + "[{download_count}/{layer_count}] OK: {} bytes {ttid} {}", + layer_metadata.file_size, + layer_name + ); + } + Err(e) => { + // Warn and continue: we will download what we can + tracing::warn!("Download error: {e}"); + err = Some(e); + } + } + } + if let Some(e) = err { + tracing::warn!("Some errors occurred downloading {ttid} layers, last error: {e}"); + Err(e) + } else { + Ok(()) + } + } + + async fn download_timeline( + &self, + ttid: TenantShardTimelineId, + index_part: Box, + index_part_generation: Generation, + ancestor_layers: &mut HashMap>, + ) -> anyhow::Result<()> { + let index_bytes = serde_json::to_string(&index_part).unwrap(); + + let layers = index_part + .layer_metadata + .into_iter() + .filter_map(|(layer_name, layer_metadata)| { + if layer_metadata.shard.shard_count != ttid.tenant_shard_id.shard_count { + // Accumulate ancestor layers for later download + let ancestor_ttid = TenantShardTimelineId::new( + TenantShardId { + tenant_id: ttid.tenant_shard_id.tenant_id, + shard_number: layer_metadata.shard.shard_number, + shard_count: layer_metadata.shard.shard_count, + }, + ttid.timeline_id, + ); + let ancestor_ttid_layers = ancestor_layers.entry(ancestor_ttid).or_default(); + use std::collections::hash_map::Entry; + match ancestor_ttid_layers.entry(layer_name) { + Entry::Occupied(entry) => { + // Descendent shards that reference a layer from an ancestor should always have matching metadata, + // as their siblings, because it is read atomically during a shard split. + assert_eq!(entry.get(), &layer_metadata); + } + Entry::Vacant(entry) => { + entry.insert(layer_metadata); + } + } + None + } else { + Some((layer_name, layer_metadata)) + } + }) + .collect(); + + let download_result = self.download_layers(ttid, layers).await; + + // Write index last, once all the layers it references are downloaded + let local_index_path = self.output_path.join(format!( + "{}/timelines/{}/index_part.json{}", + ttid.tenant_shard_id, + ttid.timeline_id, + index_part_generation.get_suffix() + )); + tokio::fs::write(&local_index_path, index_bytes) + .await + .context("writing index")?; + + download_result + } + + pub async fn download(&self) -> anyhow::Result<()> { + let (remote_client, target) = + init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?; + + // Generate a stream of TenantShardId + let shards = stream_tenant_shards(&remote_client, &target, self.tenant_id).await?; + let shards: Vec = shards.try_collect().await?; + + // Only read from shards that have the highest count: avoids redundantly downloading + // from ancestor shards. + let Some(shard_count) = shards.iter().map(|s| s.shard_count).max() else { + anyhow::bail!("No shards found"); + }; + + // We will build a collection of layers in anccestor shards to download (this will only + // happen if this tenant has been split at some point) + let mut ancestor_layers: HashMap< + TenantShardTimelineId, + HashMap, + > = Default::default(); + + for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) { + // Generate a stream of TenantTimelineId + let timelines = stream_tenant_timelines(&remote_client, &target, shard).await?; + + // Generate a stream of S3TimelineBlobData + async fn load_timeline_index( + remote_client: &GenericRemoteStorage, + target: &RootTarget, + ttid: TenantShardTimelineId, + ) -> anyhow::Result<(TenantShardTimelineId, RemoteTimelineBlobData)> { + let data = list_timeline_blobs(remote_client, ttid, target).await?; + Ok((ttid, data)) + } + let timelines = + timelines.map_ok(|ttid| load_timeline_index(&remote_client, &target, ttid)); + let mut timelines = std::pin::pin!(timelines.try_buffered(8)); + + while let Some(i) = timelines.next().await { + let (ttid, data) = i?; + match data.blob_data { + BlobDataParseResult::Parsed { + index_part, + index_part_generation, + s3_layers: _, + } => { + self.download_timeline( + ttid, + index_part, + index_part_generation, + &mut ancestor_layers, + ) + .await + .context("Downloading timeline")?; + } + BlobDataParseResult::Relic => {} + BlobDataParseResult::Incorrect { .. } => { + tracing::error!("Bad metadata in timeline {ttid}"); + } + }; + } + } + + for (ttid, layers) in ancestor_layers.into_iter() { + tracing::info!( + "Downloading {} layers from ancestor timeline {ttid}...", + layers.len() + ); + + self.download_layers(ttid, layers.into_iter().collect()) + .await?; + } + + Ok(()) + } +} diff --git a/test_runner/README.md b/test_runner/README.md index 96e74659ce..73aa29d4bb 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -71,20 +71,16 @@ a subdirectory for each version with naming convention `v{PG_VERSION}/`. Inside that dir, a `bin/postgres` binary should be present. `DEFAULT_PG_VERSION`: The version of Postgres to use, This is used to construct full path to the postgres binaries. -Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION="14"`. Alternatively, -you can use `--pg-version` argument. +Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION=16` `TEST_OUTPUT`: Set the directory where test state and test output files should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. -`NEON_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as `RUST_LOG`: logging configuration to pass into Neon CLI Useful parameters and commands: -`--pageserver-config-override=${value}` `-c` values to pass into pageserver through neon_local cli - `--preserve-database-files` to preserve pageserver (layer) and safekeer (segment) timeline files on disk -after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents. +after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents. If `NeonEnvBuilder#preserve_database_files` set to `True` for a particular test, the whole `repo` directory will be attached to Allure report (thus uploaded to S3) as `everything.tar.zst` for this test. Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them: `./scripts/pytest -s --log-cli-level=INFO ...` @@ -95,6 +91,166 @@ Exit after the first test failure: `./scripts/pytest -x ...` (there are many more pytest options; run `pytest -h` to see them.) +#### Running Python tests against real S3 or S3-compatible services + +Neon's `libs/remote_storage` supports multiple implementations of remote storage. +At the time of writing, that is +```rust +pub enum RemoteStorageKind { + /// Storage based on local file system. + /// Specify a root folder to place all stored files into. + LocalFs(Utf8PathBuf), + /// AWS S3 based storage, storing all files in the S3 bucket + /// specified by the config + AwsS3(S3Config), + /// Azure Blob based storage, storing all files in the container + /// specified by the config + AzureContainer(AzureConfig), +} +``` + +The test suite has a Python enum with equal name but different meaning: + +```python +@enum.unique +class RemoteStorageKind(str, enum.Enum): + LOCAL_FS = "local_fs" + MOCK_S3 = "mock_s3" + REAL_S3 = "real_s3" +``` + +* `LOCAL_FS` => `LocalFs` +* `MOCK_S3`: starts [`moto`](https://github.com/getmoto/moto)'s S3 implementation, then configures Pageserver with `AwsS3` +* `REAL_S3` => configure `AwsS3` as detailed below + +When a test in the test suite needs an `AwsS3`, it is supposed to call `remote_storage.s3_storage()`. +That function checks env var `ENABLE_REAL_S3_REMOTE_STORAGE`: +* If it is not set, use `MOCK_S3` +* If it is set, use `REAL_S3`. + +For `REAL_S3`, the test suite creates the dict/toml representation of the `RemoteStorageKind::AwsS3` based on env vars: + +```rust +pub struct S3Config { + // test suite env var: REMOTE_STORAGE_S3_BUCKET + pub bucket_name: String, + // test suite env var: REMOTE_STORAGE_S3_REGION + pub bucket_region: String, + // test suite determines this + pub prefix_in_bucket: Option, + // no env var exists; test suite sets it for MOCK_S3, because that's how moto works + pub endpoint: Option, + ... +} +``` + +*Credentials* are not part of the config, but discovered by the AWS SDK. +See the `libs/remote_storage` Rust code. +We're documenting two mechanism here: + +The test suite supports two mechanisms (`remote_storage.py`): + +**Credential mechanism 1**: env vars `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. +Populate the env vars with AWS access keys that you created in IAM. +Our CI uses this mechanism. +However, it is _not_ recommended for interactive use by developers ([learn more](https://docs.aws.amazon.com/sdkref/latest/guide/access-users.html#credentials-long-term)). +Instead, use profiles (next section). + +**Credential mechanism 2**: env var `AWS_PROFILE`. +This uses the AWS SDK's (and CLI's) profile mechanism. +Learn more about it [in the official docs](https://docs.aws.amazon.com/sdkref/latest/guide/file-format.html). +After configuring a profile (e.g. via the aws CLI), set the env var to its name. + +In conclusion, the full command line is: + +```bash +# with long-term AWS access keys +ENABLE_REAL_S3_REMOTE_STORAGE=true \ +REMOTE_STORAGE_S3_BUCKET=mybucket \ +REMOTE_STORAGE_S3_REGION=eu-central-1 \ +AWS_ACCESS_KEY_ID=... \ +AWS_SECRET_ACCESS_KEY=... \ +./scripts/pytest +``` + +```bash +# with AWS PROFILE +ENABLE_REAL_S3_REMOTE_STORAGE=true \ +REMOTE_STORAGE_S3_BUCKET=mybucket \ +REMOTE_STORAGE_S3_REGION=eu-central-1 \ +AWS_PROFILE=... \ +./scripts/pytest +``` + +If you're using SSO, make sure to `aws sso login --profile $AWS_PROFILE` first. + +##### Minio + +If you want to run test without the cloud setup, we recommend [minio](https://min.io/docs/minio/linux/index.html). + +```bash +# Start in Terminal 1 +mkdir /tmp/minio_data +minio server /tmp/minio_data --console-address 127.0.0.1:9001 --address 127.0.0.1:9000 +``` + +In another terminal, create an `aws` CLI profile for it: + +```ini +# append to ~/.aws/config +[profile local-minio] +services = local-minio-services +[services local-minio-services] +s3 = + endpoint_url=http://127.0.0.1:9000/ +``` + + +Now configure the credentials (this is going to write `~/.aws/credentials` for you). +It's an interactive prompt. + +```bash +# Terminal 2 +$ aws --profile local-minio configure +AWS Access Key ID [None]: minioadmin +AWS Secret Access Key [None]: minioadmin +Default region name [None]: +Default output format [None]: +``` + +Now create a bucket `testbucket` using the CLI. + +```bash +# (don't forget to have AWS_PROFILE env var set; or use --profile) +aws --profile local-minio s3 mb s3://mybucket +``` + +(If it doesn't work, make sure you update your AWS CLI to a recent version. + The [service-specific endpoint feature](https://docs.aws.amazon.com/sdkref/latest/guide/feature-ss-endpoints.html) + that we're using is quite new.) + +```bash +# with AWS PROFILE +ENABLE_REAL_S3_REMOTE_STORAGE=true \ +REMOTE_STORAGE_S3_BUCKET=mybucket \ +REMOTE_STORAGE_S3_REGION=doesntmatterforminio \ +AWS_PROFILE=local-minio \ +./scripts/pytest +``` + +NB: you can avoid the `--profile` by setting the `AWS_PROFILE` variable. +Just like the AWS SDKs, the `aws` CLI is sensible to it. + +#### Running Rust tests against real S3 or S3-compatible services + +We have some Rust tests that only run against real S3, e.g., [here](https://github.com/neondatabase/neon/blob/c18d3340b5e3c978a81c3db8b6f1e83cd9087e8a/libs/remote_storage/tests/test_real_s3.rs#L392-L397). + +They use the same env vars as the Python test suite (see previous section) +but interpret them on their own. +However, at this time, the interpretation is identical. + +So, above instructions apply to the Rust test as well. + ### Writing a test Every test needs a Neon Environment, or NeonEnv to operate in. A Neon Environment @@ -128,6 +284,21 @@ def test_foobar(neon_env_builder: NeonEnvBuilder): ... ``` +The env includes a default tenant and timeline. Therefore, you do not need to create your own +tenant/timeline for testing. + +```python +def test_foobar2(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() # Start the environment + with env.endpoints.create_start("main") as endpoint: + # Start the compute endpoint + client = env.pageserver.http_client() # Get the pageserver client + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id) +``` + For more information about pytest fixtures, see https://docs.pytest.org/en/stable/fixture.html At the end of a test, all the nodes in the environment are automatically stopped, so you diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 200c9c3740..996ca4d652 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -2,6 +2,8 @@ pytest_plugins = ( "fixtures.pg_version", "fixtures.parametrize", "fixtures.httpserver", + "fixtures.compute_reconfigure", + "fixtures.storage_controller_proxy", "fixtures.neon_fixtures", "fixtures.benchmark_fixture", "fixtures.pg_stats", diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index e7959c1764..0c36cd6ef7 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -19,9 +19,9 @@ from _pytest.config.argparsing import Parser from _pytest.fixtures import FixtureRequest from _pytest.terminal import TerminalReporter +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonPageserver -from fixtures.types import TenantId, TimelineId """ This file contains fixtures for micro-benchmarks. @@ -222,6 +222,8 @@ class NeonBenchmarker: function by the zenbenchmark fixture """ + PROPERTY_PREFIX = "neon_benchmarker_" + def __init__(self, property_recorder: Callable[[str, object], None]): # property recorder here is a pytest fixture provided by junitxml module # https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property @@ -238,7 +240,7 @@ class NeonBenchmarker: Record a benchmark result. """ # just to namespace the value - name = f"neon_benchmarker_{metric_name}" + name = f"{self.PROPERTY_PREFIX}_{metric_name}" self.property_recorder( name, { @@ -249,6 +251,18 @@ class NeonBenchmarker: }, ) + @classmethod + def records( + cls, user_properties: list[tuple[str, object]] + ) -> Iterator[tuple[str, dict[str, object]]]: + """ + Yield all records related to benchmarks + """ + for property_name, recorded_property in user_properties: + if property_name.startswith(cls.PROPERTY_PREFIX): + assert isinstance(recorded_property, dict) + yield recorded_property["name"], recorded_property + @contextmanager def record_duration(self, metric_name: str) -> Iterator[None]: """ @@ -425,10 +439,11 @@ def zenbenchmark( yield benchmarker results = {} - for _, recorded_property in request.node.user_properties: + for _, recorded_property in NeonBenchmarker.records(request.node.user_properties): name = recorded_property["name"] value = str(recorded_property["value"]) - if (unit := recorded_property["unit"].strip()) != "": + unit = str(recorded_property["unit"]).strip() + if unit != "": value += f" {unit}" results[name] = value @@ -477,25 +492,23 @@ def pytest_terminal_summary( for test_report in terminalreporter.stats.get("passed", []): result_entry = [] - for _, recorded_property in test_report.user_properties: + for _, recorded_property in NeonBenchmarker.records(test_report.user_properties): if not is_header_printed: terminalreporter.section("Benchmark results", "-") is_header_printed = True - terminalreporter.write( - "{}.{}: ".format(test_report.head_line, recorded_property["name"]) - ) + terminalreporter.write(f"{test_report.head_line}.{recorded_property['name']}: ") unit = recorded_property["unit"] value = recorded_property["value"] if unit == "MB": - terminalreporter.write("{0:,.0f}".format(value), green=True) + terminalreporter.write(f"{value:,.0f}", green=True) elif unit in ("s", "ms") and isinstance(value, float): - terminalreporter.write("{0:,.3f}".format(value), green=True) + terminalreporter.write(f"{value:,.3f}", green=True) elif isinstance(value, float): - terminalreporter.write("{0:,.4f}".format(value), green=True) + terminalreporter.write(f"{value:,.4f}", green=True) else: terminalreporter.write(str(value), green=True) - terminalreporter.line(" {}".format(unit)) + terminalreporter.line(f" {unit}") result_entry.append(recorded_property) diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/common_types.py similarity index 75% rename from test_runner/fixtures/types.py rename to test_runner/fixtures/common_types.py index ea648e460d..8eda19d1e2 100644 --- a/test_runner/fixtures/types.py +++ b/test_runner/fixtures/common_types.py @@ -1,10 +1,13 @@ import random from dataclasses import dataclass +from enum import Enum from functools import total_ordering -from typing import Any, Type, TypeVar, Union +from typing import Any, Dict, Type, TypeVar, Union T = TypeVar("T", bound="Id") +DEFAULT_WAL_SEG_SIZE = 16 * 1024 * 1024 + @total_ordering class Lsn: @@ -67,6 +70,21 @@ class Lsn: def as_int(self) -> int: return self.lsn_int + def segment_lsn(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> "Lsn": + return Lsn(self.lsn_int - (self.lsn_int % seg_sz)) + + def segno(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> int: + return self.lsn_int // seg_sz + + def segment_name(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> str: + segno = self.segno(seg_sz) + # The filename format is 00000001XXXXXXXX000000YY, where XXXXXXXXYY is segno in hex. + # XXXXXXXX is the higher 8 hex digits of segno + high_bits = segno >> 8 + # YY is the lower 2 hex digits of segno + low_bits = segno & 0xFF + return f"00000001{high_bits:08X}000000{low_bits:02X}" + @dataclass(frozen=True) class Key: @@ -126,6 +144,22 @@ class TimelineId(Id): def __repr__(self) -> str: return f'TimelineId("{self.id.hex()}")' + def __str__(self) -> str: + return self.id.hex() + + +@dataclass +class TenantTimelineId: + tenant_id: TenantId + timeline_id: TimelineId + + @classmethod + def from_json(cls, d: Dict[str, Any]) -> "TenantTimelineId": + return TenantTimelineId( + tenant_id=TenantId(d["tenant_id"]), + timeline_id=TimelineId(d["timeline_id"]), + ) + # Workaround for compat with python 3.9, which does not have `typing.Self` TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId") @@ -156,7 +190,14 @@ class TenantShardId: raise ValueError(f"Invalid TenantShardId '{input}'") def __str__(self): - return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}" + if self.shard_count > 0: + return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}" + else: + # Unsharded case: equivalent of Rust TenantShardId::unsharded(tenant_id) + return str(self.tenant_id) + + def __repr__(self): + return self.__str__() def _tuple(self) -> tuple[TenantId, int, int]: return (self.tenant_id, self.shard_number, self.shard_count) @@ -173,3 +214,9 @@ class TenantShardId: def __hash__(self) -> int: return hash(self._tuple()) + + +# TODO: Replace with `StrEnum` when we upgrade to python 3.11 +class TimelineArchivalState(str, Enum): + ARCHIVED = "Archived" + UNARCHIVED = "Unarchived" diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 6fbaa08512..7c4a8db36f 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -1,3 +1,5 @@ +import os +import time from abc import ABC, abstractmethod from contextlib import _GeneratorContextManager, contextmanager @@ -8,6 +10,7 @@ import pytest from _pytest.fixtures import FixtureRequest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, PgBin, @@ -42,7 +45,11 @@ class PgCompare(ABC): pass @abstractmethod - def flush(self): + def flush(self, compact: bool = False, gc: bool = False): + pass + + @abstractmethod + def compact(self): pass @abstractmethod @@ -98,7 +105,6 @@ class NeonCompare(PgCompare): zenbenchmark: NeonBenchmarker, neon_simple_env: NeonEnv, pg_bin: PgBin, - branch_name: str, ): self.env = neon_simple_env self._zenbenchmark = zenbenchmark @@ -106,18 +112,11 @@ class NeonCompare(PgCompare): self.pageserver_http_client = self.env.pageserver.http_client() # note that neon_simple_env now uses LOCAL_FS remote storage - - # Create tenant - tenant_conf: Dict[str, str] = {} - if False: # TODO add pytest setting for this - tenant_conf["trace_read_requests"] = "true" - self.tenant, _ = self.env.neon_cli.create_tenant(conf=tenant_conf) - - # Create timeline - self.timeline = self.env.neon_cli.create_timeline(branch_name, tenant_id=self.tenant) + self.tenant = self.env.initial_tenant + self.timeline = self.env.initial_timeline # Start pg - self._pg = self.env.endpoints.create_start(branch_name, "main", self.tenant) + self._pg = self.env.endpoints.create_start("main", "main", self.tenant) @property def pg(self) -> PgProtocol: @@ -131,13 +130,16 @@ class NeonCompare(PgCompare): def pg_bin(self) -> PgBin: return self._pg_bin - def flush(self): + def flush(self, compact: bool = True, gc: bool = True): wait_for_last_flush_lsn(self.env, self._pg, self.tenant, self.timeline) - self.pageserver_http_client.timeline_checkpoint(self.tenant, self.timeline) - self.pageserver_http_client.timeline_gc(self.tenant, self.timeline, 0) + self.pageserver_http_client.timeline_checkpoint(self.tenant, self.timeline, compact=compact) + if gc: + self.pageserver_http_client.timeline_gc(self.tenant, self.timeline, 0) def compact(self): - self.pageserver_http_client.timeline_compact(self.tenant, self.timeline) + self.pageserver_http_client.timeline_compact( + self.tenant, self.timeline, wait_until_uploaded=True + ) def report_peak_memory_use(self): self.zenbenchmark.record( @@ -155,12 +157,23 @@ class NeonCompare(PgCompare): "size", timeline_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER ) - metric_filters = {"tenant_id": str(self.tenant), "timeline_id": str(self.timeline)} + metric_filters = { + "tenant_id": str(self.tenant), + "timeline_id": str(self.timeline), + "file_kind": "layer", + "op_kind": "upload", + } + # use `started` (not `finished`) counters here, because some callers + # don't wait for upload queue to drain total_files = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_created_persistent_files_total", metric_filters + self.env.pageserver, + "pageserver_remote_timeline_client_calls_started_total", + metric_filters, ) total_bytes = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_written_persistent_bytes_total", metric_filters + self.env.pageserver, + "pageserver_remote_timeline_client_bytes_started_total", + metric_filters, ) self.zenbenchmark.record( "data_uploaded", total_bytes / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER @@ -206,9 +219,12 @@ class VanillaCompare(PgCompare): def pg_bin(self) -> PgBin: return self._pg.pg_bin - def flush(self): + def flush(self, compact: bool = False, gc: bool = False): self.cur.execute("checkpoint") + def compact(self): + pass + def report_peak_memory_use(self): pass # TODO find something @@ -257,6 +273,9 @@ class RemoteCompare(PgCompare): # TODO: flush the remote pageserver pass + def compact(self): + pass + def report_peak_memory_use(self): # TODO: get memory usage from remote pageserver pass @@ -275,13 +294,11 @@ class RemoteCompare(PgCompare): @pytest.fixture(scope="function") def neon_compare( - request: FixtureRequest, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, neon_simple_env: NeonEnv, ) -> NeonCompare: - branch_name = request.node.name - return NeonCompare(zenbenchmark, neon_simple_env, pg_bin, branch_name) + return NeonCompare(zenbenchmark, neon_simple_env, pg_bin) @pytest.fixture(scope="function") @@ -319,3 +336,26 @@ def neon_with_baseline(request: FixtureRequest) -> PgCompare: fixture = request.getfixturevalue(request.param) assert isinstance(fixture, PgCompare), f"test error: fixture {fixture} is not PgCompare" return fixture + + +@pytest.fixture(scope="function", autouse=True) +def sync_after_each_test(): + # The fixture calls `sync(2)` after each test if `SYNC_AFTER_EACH_TEST` env var is `true` + # + # In CI, `SYNC_AFTER_EACH_TEST` is set to `true` only for benchmarks (`test_runner/performance`) + # that are run on self-hosted runners because some of these tests are pretty write-heavy + # and create issues to start the processes within 10s + key = "SYNC_AFTER_EACH_TEST" + enabled = os.environ.get(key) == "true" + + yield + + if not enabled: + # regress test, or running locally + return + + start = time.time() + # we only run benches on unices, the method might not exist on windows + os.sync() + elapsed = time.time() - start + log.info(f"called sync after test {elapsed=}") diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py new file mode 100644 index 0000000000..66fc35b6aa --- /dev/null +++ b/test_runner/fixtures/compute_reconfigure.py @@ -0,0 +1,73 @@ +import concurrent.futures +from typing import Any + +import pytest +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + +from fixtures.common_types import TenantId +from fixtures.log_helper import log + + +class ComputeReconfigure: + def __init__(self, server): + self.server = server + self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach" + self.workloads = {} + self.on_notify = None + + def register_workload(self, workload): + self.workloads[workload.tenant_id] = workload + + def register_on_notify(self, fn): + """ + Add some extra work during a notification, like sleeping to slow things down, or + logging what was notified. + """ + self.on_notify = fn + + +@pytest.fixture(scope="function") +def compute_reconfigure_listener(make_httpserver): + """ + This fixture exposes an HTTP listener for the storage controller to submit + compute notifications to us, instead of updating neon_local endpoints itself. + + Although storage controller can use neon_local directly, this causes problems when + the test is also concurrently modifying endpoints. Instead, configure storage controller + to send notifications up to this test code, which will route all endpoint updates + through Workload, which has a mutex to make concurrent updates safe. + """ + server = make_httpserver + + self = ComputeReconfigure(server) + + # Do neon_local endpoint reconfiguration in the background so that we can + # accept a healthy rate of calls into notify-attach. + reconfigure_threads = concurrent.futures.ThreadPoolExecutor(max_workers=1) + + def handler(request: Request): + assert request.json is not None + body: dict[str, Any] = request.json + log.info(f"notify-attach request: {body}") + + if self.on_notify is not None: + self.on_notify(body) + + try: + workload = self.workloads[TenantId(body["tenant_id"])] + except KeyError: + pass + else: + # This causes the endpoint to query storage controller for its location, which + # is redundant since we already have it here, but this avoids extending the + # neon_local CLI to take full lists of locations + reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[no-any-return] + + return Response(status=200) + + self.server.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) + + yield self + reconfigure_threads.shutdown() + server.clear() diff --git a/test_runner/fixtures/endpoint/__init__.py b/test_runner/fixtures/endpoint/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py new file mode 100644 index 0000000000..42f0539c19 --- /dev/null +++ b/test_runner/fixtures/endpoint/http.py @@ -0,0 +1,23 @@ +import requests +from requests.adapters import HTTPAdapter + + +class EndpointHttpClient(requests.Session): + def __init__( + self, + port: int, + ): + super().__init__() + self.port = port + + self.mount("http://", HTTPAdapter()) + + def dbs_and_roles(self): + res = self.get(f"http://localhost:{self.port}/dbs_and_roles") + res.raise_for_status() + return res.json() + + def database_schema(self, database: str): + res = self.get(f"http://localhost:{self.port}/database_schema?database={database}") + res.raise_for_status() + return res.text diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 7c489bda67..cda70be8da 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -4,6 +4,8 @@ from typing import Dict, List, Optional, Tuple from prometheus_client.parser import text_string_to_metric_families from prometheus_client.samples import Sample +from fixtures.log_helper import log + class Metrics: metrics: Dict[str, List[Sample]] @@ -31,6 +33,60 @@ class Metrics: return res[0] +class MetricsGetter: + """ + Mixin for types that implement a `get_metrics` function and would like associated + helpers for querying the metrics + """ + + def get_metrics(self) -> Metrics: + raise NotImplementedError() + + def get_metric_value( + self, name: str, filter: Optional[Dict[str, str]] = None + ) -> Optional[float]: + metrics = self.get_metrics() + results = metrics.query_all(name, filter=filter) + if not results: + log.info(f'could not find metric "{name}"') + return None + assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}" + return results[0].value + + def get_metrics_values( + self, names: list[str], filter: Optional[Dict[str, str]] = None, absence_ok=False + ) -> Dict[str, float]: + """ + When fetching multiple named metrics, it is more efficient to use this + than to call `get_metric_value` repeatedly. + + Throws RuntimeError if no metrics matching `names` are found, or if + not all of `names` are found: this method is intended for loading sets + of metrics whose existence is coupled. + + If it's expected that there may be no results for some of the metrics, + specify `absence_ok=True`. The returned dict will then not contain values + for these metrics. + """ + metrics = self.get_metrics() + samples = [] + for name in names: + samples.extend(metrics.query_all(name, filter=filter)) + + result = {} + for sample in samples: + if sample.name in result: + raise RuntimeError(f"Multiple values found for {sample.name}") + result[sample.name] = sample.value + + if not absence_ok: + if len(result) != len(names): + log.info(f"Metrics found: {metrics.metrics}") + raise RuntimeError(f"could not find all metrics {' '.join(names)}") + + return result + + def parse_metrics(text: str, name: str = "") -> Metrics: metrics = Metrics(name) gen = text_string_to_metric_families(text) @@ -47,7 +103,8 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]: PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = ( - "pageserver_remote_timeline_client_calls_unfinished", + "pageserver_remote_timeline_client_calls_started_total", + "pageserver_remote_timeline_client_calls_finished_total", "pageserver_remote_physical_size", "pageserver_remote_timeline_client_bytes_started_total", "pageserver_remote_timeline_client_bytes_finished_total", @@ -61,8 +118,6 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( "libmetrics_launch_timestamp", "libmetrics_build_info", "libmetrics_tracing_event_count_total", - "pageserver_materialized_cache_hits_total", - "pageserver_materialized_cache_hits_direct_total", "pageserver_page_cache_read_hits_total", "pageserver_page_cache_read_accesses_total", "pageserver_page_cache_size_current_bytes", @@ -72,13 +127,14 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( "pageserver_getpage_reconstruct_seconds_sum", *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]], *histogram("pageserver_smgr_query_seconds_global"), - *histogram("pageserver_read_num_fs_layers"), + *histogram("pageserver_layers_visited_per_read_global"), *histogram("pageserver_getpage_get_reconstruct_data_seconds"), *histogram("pageserver_wait_lsn_seconds"), *histogram("pageserver_remote_operation_seconds"), - *histogram("pageserver_remote_timeline_client_calls_started"), *histogram("pageserver_io_operations_seconds"), "pageserver_tenant_states_count", + "pageserver_circuit_breaker_broken_total", + "pageserver_circuit_breaker_unbroken_total", ) PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( @@ -86,15 +142,22 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_resident_physical_size", "pageserver_io_operations_bytes_total", "pageserver_last_record_lsn", + "pageserver_standby_horizon", "pageserver_smgr_query_seconds_bucket", "pageserver_smgr_query_seconds_count", "pageserver_smgr_query_seconds_sum", + "pageserver_archive_size", + "pageserver_pitr_history_size", + "pageserver_layer_bytes", + "pageserver_layer_count", + "pageserver_visible_physical_size", "pageserver_storage_operations_seconds_count_total", "pageserver_storage_operations_seconds_sum_total", - "pageserver_created_persistent_files_total", - "pageserver_written_persistent_bytes_total", "pageserver_evictions_total", "pageserver_evictions_with_low_residence_duration_total", + "pageserver_aux_file_estimated_size", + "pageserver_valid_lsn_lease_count", *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, - # pageserver_broken_tenants_count is a leaked "metric" which is "cleared" on restart or reload + # "pageserver_directory_entries_count", -- only used if above a certain threshold + # "pageserver_broken_tenants_count" -- used only for broken ) diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py new file mode 100644 index 0000000000..0636cfad06 --- /dev/null +++ b/test_runner/fixtures/neon_api.py @@ -0,0 +1,307 @@ +from __future__ import annotations + +import time +from typing import TYPE_CHECKING, cast + +import requests + +if TYPE_CHECKING: + from typing import Any, Dict, Literal, Optional, Union + + from fixtures.pg_version import PgVersion + + +def connection_parameters_to_env(params: Dict[str, str]) -> Dict[str, str]: + return { + "PGHOST": params["host"], + "PGDATABASE": params["database"], + "PGUSER": params["role"], + "PGPASSWORD": params["password"], + } + + +class NeonAPI: + def __init__(self, neon_api_key: str, neon_api_base_url: str): + self.__neon_api_key = neon_api_key + self.__neon_api_base_url = neon_api_base_url.strip("/") + + def __request( + self, method: Union[str, bytes], endpoint: str, **kwargs: Any + ) -> requests.Response: + if "headers" not in kwargs: + kwargs["headers"] = {} + kwargs["headers"]["Authorization"] = f"Bearer {self.__neon_api_key}" + + return requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs) + + def create_project( + self, + pg_version: Optional[PgVersion] = None, + name: Optional[str] = None, + branch_name: Optional[str] = None, + branch_role_name: Optional[str] = None, + branch_database_name: Optional[str] = None, + ) -> Dict[str, Any]: + data: Dict[str, Any] = { + "project": { + "branch": {}, + }, + } + if name: + data["project"]["name"] = name + if pg_version: + data["project"]["pg_version"] = int(pg_version) + if branch_name: + data["project"]["branch"]["name"] = branch_name + if branch_role_name: + data["project"]["branch"]["role_name"] = branch_role_name + if branch_database_name: + data["project"]["branch"]["database_name"] = branch_database_name + + resp = self.__request( + "POST", + "/projects", + headers={ + "Accept": "application/json", + "Content-Type": "application/json", + }, + json=data, + ) + + assert resp.status_code == 201 + + return cast("Dict[str, Any]", resp.json()) + + def get_project_details(self, project_id: str) -> Dict[str, Any]: + resp = self.__request( + "GET", + f"/projects/{project_id}", + headers={ + "Accept": "application/json", + "Content-Type": "application/json", + }, + ) + assert resp.status_code == 200 + return cast("Dict[str, Any]", resp.json()) + + def delete_project( + self, + project_id: str, + ) -> Dict[str, Any]: + resp = self.__request( + "DELETE", + f"/projects/{project_id}", + headers={ + "Accept": "application/json", + "Content-Type": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def start_endpoint( + self, + project_id: str, + endpoint_id: str, + ) -> Dict[str, Any]: + resp = self.__request( + "POST", + f"/projects/{project_id}/endpoints/{endpoint_id}/start", + headers={ + "Accept": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def suspend_endpoint( + self, + project_id: str, + endpoint_id: str, + ) -> Dict[str, Any]: + resp = self.__request( + "POST", + f"/projects/{project_id}/endpoints/{endpoint_id}/suspend", + headers={ + "Accept": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def restart_endpoint( + self, + project_id: str, + endpoint_id: str, + ) -> Dict[str, Any]: + resp = self.__request( + "POST", + f"/projects/{project_id}/endpoints/{endpoint_id}/restart", + headers={ + "Accept": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def create_endpoint( + self, + project_id: str, + branch_id: str, + endpoint_type: Literal["read_write", "read_only"], + settings: Dict[str, Any], + ) -> Dict[str, Any]: + data: Dict[str, Any] = { + "endpoint": { + "branch_id": branch_id, + }, + } + + if endpoint_type: + data["endpoint"]["type"] = endpoint_type + if settings: + data["endpoint"]["settings"] = settings + + resp = self.__request( + "POST", + f"/projects/{project_id}/endpoints", + headers={ + "Accept": "application/json", + "Content-Type": "application/json", + }, + json=data, + ) + + assert resp.status_code == 201 + + return cast("Dict[str, Any]", resp.json()) + + def get_connection_uri( + self, + project_id: str, + branch_id: Optional[str] = None, + endpoint_id: Optional[str] = None, + database_name: str = "neondb", + role_name: str = "neondb_owner", + pooled: bool = True, + ) -> Dict[str, Any]: + resp = self.__request( + "GET", + f"/projects/{project_id}/connection_uri", + params={ + "branch_id": branch_id, + "endpoint_id": endpoint_id, + "database_name": database_name, + "role_name": role_name, + "pooled": pooled, + }, + headers={ + "Accept": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def get_branches(self, project_id: str) -> Dict[str, Any]: + resp = self.__request( + "GET", + f"/projects/{project_id}/branches", + headers={ + "Accept": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def get_endpoints(self, project_id: str) -> Dict[str, Any]: + resp = self.__request( + "GET", + f"/projects/{project_id}/endpoints", + headers={ + "Accept": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def get_operations(self, project_id: str) -> Dict[str, Any]: + resp = self.__request( + "GET", + f"/projects/{project_id}/operations", + headers={ + "Accept": "application/json", + "Authorization": f"Bearer {self.__neon_api_key}", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def wait_for_operation_to_finish(self, project_id: str): + has_running = True + while has_running: + has_running = False + operations = self.get_operations(project_id)["operations"] + for op in operations: + if op["status"] in {"scheduling", "running", "cancelling"}: + has_running = True + time.sleep(0.5) + + +class NeonApiEndpoint: + def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: Optional[str]): + self.neon_api = neon_api + if project_id is None: + project = neon_api.create_project(pg_version) + neon_api.wait_for_operation_to_finish(project["project"]["id"]) + self.project_id = project["project"]["id"] + self.endpoint_id = project["endpoints"][0]["id"] + self.connstr = project["connection_uris"][0]["connection_uri"] + self.pgbench_env = connection_parameters_to_env( + project["connection_uris"][0]["connection_parameters"] + ) + self.is_new = True + else: + project = neon_api.get_project_details(project_id) + if int(project["project"]["pg_version"]) != int(pg_version): + raise Exception( + f"A project with the provided ID exists, but it's not of the specified version (expected {pg_version}, got {project['project']['pg_version']})" + ) + self.project_id = project_id + eps = neon_api.get_endpoints(project_id)["endpoints"] + self.endpoint_id = eps[0]["id"] + self.connstr = neon_api.get_connection_uri( + project_id, endpoint_id=self.endpoint_id, pooled=False + )["uri"] + pw = self.connstr.split("@")[0].split(":")[-1] + self.pgbench_env = { + "PGHOST": eps[0]["host"], + "PGDATABASE": "neondb", + "PGUSER": "neondb_owner", + "PGPASSWORD": pw, + } + self.is_new = False + + def restart(self): + self.neon_api.restart_endpoint(self.project_id, self.endpoint_id) + self.neon_api.wait_for_operation_to_finish(self.project_id) + + def get_synthetic_storage_size(self) -> int: + return int( + self.neon_api.get_project_details(self.project_id)["project"]["synthetic_storage_size"] + ) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index fd5e77671b..18fbbde637 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2,6 +2,7 @@ from __future__ import annotations import abc import asyncio +import concurrent.futures import filecmp import json import os @@ -13,21 +14,38 @@ import textwrap import threading import time import uuid +from collections import defaultdict from contextlib import closing, contextmanager -from dataclasses import dataclass, field +from dataclasses import dataclass from datetime import datetime +from enum import Enum from fcntl import LOCK_EX, LOCK_UN, flock -from functools import cached_property +from functools import cached_property, partial from itertools import chain, product from pathlib import Path from types import TracebackType -from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union, cast -from urllib.parse import urlparse +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Optional, + Tuple, + Type, + TypeVar, + Union, + cast, +) +from urllib.parse import quote, urlparse import asyncpg import backoff +import httpx import jwt import psycopg2 +import psycopg2.sql import pytest import requests import toml @@ -44,37 +62,48 @@ from urllib3.util.retry import Retry from fixtures import overlayfs from fixtures.broker import NeonBroker +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId +from fixtures.endpoint.http import EndpointHttpClient from fixtures.log_helper import log +from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pageserver.allowed_errors import ( DEFAULT_PAGESERVER_ALLOWED_ERRORS, - scan_pageserver_log_for_errors, + DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS, ) +from fixtures.pageserver.common_types import IndexPartDump, LayerName, parse_layer_file_name from fixtures.pageserver.http import PageserverHttpClient -from fixtures.pageserver.types import IndexPartDump from fixtures.pageserver.utils import ( wait_for_last_record_lsn, - wait_for_upload, ) from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import ( + LocalFsStorage, MockS3Server, RemoteStorage, RemoteStorageKind, RemoteStorageUser, S3Storage, default_remote_storage, - remote_storage_to_toml_inline_table, + remote_storage_to_toml_dict, ) -from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId +from fixtures.safekeeper.http import SafekeeperHttpClient +from fixtures.safekeeper.utils import are_walreceivers_absent from fixtures.utils import ( ATTACHMENT_NAME_REGEX, allure_add_grafana_links, allure_attach_from_dir, + assert_no_errors, get_self_dir, + print_gc_result, subprocess_capture, wait_until, ) +from fixtures.utils import AuxFileStore as AuxFileStore # reexport + +from .neon_api import NeonAPI, NeonApiEndpoint + +T = TypeVar("T") """ This file contains pytest fixtures. A fixture is a test resource that can be @@ -173,6 +202,25 @@ def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Ite yield versioned_dir +@pytest.fixture(scope="session") +def neon_api_key() -> str: + api_key = os.getenv("NEON_API_KEY") + if not api_key: + raise AssertionError("Set the NEON_API_KEY environment variable") + + return api_key + + +@pytest.fixture(scope="session") +def neon_api_base_url() -> str: + return os.getenv("NEON_API_BASE_URL", "https://console-stage.neon.build/api/v2") + + +@pytest.fixture(scope="session") +def neon_api(neon_api_key: str, neon_api_base_url: str) -> NeonAPI: + return NeonAPI(neon_api_key, neon_api_base_url) + + def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]: """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar. @@ -354,7 +402,7 @@ class PgProtocol: return self.safe_psql_many([query], **kwargs)[0] def safe_psql_many( - self, queries: List[str], log_query=True, **kwargs: Any + self, queries: Iterable[str], log_query=True, **kwargs: Any ) -> List[List[Tuple[Any, ...]]]: """ Execute queries against the node and return all rows. @@ -385,7 +433,8 @@ class PgProtocol: class AuthKeys: priv: str - def generate_token(self, *, scope: str, **token_data: str) -> str: + def generate_token(self, *, scope: TokenScope, **token_data: Any) -> str: + token_data = {key: str(val) for key, val in token_data.items()} token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA") # cast(Any, self.priv) @@ -398,14 +447,24 @@ class AuthKeys: return token def generate_pageserver_token(self) -> str: - return self.generate_token(scope="pageserverapi") + return self.generate_token(scope=TokenScope.PAGE_SERVER_API) def generate_safekeeper_token(self) -> str: - return self.generate_token(scope="safekeeperdata") + return self.generate_token(scope=TokenScope.SAFEKEEPER_DATA) # generate token giving access to only one tenant def generate_tenant_token(self, tenant_id: TenantId) -> str: - return self.generate_token(scope="tenant", tenant_id=str(tenant_id)) + return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id)) + + +# TODO: Replace with `StrEnum` when we upgrade to python 3.11 +class TokenScope(str, Enum): + ADMIN = "admin" + PAGE_SERVER_API = "pageserverapi" + GENERATIONS_API = "generations_api" + SAFEKEEPER_DATA = "safekeeperdata" + TENANT = "tenant" + SCRUBBER = "scrubber" class NeonEnvBuilder: @@ -433,7 +492,8 @@ class NeonEnvBuilder: test_output_dir: Path, test_overlay_dir: Optional[Path] = None, pageserver_remote_storage: Optional[RemoteStorage] = None, - pageserver_config_override: Optional[str] = None, + # toml that will be decomposed into `--config-override` flags during `pageserver --init` + pageserver_config_override: Optional[str | Callable[[Dict[str, Any]], None]] = None, num_safekeepers: int = 1, num_pageservers: int = 1, # Use non-standard SK ids to check for various parsing bugs @@ -446,6 +506,12 @@ class NeonEnvBuilder: preserve_database_files: bool = False, initial_tenant: Optional[TenantId] = None, initial_timeline: Optional[TimelineId] = None, + pageserver_virtual_file_io_engine: Optional[str] = None, + pageserver_aux_file_policy: Optional[AuxFileStore] = None, + pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None, + safekeeper_extra_opts: Optional[list[str]] = None, + storage_controller_port_override: Optional[int] = None, + pageserver_io_buffer_alignment: Optional[int] = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -469,17 +535,38 @@ class NeonEnvBuilder: self.env: Optional[NeonEnv] = None self.keep_remote_storage_contents: bool = True self.neon_binpath = neon_binpath + self.neon_local_binpath = neon_binpath self.pg_distrib_dir = pg_distrib_dir self.pg_version = pg_version self.preserve_database_files = preserve_database_files self.initial_tenant = initial_tenant or TenantId.generate() self.initial_timeline = initial_timeline or TimelineId.generate() - self.scrub_on_exit = False + self.enable_scrub_on_exit = True self.test_output_dir = test_output_dir self.test_overlay_dir = test_overlay_dir self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = [] self.config_init_force: Optional[str] = None self.top_output_dir = top_output_dir + self.control_plane_compute_hook_api: Optional[str] = None + self.storage_controller_config: Optional[dict[Any, Any]] = None + + self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine + + self.pageserver_default_tenant_config_compaction_algorithm: Optional[ + Dict[str, Any] + ] = pageserver_default_tenant_config_compaction_algorithm + if self.pageserver_default_tenant_config_compaction_algorithm is not None: + log.debug( + f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}" + ) + + self.pageserver_aux_file_policy = pageserver_aux_file_policy + + self.safekeeper_extra_opts = safekeeper_extra_opts + + self.storage_controller_port_override = storage_controller_port_override + + self.pageserver_io_buffer_alignment = pageserver_io_buffer_alignment assert test_name.startswith( "test_" @@ -500,7 +587,7 @@ class NeonEnvBuilder: def init_start( self, - initial_tenant_conf: Optional[Dict[str, str]] = None, + initial_tenant_conf: Optional[Dict[str, Any]] = None, default_remote_storage_if_missing: bool = True, initial_tenant_shard_count: Optional[int] = None, initial_tenant_shard_stripe_size: Optional[int] = None, @@ -526,6 +613,7 @@ class NeonEnvBuilder: timeline_id=env.initial_timeline, shard_count=initial_tenant_shard_count, shard_stripe_size=initial_tenant_shard_stripe_size, + aux_file_policy=self.pageserver_aux_file_policy, ) assert env.initial_tenant == initial_tenant assert env.initial_timeline == initial_timeline @@ -594,17 +682,11 @@ class NeonEnvBuilder: def from_repo_dir( self, repo_dir: Path, - neon_binpath: Optional[Path] = None, - pg_distrib_dir: Optional[Path] = None, ) -> NeonEnv: """ A simple method to import data into the current NeonEnvBuilder from a snapshot of a repo dir. """ - # Setting custom `neon_binpath` and `pg_distrib_dir` is useful for compatibility tests - self.neon_binpath = neon_binpath or self.neon_binpath - self.pg_distrib_dir = pg_distrib_dir or self.pg_distrib_dir - # Get the initial tenant and timeline from the snapshot config snapshot_config_toml = repo_dir / "config" with snapshot_config_toml.open("r") as f: @@ -653,8 +735,33 @@ class NeonEnvBuilder: self.repo_dir / "local_fs_remote_storage", ) - if (attachments_json := Path(repo_dir / "attachments.json")).exists(): - shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name) + # restore storage controller (the db is small, don't bother with overlayfs) + storcon_db_from_dir = repo_dir / "storage_controller_db" + storcon_db_to_dir = self.repo_dir / "storage_controller_db" + log.info(f"Copying storage_controller_db from {storcon_db_from_dir} to {storcon_db_to_dir}") + assert storcon_db_from_dir.is_dir() + assert not storcon_db_to_dir.exists() + + def ignore_postgres_log(path: str, _names): + if Path(path) == storcon_db_from_dir: + return {"postgres.log"} + return set() + + shutil.copytree(storcon_db_from_dir, storcon_db_to_dir, ignore=ignore_postgres_log) + assert not (storcon_db_to_dir / "postgres.log").exists() + # NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it. + # However, in this new NeonEnv, the pageservers listen on different ports, and the storage controller + # will currently reject re-attach requests from them because the NodeMetadata isn't identical. + # So, from_repo_dir patches up the the storcon database. + patch_script_path = self.repo_dir / "storage_controller_db.startup.sql" + assert not patch_script_path.exists() + patch_script = "" + for ps in self.env.pageservers: + patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';" + # This is a temporary to get the backward compat test happy + # since the compat snapshot was generated with an older version of neon local + patch_script += f"UPDATE nodes SET availability_zone_id='{ps.az_id}' WHERE node_id = '{ps.id}' AND availability_zone_id IS NULL;" + patch_script_path.write_text(patch_script) # Update the config with info about tenants and timelines with (self.repo_dir / "config").open("r") as f: @@ -663,6 +770,10 @@ class NeonEnvBuilder: config["default_tenant_id"] = snapshot_config["default_tenant_id"] config["branch_name_mappings"] = snapshot_config["branch_name_mappings"] + # Update the config with new neon + postgres path in case of compat test + config["pg_distrib_dir"] = str(self.pg_distrib_dir) + config["neon_distrib_dir"] = str(self.neon_binpath) + with (self.repo_dir / "config").open("w") as f: toml.dump(config, f) @@ -751,6 +862,13 @@ class NeonEnvBuilder: ) ident_state_dir.rmdir() # should be empty since we moved `upper` out + def disable_scrub_on_exit(self): + """ + Some tests intentionally leave the remote storage contents empty or corrupt, + so it doesn't make sense to do the usual scrub at the end of the test. + """ + self.enable_scrub_on_exit = False + def overlay_cleanup_teardown(self): """ Unmount the overlayfs mounts created by `self.overlay_mount()`. @@ -776,23 +894,6 @@ class NeonEnvBuilder: # assert all overlayfs mounts in our test directory are gone assert [] == list(overlayfs.iter_mounts_beneath(self.test_overlay_dir)) - def enable_scrub_on_exit(self): - """ - Call this if you would like the fixture to automatically run - s3_scrubber at the end of the test, as a bidirectional test - that the scrubber is working properly, and that the code within - the test didn't produce any invalid remote state. - """ - - if not isinstance(self.pageserver_remote_storage, S3Storage): - # The scrubber can't talk to e.g. LocalFS -- it needs - # an HTTP endpoint (mock is fine) to connect to. - raise RuntimeError( - "Cannot scrub with remote_storage={self.pageserver_remote_storage}, require an S3 endpoint" - ) - - self.scrub_on_exit = True - def enable_pageserver_remote_storage( self, remote_storage_kind: RemoteStorageKind, @@ -885,16 +986,26 @@ class NeonEnvBuilder: if self.env: log.info("Cleaning up all storage and compute nodes") self.env.stop( - immediate=True, + immediate=False, # if the test threw an exception, don't check for errors # as a failing assertion would cause the cleanup below to fail ps_assert_metric_no_errors=(exc_type is None), + # do not fail on endpoint errors to allow the rest of cleanup to proceed + fail_on_endpoint_errors=False, ) cleanup_error = None - if self.scrub_on_exit: + # If we are running with S3Storage (required by the scrubber), check that whatever the test + # did does not generate any corruption + if ( + isinstance(self.env.pageserver_remote_storage, S3Storage) + and self.enable_scrub_on_exit + ): try: - S3Scrubber(self.test_output_dir, self).scan_metadata() + healthy, _ = self.env.storage_scrubber.scan_metadata() + if not healthy: + e = Exception("Remote storage metadata corrupted") + cleanup_error = e except Exception as e: log.error(f"Error during remote storage scrub: {e}") cleanup_error = e @@ -919,6 +1030,11 @@ class NeonEnvBuilder: for pageserver in self.env.pageservers: pageserver.assert_no_errors() + for safekeeper in self.env.safekeepers: + safekeeper.assert_no_errors() + + self.env.storage_controller.assert_no_errors() + try: self.overlay_cleanup_teardown() except Exception as e: @@ -942,7 +1058,7 @@ class NeonEnv: Some notable functions and fields in NeonEnv: - postgres - A factory object for creating postgres compute nodes. + endpoints - A factory object for creating postgres compute nodes. pageservers - An array containing objects representing the pageservers @@ -960,6 +1076,7 @@ class NeonEnv: """ BASE_PAGESERVER_ID = 1 + storage_controller: NeonStorageController | NeonProxiedStorageController def __init__(self, config: NeonEnvBuilder): self.repo_dir = config.repo_dir @@ -977,37 +1094,77 @@ class NeonEnv: self.pg_version = config.pg_version # Binary path for pageserver, safekeeper, etc self.neon_binpath = config.neon_binpath - # Binary path for neon_local test-specific binaries: may be overridden - # after construction for compat testing - self.neon_local_binpath = config.neon_binpath + # Binary path for neon_local test-specific binaries + self.neon_local_binpath = config.neon_local_binpath + if self.neon_local_binpath is None: + self.neon_local_binpath = self.neon_binpath self.pg_distrib_dir = config.pg_distrib_dir self.endpoint_counter = 0 - self.pageserver_config_override = config.pageserver_config_override + self.storage_controller_config = config.storage_controller_config # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. self.initial_tenant = config.initial_tenant self.initial_timeline = config.initial_timeline - attachment_service_port = self.port_distributor.get_port() - self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}" - self.attachment_service: NeonAttachmentService = NeonAttachmentService( - self, config.auth_enabled - ) + # The URL for the pageserver to use as its control_plane_api config + if config.storage_controller_port_override is not None: + log.info( + f"Using storage controller api override {config.storage_controller_port_override}" + ) - # Create a config file corresponding to the options + self.storage_controller_port = config.storage_controller_port_override + self.storage_controller = NeonProxiedStorageController( + self, config.storage_controller_port_override, config.auth_enabled + ) + else: + # Find two adjacent ports for storage controller and its postgres DB. This + # loop would eventually throw from get_port() if we run out of ports (extremely + # unlikely): usually we find two adjacent free ports on the first iteration. + while True: + storage_controller_port = self.port_distributor.get_port() + storage_controller_pg_port = self.port_distributor.get_port() + if storage_controller_pg_port == storage_controller_port + 1: + break + + self.storage_controller_port = storage_controller_port + self.storage_controller = NeonStorageController( + self, storage_controller_port, config.auth_enabled + ) + + log.info( + f"Using generated control_plane_api: {self.storage_controller.upcall_api_endpoint()}" + ) + + self.storage_controller_api: str = self.storage_controller.api_root() + self.control_plane_api: str = self.storage_controller.upcall_api_endpoint() + + # For testing this with a fake HTTP server, enable passing through a URL from config + self.control_plane_compute_hook_api = config.control_plane_compute_hook_api + + self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine + self.pageserver_aux_file_policy = config.pageserver_aux_file_policy + self.pageserver_io_buffer_alignment = config.pageserver_io_buffer_alignment + + # Create the neon_local's `NeonLocalInitConf` cfg: Dict[str, Any] = { "default_tenant_id": str(self.initial_tenant), "broker": { "listen_addr": self.broker.listen_addr(), }, - "pageservers": [], "safekeepers": [], + "pageservers": [], } if self.control_plane_api is not None: cfg["control_plane_api"] = self.control_plane_api + if self.control_plane_compute_hook_api is not None: + cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api + + if self.storage_controller_config is not None: + cfg["storage_controller"] = self.storage_controller_config + # Create config for pageserver http_auth_type = "NeonJWT" if config.auth_enabled else "Trust" pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust" @@ -1025,15 +1182,37 @@ class NeonEnv: "listen_http_addr": f"localhost:{pageserver_port.http}", "pg_auth_type": pg_auth_type, "http_auth_type": http_auth_type, + # Default which can be overriden with `NeonEnvBuilder.pageserver_config_override` + "availability_zone": "us-east-2a", } + if self.pageserver_virtual_file_io_engine is not None: + ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine + if config.pageserver_default_tenant_config_compaction_algorithm is not None: + tenant_config = ps_cfg.setdefault("tenant_config", {}) + tenant_config[ + "compaction_algorithm" + ] = config.pageserver_default_tenant_config_compaction_algorithm + + if self.pageserver_remote_storage is not None: + ps_cfg["remote_storage"] = remote_storage_to_toml_dict( + self.pageserver_remote_storage + ) + + if config.pageserver_config_override is not None: + if callable(config.pageserver_config_override): + config.pageserver_config_override(ps_cfg) + else: + assert isinstance(config.pageserver_config_override, str) + for o in config.pageserver_config_override.split(";"): + override = toml.loads(o) + for key, value in override.items(): + ps_cfg[key] = value + + ps_cfg["io_buffer_alignment"] = self.pageserver_io_buffer_alignment + # Create a corresponding NeonPageserver object self.pageservers.append( - NeonPageserver( - self, - ps_id, - port=pageserver_port, - config_override=self.pageserver_config_override, - ) + NeonPageserver(self, ps_id, port=pageserver_port, az_id=ps_cfg["availability_zone"]) ) cfg["pageservers"].append(ps_cfg) @@ -1055,39 +1234,112 @@ class NeonEnv: if config.auth_enabled: sk_cfg["auth_enabled"] = True if self.safekeepers_remote_storage is not None: - sk_cfg["remote_storage"] = self.safekeepers_remote_storage.to_toml_inline_table() - self.safekeepers.append(Safekeeper(env=self, id=id, port=port)) + sk_cfg[ + "remote_storage" + ] = self.safekeepers_remote_storage.to_toml_inline_table().strip() + self.safekeepers.append( + Safekeeper(env=self, id=id, port=port, extra_opts=config.safekeeper_extra_opts) + ) cfg["safekeepers"].append(sk_cfg) + # Scrubber instance for tests that use it, and for use during teardown checks + self.storage_scrubber = StorageScrubber(self, log_dir=config.test_output_dir) + log.info(f"Config: {cfg}") - self.neon_cli.init(cfg, force=config.config_init_force) + self.neon_cli.init( + cfg, + force=config.config_init_force, + ) + + def start(self, timeout_in_seconds: Optional[int] = None): + # Storage controller starts first, so that pageserver /re-attach calls don't + # bounce through retries on startup + self.storage_controller.start(timeout_in_seconds=timeout_in_seconds) + + # Wait for storage controller readiness to prevent unnecessary post start-up + # reconcile. + self.storage_controller.wait_until_ready() - def start(self): # Start up broker, pageserver and all safekeepers - self.broker.try_start() + futs = [] + with concurrent.futures.ThreadPoolExecutor( + max_workers=2 + len(self.pageservers) + len(self.safekeepers) + ) as executor: + futs.append( + executor.submit(lambda: self.broker.try_start() or None) + ) # The `or None` is for the linter - self.attachment_service.start() + for pageserver in self.pageservers: + futs.append( + executor.submit( + lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) + ) + ) - for pageserver in self.pageservers: - pageserver.start() + for safekeeper in self.safekeepers: + futs.append( + executor.submit( + lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds) + ) + ) - for safekeeper in self.safekeepers: - safekeeper.start() + for f in futs: + f.result() - def stop(self, immediate=False, ps_assert_metric_no_errors=False): + def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True): """ After this method returns, there should be no child processes running. + + Unless of course, some stopping failed, in that case, all remaining child processes are leaked. """ - self.endpoints.stop_all() + + # the commonly failing components have special try-except behavior, + # trying to get us to actually shutdown all processes over easier error + # reporting. + + raise_later = None + try: + self.endpoints.stop_all(fail_on_endpoint_errors) + except Exception as e: + raise_later = e + + # Stop storage controller before pageservers: we don't want it to spuriously + # detect a pageserver "failure" during test teardown + self.storage_controller.stop(immediate=immediate) + + stop_later = [] + metric_errors = [] + for sk in self.safekeepers: sk.stop(immediate=immediate) for pageserver in self.pageservers: if ps_assert_metric_no_errors: - pageserver.assert_no_metric_errors() - pageserver.stop(immediate=immediate) - self.attachment_service.stop(immediate=immediate) + try: + pageserver.assert_no_metric_errors() + except Exception as e: + metric_errors.append(e) + log.error(f"metric validation failed on {pageserver.id}: {e}") + try: + pageserver.stop(immediate=immediate) + except RuntimeError: + stop_later.append(pageserver) self.broker.stop(immediate=immediate) + # TODO: for nice logging we need python 3.11 ExceptionGroup + for ps in stop_later: + ps.stop(immediate=True) + + if raise_later is not None: + raise raise_later + + for error in metric_errors: + raise error + + if len(stop_later) > 0: + raise RuntimeError( + f"{len(stop_later)} out of {len(self.pageservers)} pageservers failed to stop gracefully" + ) + @property def pageserver(self) -> NeonPageserver: """ @@ -1120,10 +1372,11 @@ class NeonEnv: def get_tenant_pageserver(self, tenant_id: Union[TenantId, TenantShardId]): """ Get the NeonPageserver where this tenant shard is currently attached, according - to the attachment service. + to the storage controller. """ - meta = self.attachment_service.inspect(tenant_id) - assert meta is not None, f"{tenant_id} attachment location not found" + meta = self.storage_controller.inspect(tenant_id) + if meta is None: + return None pageserver_id = meta[1] return self.get_pageserver(pageserver_id) @@ -1191,6 +1444,10 @@ def _shared_simple_env( neon_binpath: Path, pg_distrib_dir: Path, pg_version: PgVersion, + pageserver_virtual_file_io_engine: str, + pageserver_aux_file_policy: Optional[AuxFileStore], + pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], + pageserver_io_buffer_alignment: Optional[int], ) -> Iterator[NeonEnv]: """ # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES @@ -1217,9 +1474,13 @@ def _shared_simple_env( pg_distrib_dir=pg_distrib_dir, pg_version=pg_version, run_id=run_id, - preserve_database_files=pytestconfig.getoption("--preserve-database-files"), + preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")), test_name=request.node.name, test_output_dir=test_output_dir, + pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, + pageserver_aux_file_policy=pageserver_aux_file_policy, + pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, + pageserver_io_buffer_alignment=pageserver_io_buffer_alignment, ) as builder: env = builder.init_start() @@ -1258,6 +1519,11 @@ def neon_env_builder( request: FixtureRequest, test_overlay_dir: Path, top_output_dir: Path, + pageserver_virtual_file_io_engine: str, + pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], + pageserver_aux_file_policy: Optional[AuxFileStore], + record_property: Callable[[str, object], None], + pageserver_io_buffer_alignment: Optional[int], ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1286,12 +1552,19 @@ def neon_env_builder( pg_version=pg_version, broker=default_broker, run_id=run_id, - preserve_database_files=pytestconfig.getoption("--preserve-database-files"), + preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")), + pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, test_name=request.node.name, test_output_dir=test_output_dir, test_overlay_dir=test_overlay_dir, + pageserver_aux_file_policy=pageserver_aux_file_policy, + pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, + pageserver_io_buffer_alignment=pageserver_io_buffer_alignment, ) as builder: yield builder + # Propogate `preserve_database_files` to make it possible to use in other fixtures, + # like `test_output_dir` fixture for attaching all database files to Allure report. + record_property("preserve_database_files", builder.preserve_database_files) @dataclass @@ -1356,7 +1629,6 @@ class AbstractNeonCli(abc.ABC): args = [bin_neon] + arguments log.info('Running command "{}"'.format(" ".join(args))) - log.info(f'Running in "{self.env.repo_dir}"') env_vars = os.environ.copy() env_vars["NEON_REPO_DIR"] = str(self.env.repo_dir) @@ -1445,10 +1717,12 @@ class NeonCli(AbstractNeonCli): self, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None, - conf: Optional[Dict[str, str]] = None, + conf: Optional[Dict[str, Any]] = None, shard_count: Optional[int] = None, shard_stripe_size: Optional[int] = None, + placement_policy: Optional[str] = None, set_default: bool = False, + aux_file_policy: Optional[AuxFileStore] = None, ) -> Tuple[TenantId, TimelineId]: """ Creates a new tenant, returns its id and its initial timeline's id. @@ -1472,6 +1746,14 @@ class NeonCli(AbstractNeonCli): product(["-c"], (f"{key}:{value}" for key, value in conf.items())) ) ) + + if aux_file_policy is AuxFileStore.V2: + args.extend(["-c", "switch_aux_file_policy:v2"]) + elif aux_file_policy is AuxFileStore.V1: + args.extend(["-c", "switch_aux_file_policy:v1"]) + elif aux_file_policy is AuxFileStore.CrossValidation: + args.extend(["-c", "switch_aux_file_policy:cross-validation"]) + if set_default: args.append("--set-default") @@ -1481,10 +1763,18 @@ class NeonCli(AbstractNeonCli): if shard_stripe_size is not None: args.extend(["--shard-stripe-size", str(shard_stripe_size)]) + if placement_policy is not None: + args.extend(["--placement-policy", str(placement_policy)]) + res = self.raw_cli(args) res.check_returncode() return tenant_id, timeline_id + def import_tenant(self, tenant_id: TenantId): + args = ["tenant", "import", "--tenant-id", str(tenant_id)] + res = self.raw_cli(args) + res.check_returncode() + def set_default(self, tenant_id: TenantId): """ Update default tenant for future operations that require tenant_id. @@ -1598,56 +1888,58 @@ class NeonCli(AbstractNeonCli): def init( self, - config: Dict[str, Any], + init_config: Dict[str, Any], force: Optional[str] = None, ) -> "subprocess.CompletedProcess[str]": - with tempfile.NamedTemporaryFile(mode="w+") as tmp: - tmp.write(toml.dumps(config)) - tmp.flush() + with tempfile.NamedTemporaryFile(mode="w+") as init_config_tmpfile: + init_config_tmpfile.write(toml.dumps(init_config)) + init_config_tmpfile.flush() - cmd = ["init", f"--config={tmp.name}", "--pg-version", self.env.pg_version] + cmd = [ + "init", + f"--config={init_config_tmpfile.name}", + ] if force is not None: cmd.extend(["--force", force]) - storage = self.env.pageserver_remote_storage - - append_pageserver_param_overrides( - params_to_update=cmd, - remote_storage=storage, - pageserver_config_override=self.env.pageserver_config_override, - ) - - s3_env_vars = None - if isinstance(storage, S3Storage): - s3_env_vars = storage.access_env_vars() - res = self.raw_cli(cmd, extra_env_vars=s3_env_vars) + res = self.raw_cli(cmd) res.check_returncode() - return res + return res - def attachment_service_start(self): - cmd = ["attachment_service", "start"] + def storage_controller_start( + self, + timeout_in_seconds: Optional[int] = None, + instance_id: Optional[int] = None, + base_port: Optional[int] = None, + ): + cmd = ["storage_controller", "start"] + if timeout_in_seconds is not None: + cmd.append(f"--start-timeout={timeout_in_seconds}s") + if instance_id is not None: + cmd.append(f"--instance-id={instance_id}") + if base_port is not None: + cmd.append(f"--base-port={base_port}") return self.raw_cli(cmd) - def attachment_service_stop(self, immediate: bool): - cmd = ["attachment_service", "stop"] + def storage_controller_stop(self, immediate: bool, instance_id: Optional[int] = None): + cmd = ["storage_controller", "stop"] if immediate: cmd.extend(["-m", "immediate"]) + if instance_id is not None: + cmd.append(f"--instance-id={instance_id}") return self.raw_cli(cmd) def pageserver_start( self, id: int, - overrides: Tuple[str, ...] = (), extra_env_vars: Optional[Dict[str, str]] = None, + timeout_in_seconds: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": - start_args = ["pageserver", "start", f"--id={id}", *overrides] + start_args = ["pageserver", "start", f"--id={id}"] + if timeout_in_seconds is not None: + start_args.append(f"--start-timeout={timeout_in_seconds}s") storage = self.env.pageserver_remote_storage - append_pageserver_param_overrides( - params_to_update=start_args, - remote_storage=storage, - pageserver_config_override=self.env.pageserver_config_override, - ) if isinstance(storage, S3Storage): s3_env_vars = storage.access_env_vars() @@ -1664,7 +1956,10 @@ class NeonCli(AbstractNeonCli): return self.raw_cli(cmd) def safekeeper_start( - self, id: int, extra_opts: Optional[List[str]] = None + self, + id: int, + extra_opts: Optional[List[str]] = None, + timeout_in_seconds: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": s3_env_vars = None if isinstance(self.env.safekeepers_remote_storage, S3Storage): @@ -1674,6 +1969,8 @@ class NeonCli(AbstractNeonCli): extra_opts = [f"-e={opt}" for opt in extra_opts] else: extra_opts = [] + if timeout_in_seconds is not None: + extra_opts.append(f"--start-timeout={timeout_in_seconds}s") return self.raw_cli( ["safekeeper", "start", str(id), *extra_opts], extra_env_vars=s3_env_vars ) @@ -1698,6 +1995,7 @@ class NeonCli(AbstractNeonCli): hot_standby: bool = False, lsn: Optional[Lsn] = None, pageserver_id: Optional[int] = None, + allow_multiple=False, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1721,6 +2019,8 @@ class NeonCli(AbstractNeonCli): args.extend(["--hot-standby", "true"]) if pageserver_id is not None: args.extend(["--pageserver-id", str(pageserver_id)]) + if allow_multiple: + args.extend(["--allow-multiple"]) res = self.raw_cli(args) res.check_returncode() @@ -1732,11 +2032,16 @@ class NeonCli(AbstractNeonCli): safekeepers: Optional[List[int]] = None, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, + allow_multiple=False, + basebackup_request_tries: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", "start", ] + extra_env_vars = {} + if basebackup_request_tries is not None: + extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries) if remote_ext_config is not None: args.extend(["--remote-ext-config", remote_ext_config]) @@ -1746,8 +2051,10 @@ class NeonCli(AbstractNeonCli): args.append(endpoint_id) if pageserver_id is not None: args.extend(["--pageserver-id", str(pageserver_id)]) + if allow_multiple: + args.extend(["--allow-multiple"]) - res = self.raw_cli(args) + res = self.raw_cli(args, extra_env_vars) res.check_returncode() return res @@ -1756,6 +2063,7 @@ class NeonCli(AbstractNeonCli): endpoint_id: str, tenant_id: Optional[TenantId] = None, pageserver_id: Optional[int] = None, + safekeepers: Optional[List[int]] = None, check_return_code=True, ) -> "subprocess.CompletedProcess[str]": args = ["endpoint", "reconfigure", endpoint_id] @@ -1763,6 +2071,8 @@ class NeonCli(AbstractNeonCli): args.extend(["--tenant-id", str(tenant_id)]) if pageserver_id is not None: args.extend(["--pageserver-id", str(pageserver_id)]) + if safekeepers is not None: + args.extend(["--safekeepers", (",".join(map(str, safekeepers)))]) return self.raw_cli(args, check_return_code=check_return_code) def endpoint_stop( @@ -1770,6 +2080,7 @@ class NeonCli(AbstractNeonCli): endpoint_id: str, destroy=False, check_return_code=True, + mode: Optional[str] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1777,6 +2088,8 @@ class NeonCli(AbstractNeonCli): ] if destroy: args.append("--destroy") + if mode is not None: + args.append(f"--mode={mode}") if endpoint_id is not None: args.append(endpoint_id) @@ -1805,19 +2118,6 @@ class NeonCli(AbstractNeonCli): return self.raw_cli(args, check_return_code=True) - def tenant_migrate( - self, tenant_shard_id: TenantShardId, new_pageserver: int, timeout_secs: Optional[int] - ): - args = [ - "tenant", - "migrate", - "--tenant-id", - str(tenant_shard_id), - "--id", - str(new_pageserver), - ] - return self.raw_cli(args, check_return_code=True, timeout=timeout_secs) - def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]": return self.raw_cli(["start"], check_return_code=check_return_code) @@ -1865,58 +2165,231 @@ class Pagectl(AbstractNeonCli): return IndexPartDump.from_json(parsed) -class NeonAttachmentService: - def __init__(self, env: NeonEnv, auth_enabled): +class LogUtils: + """ + A mixin class which provides utilities for inspecting the logs of a service. + """ + + def __init__(self, logfile: Path) -> None: + self.logfile = logfile + + def assert_log_contains( + self, pattern: str, offset: None | LogCursor = None + ) -> Tuple[str, LogCursor]: + """Convenient for use inside wait_until()""" + + res = self.log_contains(pattern, offset=offset) + assert res is not None + return res + + def log_contains( + self, pattern: str, offset: None | LogCursor = None + ) -> Optional[Tuple[str, LogCursor]]: + """Check that the log contains a line that matches the given regex""" + logfile = self.logfile + if not logfile.exists(): + log.warning(f"Skipping log check: {logfile} does not exist") + return None + + contains_re = re.compile(pattern) + + # XXX: Our rust logging machinery buffers the messages, so if you + # call this function immediately after it's been logged, there is + # no guarantee it is already present in the log file. This hasn't + # been a problem in practice, our python tests are not fast enough + # to hit that race condition. + skip_until_line_no = 0 if offset is None else offset._line_no + cur_line_no = 0 + with logfile.open("r") as f: + for line in f: + if cur_line_no < skip_until_line_no: + cur_line_no += 1 + continue + elif contains_re.search(line): + # found it! + cur_line_no += 1 + return (line, LogCursor(cur_line_no)) + else: + cur_line_no += 1 + return None + + +class StorageControllerApiException(Exception): + def __init__(self, message, status_code: int): + super().__init__(message) + self.message = message + self.status_code = status_code + + +# See libs/pageserver_api/src/controller_api.rs +# for the rust definitions of the enums below +# TODO: Replace with `StrEnum` when we upgrade to python 3.11 +class PageserverAvailability(str, Enum): + ACTIVE = "Active" + UNAVAILABLE = "Unavailable" + OFFLINE = "Offline" + + +class PageserverSchedulingPolicy(str, Enum): + ACTIVE = "Active" + DRAINING = "Draining" + FILLING = "Filling" + PAUSE = "Pause" + PAUSE_FOR_RESTART = "PauseForRestart" + + +class StorageControllerLeadershipStatus(str, Enum): + LEADER = "leader" + STEPPED_DOWN = "stepped_down" + CANDIDATE = "candidate" + + +class NeonStorageController(MetricsGetter, LogUtils): + def __init__(self, env: NeonEnv, port: int, auth_enabled: bool): self.env = env + self.port: int = port + self.api: str = f"http://127.0.0.1:{port}" self.running = False self.auth_enabled = auth_enabled + self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS + self.logfile = self.env.repo_dir / "storage_controller_1" / "storage_controller.log" - def start(self): + def start( + self, + timeout_in_seconds: Optional[int] = None, + instance_id: Optional[int] = None, + base_port: Optional[int] = None, + ): assert not self.running - self.env.neon_cli.attachment_service_start() + self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port) self.running = True return self - def stop(self, immediate: bool = False) -> "NeonAttachmentService": + def stop(self, immediate: bool = False) -> "NeonStorageController": if self.running: - self.env.neon_cli.attachment_service_stop(immediate) + self.env.neon_cli.storage_controller_stop(immediate) self.running = False return self - def request(self, method, *args, **kwargs) -> requests.Response: - kwargs["headers"] = self.headers() - return requests.request(method, *args, **kwargs) + def upcall_api_endpoint(self) -> str: + return f"{self.api}/upcall/v1" - def headers(self) -> Dict[str, str]: - headers = {} + def api_root(self) -> str: + return self.api + + @staticmethod + def retryable_node_operation(op, ps_id, max_attempts, backoff): + while max_attempts > 0: + try: + op(ps_id) + return + except StorageControllerApiException as e: + max_attempts -= 1 + log.info(f"Operation failed ({max_attempts} attempts left): {e}") + + if max_attempts == 0: + raise e + + time.sleep(backoff) + + @staticmethod + def raise_api_exception(res: requests.Response): + try: + res.raise_for_status() + except requests.RequestException as e: + try: + msg = res.json()["msg"] + except: # noqa: E722 + msg = "" + raise StorageControllerApiException(msg, res.status_code) from e + + def assert_no_errors(self): + assert_no_errors( + self.logfile, + "storage_controller", + self.allowed_errors, + ) + + def pageserver_api(self, *args, **kwargs) -> PageserverHttpClient: + """ + The storage controller implements a subset of the pageserver REST API, for mapping + per-tenant actions into per-shard actions (e.g. timeline creation). Tests should invoke those + functions via the HttpClient, as an implicit check that these APIs remain compatible. + """ + auth_token = None if self.auth_enabled: - jwt_token = self.env.auth_keys.generate_pageserver_token() + auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API) + return PageserverHttpClient(self.port, lambda: True, auth_token, *args, **kwargs) + + def request(self, method, *args, **kwargs) -> requests.Response: + resp = requests.request(method, *args, **kwargs) + NeonStorageController.raise_api_exception(resp) + + return resp + + def headers(self, scope: Optional[TokenScope]) -> Dict[str, str]: + headers = {} + if self.auth_enabled and scope is not None: + jwt_token = self.env.auth_keys.generate_token(scope=scope) headers["Authorization"] = f"Bearer {jwt_token}" return headers + def get_metrics(self) -> Metrics: + res = self.request("GET", f"{self.api}/metrics") + return parse_metrics(res.text) + + def ready(self) -> bool: + status = None + try: + resp = self.request("GET", f"{self.api}/ready") + status = resp.status_code + except StorageControllerApiException as e: + status = e.status_code + + if status == 503: + return False + elif status == 200: + return True + else: + raise RuntimeError(f"Unexpected status {status} from readiness endpoint") + + def wait_until_ready(self): + t1 = time.time() + + def storage_controller_ready(): + assert self.ready() is True + + wait_until(30, 1, storage_controller_ready) + return time.time() - t1 + def attach_hook_issue( - self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int + self, + tenant_shard_id: Union[TenantId, TenantShardId], + pageserver_id: int, + generation_override: Optional[int] = None, ) -> int: + body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id} + if generation_override is not None: + body["generation_override"] = generation_override + response = self.request( "POST", - f"{self.env.control_plane_api}/attach-hook", - json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id}, - headers=self.headers(), + f"{self.api}/debug/v1/attach-hook", + json=body, + headers=self.headers(TokenScope.ADMIN), ) - response.raise_for_status() gen = response.json()["gen"] assert isinstance(gen, int) return gen def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]): - response = self.request( + self.request( "POST", - f"{self.env.control_plane_api}/attach-hook", + f"{self.api}/debug/v1/attach-hook", json={"tenant_shard_id": str(tenant_shard_id), "node_id": None}, - headers=self.headers(), + headers=self.headers(TokenScope.ADMIN), ) - response.raise_for_status() def inspect(self, tenant_shard_id: Union[TenantId, TenantShardId]) -> Optional[tuple[int, int]]: """ @@ -1924,11 +2397,10 @@ class NeonAttachmentService: """ response = self.request( "POST", - f"{self.env.control_plane_api}/inspect", + f"{self.api}/debug/v1/inspect", json={"tenant_shard_id": str(tenant_shard_id)}, - headers=self.headers(), + headers=self.headers(TokenScope.ADMIN), ) - response.raise_for_status() json = response.json() log.info(f"Response: {json}") if json["attachment"]: @@ -1942,11 +2414,99 @@ class NeonAttachmentService: "node_id": int(node.id), "listen_http_addr": "localhost", "listen_http_port": node.service_port.http, + "listen_pg_addr": "localhost", + "listen_pg_port": node.service_port.pg, + "availability_zone_id": node.az_id, } log.info(f"node_register({body})") self.request( - "POST", f"{self.env.control_plane_api}/node", json=body, headers=self.headers() - ).raise_for_status() + "POST", + f"{self.api}/control/v1/node", + json=body, + headers=self.headers(TokenScope.ADMIN), + ) + + def node_delete(self, node_id): + log.info(f"node_delete({node_id})") + self.request( + "DELETE", + f"{self.api}/control/v1/node/{node_id}", + headers=self.headers(TokenScope.ADMIN), + ) + + def node_drain(self, node_id): + log.info(f"node_drain({node_id})") + self.request( + "PUT", + f"{self.api}/control/v1/node/{node_id}/drain", + headers=self.headers(TokenScope.ADMIN), + ) + + def cancel_node_drain(self, node_id): + log.info(f"cancel_node_drain({node_id})") + self.request( + "DELETE", + f"{self.api}/control/v1/node/{node_id}/drain", + headers=self.headers(TokenScope.ADMIN), + ) + + def node_fill(self, node_id): + log.info(f"node_fill({node_id})") + self.request( + "PUT", + f"{self.api}/control/v1/node/{node_id}/fill", + headers=self.headers(TokenScope.ADMIN), + ) + + def cancel_node_fill(self, node_id): + log.info(f"cancel_node_fill({node_id})") + self.request( + "DELETE", + f"{self.api}/control/v1/node/{node_id}/fill", + headers=self.headers(TokenScope.ADMIN), + ) + + def node_status(self, node_id): + response = self.request( + "GET", + f"{self.api}/control/v1/node/{node_id}", + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + + def get_leader(self): + response = self.request( + "GET", + f"{self.api}/control/v1/leader", + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + + def node_list(self): + response = self.request( + "GET", + f"{self.api}/control/v1/node", + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + + def tenant_list(self): + response = self.request( + "GET", + f"{self.api}/debug/v1/tenant", + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + + def node_configure(self, node_id, body: dict[str, Any]): + log.info(f"node_configure({node_id}, {body})") + body["node_id"] = node_id + self.request( + "PUT", + f"{self.api}/control/v1/node/{node_id}/config", + json=body, + headers=self.headers(TokenScope.ADMIN), + ) def tenant_create( self, @@ -1954,13 +2514,19 @@ class NeonAttachmentService: shard_count: Optional[int] = None, shard_stripe_size: Optional[int] = None, tenant_config: Optional[Dict[Any, Any]] = None, + placement_policy: Optional[Union[Dict[Any, Any] | str]] = None, ): + """ + Use this rather than pageserver_api() when you need to include shard parameters + """ body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)} if shard_count is not None: shard_params = {"count": shard_count} if shard_stripe_size is not None: shard_params["stripe_size"] = shard_stripe_size + else: + shard_params["stripe_size"] = 32768 body["shard_parameters"] = shard_params @@ -1968,49 +2534,370 @@ class NeonAttachmentService: for k, v in tenant_config.items(): body[k] = v - response = self.request("POST", f"{self.env.control_plane_api}/tenant", json=body) + body["placement_policy"] = placement_policy + + response = self.request( + "POST", + f"{self.api}/v1/tenant", + json=body, + headers=self.headers(TokenScope.PAGE_SERVER_API), + ) response.raise_for_status() log.info(f"tenant_create success: {response.json()}") - def tenant_timeline_create(self, tenant_id: TenantId, timeline_id: TimelineId): - body: Dict[str, Any] = {"new_timeline_id": str(timeline_id)} - - response = self.request( - "POST", f"{self.env.control_plane_api}/tenant/{tenant_id}/timeline", json=body - ) - response.raise_for_status() - log.info(f"tenant_timeline_create success: {response.json()}") - def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]: - response = self.request("GET", f"{self.env.control_plane_api}/tenant/{tenant_id}/locate") - response.raise_for_status() + """ + :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int} + """ + response = self.request( + "GET", + f"{self.api}/debug/v1/tenant/{tenant_id}/locate", + headers=self.headers(TokenScope.ADMIN), + ) body = response.json() shards: list[dict[str, Any]] = body["shards"] return shards - def tenant_shard_split(self, tenant_id: TenantId, shard_count: int) -> list[TenantShardId]: + def tenant_describe(self, tenant_id: TenantId): + """ + :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int, preferred_az_id: str} + """ response = self.request( - "PUT", - f"{self.env.control_plane_api}/tenant/{tenant_id}/shard_split", - json={"new_shard_count": shard_count}, + "GET", + f"{self.api}/control/v1/tenant/{tenant_id}", + headers=self.headers(TokenScope.ADMIN), ) response.raise_for_status() + return response.json() + + def tenant_shard_split( + self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None + ) -> list[TenantShardId]: + response = self.request( + "PUT", + f"{self.api}/control/v1/tenant/{tenant_id}/shard_split", + json={"new_shard_count": shard_count, "new_stripe_size": shard_stripe_size}, + headers=self.headers(TokenScope.ADMIN), + ) body = response.json() log.info(f"tenant_shard_split success: {body}") shards: list[TenantShardId] = body["new_shards"] return shards def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int): - response = self.request( + self.request( "PUT", - f"{self.env.control_plane_api}/tenant/{tenant_shard_id}/migrate", + f"{self.api}/control/v1/tenant/{tenant_shard_id}/migrate", json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id}, + headers=self.headers(TokenScope.ADMIN), ) - response.raise_for_status() log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}") assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id - def __enter__(self) -> "NeonAttachmentService": + def tenant_policy_update(self, tenant_id: TenantId, body: dict[str, Any]): + log.info(f"tenant_policy_update({tenant_id}, {body})") + self.request( + "PUT", + f"{self.api}/control/v1/tenant/{tenant_id}/policy", + json=body, + headers=self.headers(TokenScope.ADMIN), + ) + + def tenant_import(self, tenant_id: TenantId): + self.request( + "POST", + f"{self.api}/debug/v1/tenant/{tenant_id}/import", + headers=self.headers(TokenScope.ADMIN), + ) + + def reconcile_all(self): + r = self.request( + "POST", + f"{self.api}/debug/v1/reconcile_all", + headers=self.headers(TokenScope.ADMIN), + ) + r.raise_for_status() + n = r.json() + log.info(f"reconcile_all waited for {n} shards") + return n + + def reconcile_until_idle(self, timeout_secs=30): + start_at = time.time() + n = 1 + delay_sec = 0.5 + delay_max = 5 + while n > 0: + n = self.reconcile_all() + if n == 0: + break + elif time.time() - start_at > timeout_secs: + raise RuntimeError("Timeout in reconcile_until_idle") + else: + # Don't call again right away: if we're waiting for many reconciles that + # are blocked on the concurrency limit, it slows things down to call + # reconcile_all frequently. + time.sleep(delay_sec) + delay_sec *= 2 + delay_sec = min(delay_sec, delay_max) + + def consistency_check(self): + """ + Throw an exception if the service finds any inconsistencies in its state + """ + self.request( + "POST", + f"{self.api}/debug/v1/consistency_check", + headers=self.headers(TokenScope.ADMIN), + ) + log.info("storage controller passed consistency check") + + def node_registered(self, node_id: int) -> bool: + """ + Returns true if the storage controller can confirm + it knows of pageserver with 'node_id' + """ + try: + self.node_status(node_id) + except StorageControllerApiException as e: + if e.status_code == 404: + return False + else: + raise e + + return True + + def poll_node_status( + self, + node_id: int, + desired_availability: Optional[PageserverAvailability], + desired_scheduling_policy: Optional[PageserverSchedulingPolicy], + max_attempts: int, + backoff: int, + ): + """ + Poll the node status until it reaches 'desired_scheduling_policy' and 'desired_availability' + or 'max_attempts' have been exhausted + """ + log.info( + f"Polling {node_id} for {desired_scheduling_policy} scheduling policy and {desired_availability} availability" + ) + while max_attempts > 0: + try: + status = self.node_status(node_id) + policy = status["scheduling"] + availability = status["availability"] + if (desired_scheduling_policy is None or policy == desired_scheduling_policy) and ( + desired_availability is None or availability == desired_availability + ): + return + else: + max_attempts -= 1 + log.info( + f"Status call returned {policy=} {availability=} ({max_attempts} attempts left)" + ) + + if max_attempts == 0: + raise AssertionError( + f"Status for {node_id=} did not reach {desired_scheduling_policy=} {desired_availability=}" + ) + + time.sleep(backoff) + except StorageControllerApiException as e: + max_attempts -= 1 + log.info(f"Status call failed ({max_attempts} retries left): {e}") + + if max_attempts == 0: + raise e + + time.sleep(backoff) + + def metadata_health_update(self, healthy: List[TenantShardId], unhealthy: List[TenantShardId]): + body: Dict[str, Any] = { + "healthy_tenant_shards": [str(t) for t in healthy], + "unhealthy_tenant_shards": [str(t) for t in unhealthy], + } + + self.request( + "POST", + f"{self.api}/control/v1/metadata_health/update", + json=body, + headers=self.headers(TokenScope.SCRUBBER), + ) + + def metadata_health_list_unhealthy(self): + response = self.request( + "GET", + f"{self.api}/control/v1/metadata_health/unhealthy", + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + + def metadata_health_list_outdated(self, duration: str): + body: Dict[str, Any] = {"not_scrubbed_for": duration} + + response = self.request( + "POST", + f"{self.api}/control/v1/metadata_health/outdated", + json=body, + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + + def metadata_health_is_healthy(self, outdated_duration: str = "1h") -> bool: + """Metadata is healthy if there is no unhealthy or outdated health records.""" + + unhealthy = self.metadata_health_list_unhealthy() + outdated = self.metadata_health_list_outdated(outdated_duration) + + healthy = ( + len(unhealthy["unhealthy_tenant_shards"]) == 0 and len(outdated["health_records"]) == 0 + ) + if not healthy: + log.info(f"{unhealthy=}, {outdated=}") + return healthy + + def step_down(self): + log.info("Asking storage controller to step down") + response = self.request( + "PUT", + f"{self.api}/control/v1/step_down", + headers=self.headers(TokenScope.ADMIN), + ) + + response.raise_for_status() + return response.json() + + def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): + if isinstance(config_strings, tuple): + pairs = [config_strings] + else: + pairs = config_strings + + log.info(f"Requesting config failpoints: {repr(pairs)}") + + res = self.request( + "PUT", + f"{self.api}/debug/v1/failpoints", + json=[{"name": name, "actions": actions} for name, actions in pairs], + headers=self.headers(TokenScope.ADMIN), + ) + log.info(f"Got failpoints request response code {res.status_code}") + res.raise_for_status() + + def get_tenants_placement(self) -> defaultdict[str, Dict[str, Any]]: + """ + Get the intent and observed placements of all tenants known to the storage controller. + """ + tenants = self.tenant_list() + + tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict( + lambda: { + "observed": {"attached": None, "secondary": []}, + "intent": {"attached": None, "secondary": []}, + } + ) + + for t in tenants: + for node_id, loc_state in t["observed"]["locations"].items(): + if ( + loc_state is not None + and "conf" in loc_state + and loc_state["conf"] is not None + and loc_state["conf"]["mode"] + in set(["AttachedSingle", "AttachedMulti", "AttachedStale"]) + ): + tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id) + + if ( + loc_state is not None + and "conf" in loc_state + and loc_state["conf"] is not None + and loc_state["conf"]["mode"] == "Secondary" + ): + tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append( + int(node_id) + ) + + if "attached" in t["intent"]: + tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"][ + "attached" + ] + + if "secondary" in t["intent"]: + tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][ + "secondary" + ] + + return tenant_placement + + def warm_up_all_secondaries(self): + log.info("Warming up all secondary locations") + + tenant_placement = self.get_tenants_placement() + for tid, placement in tenant_placement.items(): + assert placement["observed"]["attached"] is not None + primary_id = placement["observed"]["attached"] + + assert len(placement["observed"]["secondary"]) == 1 + secondary_id = placement["observed"]["secondary"][0] + + parsed_tid = TenantShardId.parse(tid) + self.env.get_pageserver(primary_id).http_client().tenant_heatmap_upload(parsed_tid) + self.env.get_pageserver(secondary_id).http_client().tenant_secondary_download( + parsed_tid, wait_ms=250 + ) + + def get_leadership_status(self) -> StorageControllerLeadershipStatus: + metric_values = {} + for status in StorageControllerLeadershipStatus: + metric_value = self.get_metric_value( + "storage_controller_leadership_status", filter={"status": status} + ) + metric_values[status] = metric_value + + assert list(metric_values.values()).count(1) == 1 + + for status, metric_value in metric_values.items(): + if metric_value == 1: + return status + + raise AssertionError("unreachable") + + def on_safekeeper_deploy(self, id: int, body: dict[str, Any]): + self.request( + "POST", + f"{self.api}/control/v1/safekeeper/{id}", + headers=self.headers(TokenScope.ADMIN), + json=body, + ) + + def get_safekeeper(self, id: int) -> Optional[dict[str, Any]]: + try: + response = self.request( + "GET", + f"{self.api}/control/v1/safekeeper/{id}", + headers=self.headers(TokenScope.ADMIN), + ) + json = response.json() + assert isinstance(json, dict) + return json + except StorageControllerApiException as e: + if e.status_code == 404: + return None + raise e + + def set_preferred_azs(self, preferred_azs: dict[TenantShardId, str]) -> list[TenantShardId]: + response = self.request( + "PUT", + f"{self.api}/control/v1/preferred_azs", + headers=self.headers(TokenScope.ADMIN), + json={str(tid): az for tid, az in preferred_azs.items()}, + ) + + response.raise_for_status() + return [TenantShardId.parse(tid) for tid in response.json()["updated"]] + + def __enter__(self) -> "NeonStorageController": return self def __exit__( @@ -2022,24 +2909,80 @@ class NeonAttachmentService: self.stop(immediate=True) -class NeonPageserver(PgProtocol): +class NeonProxiedStorageController(NeonStorageController): + def __init__(self, env: NeonEnv, proxy_port: int, auth_enabled: bool): + super(NeonProxiedStorageController, self).__init__(env, proxy_port, auth_enabled) + self.instances: dict[int, dict[str, Any]] = {} + + def start( + self, + timeout_in_seconds: Optional[int] = None, + instance_id: Optional[int] = None, + base_port: Optional[int] = None, + ): + assert instance_id is not None and base_port is not None + + self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port) + self.instances[instance_id] = {"running": True} + + self.running = True + return self + + def stop_instance( + self, immediate: bool = False, instance_id: Optional[int] = None + ) -> "NeonStorageController": + assert instance_id in self.instances + if self.instances[instance_id]["running"]: + self.env.neon_cli.storage_controller_stop(immediate, instance_id) + self.instances[instance_id]["running"] = False + + self.running = any(meta["running"] for meta in self.instances.values()) + return self + + def stop(self, immediate: bool = False) -> "NeonStorageController": + for iid, details in self.instances.items(): + if details["running"]: + self.env.neon_cli.storage_controller_stop(immediate, iid) + self.instances[iid]["running"] = False + + self.running = False + return self + + def assert_no_errors(self): + for instance_id in self.instances.keys(): + assert_no_errors( + self.env.repo_dir / f"storage_controller_{instance_id}" / "storage_controller.log", + "storage_controller", + self.allowed_errors, + ) + + def log_contains( + self, pattern: str, offset: None | LogCursor = None + ) -> Optional[Tuple[str, LogCursor]]: + raise NotImplementedError() + + +@dataclass +class LogCursor: + _line_no: int + + +class NeonPageserver(PgProtocol, LogUtils): """ An object representing a running pageserver. """ TEMP_FILE_SUFFIX = "___temp" - def __init__( - self, env: NeonEnv, id: int, port: PageserverPort, config_override: Optional[str] = None - ): + def __init__(self, env: NeonEnv, id: int, port: PageserverPort, az_id: str): super().__init__(host="localhost", port=port.pg, user="cloud_admin") self.env = env self.id = id + self.az_id = az_id self.running = False self.service_port = port - self.config_override = config_override self.version = env.get_binary_version("pageserver") - + self.logfile = self.workdir / "pageserver.log" # After a test finishes, we will scrape the log to see if there are any # unexpected error messages. If your test expects an error, add it to # 'allowed_errors' in the test with something like: @@ -2049,25 +2992,64 @@ class NeonPageserver(PgProtocol): # The entries in the list are regular experessions. self.allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS) - def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path: + def timeline_dir( + self, + tenant_shard_id: Union[TenantId, TenantShardId], + timeline_id: Optional[TimelineId] = None, + ) -> Path: """Get a timeline directory's path based on the repo directory of the test environment""" if timeline_id is None: - return self.tenant_dir(tenant_id) / "timelines" - return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id) + return self.tenant_dir(tenant_shard_id) / "timelines" + return self.tenant_dir(tenant_shard_id) / "timelines" / str(timeline_id) def tenant_dir( self, - tenant_id: Optional[TenantId] = None, + tenant_shard_id: Optional[Union[TenantId, TenantShardId]] = None, ) -> Path: """Get a tenant directory's path based on the repo directory of the test environment""" - if tenant_id is None: + if tenant_shard_id is None: return self.workdir / "tenants" - return self.workdir / "tenants" / str(tenant_id) + return self.workdir / "tenants" / str(tenant_shard_id) + + @property + def config_toml_path(self) -> Path: + return self.workdir / "pageserver.toml" + + def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], T]) -> T: + """ + Edit the pageserver's config toml file in place. + """ + path = self.config_toml_path + with open(path, "r") as f: + config = toml.load(f) + res = edit_fn(config) + with open(path, "w") as f: + toml.dump(config, f) + return res + + def patch_config_toml_nonrecursive(self, patch: Dict[str, Any]) -> Dict[str, Any]: + """ + Non-recursively merge the given `patch` dict into the existing config toml, using `dict.update()`. + Returns the replaced values. + If there was no previous value, the key is mapped to None. + This allows to restore the original value by calling this method with the returned dict. + """ + replacements = {} + + def doit(config: Dict[str, Any]): + while len(patch) > 0: + key, new = patch.popitem() + old = config.get(key, None) + config[key] = new + replacements[key] = old + + self.edit_config_toml(doit) + return replacements def start( self, - overrides: Tuple[str, ...] = (), extra_env_vars: Optional[Dict[str, str]] = None, + timeout_in_seconds: Optional[int] = None, ) -> "NeonPageserver": """ Start the page server. @@ -2077,9 +3059,17 @@ class NeonPageserver(PgProtocol): assert self.running is False self.env.neon_cli.pageserver_start( - self.id, overrides=overrides, extra_env_vars=extra_env_vars + self.id, extra_env_vars=extra_env_vars, timeout_in_seconds=timeout_in_seconds ) self.running = True + + if self.env.storage_controller.running and self.env.storage_controller.node_registered( + self.id + ): + self.env.storage_controller.poll_node_status( + self.id, PageserverAvailability.ACTIVE, None, max_attempts=20, backoff=1 + ) + return self def stop(self, immediate: bool = False) -> "NeonPageserver": @@ -2092,13 +3082,17 @@ class NeonPageserver(PgProtocol): self.running = False return self - def restart(self, immediate: bool = False): + def restart( + self, + immediate: bool = False, + timeout_in_seconds: Optional[int] = None, + ): """ High level wrapper for restart: restarts the process, and waits for tenant state to stabilize. """ self.stop(immediate=immediate) - self.start() + self.start(timeout_in_seconds=timeout_in_seconds) self.quiesce_tenants() def quiesce_tenants(self): @@ -2156,18 +3150,9 @@ class NeonPageserver(PgProtocol): return self.env.repo_dir / f"pageserver_{self.id}" def assert_no_errors(self): - logfile = self.workdir / "pageserver.log" - if not logfile.exists(): - log.warning(f"Skipping log check: {logfile} does not exist") - return - - with logfile.open("r") as f: - errors = scan_pageserver_log_for_errors(f, self.allowed_errors) - - for _lineno, error in errors: - log.info(f"not allowed error: {error.strip()}") - - assert not errors + assert_no_errors( + self.workdir / "pageserver.log", f"pageserver_{self.id}", self.allowed_errors + ) def assert_no_metric_errors(self): """ @@ -2184,34 +3169,12 @@ class NeonPageserver(PgProtocol): value = self.http_client().get_metric_value(metric) assert value == 0, f"Nonzero {metric} == {value}" - def log_contains(self, pattern: str) -> Optional[str]: - """Check that the pageserver log contains a line that matches the given regex""" - logfile = self.workdir / "pageserver.log" - if not logfile.exists(): - log.warning(f"Skipping log check: {logfile} does not exist") - return None - - contains_re = re.compile(pattern) - - # XXX: Our rust logging machinery buffers the messages, so if you - # call this function immediately after it's been logged, there is - # no guarantee it is already present in the log file. This hasn't - # been a problem in practice, our python tests are not fast enough - # to hit that race condition. - with logfile.open("r") as f: - for line in f: - if contains_re.search(line): - # found it! - return line - - return None - def tenant_attach( self, tenant_id: TenantId, config: None | Dict[str, Any] = None, - config_null: bool = False, generation: Optional[int] = None, + override_storage_controller_generation: bool = False, ): """ Tenant attachment passes through here to acquire a generation number before proceeding @@ -2219,29 +3182,34 @@ class NeonPageserver(PgProtocol): """ client = self.http_client() if generation is None: - generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id) + generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) + elif override_storage_controller_generation: + generation = self.env.storage_controller.attach_hook_issue( + tenant_id, self.id, generation + ) return client.tenant_attach( tenant_id, + generation, config, - config_null, - generation=generation, ) def tenant_detach(self, tenant_id: TenantId): - self.env.attachment_service.attach_hook_drop(tenant_id) + self.env.storage_controller.attach_hook_drop(tenant_id) client = self.http_client() return client.tenant_detach(tenant_id) def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs): if config["mode"].startswith("Attached") and "generation" not in config: - config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id) + config["generation"] = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) client = self.http_client() return client.tenant_location_conf(tenant_id, config, **kwargs) - def read_tenant_location_conf(self, tenant_id: TenantId) -> dict[str, Any]: - path = self.tenant_dir(tenant_id) / "config-v1" + def read_tenant_location_conf( + self, tenant_shard_id: Union[TenantId, TenantShardId] + ) -> dict[str, Any]: + path = self.tenant_dir(tenant_shard_id) / "config-v1" log.info(f"Reading location conf from {path}") bytes = open(path, "r").read() try: @@ -2259,42 +3227,54 @@ class NeonPageserver(PgProtocol): generation: Optional[int] = None, ) -> TenantId: if generation is None: - generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id) + generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) client = self.http_client(auth_token=auth_token) - return client.tenant_create(tenant_id, conf, generation=generation) - def tenant_load(self, tenant_id: TenantId): - client = self.http_client() - return client.tenant_load( - tenant_id, generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id) + conf = conf or {} + + client.tenant_location_conf( + tenant_id, + { + "mode": "AttachedSingle", + "generation": generation, + "tenant_conf": conf, + "secondary_conf": None, + }, + ) + return tenant_id + + def list_layers( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + ) -> list[Path]: + """ + Inspect local storage on a pageserver to discover which layer files are present. + + :return: list of relative paths to layers, from the timeline root. + """ + timeline_path = self.timeline_dir(tenant_id, timeline_id) + + def relative(p: Path) -> Path: + return p.relative_to(timeline_path) + + return sorted( + list( + map( + relative, + filter( + lambda path: path.name != "metadata" + and "ephemeral" not in path.name + and "temp" not in path.name, + timeline_path.glob("*"), + ), + ) + ) ) - -def append_pageserver_param_overrides( - params_to_update: List[str], - remote_storage: Optional[RemoteStorage], - pageserver_config_override: Optional[str] = None, -): - if remote_storage is not None: - remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage) - - params_to_update.append( - f"--pageserver-config-override=remote_storage={remote_storage_toml_table}" - ) - else: - params_to_update.append('--pageserver-config-override=remote_storage=""') - - env_overrides = os.getenv("NEON_PAGESERVER_OVERRIDES") - if env_overrides is not None: - params_to_update += [ - f"--pageserver-config-override={o.strip()}" for o in env_overrides.split(";") - ] - - if pageserver_config_override is not None: - params_to_update += [ - f"--pageserver-config-override={o.strip()}" - for o in pageserver_config_override.split(";") - ] + def layer_exists( + self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: LayerName + ) -> bool: + layers = self.list_layers(tenant_id, timeline_id) + return layer_name in [parse_layer_file_name(p.name) for p in layers] class PgBin: @@ -2319,9 +3299,21 @@ class PgBin: env.update(env_add) return env - def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None): + def _log_env(self, env: dict[str, str]) -> None: + env_s = {} + for k, v in env.items(): + if k.startswith("PG") and k != "PGPASSWORD": + env_s[k] = v + log.debug(f"Environment: {env_s}") + + def run_nonblocking( + self, + command: List[str], + env: Optional[Env] = None, + cwd: Optional[Union[str, Path]] = None, + ) -> subprocess.Popen[Any]: """ - Run one of the postgres binaries. + Run one of the postgres binaries, not waiting for it to finish The command should be in list form, e.g. ['pgbench', '-p', '55432'] @@ -2332,11 +3324,34 @@ class PgBin: If you want stdout/stderr captured to files, use `run_capture` instead. """ - self._fixpath(command) log.info(f"Running command '{' '.join(command)}'") env = self._build_env(env) - subprocess.run(command, env=env, cwd=cwd, check=True) + self._log_env(env) + return subprocess.Popen(command, env=env, cwd=cwd, stdout=subprocess.PIPE, text=True) + + def run( + self, + command: List[str], + env: Optional[Env] = None, + cwd: Optional[Union[str, Path]] = None, + ) -> None: + """ + Run one of the postgres binaries, waiting for it to finish + + The command should be in list form, e.g. ['pgbench', '-p', '55432'] + + All the necessary environment variables will be set. + + If the first argument (the command name) doesn't include a path (no '/' + characters present), then it will be edited to include the correct path. + + If you want stdout/stderr captured to files, use `run_capture` instead. + """ + proc = self.run_nonblocking(command, env, cwd) + proc.wait() + if proc.returncode != 0: + raise subprocess.CalledProcessError(proc.returncode, proc.args) def run_capture( self, @@ -2356,6 +3371,7 @@ class PgBin: self._fixpath(command) log.info(f"Running command '{' '.join(command)}'") env = self._build_env(env) + self._log_env(env) base_path, _, _ = subprocess_capture( self.log_dir, command, @@ -2367,12 +3383,49 @@ class PgBin: ) return base_path + def get_pg_controldata_checkpoint_lsn(self, pgdata: str) -> Lsn: + """ + Run pg_controldata on given datadir and extract checkpoint lsn. + """ + + pg_controldata_path = os.path.join(self.pg_bin_path, "pg_controldata") + cmd = f"{pg_controldata_path} -D {pgdata}" + result = subprocess.run(cmd, capture_output=True, text=True, shell=True) + checkpoint_lsn = re.findall( + "Latest checkpoint location:\\s+([0-9A-F]+/[0-9A-F]+)", result.stdout + )[0] + log.info(f"last checkpoint at {checkpoint_lsn}") + return Lsn(checkpoint_lsn) + + def take_fullbackup( + self, + pageserver: NeonPageserver, + tenant: TenantId, + timeline: TimelineId, + lsn: Lsn, + output: Path, + ): + """ + Request fullbackup from pageserver, store it at 'output'. + """ + cmd = [ + "psql", + "--no-psqlrc", + pageserver.connstr(), + "-c", + f"fullbackup {tenant} {timeline} {lsn}", + "-o", + str(output), + ] + self.run_capture(cmd) + @pytest.fixture(scope="function") def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin: return PgBin(test_output_dir, pg_distrib_dir, pg_version) +# TODO make port an optional argument class VanillaPostgres(PgProtocol): def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init: bool = True): super().__init__(host="localhost", port=port, dbname="postgres") @@ -2508,6 +3561,18 @@ class RemotePostgres(PgProtocol): pass +@pytest.fixture(scope="function") +def benchmark_project_pub(neon_api: NeonAPI, pg_version: PgVersion) -> NeonApiEndpoint: + project_id = os.getenv("BENCHMARK_PROJECT_ID_PUB") + return NeonApiEndpoint(neon_api, pg_version, project_id) + + +@pytest.fixture(scope="function") +def benchmark_project_sub(neon_api: NeonAPI, pg_version: PgVersion) -> NeonApiEndpoint: + project_id = os.getenv("BENCHMARK_PROJECT_ID_SUB") + return NeonApiEndpoint(neon_api, pg_version, project_id) + + @pytest.fixture(scope="function") def remote_pg( test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion @@ -2554,9 +3619,16 @@ class PSQL: host: str = "127.0.0.1", port: int = 5432, ): - assert shutil.which(path) + search_path = None + if (d := os.getenv("POSTGRES_DISTRIB_DIR")) is not None and ( + v := os.getenv("DEFAULT_PG_VERSION") + ) is not None: + search_path = Path(d) / f"v{v}" / "bin" - self.path = path + full_path = shutil.which(path, path=search_path) + assert full_path is not None + + self.path = full_path self.database_url = f"postgres://{host}:{port}/main?options=project%3Dgeneric-project-name" async def run(self, query: Optional[str] = None) -> asyncio.subprocess.Process: @@ -2662,6 +3734,7 @@ class NeonProxy(PgProtocol): self.auth_backend = auth_backend self.metric_collection_endpoint = metric_collection_endpoint self.metric_collection_interval = metric_collection_interval + self.http_timeout_seconds = 15 self._popen: Optional[subprocess.Popen[bytes]] = None def start(self) -> NeonProxy: @@ -2700,6 +3773,7 @@ class NeonProxy(PgProtocol): *["--proxy", f"{self.host}:{self.proxy_port}"], *["--mgmt", f"{self.host}:{self.mgmt_port}"], *["--wss", f"{self.host}:{self.external_http_port}"], + *["--sql-over-http-timeout", f"{self.http_timeout_seconds}s"], *["-c", str(crt_path)], *["-k", str(key_path)], *self.auth_backend.extra_args(), @@ -2736,9 +3810,12 @@ class NeonProxy(PgProtocol): def http_query(self, query, args, **kwargs): # TODO maybe use default values if not provided - user = kwargs["user"] - password = kwargs["password"] + user = quote(kwargs["user"]) + password = quote(kwargs["password"]) expected_code = kwargs.get("expected_code") + timeout = kwargs.get("timeout") + + log.info(f"Executing http query: {query}") connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres" response = requests.post( @@ -2750,15 +3827,42 @@ class NeonProxy(PgProtocol): "Neon-Pool-Opt-In": "true", }, verify=str(self.test_output_dir / "proxy.crt"), + timeout=timeout, ) if expected_code is not None: - assert response.status_code == kwargs["expected_code"], f"response: {response.json()}" + assert response.status_code == expected_code, f"response: {response.json()}" return response.json() + async def http2_query(self, query, args, **kwargs): + # TODO maybe use default values if not provided + user = kwargs["user"] + password = kwargs["password"] + expected_code = kwargs.get("expected_code") + + log.info(f"Executing http2 query: {query}") + + connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres" + async with httpx.AsyncClient( + http2=True, verify=str(self.test_output_dir / "proxy.crt") + ) as client: + response = await client.post( + f"https://{self.domain}:{self.external_http_port}/sql", + json={"query": query, "params": args}, + headers={ + "Content-Type": "application/sql", + "Neon-Connection-String": connstr, + "Neon-Pool-Opt-In": "true", + }, + ) + assert response.http_version == "HTTP/2" + + if expected_code is not None: + assert response.status_code == expected_code, f"response: {response.json()}" + return response.json() + def get_metrics(self) -> str: request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics") - request_result.raise_for_status() return request_result.text @staticmethod @@ -2905,7 +4009,7 @@ def static_proxy( yield proxy -class Endpoint(PgProtocol): +class Endpoint(PgProtocol, LogUtils): """An object representing a Postgres compute endpoint managed by the control plane.""" def __init__( @@ -2918,7 +4022,6 @@ class Endpoint(PgProtocol): ): super().__init__(host="localhost", port=pg_port, user="cloud_admin", dbname="postgres") self.env = env - self.running = False self.branch_name: Optional[str] = None # dubious self.endpoint_id: Optional[str] = None # dubious, see asserts below self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA @@ -2926,9 +4029,24 @@ class Endpoint(PgProtocol): self.pg_port = pg_port self.http_port = http_port self.check_stop_result = check_stop_result + # passed to endpoint create and endpoint reconfigure self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers)) # path to conf is /endpoints//pgdata/postgresql.conf + # Semaphore is set to 1 when we start, and acquire'd back to zero when we stop + # + # We use a semaphore rather than a bool so that racing calls to stop() don't + # try and stop the same process twice, as stop() is called by test teardown and + # potentially by some __del__ chains in other threads. + self._running = threading.Semaphore(0) + + def http_client( + self, auth_token: Optional[str] = None, retries: Optional[Retry] = None + ) -> EndpointHttpClient: + return EndpointHttpClient( + port=self.http_port, + ) + def create( self, branch_name: str, @@ -2937,6 +4055,7 @@ class Endpoint(PgProtocol): lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, pageserver_id: Optional[int] = None, + allow_multiple: bool = False, ) -> "Endpoint": """ Create a new Postgres endpoint. @@ -2959,21 +4078,29 @@ class Endpoint(PgProtocol): pg_port=self.pg_port, http_port=self.http_port, pageserver_id=pageserver_id, + allow_multiple=allow_multiple, ) path = Path("endpoints") / self.endpoint_id / "pgdata" self.pgdata_dir = os.path.join(self.env.repo_dir, path) + self.logfile = self.endpoint_path() / "compute.log" config_lines = config_lines or [] # set small 'max_replication_write_lag' to enable backpressure # and make tests more stable. config_lines = ["max_replication_write_lag=15MB"] + config_lines + self.config(config_lines) return self def start( - self, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None + self, + remote_ext_config: Optional[str] = None, + pageserver_id: Optional[int] = None, + safekeepers: Optional[List[int]] = None, + allow_multiple: bool = False, + basebackup_request_tries: Optional[int] = None, ) -> "Endpoint": """ Start the Postgres instance. @@ -2982,6 +4109,11 @@ class Endpoint(PgProtocol): assert self.endpoint_id is not None + # If `safekeepers` is not None, they are remember them as active and use + # in the following commands. + if safekeepers is not None: + self.active_safekeepers = safekeepers + log.info(f"Starting postgres endpoint {self.endpoint_id}") self.env.neon_cli.endpoint_start( @@ -2989,8 +4121,10 @@ class Endpoint(PgProtocol): safekeepers=self.active_safekeepers, remote_ext_config=remote_ext_config, pageserver_id=pageserver_id, + allow_multiple=allow_multiple, + basebackup_request_tries=basebackup_request_tries, ) - self.running = True + self._running.release(1) return self @@ -3030,22 +4164,55 @@ class Endpoint(PgProtocol): return self - def reconfigure(self, pageserver_id: Optional[int] = None): - assert self.endpoint_id is not None - self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id) + def edit_hba(self, hba: List[str]): + """Prepend hba lines into pg_hba.conf file.""" + with open(os.path.join(self.pg_data_dir_path(), "pg_hba.conf"), "r+") as conf_file: + data = conf_file.read() + conf_file.seek(0) + conf_file.write("\n".join(hba) + "\n") + conf_file.write(data) - def respec(self, **kwargs): + if self.is_running(): + self.safe_psql("SELECT pg_reload_conf()") + + def is_running(self): + return self._running._value > 0 + + def reconfigure( + self, pageserver_id: Optional[int] = None, safekeepers: Optional[List[int]] = None + ): + assert self.endpoint_id is not None + # If `safekeepers` is not None, they are remember them as active and use + # in the following commands. + if safekeepers is not None: + self.active_safekeepers = safekeepers + self.env.neon_cli.endpoint_reconfigure( + self.endpoint_id, self.tenant_id, pageserver_id, self.active_safekeepers + ) + + def respec(self, **kwargs: Any) -> None: """Update the endpoint.json file used by control_plane.""" # Read config config_path = os.path.join(self.endpoint_path(), "endpoint.json") with open(config_path, "r") as f: - data_dict = json.load(f) + data_dict: dict[str, Any] = json.load(f) # Write it back updated with open(config_path, "w") as file: log.info(json.dumps(dict(data_dict, **kwargs))) json.dump(dict(data_dict, **kwargs), file, indent=4) + # Please note: Migrations only run if pg_skip_catalog_updates is false + def wait_for_migrations(self, num_migrations: int = 10): + with self.cursor() as cur: + + def check_migrations_done(): + cur.execute("SELECT id FROM neon_migration.migration_id") + migration_id: int = cur.fetchall()[0][0] + assert migration_id >= num_migrations + + wait_until(20, 0.5, check_migrations_done) + # Mock the extension part of spec passed from control plane for local testing # endpooint.rs adds content of this file as a part of the spec.json def create_remote_extension_spec(self, spec: dict[str, Any]): @@ -3057,33 +4224,38 @@ class Endpoint(PgProtocol): with open(remote_extensions_spec_path, "w") as file: json.dump(spec, file, indent=4) - def stop(self) -> "Endpoint": + def stop(self, mode: str = "fast") -> "Endpoint": """ Stop the Postgres instance if it's running. + + Because test teardown might try and stop an endpoint concurrently with test code + stopping the endpoint, this method is thread safe + Returns self. """ - if self.running: + running = self._running.acquire(blocking=False) + if running: assert self.endpoint_id is not None self.env.neon_cli.endpoint_stop( - self.endpoint_id, check_return_code=self.check_stop_result + self.endpoint_id, check_return_code=self.check_stop_result, mode=mode ) - self.running = False return self - def stop_and_destroy(self) -> "Endpoint": + def stop_and_destroy(self, mode: str = "immediate") -> "Endpoint": """ Stop the Postgres instance, then destroy the endpoint. Returns self. """ - assert self.endpoint_id is not None - self.env.neon_cli.endpoint_stop( - self.endpoint_id, True, check_return_code=self.check_stop_result - ) - self.endpoint_id = None - self.running = False + running = self._running.acquire(blocking=False) + if running: + assert self.endpoint_id is not None + self.env.neon_cli.endpoint_stop( + self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode + ) + self.endpoint_id = None return self @@ -3096,6 +4268,8 @@ class Endpoint(PgProtocol): config_lines: Optional[List[str]] = None, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, + allow_multiple=False, + basebackup_request_tries: Optional[int] = None, ) -> "Endpoint": """ Create an endpoint, apply config, and start Postgres. @@ -3111,7 +4285,13 @@ class Endpoint(PgProtocol): hot_standby=hot_standby, lsn=lsn, pageserver_id=pageserver_id, - ).start(remote_ext_config=remote_ext_config, pageserver_id=pageserver_id) + allow_multiple=allow_multiple, + ).start( + remote_ext_config=remote_ext_config, + pageserver_id=pageserver_id, + allow_multiple=allow_multiple, + basebackup_request_tries=basebackup_request_tries, + ) log.info(f"Postgres startup took {time.time() - started_at} seconds") @@ -3135,6 +4315,17 @@ class Endpoint(PgProtocol): assert self.pgdata_dir is not None # please mypy return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024 + def clear_shared_buffers(self, cursor: Optional[Any] = None): + """ + Best-effort way to clear postgres buffers. Pinned buffers will not be 'cleared.' + + Might also clear LFC. + """ + if cursor is not None: + cursor.execute("select clear_buffer_cache()") + else: + self.safe_psql("select clear_buffer_cache()") + class EndpointFactory: """An object representing multiple compute endpoints.""" @@ -3154,6 +4345,7 @@ class EndpointFactory: config_lines: Optional[List[str]] = None, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, + basebackup_request_tries: Optional[int] = None, ) -> Endpoint: ep = Endpoint( self.env, @@ -3172,6 +4364,7 @@ class EndpointFactory: lsn=lsn, remote_ext_config=remote_ext_config, pageserver_id=pageserver_id, + basebackup_request_tries=basebackup_request_tries, ) def create( @@ -3205,13 +4398,23 @@ class EndpointFactory: pageserver_id=pageserver_id, ) - def stop_all(self) -> "EndpointFactory": + def stop_all(self, fail_on_error=True) -> "EndpointFactory": + exception = None for ep in self.endpoints: - ep.stop() + try: + ep.stop() + except Exception as e: + log.error(f"Failed to stop endpoint {ep.endpoint_id}: {e}") + exception = e + + if fail_on_error and exception is not None: + raise exception return self - def new_replica(self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]]): + def new_replica( + self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]] = None + ): branch_name = origin.branch_name assert origin in self.endpoints assert branch_name is not None @@ -3250,7 +4453,7 @@ class SafekeeperPort: @dataclass -class Safekeeper: +class Safekeeper(LogUtils): """An object representing a running safekeeper daemon.""" env: NeonEnv @@ -3258,9 +4461,48 @@ class Safekeeper: id: int running: bool = False - def start(self, extra_opts: Optional[List[str]] = None) -> "Safekeeper": + def __init__( + self, + env: NeonEnv, + port: SafekeeperPort, + id: int, + running: bool = False, + extra_opts: Optional[List[str]] = None, + ): + self.env = env + self.port = port + self.id = id + self.running = running + self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log" + + if extra_opts is None: + # Testing defaults: enable everything, and set short timeouts so that background + # work will happen during short tests. + # **Note**: Any test that explicitly sets extra_opts will not get these defaults. + extra_opts = [ + "--enable-offload", + "--delete-offloaded-wal", + "--partial-backup-timeout", + "10s", + "--control-file-save-interval", + "1s", + "--eviction-min-resident", + "10s", + ] + + self.extra_opts = extra_opts + + def start( + self, extra_opts: Optional[List[str]] = None, timeout_in_seconds: Optional[int] = None + ) -> "Safekeeper": + if extra_opts is None: + # Apply either the extra_opts passed in, or the ones from our constructor: we do not merge the two. + extra_opts = self.extra_opts + assert self.running is False - self.env.neon_cli.safekeeper_start(self.id, extra_opts=extra_opts) + self.env.neon_cli.safekeeper_start( + self.id, extra_opts=extra_opts, timeout_in_seconds=timeout_in_seconds + ) self.running = True # wait for wal acceptor start by checking its status started_at = time.time() @@ -3280,11 +4522,16 @@ class Safekeeper: return self def stop(self, immediate: bool = False) -> "Safekeeper": - log.info("Stopping safekeeper {}".format(self.id)) + log.info(f"Stopping safekeeper {self.id}") self.env.neon_cli.safekeeper_stop(self.id, immediate) self.running = False return self + def assert_no_errors(self): + assert not self.log_contains("manager task finished prematurely") + assert not self.log_contains("error while acquiring WalResidentTimeline guard") + assert not self.log_contains("timeout while acquiring WalResidentTimeline guard") + def append_logical_message( self, tenant_id: TenantId, timeline_id: TimelineId, request: Dict[str, Any] ) -> Dict[str, Any]: @@ -3312,232 +4559,143 @@ class Safekeeper: assert isinstance(res, dict) return res - def http_client(self, auth_token: Optional[str] = None) -> SafekeeperHttpClient: + def http_client( + self, auth_token: Optional[str] = None, gen_sk_wide_token: bool = True + ) -> SafekeeperHttpClient: + """ + When auth_token is None but gen_sk_wide is True creates safekeeper wide + token, which is a reasonable default. + """ + if auth_token is None and gen_sk_wide_token: + auth_token = self.env.auth_keys.generate_safekeeper_token() is_testing_enabled = '"testing"' in self.env.get_binary_version("safekeeper") return SafekeeperHttpClient( port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled ) - def data_dir(self) -> str: - return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}") + def get_timeline_start_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: + timeline_status = self.http_client().timeline_status(tenant_id, timeline_id) + timeline_start_lsn = timeline_status.timeline_start_lsn + log.info(f"sk {self.id} timeline start LSN: {timeline_start_lsn}") + return timeline_start_lsn - def timeline_dir(self, tenant_id, timeline_id) -> str: - return os.path.join(self.data_dir(), str(tenant_id), str(timeline_id)) + def get_flush_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: + timeline_status = self.http_client().timeline_status(tenant_id, timeline_id) + flush_lsn = timeline_status.flush_lsn + log.info(f"sk {self.id} flush LSN: {flush_lsn}") + return flush_lsn + + def pull_timeline( + self, srcs: list[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId + ) -> Dict[str, Any]: + """ + pull_timeline from srcs to self. + """ + src_https = [f"http://localhost:{sk.port.http}" for sk in srcs] + res = self.http_client().pull_timeline( + {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id), "http_hosts": src_https} + ) + src_ids = [sk.id for sk in srcs] + log.info(f"finished pulling timeline from {src_ids} to {self.id}") + return res + + @property + def data_dir(self) -> Path: + return self.env.repo_dir / "safekeepers" / f"sk{self.id}" + + def timeline_dir(self, tenant_id, timeline_id) -> Path: + return self.data_dir / str(tenant_id) / str(timeline_id) + + # List partial uploaded segments of this safekeeper. Works only for + # RemoteStorageKind.LOCAL_FS. + def list_uploaded_segments(self, tenant_id: TenantId, timeline_id: TimelineId): + tline_path = ( + self.env.repo_dir + / "local_fs_remote_storage" + / "safekeeper" + / str(tenant_id) + / str(timeline_id) + ) + assert isinstance(self.env.safekeepers_remote_storage, LocalFsStorage) + segs = self._list_segments_in_dir( + tline_path, lambda name: ".metadata" not in name and ".___temp" not in name + ) + mysegs = [s for s in segs if f"sk{self.id}" in s] + return mysegs def list_segments(self, tenant_id, timeline_id) -> List[str]: """ Get list of segment names of the given timeline. """ tli_dir = self.timeline_dir(tenant_id, timeline_id) + return self._list_segments_in_dir( + tli_dir, lambda name: not name.startswith("safekeeper.control") + ) + + def _list_segments_in_dir(self, path: Path, keep_filter: Callable[[str], bool]) -> list[str]: segments = [] - for _, _, filenames in os.walk(tli_dir): - segments.extend([f for f in filenames if not f.startswith("safekeeper.control")]) + for _, _, filenames in os.walk(path): + segments.extend([f for f in filenames if keep_filter(f)]) segments.sort() return segments - -# Walreceiver as returned by sk's timeline status endpoint. -@dataclass -class Walreceiver: - conn_id: int - state: str - - -@dataclass -class SafekeeperTimelineStatus: - acceptor_epoch: int - pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 - flush_lsn: Lsn - commit_lsn: Lsn - timeline_start_lsn: Lsn - backup_lsn: Lsn - peer_horizon_lsn: Lsn - remote_consistent_lsn: Lsn - walreceivers: List[Walreceiver] - - -@dataclass -class SafekeeperMetrics: - # These are metrics from Prometheus which uses float64 internally. - # As a consequence, values may differ from real original int64s. - flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) - commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) - - -class SafekeeperHttpClient(requests.Session): - HTTPError = requests.HTTPError - - def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False): - super().__init__() - self.port = port - self.auth_token = auth_token - self.is_testing_enabled = is_testing_enabled - - if auth_token is not None: - self.headers["Authorization"] = f"Bearer {auth_token}" - - def check_status(self): - self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() - - def is_testing_enabled_or_skip(self): - if not self.is_testing_enabled: - pytest.skip("safekeeper was built without 'testing' feature") - - def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): - self.is_testing_enabled_or_skip() - - if isinstance(config_strings, tuple): - pairs = [config_strings] - else: - pairs = config_strings - - log.info(f"Requesting config failpoints: {repr(pairs)}") - - res = self.put( - f"http://localhost:{self.port}/v1/failpoints", - json=[{"name": name, "actions": actions} for name, actions in pairs], - ) - log.info(f"Got failpoints request response code {res.status_code}") - res.raise_for_status() - res_json = res.json() - assert res_json is None - return res_json - - def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]: - params = params or {} - res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params) - res.raise_for_status() - res_json = json.loads(res.text) - assert isinstance(res_json, dict) - return res_json - - def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]: - res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body) - res.raise_for_status() - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]): - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy", - json=body, - ) - res.raise_for_status() - - def timeline_digest( - self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn - ) -> Dict[str, Any]: - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest", - params={ - "from_lsn": str(from_lsn), - "until_lsn": str(until_lsn), - }, - ) - res.raise_for_status() - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def timeline_create( - self, - tenant_id: TenantId, - timeline_id: TimelineId, - pg_version: int, # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 - commit_lsn: Lsn, + def checkpoint_up_to( + self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn, wait_wal_removal=True ): - body = { - "tenant_id": str(tenant_id), - "timeline_id": str(timeline_id), - "pg_version": pg_version, - "commit_lsn": str(commit_lsn), - } - res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body) - res.raise_for_status() + """ + Assuming pageserver(s) uploaded to s3 up to `lsn`, + 1) wait for remote_consistent_lsn and wal_backup_lsn on safekeeper to reach it. + 2) checkpoint timeline on safekeeper, which should remove WAL before this LSN; optionally wait for that. + """ + cli = self.http_client() - def timeline_status( - self, tenant_id: TenantId, timeline_id: TimelineId - ) -> SafekeeperTimelineStatus: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") - res.raise_for_status() - resj = res.json() - walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]] - return SafekeeperTimelineStatus( - acceptor_epoch=resj["acceptor_state"]["epoch"], - pg_version=resj["pg_info"]["pg_version"], - flush_lsn=Lsn(resj["flush_lsn"]), - commit_lsn=Lsn(resj["commit_lsn"]), - timeline_start_lsn=Lsn(resj["timeline_start_lsn"]), - backup_lsn=Lsn(resj["backup_lsn"]), - peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]), - remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]), - walreceivers=walreceivers, - ) + target_segment_file = lsn.segment_name() - def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body): - res = self.post( - f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", - json=body, - ) - res.raise_for_status() - - # only_local doesn't remove segments in the remote storage. - def timeline_delete( - self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False - ) -> Dict[Any, Any]: - res = self.delete( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", - params={ - "only_local": str(only_local).lower(), - }, - ) - res.raise_for_status() - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]: - res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") - res.raise_for_status() - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def get_metrics_str(self) -> str: - request_result = self.get(f"http://localhost:{self.port}/metrics") - request_result.raise_for_status() - return request_result.text - - def get_metrics(self) -> SafekeeperMetrics: - all_metrics_text = self.get_metrics_str() - - metrics = SafekeeperMetrics() - for match in re.finditer( - r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', - all_metrics_text, - re.MULTILINE, - ): - metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int( - match.group(3) + def are_segments_removed(): + segments = self.list_segments(tenant_id, timeline_id) + log.info( + f"waiting for all segments before {target_segment_file} to be removed from sk {self.id}, current segments: {segments}" ) - for match in re.finditer( - r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', - all_metrics_text, - re.MULTILINE, - ): - metrics.commit_lsn_inexact[ - (TenantId(match.group(1)), TimelineId(match.group(2))) - ] = int(match.group(3)) - return metrics + assert all(target_segment_file <= s for s in segments) + + def are_lsns_advanced(): + stat = cli.timeline_status(tenant_id, timeline_id) + log.info( + f"waiting for remote_consistent_lsn and backup_lsn on sk {self.id} to reach {lsn}, currently remote_consistent_lsn={stat.remote_consistent_lsn}, backup_lsn={stat.backup_lsn}" + ) + assert stat.remote_consistent_lsn >= lsn and stat.backup_lsn >= lsn.segment_lsn() + + # xxx: max wait is long because we might be waiting for reconnection from + # pageserver to this safekeeper + wait_until(30, 1, are_lsns_advanced) + cli.checkpoint(tenant_id, timeline_id) + if wait_wal_removal: + wait_until(30, 1, are_segments_removed) + + def wait_until_paused(self, failpoint: str): + msg = f"at failpoint {failpoint}" + + def paused(): + log.info(f"waiting for hitting failpoint {failpoint}") + self.assert_log_contains(msg) + + wait_until(20, 0.5, paused) -class S3Scrubber: - def __init__(self, log_dir: Path, env: NeonEnvBuilder): +# TODO: Replace with `StrEnum` when we upgrade to python 3.11 +class NodeKind(str, Enum): + PAGESERVER = "pageserver" + SAFEKEEPER = "safekeeper" + + +class StorageScrubber: + def __init__(self, env: NeonEnv, log_dir: Path): self.env = env self.log_dir = log_dir - def scrubber_cli(self, args: list[str], timeout) -> str: + def scrubber_cli( + self, args: list[str], timeout, extra_env: Optional[Dict[str, str]] = None + ) -> str: assert isinstance(self.env.pageserver_remote_storage, S3Storage) s3_storage = self.env.pageserver_remote_storage @@ -3552,9 +4710,16 @@ class S3Scrubber: if s3_storage.endpoint is not None: env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint}) - base_args = [str(self.env.neon_binpath / "s3_scrubber")] + if extra_env is not None: + env.update(extra_env) + + base_args = [ + str(self.env.neon_binpath / "storage_scrubber"), + f"--controller-api={self.env.storage_controller.api_root()}", + ] args = base_args + args + log.info(f"Invoking scrubber command {args} with env: {env}") (output_path, stdout, status_code) = subprocess_capture( self.log_dir, args, @@ -3570,18 +4735,88 @@ class S3Scrubber: log.warning(f"Scrub environment: {env}") log.warning(f"Output at: {output_path}") - raise RuntimeError("Remote storage scrub failed") + raise RuntimeError(f"Scrubber failed while running {args}") assert stdout is not None return stdout - def scan_metadata(self) -> Any: - stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30) + def scan_metadata_safekeeper( + self, + timeline_lsns: List[Dict[str, Any]], + cloud_admin_api_url: str, + cloud_admin_api_token: str, + ) -> Tuple[bool, Any]: + extra_env = { + "CLOUD_ADMIN_API_URL": cloud_admin_api_url, + "CLOUD_ADMIN_API_TOKEN": cloud_admin_api_token, + } + return self.scan_metadata( + node_kind=NodeKind.SAFEKEEPER, timeline_lsns=timeline_lsns, extra_env=extra_env + ) + def scan_metadata( + self, + post_to_storage_controller: bool = False, + node_kind: NodeKind = NodeKind.PAGESERVER, + timeline_lsns: Optional[List[Dict[str, Any]]] = None, + extra_env: Optional[Dict[str, str]] = None, + ) -> Tuple[bool, Any]: + """ + Returns the health status and the metadata summary. + """ + args = ["scan-metadata", "--node-kind", node_kind.value, "--json"] + if post_to_storage_controller: + args.append("--post") + if timeline_lsns is not None: + args.append("--timeline-lsns") + args.append(json.dumps(timeline_lsns)) + stdout = self.scrubber_cli(args, timeout=30, extra_env=extra_env) + + try: + summary = json.loads(stdout) + # summary does not contain "with_warnings" if node_kind is the safekeeper + no_warnings = "with_warnings" not in summary or not summary["with_warnings"] + healthy = not summary["with_errors"] and no_warnings + return healthy, summary + except: + log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:") + log.error(stdout) + raise + + def tenant_snapshot(self, tenant_id: TenantId, output_path: Path): + stdout = self.scrubber_cli( + ["tenant-snapshot", "--tenant-id", str(tenant_id), "--output-path", str(output_path)], + timeout=30, + ) + log.info(f"tenant-snapshot output: {stdout}") + + def pageserver_physical_gc( + self, + min_age_secs: int, + tenant_ids: Optional[list[TenantId]] = None, + mode: Optional[str] = None, + ): + args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"] + + if tenant_ids is None: + tenant_ids = [] + + for tenant_id in tenant_ids: + args.extend(["--tenant-id", str(tenant_id)]) + + if mode is not None: + args.extend(["--mode", mode]) + + stdout = self.scrubber_cli( + args, + timeout=30, + ) try: return json.loads(stdout) except: - log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:") + log.error( + "Failed to decode JSON output from `pageserver-physical_gc`. Dumping stdout:" + ) log.error(stdout) raise @@ -3634,7 +4869,7 @@ def pytest_addoption(parser: Parser): SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] - r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql|conf)" + r"config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql|conf)" ) @@ -3663,7 +4898,23 @@ def test_output_dir( yield test_dir - allure_attach_from_dir(test_dir) + # Allure artifacts creation might involve the creation of `.tar.zst` archives, + # which aren't going to be used if Allure results collection is not enabled + # (i.e. --alluredir is not set). + # Skip `allure_attach_from_dir` in this case + if not request.config.getoption("--alluredir"): + return + + preserve_database_files = False + for k, v in request.node.user_properties: + # NB: the neon_env_builder fixture uses this fixture (test_output_dir). + # So, neon_env_builder's cleanup runs before here. + # The cleanup propagates NeonEnvBuilder.preserve_database_files into this user property. + if k == "preserve_database_files": + assert isinstance(v, bool) + preserve_database_files = v + + allure_attach_from_dir(test_dir, preserve_database_files) class FileAndThreadLock: @@ -3840,33 +5091,35 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]: # pg is the existing and running compute node, that we want to compare with a basebackup -def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint): +def check_restored_datadir_content( + test_output_dir: Path, + env: NeonEnv, + endpoint: Endpoint, + ignored_files: Optional[list[str]] = None, +): + pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) + # Get the timeline ID. We need it for the 'basebackup' command timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0]) - # many tests already checkpoint, but do it just in case - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("CHECKPOINT") - - # wait for pageserver to catch up - wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id) # stop postgres to ensure that files won't change endpoint.stop() + # Read the shutdown checkpoint's LSN + checkpoint_lsn = pg_bin.get_pg_controldata_checkpoint_lsn(endpoint.pg_data_dir_path()) + # Take a basebackup from pageserver restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir" restored_dir_path.mkdir(exist_ok=True) - pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) psql_path = os.path.join(pg_bin.pg_bin_path, "psql") - pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"] + pageserver_id = env.storage_controller.locate(endpoint.tenant_id)[0]["node_id"] cmd = rf""" {psql_path} \ --no-psqlrc \ postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg} \ - -c 'basebackup {endpoint.tenant_id} {timeline_id}' \ + -c 'basebackup {endpoint.tenant_id} {timeline_id} {checkpoint_lsn}' \ | tar -x -C {restored_dir_path} """ @@ -3885,8 +5138,21 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint # list files we're going to compare assert endpoint.pgdata_dir pgdata_files = list_files_to_compare(Path(endpoint.pgdata_dir)) + restored_files = list_files_to_compare(restored_dir_path) + if pgdata_files != restored_files: + # filter pg_xact and multixact files which are downloaded on demand + pgdata_files = [ + f + for f in pgdata_files + if not f.startswith("pg_xact") and not f.startswith("pg_multixact") + ] + + if ignored_files: + pgdata_files = [f for f in pgdata_files if f not in ignored_files] + restored_files = [f for f in restored_files if f not in ignored_files] + # check that file sets are equal assert pgdata_files == restored_files @@ -3902,19 +5168,19 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint for f in mismatch: f1 = os.path.join(endpoint.pgdata_dir, f) f2 = os.path.join(restored_dir_path, f) - stdout_filename = "{}.filediff".format(f2) + stdout_filename = f"{f2}.filediff" with open(stdout_filename, "w") as stdout_f: - subprocess.run("xxd -b {} > {}.hex ".format(f1, f1), shell=True) - subprocess.run("xxd -b {} > {}.hex ".format(f2, f2), shell=True) + subprocess.run(f"xxd -b {f1} > {f1}.hex ", shell=True) + subprocess.run(f"xxd -b {f2} > {f2}.hex ", shell=True) - cmd = "diff {}.hex {}.hex".format(f1, f2) + cmd = f"diff {f1}.hex {f2}.hex" subprocess.run([cmd], stdout=stdout_f, shell=True) assert (mismatch, error) == ([], []) -def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) -> Lsn: +def logical_replication_sync(subscriber: PgProtocol, publisher: PgProtocol) -> Lsn: """Wait logical replication subscriber to sync with publisher.""" publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) while True: @@ -3931,7 +5197,7 @@ def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) - def tenant_get_shards( - env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int] + env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int] = None ) -> list[tuple[TenantShardId, NeonPageserver]]: """ Helper for when you want to talk to one or more pageservers, and the @@ -3939,7 +5205,7 @@ def tenant_get_shards( us to figure out the shards for a tenant. If the caller provides `pageserver_id`, it will be used for all shards, even - if the shard is indicated by attachment service to be on some other pageserver. + if the shard is indicated by storage controller to be on some other pageserver. Caller should over the response to apply their per-pageserver action to each shard @@ -3955,19 +5221,46 @@ def tenant_get_shards( TenantShardId.parse(s["shard_id"]), override_pageserver or env.get_pageserver(s["node_id"]), ) - for s in env.attachment_service.locate(tenant_id) + for s in env.storage_controller.locate(tenant_id) ] else: # Assume an unsharded tenant return [(TenantShardId(tenant_id, 0, 0), override_pageserver or env.pageserver)] +def wait_replica_caughtup(primary: Endpoint, secondary: Endpoint): + primary_lsn = Lsn( + primary.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()", log_query=False) + ) + while True: + secondary_lsn = Lsn( + secondary.safe_psql_scalar("SELECT pg_last_wal_replay_lsn()", log_query=False) + ) + caught_up = secondary_lsn >= primary_lsn + log.info(f"caughtup={caught_up}, primary_lsn={primary_lsn}, secondary_lsn={secondary_lsn}") + if caught_up: + return + time.sleep(1) + + +def log_replica_lag(primary: Endpoint, secondary: Endpoint): + last_replay_lsn = Lsn( + secondary.safe_psql_scalar("SELECT pg_last_wal_replay_lsn()", log_query=False) + ) + primary_lsn = Lsn( + primary.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()", log_query=False) + ) + lag = primary_lsn - last_replay_lsn + log.info(f"primary_lsn={primary_lsn}, replay_lsn={last_replay_lsn}, lag={lag}") + + def wait_for_last_flush_lsn( env: NeonEnv, endpoint: Endpoint, tenant: TenantId, timeline: TimelineId, pageserver_id: Optional[int] = None, + auth_token: Optional[str] = None, ) -> Lsn: """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" @@ -3981,7 +5274,7 @@ def wait_for_last_flush_lsn( f"wait_for_last_flush_lsn: waiting for {last_flush_lsn} on shard {tenant_shard_id} on pageserver {pageserver.id})" ) waited = wait_for_last_record_lsn( - pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn + pageserver.http_client(auth_token=auth_token), tenant_shard_id, timeline, last_flush_lsn ) assert waited >= last_flush_lsn @@ -3991,6 +5284,49 @@ def wait_for_last_flush_lsn( return min(results) +def flush_ep_to_pageserver( + env: NeonEnv, + ep: Endpoint, + tenant: TenantId, + timeline: TimelineId, + pageserver_id: Optional[int] = None, +) -> Lsn: + """ + Stop endpoint and wait until all committed WAL reaches the pageserver + (last_record_lsn). This is for use by tests which want everything written so + far to reach pageserver *and* expecting that no more data will arrive until + endpoint starts again, so unlike wait_for_last_flush_lsn it polls + safekeepers instead of compute to learn LSN. + + Returns the catch up LSN. + """ + ep.stop() + + commit_lsn: Lsn = Lsn(0) + # In principle in the absense of failures polling single sk would be enough. + for sk in env.safekeepers: + cli = sk.http_client() + # wait until compute connections are gone + wait_until(30, 0.5, partial(are_walreceivers_absent, cli, tenant, timeline)) + commit_lsn = max(cli.get_commit_lsn(tenant, timeline), commit_lsn) + + # Note: depending on WAL filtering implementation, probably most shards + # won't be able to reach commit_lsn (unless gaps are also ack'ed), so this + # is broken in sharded case. + shards = tenant_get_shards(env, tenant, pageserver_id) + for tenant_shard_id, pageserver in shards: + log.info( + f"flush_ep_to_pageserver: waiting for {commit_lsn} on shard {tenant_shard_id} on pageserver {pageserver.id})" + ) + waited = wait_for_last_record_lsn( + pageserver.http_client(), tenant_shard_id, timeline, commit_lsn + ) + + assert waited >= commit_lsn + + return commit_lsn + + def wait_for_wal_insert_lsn( env: NeonEnv, endpoint: Endpoint, @@ -4028,12 +5364,77 @@ def fork_at_current_lsn( return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn) +def import_timeline_from_vanilla_postgres( + test_output_dir: Path, + env: NeonEnv, + pg_bin: PgBin, + tenant_id: TenantId, + timeline_id: TimelineId, + branch_name: str, + vanilla_pg_connstr: str, +): + """ + Create a new timeline, by importing an existing PostgreSQL cluster. + + This works by taking a physical backup of the running PostgreSQL cluster, and importing that. + """ + + # Take backup of the existing PostgreSQL server with pg_basebackup + basebackup_dir = os.path.join(test_output_dir, "basebackup") + base_tar = os.path.join(basebackup_dir, "base.tar") + wal_tar = os.path.join(basebackup_dir, "pg_wal.tar") + os.mkdir(basebackup_dir) + pg_bin.run( + [ + "pg_basebackup", + "-F", + "tar", + "-d", + vanilla_pg_connstr, + "-D", + basebackup_dir, + ] + ) + + # Extract start_lsn and end_lsn form the backup manifest file + with open(os.path.join(basebackup_dir, "backup_manifest")) as f: + manifest = json.load(f) + start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"] + end_lsn = manifest["WAL-Ranges"][0]["End-LSN"] + + # Import the backup tarballs into the pageserver + env.neon_cli.raw_cli( + [ + "timeline", + "import", + "--tenant-id", + str(tenant_id), + "--timeline-id", + str(timeline_id), + "--branch-name", + branch_name, + "--base-lsn", + start_lsn, + "--base-tarfile", + base_tar, + "--end-lsn", + end_lsn, + "--wal-tarfile", + wal_tar, + "--pg-version", + env.pg_version, + ] + ) + wait_for_last_record_lsn(env.pageserver.http_client(), tenant_id, timeline_id, Lsn(end_lsn)) + + def last_flush_lsn_upload( env: NeonEnv, endpoint: Endpoint, tenant_id: TenantId, timeline_id: TimelineId, pageserver_id: Optional[int] = None, + auth_token: Optional[str] = None, ) -> Lsn: """ Wait for pageserver to catch to the latest flush LSN of given endpoint, @@ -4041,15 +5442,13 @@ def last_flush_lsn_upload( reaching flush LSN). """ last_flush_lsn = wait_for_last_flush_lsn( - env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver_id + env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver_id, auth_token=auth_token ) shards = tenant_get_shards(env, tenant_id, pageserver_id) for tenant_shard_id, pageserver in shards: - ps_http = pageserver.http_client() + ps_http = pageserver.http_client(auth_token=auth_token) wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, last_flush_lsn) - # force a checkpoint to trigger upload - ps_http.timeline_checkpoint(tenant_shard_id, timeline_id) - wait_for_upload(ps_http, tenant_shard_id, timeline_id, last_flush_lsn) + ps_http.timeline_checkpoint(tenant_shard_id, timeline_id, wait_until_uploaded=True) return last_flush_lsn @@ -4064,3 +5463,75 @@ def parse_project_git_version_output(s: str) -> str: return commit raise ValueError(f"unable to parse --version output: '{s}'") + + +def generate_uploads_and_deletions( + env: NeonEnv, + *, + init: bool = True, + tenant_id: Optional[TenantId] = None, + timeline_id: Optional[TimelineId] = None, + data: Optional[str] = None, + pageserver: NeonPageserver, +): + """ + Using the environment's default tenant + timeline, generate a load pattern + that results in some uploads and some deletions to remote storage. + """ + + if tenant_id is None: + tenant_id = env.initial_tenant + assert tenant_id is not None + + if timeline_id is None: + timeline_id = env.initial_timeline + assert timeline_id is not None + + ps_http = pageserver.http_client() + + with env.endpoints.create_start( + "main", tenant_id=tenant_id, pageserver_id=pageserver.id + ) as endpoint: + if init: + endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") + last_flush_lsn_upload( + env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id + ) + + def churn(data): + endpoint.safe_psql_many( + [ + f""" + INSERT INTO foo (id, val) + SELECT g, '{data}' + FROM generate_series(1, 200) g + ON CONFLICT (id) DO UPDATE + SET val = EXCLUDED.val + """, + # to ensure that GC can actually remove some layers + "VACUUM foo", + ] + ) + assert tenant_id is not None + assert timeline_id is not None + # We are waiting for uploads as well as local flush, in order to avoid leaving the system + # in a state where there are "future layers" in remote storage that will generate deletions + # after a restart. + last_flush_lsn_upload( + env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id + ) + + # Compaction should generate some GC-elegible layers + for i in range(0, 2): + churn(f"{i if data is None else data}") + + gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0) + print_gc_result(gc_result) + assert gc_result["layers_removed"] > 0 + + # Stop endpoint and flush all data to pageserver, then checkpoint it: this + # ensures that the pageserver is in a fully idle state: there will be no more + # background ingest, no more uploads pending, and therefore no non-determinism + # in subsequent actions like pageserver restarts. + flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id) + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 74c6bddf23..f8d9a51c91 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -52,10 +52,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*Error processing HTTP request: Forbidden", # intentional failpoints ".*failpoint ", - # FIXME: These need investigation - ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*", - ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*", - ".*Removing intermediate uninit mark file.*", # Tenant::delete_timeline() can cause any of the four following errors. # FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946 ".*could not flush frozen layer.*queue is in state Stopped", # when schedule layer upload fails because queued got closed before compaction got killed @@ -67,10 +63,11 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress ".*task iteration took longer than the configured period.*", # these can happen anytime we do compactions from background task and shutdown pageserver - r".*ERROR.*ancestor timeline \S+ is being stopped", + ".*could not compact.*cancelled.*", # this is expected given our collaborative shutdown approach for the UploadQueue ".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*", ".*Compaction failed.*, retrying in .*: ShuttingDown", + ".*Compaction failed.*, retrying in .*: Other\\(timeline shutting down.*", # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally ".*Error processing HTTP request: NotFound: Timeline .* was not found", ".*took more than expected to complete.*", @@ -82,9 +79,39 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( # During shutdown, DownloadError::Cancelled may be logged as an error. Cleaning this # up is tracked in https://github.com/neondatabase/neon/issues/6096 ".*Cancelled, shutting down.*", + # Open layers are only rolled at Lsn boundaries to avoid name clashses. + # Hence, we can overshoot the soft limit set by checkpoint distance. + # This is especially pronounced in tests that set small checkpoint + # distances. + ".*Flushed oversized open layer with size.*", + # During teardown, we stop the storage controller before the pageservers, so pageservers + # can experience connection errors doing background deletion queue work. + ".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*", + # Can happen when the test shuts down the storage controller while it is calling the utilization API + ".*WARN.*path=/v1/utilization .*request was dropped before completing", + # Can happen during shutdown + ".*scheduling deletion on drop failed: queue is in state Stopped.*", ) +DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ + # Many tests will take pageservers offline, resulting in log warnings on the controller + # failing to connect to them. + ".*Call to node.*management API.*failed.*receive body.*", + ".*Call to node.*management API.*failed.*ReceiveBody.*", + ".*Failed to update node .+ after heartbeat round.*error sending request for url.*", + # Many tests will start up with a node offline + ".*startup_reconcile: Could not scan node.*", + # Tests run in dev mode + ".*Starting in dev mode.*", + # Tests that stop endpoints & use the storage controller's neon_local notification + # mechanism might fail (neon_local's stopping and endpoint isn't atomic wrt the storage + # controller's attempts to notify the endpoint). + ".*reconciler.*neon_local notification hook failed.*", + ".*reconciler.*neon_local error.*", +] + + def _check_allowed_errors(input): allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS) @@ -110,9 +137,10 @@ if __name__ == "__main__": "-i", "--input", type=argparse.FileType("r"), - default=sys.stdin, - help="Pageserver logs file. Reads from stdin if no file is provided.", + help="Pageserver logs file. Use '-' for stdin.", + required=True, ) + args = parser.parse_args() errors = _check_allowed_errors(args.input) diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/common_types.py similarity index 55% rename from test_runner/fixtures/pageserver/types.py rename to test_runner/fixtures/pageserver/common_types.py index 72fa30a2f2..a6c327a8a0 100644 --- a/test_runner/fixtures/pageserver/types.py +++ b/test_runner/fixtures/pageserver/common_types.py @@ -1,7 +1,8 @@ +import re from dataclasses import dataclass from typing import Any, Dict, Tuple, Union -from fixtures.types import KEY_MAX, KEY_MIN, Key, Lsn +from fixtures.common_types import KEY_MAX, KEY_MIN, Key, Lsn @dataclass @@ -11,7 +12,7 @@ class IndexLayerMetadata: @dataclass(frozen=True) -class ImageLayerFileName: +class ImageLayerName: lsn: Lsn key_start: Key key_end: Key @@ -25,7 +26,7 @@ class ImageLayerFileName: @dataclass(frozen=True) -class DeltaLayerFileName: +class DeltaLayerName: lsn_start: Lsn lsn_end: Lsn key_start: Key @@ -40,65 +41,57 @@ class DeltaLayerFileName: return ret -LayerFileName = Union[ImageLayerFileName, DeltaLayerFileName] +LayerName = Union[ImageLayerName, DeltaLayerName] class InvalidFileName(Exception): pass +IMAGE_LAYER_FILE_NAME = re.compile( + "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})(-v1-[a-f0-9]{8})?$" +) + + def parse_image_layer(f_name: str) -> Tuple[int, int, int]: """Parse an image layer file name. Return key start, key end, and snapshot lsn""" - parts = f_name.split("__") - if len(parts) != 2: - raise InvalidFileName(f"expecting two parts separated by '__', got: {parts}") - key_parts = parts[0].split("-") - if len(key_parts) != 2: - raise InvalidFileName( - f"expecting two key parts separated by '--' in parts[0], got: {key_parts}" - ) - try: - return int(key_parts[0], 16), int(key_parts[1], 16), int(parts[1], 16) - except ValueError as e: - raise InvalidFileName(f"conversion error: {f_name}") from e + + match = IMAGE_LAYER_FILE_NAME.match(f_name) + if match is None: + raise InvalidFileName(f"'{f_name}' is not an image layer filename") + + return int(match.group(1), 16), int(match.group(2), 16), int(match.group(3), 16) + + +DELTA_LAYER_FILE_NAME = re.compile( + "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})-([A-F0-9]{16})(-v1-[a-f0-9]{8})?$" +) def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]: """Parse a delta layer file name. Return key start, key end, lsn start, and lsn end""" - parts = f_name.split("__") - if len(parts) != 2: - raise InvalidFileName(f"expecting two parts separated by '__', got: {parts}") - key_parts = parts[0].split("-") - if len(key_parts) != 2: - raise InvalidFileName( - f"expecting two key parts separated by '--' in parts[0], got: {key_parts}" - ) - lsn_parts = parts[1].split("-") - if len(lsn_parts) != 2: - raise InvalidFileName( - f"expecting two lsn parts separated by '--' in parts[1], got: {lsn_parts}" - ) - try: - return ( - int(key_parts[0], 16), - int(key_parts[1], 16), - int(lsn_parts[0], 16), - int(lsn_parts[1], 16), - ) - except ValueError as e: - raise InvalidFileName(f"conversion error: {f_name}") from e + match = DELTA_LAYER_FILE_NAME.match(f_name) + if match is None: + raise InvalidFileName(f"'{f_name}' is not an delta layer filename") + + return ( + int(match.group(1), 16), + int(match.group(2), 16), + int(match.group(3), 16), + int(match.group(4), 16), + ) -def parse_layer_file_name(file_name: str) -> LayerFileName: +def parse_layer_file_name(file_name: str) -> LayerName: try: key_start, key_end, lsn = parse_image_layer(file_name) - return ImageLayerFileName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end)) + return ImageLayerName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end)) except InvalidFileName: pass try: key_start, key_end, lsn_start, lsn_end = parse_delta_layer(file_name) - return DeltaLayerFileName( + return DeltaLayerName( lsn_start=Lsn(lsn_start), lsn_end=Lsn(lsn_end), key_start=Key(key_start), @@ -110,18 +103,15 @@ def parse_layer_file_name(file_name: str) -> LayerFileName: raise InvalidFileName("neither image nor delta layer") -def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn): +def is_future_layer(layer_file_name: LayerName, disk_consistent_lsn: Lsn): """ Determines if this layer file is considered to be in future meaning we will discard these layers during timeline initialization from the given disk_consistent_lsn. """ - if ( - isinstance(layer_file_name, ImageLayerFileName) - and layer_file_name.lsn > disk_consistent_lsn - ): + if isinstance(layer_file_name, ImageLayerName) and layer_file_name.lsn > disk_consistent_lsn: return True elif ( - isinstance(layer_file_name, DeltaLayerFileName) + isinstance(layer_file_name, DeltaLayerName) and layer_file_name.lsn_end > disk_consistent_lsn + 1 ): return True @@ -131,7 +121,7 @@ def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn): @dataclass class IndexPartDump: - layer_metadata: Dict[LayerFileName, IndexLayerMetadata] + layer_metadata: Dict[LayerName, IndexLayerMetadata] disk_consistent_lsn: Lsn @classmethod diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 340cc9e9e3..582f9c0264 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -1,19 +1,19 @@ from __future__ import annotations -import json import time from collections import defaultdict from dataclasses import dataclass +from datetime import datetime from typing import Any, Dict, List, Optional, Set, Tuple, Union import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineArchivalState, TimelineId from fixtures.log_helper import log -from fixtures.metrics import Metrics, parse_metrics +from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pg_version import PgVersion -from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.utils import Fn @@ -33,7 +33,7 @@ class TimelineCreate406(PageserverApiException): class TimelineCreate409(PageserverApiException): def __init__(self, res: requests.Response): assert res.status_code == 409 - super().__init__("", res.status_code) + super().__init__(res.json()["msg"], res.status_code) @dataclass @@ -55,20 +55,32 @@ class InMemoryLayerInfo: class HistoricLayerInfo: kind: str layer_file_name: str - layer_file_size: Optional[int] + layer_file_size: int lsn_start: str lsn_end: Optional[str] remote: bool + # None for image layers, true if pageserver thinks this is an L0 delta layer + l0: Optional[bool] + visible: bool @classmethod def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo: + # instead of parsing the key range lets keep the definition of "L0" in pageserver + l0_ness = d.get("l0") + assert l0_ness is None or isinstance(l0_ness, bool) + + size = d["layer_file_size"] + assert isinstance(size, int) + return HistoricLayerInfo( kind=d["kind"], layer_file_name=d["layer_file_name"], - layer_file_size=d.get("layer_file_size"), + layer_file_size=size, lsn_start=d["lsn_start"], lsn_end=d.get("lsn_end"), remote=d["remote"], + l0=l0_ness, + visible=d["access_stats"]["visible"], ) @@ -107,6 +119,9 @@ class LayerMapInfo: def image_layers(self) -> List[HistoricLayerInfo]: return [x for x in self.historic_layers if x.kind == "Image"] + def delta_l0_layers(self) -> List[HistoricLayerInfo]: + return [x for x in self.historic_layers if x.kind == "Delta" and x.l0] + def historic_by_name(self) -> Set[str]: return set(x.layer_file_name for x in self.historic_layers) @@ -124,7 +139,7 @@ class TenantConfig: ) -class PageserverHttpClient(requests.Session): +class PageserverHttpClient(requests.Session, MetricsGetter): def __init__( self, port: int, @@ -162,6 +177,21 @@ class PageserverHttpClient(requests.Session): if auth_token is not None: self.headers["Authorization"] = f"Bearer {auth_token}" + def without_status_retrying(self) -> PageserverHttpClient: + retries = Retry( + status=0, + connect=5, + read=False, + backoff_factor=0.2, + status_forcelist=[], + allowed_methods=None, + remove_headers_on_redirect=[], + ) + + return PageserverHttpClient( + self.port, self.is_testing_enabled_or_skip, self.auth_token, retries + ) + @property def base_url(self) -> str: return f"http://localhost:{self.port}" @@ -210,71 +240,34 @@ class PageserverHttpClient(requests.Session): assert isinstance(res_json, list) return res_json - def tenant_create( - self, - new_tenant_id: Union[TenantId, TenantShardId], - conf: Optional[Dict[str, Any]] = None, - generation: Optional[int] = None, - ) -> TenantId: - if conf is not None: - assert "new_tenant_id" not in conf.keys() - - body: Dict[str, Any] = { - "new_tenant_id": str(new_tenant_id), - **(conf or {}), - } - - if generation is not None: - body.update({"generation": generation}) - - res = self.post( - f"http://localhost:{self.port}/v1/tenant", - json=body, - ) - self.verbose_error(res) - if res.status_code == 409: - raise Exception(f"could not create tenant: already exists for id {new_tenant_id}") - new_tenant_id = res.json() - assert isinstance(new_tenant_id, str) - return TenantId(new_tenant_id) - def tenant_attach( self, tenant_id: Union[TenantId, TenantShardId], + generation: int, config: None | Dict[str, Any] = None, - config_null: bool = False, - generation: Optional[int] = None, ): - if config_null: - assert config is None - body: Any = None - else: - # null-config is prohibited by the API - config = config or {} - body = {"config": config} - if generation is not None: - body.update({"generation": generation}) + config = config or {} - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach", - data=json.dumps(body), - headers={"Content-Type": "application/json"}, + return self.tenant_location_conf( + tenant_id, + location_conf={ + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": config, + "generation": generation, + }, ) - self.verbose_error(res) - def tenant_detach(self, tenant_id: TenantId, detach_ignored=False, timeout_secs=None): - params = {} - if detach_ignored: - params["detach_ignored"] = "true" - - kwargs = {} - if timeout_secs is not None: - kwargs["timeout"] = timeout_secs - - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params, **kwargs + def tenant_detach(self, tenant_id: TenantId): + return self.tenant_location_conf( + tenant_id, + location_conf={ + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, ) - self.verbose_error(res) def tenant_reset(self, tenant_id: Union[TenantId, TenantShardId], drop_cache: bool): params = {} @@ -285,40 +278,65 @@ class PageserverHttpClient(requests.Session): self.verbose_error(res) def tenant_location_conf( - self, tenant_id: Union[TenantId, TenantShardId], location_conf=dict[str, Any], flush_ms=None + self, + tenant_id: Union[TenantId, TenantShardId], + location_conf=dict[str, Any], + flush_ms=None, + lazy: Optional[bool] = None, ): body = location_conf.copy() - body["tenant_id"] = str(tenant_id) params = {} if flush_ms is not None: params["flush_ms"] = str(flush_ms) + if lazy is not None: + params["lazy"] = "true" if lazy else "false" + res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config", json=body, params=params, ) self.verbose_error(res) + return res.json() + + def tenant_list_locations(self): + res = self.get( + f"http://localhost:{self.port}/v1/location_config", + ) + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json["tenant_shards"], list) + return res_json + + def tenant_get_location(self, tenant_id: TenantShardId): + res = self.get( + f"http://localhost:{self.port}/v1/location_config/{tenant_id}", + ) + self.verbose_error(res) + return res.json() def tenant_delete(self, tenant_id: Union[TenantId, TenantShardId]): res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") self.verbose_error(res) return res - def tenant_load(self, tenant_id: TenantId, generation=None): - body = None - if generation is not None: - body = {"generation": generation} - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load", json=body) - self.verbose_error(res) + def tenant_status( + self, tenant_id: Union[TenantId, TenantShardId], activate: bool = False + ) -> Dict[Any, Any]: + """ + :activate: hint the server not to accelerate activation of this tenant in response + to this query. False by default for tests, because they generally want to observed the + system rather than interfering with it. This is true by default on the server side, + because in the field if the control plane is GET'ing a tenant it's a sign that it wants + to do something with it. + """ + params = {} + if not activate: + params["activate"] = "false" - def tenant_ignore(self, tenant_id: TenantId): - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore") - self.verbose_error(res) - - def tenant_status(self, tenant_id: Union[TenantId, TenantShardId]) -> Dict[Any, Any]: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}", params=params) self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) @@ -333,9 +351,21 @@ class PageserverHttpClient(requests.Session): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload") self.verbose_error(res) - def tenant_secondary_download(self, tenant_id: Union[TenantId, TenantShardId]): - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download") + def tenant_secondary_download( + self, tenant_id: Union[TenantId, TenantShardId], wait_ms: Optional[int] = None + ) -> tuple[int, dict[Any, Any]]: + url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download" + if wait_ms is not None: + url = url + f"?wait_ms={wait_ms}" + res = self.post(url) self.verbose_error(res) + return (res.status_code, res.json()) + + def tenant_secondary_status(self, tenant_id: Union[TenantId, TenantShardId]): + url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/status" + res = self.get(url) + self.verbose_error(res) + return res.json() def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]): assert "tenant_id" not in config.keys() @@ -389,6 +419,28 @@ class PageserverHttpClient(requests.Session): ) return res.text + def tenant_time_travel_remote_storage( + self, + tenant_id: Union[TenantId, TenantShardId], + timestamp: datetime, + done_if_after: datetime, + shard_counts: Optional[List[int]] = None, + ): + """ + Issues a request to perform time travel operations on the remote storage + """ + + if shard_counts is None: + shard_counts = [] + body: Dict[str, Any] = { + "shard_counts": shard_counts, + } + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/time_travel_remote_storage?travel_to={timestamp.isoformat()}Z&done_if_after={done_if_after.isoformat()}Z", + json=body, + ) + self.verbose_error(res) + def timeline_list( self, tenant_id: Union[TenantId, TenantShardId], @@ -512,16 +564,41 @@ class PageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json + def timeline_block_gc(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/block_gc", + ) + log.info(f"Got GC request response code: {res.status_code}") + self.verbose_error(res) + + def timeline_unblock_gc( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + ): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/unblock_gc", + ) + log.info(f"Got GC request response code: {res.status_code}") + self.verbose_error(res) + def timeline_compact( self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, force_repartition=False, + force_image_layer_creation=False, + wait_until_uploaded=False, + enhanced_gc_bottom_most_compaction=False, ): self.is_testing_enabled_or_skip() query = {} if force_repartition: query["force_repartition"] = "true" + if force_image_layer_creation: + query["force_image_layer_creation"] = "true" + if wait_until_uploaded: + query["wait_until_uploaded"] = "true" + if enhanced_gc_bottom_most_compaction: + query["enhanced_gc_bottom_most_compaction"] = "true" log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}") res = self.put( @@ -544,22 +621,53 @@ class PageserverHttpClient(requests.Session): ) self.verbose_error(res) + def timeline_archival_config( + self, + tenant_id: Union[TenantId, TenantShardId], + timeline_id: TimelineId, + state: TimelineArchivalState, + ): + config = {"state": state.value} + log.info( + f"requesting timeline archival config {config} for tenant {tenant_id} and timeline {timeline_id}" + ) + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/archival_config", + json=config, + ) + self.verbose_error(res) + def timeline_get_lsn_by_timestamp( self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, - timestamp, - version: Optional[int] = None, + timestamp: datetime, + with_lease: bool = False, + **kwargs, ): log.info( - f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}" + f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}, {with_lease=}" ) - if version is None: - version_str = "" - else: - version_str = f"&version={version}" + with_lease_query = f"{with_lease=}".lower() res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}{version_str}", + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z&{with_lease_query}", + **kwargs, + ) + self.verbose_error(res) + res_json = res.json() + return res_json + + def timeline_lsn_lease( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn + ): + data = { + "lsn": str(lsn), + } + + log.info(f"Requesting lsn lease for {lsn=}, {tenant_id=}, {timeline_id=}") + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/lsn_lease", + json=data, ) self.verbose_error(res) res_json = res.json() @@ -576,21 +684,44 @@ class PageserverHttpClient(requests.Session): res_json = res.json() return res_json + def timeline_layer_map_info( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + ): + log.info(f"Requesting layer map info of tenant {tenant_id}, timeline {timeline_id}") + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer", + ) + self.verbose_error(res) + res_json = res.json() + return res_json + def timeline_checkpoint( self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, force_repartition=False, + force_image_layer_creation=False, + wait_until_uploaded=False, + compact: Optional[bool] = None, + **kwargs, ): self.is_testing_enabled_or_skip() query = {} if force_repartition: query["force_repartition"] = "true" + if force_image_layer_creation: + query["force_image_layer_creation"] = "true" + if wait_until_uploaded: + query["wait_until_uploaded"] = "true" + + if compact is not None: + query["compact"] = "true" if compact else "false" log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint", params=query, + **kwargs, ) log.info(f"Got checkpoint request response code: {res.status_code}") self.verbose_error(res) @@ -684,71 +815,33 @@ class PageserverHttpClient(requests.Session): }, ).value - def get_remote_timeline_client_metric( + def get_remote_timeline_client_queue_count( self, - metric_name: str, tenant_id: TenantId, timeline_id: TimelineId, file_kind: str, op_kind: str, - ) -> Optional[float]: - metrics = self.get_metrics() - matches = metrics.query_all( - name=metric_name, + ) -> Optional[int]: + metrics = [ + "pageserver_remote_timeline_client_calls_started_total", + "pageserver_remote_timeline_client_calls_finished_total", + ] + res = self.get_metrics_values( + metrics, filter={ "tenant_id": str(tenant_id), "timeline_id": str(timeline_id), "file_kind": str(file_kind), "op_kind": str(op_kind), }, + absence_ok=True, ) - if len(matches) == 0: - value = None - elif len(matches) == 1: - value = matches[0].value - assert value is not None - else: - assert len(matches) < 2, "above filter should uniquely identify metric" - return value - - def get_metric_value( - self, name: str, filter: Optional[Dict[str, str]] = None - ) -> Optional[float]: - metrics = self.get_metrics() - results = metrics.query_all(name, filter=filter) - if not results: - log.info(f'could not find metric "{name}"') + if len(res) != 2: return None - assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}" - return results[0].value - - def get_metrics_values( - self, names: list[str], filter: Optional[Dict[str, str]] = None - ) -> Dict[str, float]: - """ - When fetching multiple named metrics, it is more efficient to use this - than to call `get_metric_value` repeatedly. - - Throws RuntimeError if no metrics matching `names` are found, or if - not all of `names` are found: this method is intended for loading sets - of metrics whose existence is coupled. - """ - metrics = self.get_metrics() - samples = [] - for name in names: - samples.extend(metrics.query_all(name, filter=filter)) - - result = {} - for sample in samples: - if sample.name in result: - raise RuntimeError(f"Multiple values found for {sample.name}") - result[sample.name] = sample.value - - if len(result) != len(names): - log.info(f"Metrics found: {metrics.metrics}") - raise RuntimeError(f"could not find all metrics {' '.join(names)}") - - return result + inc, dec = [res[metric] for metric in metrics] + queue_count = int(inc) - int(dec) + assert queue_count >= 0 + return queue_count def layer_map_info( self, @@ -780,6 +873,25 @@ class PageserverHttpClient(requests.Session): continue self.download_layer(tenant_id, timeline_id, layer.layer_file_name) + def detach_ancestor( + self, + tenant_id: Union[TenantId, TenantShardId], + timeline_id: TimelineId, + batch_size: int | None = None, + **kwargs, + ) -> Set[TimelineId]: + params = {} + if batch_size is not None: + params["batch_size"] = batch_size + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor", + params=params, + **kwargs, + ) + self.verbose_error(res) + json = res.json() + return set(map(TimelineId, json["reparented_timelines"])) + def evict_layer( self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str ): @@ -821,3 +933,46 @@ class PageserverHttpClient(requests.Session): self.put( f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}" ).raise_for_status() + + def timeline_wait_logical_size(self, tenant_id: TenantId, timeline_id: TimelineId) -> int: + detail = self.timeline_detail( + tenant_id, + timeline_id, + include_non_incremental_logical_size=True, + force_await_initial_logical_size=True, + ) + current_logical_size = detail["current_logical_size"] + non_incremental = detail["current_logical_size_non_incremental"] + assert current_logical_size == non_incremental + assert isinstance(current_logical_size, int) + return current_logical_size + + def top_tenants( + self, order_by: str, limit: int, where_shards_lt: int, where_gt: int + ) -> dict[Any, Any]: + res = self.post( + f"http://localhost:{self.port}/v1/top_tenants", + json={ + "order_by": order_by, + "limit": limit, + "where_shards_lt": where_shards_lt, + "where_gt": where_gt, + }, + ) + self.verbose_error(res) + return res.json() # type: ignore + + def perf_info( + self, + tenant_id: Union[TenantId, TenantShardId], + timeline_id: TimelineId, + ): + self.is_testing_enabled_or_skip() + + log.info(f"Requesting perf info: tenant {tenant_id}, timeline {timeline_id}") + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/perf_info", + ) + log.info(f"Got perf info response code: {res.status_code}") + self.verbose_error(res) + return res.json() diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index bbb4ccee5b..3e0ffabf74 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -1,18 +1,14 @@ import concurrent.futures -import time from typing import Any, Callable, Dict, Tuple import fixtures.pageserver.remote_storage +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, ) -from fixtures.pageserver.utils import ( - wait_until_tenant_state, -) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.types import TenantId, TimelineId def single_timeline( @@ -42,44 +38,37 @@ def single_timeline( log.info("detach template tenant form pageserver") env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely - ".*Dropped remote consistent LSN updates.*", - ) log.info(f"duplicating template tenant {ncopies} times in S3") tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies) + # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done. + # However, on-demand downloads are quite slow ATM. + # => do the on-demand downloads in Python. + log.info("python-side on-demand download the layer files into local tenant dir") + tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants)) + fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir( + env, tenant_timelines + ) + log.info("attach duplicated tenants to pageserver") # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done. # However, on-demand downloads are quite slow ATM. # => do the on-demand downloads in Python. assert ps_http.tenant_list() == [] - # make the attach fail after it created enough on-disk state to retry loading - # the tenant next startup, but before it can start background loops that would start download - ps_http.configure_failpoints(("attach-before-activate", "return")) - env.pageserver.allowed_errors.append( - ".*attach failed, setting tenant state to Broken: attach-before-activate.*" - ) - def attach_broken(tenant): + def attach(tenant): env.pageserver.tenant_attach( tenant, config=template_config.copy(), + generation=100, + override_storage_controller_generation=True, ) - time.sleep(0.1) - wait_until_tenant_state(ps_http, tenant, "Broken", 10) with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor: - executor.map(attach_broken, tenants) + executor.map(attach, tenants) - env.pageserver.stop( - immediate=True - ) # clears the failpoint as a side-effect; immediate to avoid hitting neon_local's timeout - tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants)) - log.info("python-side on-demand download the layer files into local tenant dir") - fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir( - env, tenant_timelines - ) + # Benchmarks will start the pageserver explicitly themselves + env.pageserver.stop() return env diff --git a/test_runner/fixtures/pageserver/remote_storage.py b/test_runner/fixtures/pageserver/remote_storage.py index e6cd9b4614..0c3612716a 100644 --- a/test_runner/fixtures/pageserver/remote_storage.py +++ b/test_runner/fixtures/pageserver/remote_storage.py @@ -6,13 +6,13 @@ import threading from pathlib import Path from typing import Any, List, Tuple +from fixtures.common_types import TenantId, TimelineId from fixtures.neon_fixtures import NeonEnv, Pagectl -from fixtures.pageserver.types import ( +from fixtures.pageserver.common_types import ( InvalidFileName, parse_layer_file_name, ) from fixtures.remote_storage import LocalFsStorage -from fixtures.types import TenantId, TimelineId def duplicate_one_tenant(env: NeonEnv, template_tenant: TenantId, new_tenant: TenantId): diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 6b2651e447..a74fef6a60 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -1,12 +1,17 @@ import time -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Tuple, Union -from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef, ObjectTypeDef +from mypy_boto3_s3.type_defs import ( + DeleteObjectOutputTypeDef, + EmptyResponseMetadataTypeDef, + ListObjectsV2OutputTypeDef, + ObjectTypeDef, +) +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.log_helper import log from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage -from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.utils import wait_until @@ -15,7 +20,7 @@ def assert_tenant_state( tenant: TenantId, expected_state: str, message: Optional[str] = None, -): +) -> None: tenant_status = pageserver_http.tenant_status(tenant) log.info(f"tenant_status: {tenant_status}") assert tenant_status["state"]["slug"] == expected_state, message or tenant_status @@ -57,9 +62,7 @@ def wait_for_upload( ) time.sleep(1) raise Exception( - "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( - lsn, current_lsn - ) + f"timed out while waiting for {tenant}/{timeline} remote_consistent_lsn to reach {lsn}, was {current_lsn}" ) @@ -195,39 +198,62 @@ def wait_for_last_record_lsn( lsn: Lsn, ) -> Lsn: """waits for pageserver to catch up to a certain lsn, returns the last observed lsn.""" - for i in range(100): + for i in range(1000): current_lsn = last_record_lsn(pageserver_http, tenant, timeline) if current_lsn >= lsn: return current_lsn if i % 10 == 0: log.info( - "waiting for last_record_lsn to reach {}, now {}, iteration {}".format( - lsn, current_lsn, i + 1 - ) + f"{tenant}/{timeline} waiting for last_record_lsn to reach {lsn}, now {current_lsn}, iteration {i + 1}" ) time.sleep(0.1) raise Exception( - "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn) + f"timed out while waiting for last_record_lsn to reach {lsn}, was {current_lsn}" ) def wait_for_upload_queue_empty( pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId ): + wait_period_secs = 0.2 while True: all_metrics = pageserver_http.get_metrics() - tl = all_metrics.query_all( - "pageserver_remote_timeline_client_calls_unfinished", + started = all_metrics.query_all( + "pageserver_remote_timeline_client_calls_started_total", { "tenant_id": str(tenant_id), "timeline_id": str(timeline_id), }, ) - assert len(tl) > 0 - log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}") - if all(m.value == 0 for m in tl): + finished = all_metrics.query_all( + "pageserver_remote_timeline_client_calls_finished_total", + { + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + }, + ) + + # this is `started left join finished`; if match, subtracting start from finished, resulting in queue depth + remaining_labels = ["shard_id", "file_kind", "op_kind"] + tl: List[Tuple[Any, float]] = [] + for s in started: + found = False + for f in finished: + if all([s.labels[label] == f.labels[label] for label in remaining_labels]): + assert ( + not found + ), "duplicate match, remaining_labels don't uniquely identify sample" + tl.append((s.labels, int(s.value) - int(f.value))) + found = True + if not found: + tl.append((s.labels, int(s.value))) + assert len(tl) == len(started), "something broken with join logic" + log.info(f"upload queue for {tenant_id}/{timeline_id}:") + for labels, queue_count in tl: + log.info(f" {labels}: {queue_count}") + if all(queue_count == 0 for (_, queue_count) in tl): return - time.sleep(0.2) + time.sleep(wait_period_secs) def wait_timeline_detail_404( @@ -262,7 +288,7 @@ def timeline_delete_wait_completed( iterations: int = 20, interval: Optional[float] = None, **delete_args, -): +) -> None: pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args) wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval) @@ -272,7 +298,7 @@ def assert_prefix_empty( remote_storage: Optional[RemoteStorage], prefix: Optional[str] = None, allowed_postfix: Optional[str] = None, -): +) -> None: assert remote_storage is not None response = list_prefix(remote_storage, prefix) keys = response["KeyCount"] @@ -287,7 +313,7 @@ def assert_prefix_empty( # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865 # this seems like a mock_s3 issue log.warning( - f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0" + f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0" ) keys = 0 elif keys != 0 and len(objects) == 0: @@ -327,7 +353,6 @@ def list_prefix( """ # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api. assert isinstance(remote, S3Storage), "localfs is currently not supported" - assert remote.client is not None prefix_in_bucket = remote.prefix_in_bucket or "" if not prefix: @@ -346,58 +371,76 @@ def list_prefix( return response -def wait_tenant_status_404( - pageserver_http: PageserverHttpClient, - tenant_id: TenantId, - iterations: int, - interval: float = 0.250, -): - def tenant_is_missing(): - data = {} - try: - data = pageserver_http.tenant_status(tenant_id) - log.info(f"tenant status {data}") - except PageserverApiException as e: - log.debug(e) - if e.status_code == 404: - return +def remote_storage_delete_key( + remote: RemoteStorage, + key: str, +) -> DeleteObjectOutputTypeDef: + """ + Note that this function takes into account prefix_in_bucket. + """ + # For local_fs we need to use a different implementation. As we don't need local_fs, just don't support it for now. + assert isinstance(remote, S3Storage), "localfs is currently not supported" - raise RuntimeError(f"Timeline exists state {data.get('state')}") + prefix_in_bucket = remote.prefix_in_bucket or "" - wait_until(iterations, interval=interval, func=tenant_is_missing) + # real s3 tests have uniqie per test prefix + # mock_s3 tests use special pageserver prefix for pageserver stuff + key = "/".join((prefix_in_bucket, key)) + + response = remote.client.delete_object( + Bucket=remote.bucket_name, + Key=key, + ) + return response -def tenant_delete_wait_completed( - pageserver_http: PageserverHttpClient, - tenant_id: TenantId, - iterations: int, - ignore_errors: bool = False, -): - if not ignore_errors: - pageserver_http.tenant_delete(tenant_id=tenant_id) - else: - interval = 0.5 +def enable_remote_storage_versioning( + remote: RemoteStorage, +) -> EmptyResponseMetadataTypeDef: + """ + Enable S3 versioning for the remote storage + """ + # local_fs has no support for versioning + assert isinstance(remote, S3Storage), "localfs is currently not supported" - def delete_request_sent(): - try: - pageserver_http.tenant_delete(tenant_id=tenant_id) - except PageserverApiException as e: - log.debug(e) - if e.status_code == 404: - return - except Exception as e: - log.debug(e) + # The SDK supports enabling versioning on normal S3 as well but we don't want to change + # these settings from a test in a live bucket (also, our access isn't enough nor should it be) + assert not remote.real, "Enabling storage versioning only supported on Mock S3" - wait_until(iterations, interval=interval, func=delete_request_sent) - wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations) + # Workaround to enable self-copy until upstream bug is fixed: https://github.com/getmoto/moto/issues/7300 + remote.client.put_bucket_encryption( + Bucket=remote.bucket_name, + ServerSideEncryptionConfiguration={ + "Rules": [ + { + "ApplyServerSideEncryptionByDefault": {"SSEAlgorithm": "AES256"}, + "BucketKeyEnabled": False, + }, + ] + }, + ) + # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive. + response = remote.client.put_bucket_versioning( + Bucket=remote.bucket_name, + VersioningConfiguration={ + "MFADelete": "Disabled", + "Status": "Enabled", + }, + ) + return response -MANY_SMALL_LAYERS_TENANT_CONFIG = { - "gc_period": "0s", - "compaction_period": "0s", - "checkpoint_distance": f"{1024**2}", - "image_creation_threshold": "100", -} +def many_small_layers_tenant_config() -> Dict[str, Any]: + """ + Create a new dict to avoid issues with deleting from the global value. + In python, the global is mutable. + """ + return { + "gc_period": "0s", + "compaction_period": "0s", + "checkpoint_distance": 1024**2, + "image_creation_threshold": 100, + } def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int: diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index 53350138dd..e2dd51802c 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -1,50 +1,119 @@ import os -from typing import Optional +from typing import Any, Dict, Optional +import allure import pytest -from _pytest.fixtures import FixtureRequest +import toml from _pytest.python import Metafunc from fixtures.pg_version import PgVersion +from fixtures.utils import AuxFileStore """ -Dynamically parametrize tests by Postgres version and build type (debug/release/remote) +Dynamically parametrize tests by different parameters """ @pytest.fixture(scope="function", autouse=True) -def pg_version(request: FixtureRequest) -> Optional[PgVersion]: - # Do not parametrize performance tests yet, we need to prepare grafana charts first - if "test_runner/performance" in str(request.node.path): - v = os.environ.get("DEFAULT_PG_VERSION") - return PgVersion(v) - +def pg_version() -> Optional[PgVersion]: return None @pytest.fixture(scope="function", autouse=True) -def build_type(request: FixtureRequest) -> Optional[str]: - # Do not parametrize performance tests yet, we need to prepare grafana charts first - if "test_runner/performance" in str(request.node.path): - return os.environ.get("BUILD_TYPE", "").lower() - +def build_type() -> Optional[str]: return None +@pytest.fixture(scope="function", autouse=True) +def platform() -> Optional[str]: + return None + + +@pytest.fixture(scope="function", autouse=True) +def pageserver_virtual_file_io_engine() -> Optional[str]: + return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE") + + +@pytest.fixture(scope="function", autouse=True) +def pageserver_io_buffer_alignment() -> Optional[int]: + return None + + +@pytest.fixture(scope="function", autouse=True) +def pageserver_aux_file_policy() -> Optional[AuxFileStore]: + return None + + +def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]: + toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") + if toml_table is None: + return None + v = toml.loads(toml_table) + assert isinstance(v, dict) + return v + + +@pytest.fixture(scope="function", autouse=True) +def pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]: + return get_pageserver_default_tenant_config_compaction_algorithm() + + def pytest_generate_tests(metafunc: Metafunc): - # Do not parametrize performance tests yet, we need to prepare grafana charts first - if "test_runner/performance" in metafunc.definition._nodeid: - return - - if (v := os.environ.get("DEFAULT_PG_VERSION")) is None: - pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET] - else: - pg_versions = [PgVersion(v)] - - if (bt := os.environ.get("BUILD_TYPE")) is None: + if (bt := os.getenv("BUILD_TYPE")) is None: build_types = ["debug", "release"] else: build_types = [bt.lower()] metafunc.parametrize("build_type", build_types) + + if (v := os.getenv("DEFAULT_PG_VERSION")) is None: + pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET] + else: + pg_versions = [PgVersion(v)] + metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions)) + + # A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=std-fs` + # And do not change test name for default `pageserver_virtual_file_io_engine=tokio-epoll-uring` to keep tests statistics + if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ( + "", + "tokio-epoll-uring", + ): + metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine]) + + # Same hack for pageserver_default_tenant_config_compaction_algorithm + if ( + explicit_default := get_pageserver_default_tenant_config_compaction_algorithm() + ) is not None: + metafunc.parametrize( + "pageserver_default_tenant_config_compaction_algorithm", + [explicit_default], + ids=[explicit_default["kind"]], + ) + + # For performance tests, parametrize also by platform + if ( + "test_runner/performance" in metafunc.definition._nodeid + and (platform := os.getenv("PLATFORM")) is not None + ): + metafunc.parametrize("platform", [platform.lower()]) + + +@pytest.hookimpl(hookwrapper=True, tryfirst=True) +def pytest_runtest_makereport(*args, **kwargs): + # Add test parameters to Allue report to distinguish the same tests with different parameters. + # Names has `__` prefix to avoid conflicts with `pytest.mark.parametrize` parameters + + # A mapping between `uname -m` and `RUNNER_ARCH` values. + # `RUNNER_ARCH` environment variable is set on GitHub Runners, + # possible values are X86, X64, ARM, or ARM64. + # See https://docs.github.com/en/actions/learn-github-actions/variables#default-environment-variables + uname_m = { + "aarch64": "ARM64", + "arm64": "ARM64", + "x86_64": "X64", + }.get(os.uname().machine, "UNKNOWN") + arch = os.getenv("RUNNER_ARCH", uname_m) + allure.dynamic.parameter("__arch", arch) + + yield diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index 657718da00..e12c8e5f4a 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -3,8 +3,6 @@ import os from typing import Optional import pytest -from _pytest.config import Config -from _pytest.config.argparsing import Parser """ This fixture is used to determine which version of Postgres to use for tests. @@ -52,7 +50,7 @@ class PgVersion(str, enum.Enum): return None -DEFAULT_VERSION: PgVersion = PgVersion.V14 +DEFAULT_VERSION: PgVersion = PgVersion.V16 def skip_on_postgres(version: PgVersion, reason: str): @@ -69,15 +67,8 @@ def xfail_on_postgres(version: PgVersion, reason: str): ) -def pytest_addoption(parser: Parser): - parser.addoption( - "--pg-version", - action="store", - type=PgVersion, - help="DEPRECATED: Postgres version to use for tests", +def run_only_on_default_postgres(reason: str): + return pytest.mark.skipif( + PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is not DEFAULT_VERSION, + reason=reason, ) - - -def pytest_configure(config: Config): - if config.getoption("--pg-version"): - raise Exception("--pg-version is deprecated, use DEFAULT_PG_VERSION env var instead") diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index c0c2383feb..1b6c3c23ba 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -12,8 +12,9 @@ import boto3 import toml from mypy_boto3_s3 import S3Client +from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.log_helper import log -from fixtures.types import TenantId, TimelineId +from fixtures.pageserver.common_types import IndexPartDump TIMELINE_INDEX_PART_FILE_NAME = "index_part.json" TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json" @@ -50,7 +51,7 @@ class MockS3Server: # XXX: do not use `shell=True` or add `exec ` to the command here otherwise. # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux # if a process is started from the shell process. - self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", "s3", f"-p{port}"]) + self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", f"-p{port}"]) error = None try: return_code = self.subprocess.poll() @@ -141,11 +142,13 @@ class LocalFsStorage: with self.heatmap_path(tenant_id).open("r") as f: return json.load(f) - def to_toml_inline_table(self) -> str: - rv = { + def to_toml_dict(self) -> Dict[str, Any]: + return { "local_path": str(self.root), } - return toml.TomlEncoder().dump_inline_table(rv) + + def to_toml_inline_table(self) -> str: + return toml.TomlEncoder().dump_inline_table(self.to_toml_dict()) def cleanup(self): # no cleanup is done here, because there's NeonEnvBuilder.cleanup_local_storage which will remove everything, including localfs files @@ -160,20 +163,36 @@ class LocalFsStorage: class S3Storage: bucket_name: str bucket_region: str - access_key: str - secret_key: str + access_key: Optional[str] + secret_key: Optional[str] + aws_profile: Optional[str] prefix_in_bucket: str client: S3Client cleanup: bool """Is this MOCK_S3 (false) or REAL_S3 (true)""" real: bool endpoint: Optional[str] = None + """formatting deserialized with humantime crate, for example "1s".""" + custom_timeout: Optional[str] = None def access_env_vars(self) -> Dict[str, str]: - return { - "AWS_ACCESS_KEY_ID": self.access_key, - "AWS_SECRET_ACCESS_KEY": self.secret_key, - } + if self.aws_profile is not None: + env = { + "AWS_PROFILE": self.aws_profile, + } + # Pass through HOME env var because AWS_PROFILE needs it in order to work + home = os.getenv("HOME") + if home is not None: + env["HOME"] = home + return env + if self.access_key is not None and self.secret_key is not None: + return { + "AWS_ACCESS_KEY_ID": self.access_key, + "AWS_SECRET_ACCESS_KEY": self.secret_key, + } + raise RuntimeError( + "Either AWS_PROFILE or (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY) have to be set for S3Storage" + ) def to_string(self) -> str: return json.dumps( @@ -185,7 +204,7 @@ class S3Storage: } ) - def to_toml_inline_table(self) -> str: + def to_toml_dict(self) -> Dict[str, Any]: rv = { "bucket_name": self.bucket_name, "bucket_region": self.bucket_region, @@ -197,7 +216,13 @@ class S3Storage: if self.endpoint is not None: rv["endpoint"] = self.endpoint - return toml.TomlEncoder().dump_inline_table(rv) + if self.custom_timeout is not None: + rv["timeout"] = self.custom_timeout + + return rv + + def to_toml_inline_table(self) -> str: + return toml.TomlEncoder().dump_inline_table(self.to_toml_dict()) def do_cleanup(self): if not self.cleanup: @@ -243,6 +268,51 @@ class S3Storage: log.info(f"deleted {cnt} objects from remote storage") + def tenants_path(self) -> str: + return f"{self.prefix_in_bucket}/tenants" + + def tenant_path(self, tenant_id: Union[TenantShardId, TenantId]) -> str: + return f"{self.tenants_path()}/{tenant_id}" + + def timeline_path( + self, tenant_id: Union[TenantShardId, TenantId], timeline_id: TimelineId + ) -> str: + return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}" + + def get_latest_index_key(self, index_keys: List[str]) -> str: + """ + Gets the latest index file key. + + @param index_keys: A list of index keys of different generations. + """ + + def parse_gen(index_key: str) -> int: + parts = index_key.split("index_part.json-") + return int(parts[-1], base=16) if len(parts) == 2 else -1 + + return max(index_keys, key=parse_gen) + + def download_index_part(self, index_key: str) -> IndexPartDump: + """ + Downloads the index content from remote storage. + + @param index_key: index key in remote storage. + """ + response = self.client.get_object(Bucket=self.bucket_name, Key=index_key) + body = response["Body"].read().decode("utf-8") + log.info(f"index_part.json: {body}") + return IndexPartDump.from_json(json.loads(body)) + + def heatmap_key(self, tenant_id: TenantId) -> str: + return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}" + + def heatmap_content(self, tenant_id: TenantId): + r = self.client.get_object(Bucket=self.bucket_name, Key=self.heatmap_key(tenant_id)) + return json.loads(r["Body"].read().decode("utf-8")) + + def mock_remote_tenant_path(self, tenant_id: TenantId): + assert self.real is False + RemoteStorage = Union[LocalFsStorage, S3Storage] @@ -308,6 +378,7 @@ class RemoteStorageKind(str, enum.Enum): bucket_region=mock_region, access_key=access_key, secret_key=secret_key, + aws_profile=None, prefix_in_bucket="", client=client, cleanup=False, @@ -317,12 +388,11 @@ class RemoteStorageKind(str, enum.Enum): assert self == RemoteStorageKind.REAL_S3 env_access_key = os.getenv("AWS_ACCESS_KEY_ID") - assert env_access_key, "no aws access key provided" env_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY") - assert env_secret_key, "no aws access key provided" - - # session token is needed for local runs with sso auth - session_token = os.getenv("AWS_SESSION_TOKEN") + env_profile = os.getenv("AWS_PROFILE") + assert ( + env_access_key and env_secret_key + ) or env_profile, "need to specify either access key and secret access key or profile" bucket_name = bucket_name or os.getenv("REMOTE_STORAGE_S3_BUCKET") assert bucket_name is not None, "no remote storage bucket name provided" @@ -334,9 +404,6 @@ class RemoteStorageKind(str, enum.Enum): client = boto3.client( "s3", region_name=bucket_region, - aws_access_key_id=env_access_key, - aws_secret_access_key=env_secret_key, - aws_session_token=session_token, ) return S3Storage( @@ -344,6 +411,7 @@ class RemoteStorageKind(str, enum.Enum): bucket_region=bucket_region, access_key=env_access_key, secret_key=env_secret_key, + aws_profile=env_profile, prefix_in_bucket=prefix_in_bucket, client=client, cleanup=True, @@ -391,6 +459,13 @@ def default_remote_storage() -> RemoteStorageKind: return RemoteStorageKind.LOCAL_FS +def remote_storage_to_toml_dict(remote_storage: RemoteStorage) -> Dict[str, Any]: + if not isinstance(remote_storage, (LocalFsStorage, S3Storage)): + raise Exception("invalid remote storage type") + + return remote_storage.to_toml_dict() + + # serialize as toml inline table def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str: if not isinstance(remote_storage, (LocalFsStorage, S3Storage)): diff --git a/test_runner/fixtures/safekeeper/__init__.py b/test_runner/fixtures/safekeeper/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py new file mode 100644 index 0000000000..9bf03554e7 --- /dev/null +++ b/test_runner/fixtures/safekeeper/http.py @@ -0,0 +1,260 @@ +import json +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import pytest +import requests + +from fixtures.common_types import Lsn, TenantId, TenantTimelineId, TimelineId +from fixtures.log_helper import log +from fixtures.metrics import Metrics, MetricsGetter, parse_metrics + + +# Walreceiver as returned by sk's timeline status endpoint. +@dataclass +class Walreceiver: + conn_id: int + state: str + + +@dataclass +class SafekeeperTimelineStatus: + term: int + last_log_term: int + pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 + flush_lsn: Lsn + commit_lsn: Lsn + timeline_start_lsn: Lsn + backup_lsn: Lsn + peer_horizon_lsn: Lsn + remote_consistent_lsn: Lsn + walreceivers: List[Walreceiver] + + +class SafekeeperMetrics(Metrics): + # Helpers to get metrics from tests without hardcoding the metric names there. + # These are metrics from Prometheus which uses float64 internally. + # As a consequence, values may differ from real original int64s. + + def __init__(self, m: Metrics): + self.metrics = m.metrics + + def flush_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId): + return self.query_one( + "safekeeper_flush_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)} + ).value + + def commit_lsn_inexact(self, tenant_id: TenantId, timeline_id: TimelineId): + return self.query_one( + "safekeeper_commit_lsn", {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)} + ).value + + +class SafekeeperHttpClient(requests.Session, MetricsGetter): + HTTPError = requests.HTTPError + + def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False): + super().__init__() + self.port = port + self.auth_token = auth_token + self.is_testing_enabled = is_testing_enabled + + if auth_token is not None: + self.headers["Authorization"] = f"Bearer {auth_token}" + + def check_status(self): + self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() + + def get_metrics_str(self) -> str: + """You probably want to use get_metrics() instead.""" + request_result = self.get(f"http://localhost:{self.port}/metrics") + request_result.raise_for_status() + return request_result.text + + def get_metrics(self) -> SafekeeperMetrics: + res = self.get_metrics_str() + return SafekeeperMetrics(parse_metrics(res)) + + def is_testing_enabled_or_skip(self): + if not self.is_testing_enabled: + pytest.skip("safekeeper was built without 'testing' feature") + + def configure_failpoints(self, config_strings: Union[Tuple[str, str], List[Tuple[str, str]]]): + self.is_testing_enabled_or_skip() + + if isinstance(config_strings, tuple): + pairs = [config_strings] + else: + pairs = config_strings + + log.info(f"Requesting config failpoints: {repr(pairs)}") + + res = self.put( + f"http://localhost:{self.port}/v1/failpoints", + json=[{"name": name, "actions": actions} for name, actions in pairs], + ) + log.info(f"Got failpoints request response code {res.status_code}") + res.raise_for_status() + res_json = res.json() + assert res_json is None + return res_json + + def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]: + res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def timeline_list(self) -> List[TenantTimelineId]: + res = self.get(f"http://localhost:{self.port}/v1/tenant/timeline") + res.raise_for_status() + resj = res.json() + return [TenantTimelineId.from_json(ttidj) for ttidj in resj] + + def timeline_create( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + pg_version: int, # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 + commit_lsn: Lsn, + ): + body = { + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + "pg_version": pg_version, + "commit_lsn": str(commit_lsn), + } + res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body) + res.raise_for_status() + + def timeline_status( + self, tenant_id: TenantId, timeline_id: TimelineId + ) -> SafekeeperTimelineStatus: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") + res.raise_for_status() + resj = res.json() + walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]] + return SafekeeperTimelineStatus( + term=resj["acceptor_state"]["term"], + last_log_term=resj["acceptor_state"]["epoch"], + pg_version=resj["pg_info"]["pg_version"], + flush_lsn=Lsn(resj["flush_lsn"]), + commit_lsn=Lsn(resj["commit_lsn"]), + timeline_start_lsn=Lsn(resj["timeline_start_lsn"]), + backup_lsn=Lsn(resj["backup_lsn"]), + peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]), + remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]), + walreceivers=walreceivers, + ) + + def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: + return self.timeline_status(tenant_id, timeline_id).commit_lsn + + # only_local doesn't remove segments in the remote storage. + def timeline_delete( + self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False + ) -> Dict[Any, Any]: + res = self.delete( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", + params={ + "only_local": str(only_local).lower(), + }, + ) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]: + params = params or {} + res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params) + res.raise_for_status() + res_json = json.loads(res.text) + assert isinstance(res_json, dict) + return res_json + + def debug_dump_timeline( + self, timeline_id: TimelineId, params: Optional[Dict[str, str]] = None + ) -> Any: + params = params or {} + params["timeline_id"] = str(timeline_id) + dump = self.debug_dump(params) + return dump["timelines"][0] + + def get_partial_backup(self, timeline_id: TimelineId) -> Any: + dump = self.debug_dump_timeline(timeline_id, {"dump_control_file": "true"}) + return dump["control_file"]["partial_backup"] + + def get_eviction_state(self, timeline_id: TimelineId) -> Any: + dump = self.debug_dump_timeline(timeline_id, {"dump_control_file": "true"}) + return dump["control_file"]["eviction_state"] + + def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]: + res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy", + json=body, + ) + res.raise_for_status() + + def patch_control_file( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + patch: Dict[str, Any], + ) -> Dict[str, Any]: + res = self.patch( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file", + json={ + "updates": patch, + "apply_fields": list(patch.keys()), + }, + ) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint", + json={}, + ) + res.raise_for_status() + + def timeline_digest( + self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn + ) -> Dict[str, Any]: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest", + params={ + "from_lsn": str(from_lsn), + "until_lsn": str(until_lsn), + }, + ) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def backup_partial_reset(self, tenant_id: TenantId, timeline_id: TimelineId): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/backup_partial_reset", + json={}, + ) + res.raise_for_status() + return res.json() + + def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body): + res = self.post( + f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", + json=body, + ) + res.raise_for_status() diff --git a/test_runner/fixtures/safekeeper/utils.py b/test_runner/fixtures/safekeeper/utils.py new file mode 100644 index 0000000000..0e4b5d7883 --- /dev/null +++ b/test_runner/fixtures/safekeeper/utils.py @@ -0,0 +1,11 @@ +from fixtures.common_types import TenantId, TimelineId +from fixtures.log_helper import log +from fixtures.safekeeper.http import SafekeeperHttpClient + + +def are_walreceivers_absent( + sk_http_cli: SafekeeperHttpClient, tenant_id: TenantId, timeline_id: TimelineId +): + status = sk_http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}") + return len(status.walreceivers) == 0 diff --git a/test_runner/fixtures/storage_controller_proxy.py b/test_runner/fixtures/storage_controller_proxy.py new file mode 100644 index 0000000000..3477f8b1f2 --- /dev/null +++ b/test_runner/fixtures/storage_controller_proxy.py @@ -0,0 +1,73 @@ +import re +from typing import Any, Optional + +import pytest +import requests +from pytest_httpserver import HTTPServer +from werkzeug.datastructures import Headers +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + +from fixtures.log_helper import log + + +class StorageControllerProxy: + def __init__(self, server: HTTPServer): + self.server: HTTPServer = server + self.listen: str = f"http://{server.host}:{server.port}" + self.routing_to: Optional[str] = None + + def route_to(self, storage_controller_api: str): + self.routing_to = storage_controller_api + + def port(self) -> int: + return self.server.port + + def upcall_api_endpoint(self) -> str: + return f"{self.listen}/upcall/v1" + + +def proxy_request(method: str, url: str, **kwargs) -> requests.Response: + return requests.request(method, url, **kwargs) + + +@pytest.fixture(scope="function") +def storage_controller_proxy(make_httpserver): + """ + Proxies requests into the storage controller to the currently + selected storage controller instance via `StorageControllerProxy.route_to`. + + This fixture is intended for tests that need to run multiple instances + of the storage controller at the same time. + """ + server = make_httpserver + + self = StorageControllerProxy(server) + + log.info(f"Storage controller proxy listening on {self.listen}") + + def handler(request: Request): + if self.route_to is None: + log.info(f"Storage controller proxy has no routing configured for {request.url}") + return Response("Routing not configured", status=503) + + route_to_url = f"{self.routing_to}{request.path}" + + log.info(f"Routing {request.url} to {route_to_url}") + + args: dict[str, Any] = {"headers": request.headers} + if request.is_json: + args["json"] = request.json + + response = proxy_request(request.method, route_to_url, **args) + + headers = Headers() + for key, value in response.headers.items(): + headers.add(key, value) + + return Response(response.content, headers=headers, status=response.status_code) + + self.server.expect_request(re.compile(".*")).respond_with_handler(handler) + + yield self + server.clear() diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 91f33e1196..80f1c9e4e3 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,20 +1,27 @@ import contextlib +import enum import json import os import re import subprocess +import tarfile import threading import time +from hashlib import sha256 from pathlib import Path from typing import ( + IO, TYPE_CHECKING, Any, Callable, Dict, + Iterable, List, Optional, + Set, Tuple, TypeVar, + Union, ) from urllib.parse import urlencode @@ -23,14 +30,14 @@ import zstandard from psycopg2.extensions import cursor from fixtures.log_helper import log -from fixtures.pageserver.types import ( +from fixtures.pageserver.common_types import ( parse_delta_layer, parse_image_layer, ) if TYPE_CHECKING: from fixtures.neon_fixtures import PgBin -from fixtures.types import TimelineId +from fixtures.common_types import TimelineId Fn = TypeVar("Fn", bound=Callable[..., Any]) @@ -189,7 +196,7 @@ def query_scalar(cur: cursor, query: str) -> Any: # Traverse directory to get total size. -def get_dir_size(path: str) -> int: +def get_dir_size(path: Path) -> int: """Return size in bytes.""" totalbytes = 0 for root, _dirs, files in os.walk(path): @@ -233,9 +240,18 @@ ATTACHMENT_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] ) -def allure_attach_from_dir(dir: Path): +def allure_attach_from_dir(dir: Path, preserve_database_files: bool = False): """Attach all non-empty files from `dir` that matches `ATTACHMENT_NAME_REGEX` to Allure report""" + if preserve_database_files: + zst_file = dir.with_suffix(".tar.zst") + with zst_file.open("wb") as zst: + cctx = zstandard.ZstdCompressor() + with cctx.stream_writer(zst) as compressor: + with tarfile.open(fileobj=compressor, mode="w") as tar: + tar.add(dir, arcname="") + allure.attach.file(zst_file, "everything.tar.zst", "application/zstd", "tar.zst") + for attachment in Path(dir).glob("**/*"): if ATTACHMENT_NAME_REGEX.fullmatch(attachment.name) and attachment.stat().st_size > 0: name = str(attachment.relative_to(dir)) @@ -369,7 +385,15 @@ def start_in_background( return spawned_process -def wait_until(number_of_iterations: int, interval: float, func: Fn): +WaitUntilRet = TypeVar("WaitUntilRet") + + +def wait_until( + number_of_iterations: int, + interval: float, + func: Callable[[], WaitUntilRet], + show_intermediate_error=False, +) -> WaitUntilRet: """ Wait until 'func' returns successfully, without exception. Returns the last return value from the function. @@ -379,14 +403,28 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn): try: res = func() except Exception as e: - log.info("waiting for %s iteration %s failed", func, i + 1) + log.info("waiting for %s iteration %s failed: %s", func, i + 1, e) last_exception = e + if show_intermediate_error: + log.info(e) time.sleep(interval) continue return res raise Exception("timed out while waiting for %s" % func) from last_exception +def assert_eq(a, b) -> None: + assert a == b + + +def assert_gt(a, b) -> None: + assert a > b + + +def assert_ge(a, b) -> None: + assert a >= b + + def run_pg_bench_small(pg_bin: "PgBin", connstr: str): """ Fast way to populate data. @@ -430,3 +468,148 @@ def humantime_to_ms(humantime: str) -> float: ) return round(total_ms, 3) + + +def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]: + # FIXME: this duplicates test_runner/fixtures/pageserver/allowed_errors.py + error_or_warn = re.compile(r"\s(ERROR|WARN)") + errors = [] + for lineno, line in enumerate(input, start=1): + if len(line) == 0: + continue + + if error_or_warn.search(line): + # Is this a torn log line? This happens when force-killing a process and restarting + # Example: "2023-10-25T09:38:31.752314Z WARN deletion executo2023-10-25T09:38:31.875947Z INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192" + if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line): + continue + + # It's an ERROR or WARN. Is it in the allow-list? + for a in allowed_errors: + if re.match(a, line): + break + else: + errors.append((lineno, line)) + return errors + + +def assert_no_errors(log_file, service, allowed_errors): + if not log_file.exists(): + log.warning(f"Skipping {service} log check: {log_file} does not exist") + return + + with log_file.open("r") as f: + errors = scan_log_for_errors(f, allowed_errors) + + for _lineno, error in errors: + log.info(f"not allowed {service} error: {error.strip()}") + + assert not errors, f"First log error on {service}: {errors[0]}\nHint: use scripts/check_allowed_errors.sh to test any new allowed_error you add" + + +@enum.unique +class AuxFileStore(str, enum.Enum): + V1 = "v1" + V2 = "v2" + CrossValidation = "cross-validation" + + def __repr__(self) -> str: + return f"'aux-{self.value}'" + + def __str__(self) -> str: + return f"'aux-{self.value}'" + + +def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: Set[str]): + """ + This is essentially: + + lines=$(comm -3 \ + <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \ + <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \ + | wc -l) + [ "$lines" = "0" ] + + But in a more mac friendly fashion. + """ + started_at = time.time() + + def hash_extracted(reader: Union[IO[bytes], None]) -> bytes: + assert reader is not None + digest = sha256(usedforsecurity=False) + while True: + buf = reader.read(64 * 1024) + if not buf: + break + digest.update(buf) + return digest.digest() + + def build_hash_list(p: Path) -> List[Tuple[str, bytes]]: + with tarfile.open(p) as f: + matching_files = (info for info in f if info.isreg() and info.name not in skip_files) + ret = list( + map(lambda info: (info.name, hash_extracted(f.extractfile(info))), matching_files) + ) + ret.sort(key=lambda t: t[0]) + return ret + + left_list, right_list = map(build_hash_list, [left, right]) + + assert len(left_list) == len( + right_list + ), f"unexpected number of files on tar files, {len(left_list)} != {len(right_list)}" + + mismatching = set() + + for left_tuple, right_tuple in zip(left_list, right_list): + left_path, left_hash = left_tuple + right_path, right_hash = right_tuple + assert ( + left_path == right_path + ), f"file count matched, expected these to be same paths: {left_path}, {right_path}" + if left_hash != right_hash: + mismatching.add(left_path) + + assert len(mismatching) == 0, f"files with hash mismatch: {mismatching}" + + elapsed = time.time() - started_at + log.info(f"assert_pageserver_backups_equal completed in {elapsed}s") + + +class PropagatingThread(threading.Thread): + _target: Any + _args: Any + _kwargs: Any + """ + Simple Thread wrapper with join() propagating the possible exception in the thread. + """ + + def run(self): + self.exc = None + try: + self.ret = self._target(*self._args, **self._kwargs) + except BaseException as e: + self.exc = e + + def join(self, timeout=None): + super(PropagatingThread, self).join(timeout) + if self.exc: + raise self.exc + return self.ret + + +def human_bytes(amt: float) -> str: + """ + Render a bytes amount into nice IEC bytes string. + """ + + suffixes = ["", "Ki", "Mi", "Gi"] + + last = suffixes[-1] + + for name in suffixes: + if amt < 1024 or name == last: + return f"{int(round(amt))} {name}B" + amt = amt / 1024 + + raise RuntimeError("unreachable") diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index 30def1194d..065a78bf9b 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -1,5 +1,7 @@ -from typing import Optional +import threading +from typing import Any, Optional +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -8,8 +10,11 @@ from fixtures.neon_fixtures import ( tenant_get_shards, wait_for_last_flush_lsn, ) -from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload -from fixtures.types import TenantId, TimelineId +from fixtures.pageserver.utils import wait_for_last_record_lsn + +# neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex +# to ensure we don't do that: this enables running lots of Workloads in parallel safely. +ENDPOINT_LOCK = threading.Lock() class Workload: @@ -21,28 +26,53 @@ class Workload: - reads, checking we get the right data (`validate`) """ - def __init__(self, env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): + def __init__( + self, + env: NeonEnv, + tenant_id: TenantId, + timeline_id: TimelineId, + branch_name: Optional[str] = None, + endpoint_opts: Optional[dict[str, Any]] = None, + ): self.env = env self.tenant_id = tenant_id self.timeline_id = timeline_id self.table = "foo" + # By default, use the default branch name for initial tenant in NeonEnv + self.branch_name = branch_name or "main" + self.expect_rows = 0 self.churn_cursor = 0 self._endpoint: Optional[Endpoint] = None + self._endpoint_opts = endpoint_opts or {} + + def reconfigure(self): + """ + Request the endpoint to reconfigure based on location reported by storage controller + """ + if self._endpoint is not None: + with ENDPOINT_LOCK: + self._endpoint.reconfigure() def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint: - if self._endpoint is None: - self._endpoint = self.env.endpoints.create( - "main", - tenant_id=self.tenant_id, - pageserver_id=pageserver_id, - endpoint_id="ep-workload", - ) - self._endpoint.start(pageserver_id=pageserver_id) - else: - self._endpoint.reconfigure(pageserver_id=pageserver_id) + # We may be running alongside other Workloads for different tenants. Full TTID is + # obnoxiously long for use here, but a cut-down version is still unique enough for tests. + endpoint_id = f"ep-workload-{str(self.tenant_id)[0:4]}-{str(self.timeline_id)[0:4]}" + + with ENDPOINT_LOCK: + if self._endpoint is None: + self._endpoint = self.env.endpoints.create( + self.branch_name, + tenant_id=self.tenant_id, + pageserver_id=pageserver_id, + endpoint_id=endpoint_id, + **self._endpoint_opts, + ) + self._endpoint.start(pageserver_id=pageserver_id) + else: + self._endpoint.reconfigure(pageserver_id=pageserver_id) connstring = self._endpoint.safe_psql( "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'" @@ -51,9 +81,13 @@ class Workload: return self._endpoint - def __del__(self): + def stop(self): if self._endpoint is not None: self._endpoint.stop() + self._endpoint = None + + def __del__(self): + self.stop() def init(self, pageserver_id: Optional[int] = None): endpoint = self.endpoint(pageserver_id) @@ -64,7 +98,7 @@ class Workload: self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id ) - def write_rows(self, n, pageserver_id: Optional[int] = None): + def write_rows(self, n, pageserver_id: Optional[int] = None, upload: bool = True): endpoint = self.endpoint(pageserver_id) start = self.expect_rows end = start + n - 1 @@ -78,11 +112,14 @@ class Workload: """ ) - return last_flush_lsn_upload( - self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id - ) + if upload: + return last_flush_lsn_upload( + self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id + ) + else: + return False - def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True): + def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True, ingest=True): assert self.expect_rows >= n max_iters = 10 @@ -120,33 +157,34 @@ class Workload: ] ) - for tenant_shard_id, pageserver in tenant_get_shards( - self.env, self.tenant_id, pageserver_id - ): - last_flush_lsn = wait_for_last_flush_lsn( - self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id - ) - ps_http = pageserver.http_client() - wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn) + if ingest: + # Wait for written data to be ingested by the pageserver + for tenant_shard_id, pageserver in tenant_get_shards( + self.env, self.tenant_id, pageserver_id + ): + last_flush_lsn = wait_for_last_flush_lsn( + self.env, + endpoint, + self.tenant_id, + self.timeline_id, + pageserver_id=pageserver_id, + ) + ps_http = pageserver.http_client() + wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn) - if upload: - # force a checkpoint to trigger upload - ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id) - wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn) - log.info(f"Churn: waiting for remote LSN {last_flush_lsn}") - else: - log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}") + if upload: + # Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload) + ps_http.timeline_checkpoint( + tenant_shard_id, self.timeline_id, wait_until_uploaded=True + ) + log.info(f"Churn: waiting for remote LSN {last_flush_lsn}") + else: + log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}") def validate(self, pageserver_id: Optional[int] = None): endpoint = self.endpoint(pageserver_id) - result = endpoint.safe_psql_many( - [ - "select clear_buffer_cache()", - f""" - SELECT COUNT(*) FROM {self.table} - """, - ] - ) + endpoint.clear_shared_buffers() + result = endpoint.safe_psql(f"SELECT COUNT(*) FROM {self.table}") log.info(f"validate({self.expect_rows}): {result}") - assert result == [[("",)], [(self.expect_rows,)]] + assert result == [(self.expect_rows,)] diff --git a/test_runner/logical_repl/README.md b/test_runner/logical_repl/README.md new file mode 100644 index 0000000000..8eca056dda --- /dev/null +++ b/test_runner/logical_repl/README.md @@ -0,0 +1,22 @@ +# Logical replication tests + +## Clickhouse + +```bash +export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb + +docker compose -f clickhouse/docker-compose.yml up -d +pytest -m remote_cluster -k test_clickhouse +docker compose -f clickhouse/docker-compose.yml down +``` + +## Debezium + +```bash +export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb + +docker compose -f debezium/docker-compose.yml up -d +pytest -m remote_cluster -k test_debezium +docker compose -f debezium/docker-compose.yml down + +``` \ No newline at end of file diff --git a/test_runner/logical_repl/clickhouse/docker-compose.yml b/test_runner/logical_repl/clickhouse/docker-compose.yml new file mode 100644 index 0000000000..e00038b811 --- /dev/null +++ b/test_runner/logical_repl/clickhouse/docker-compose.yml @@ -0,0 +1,9 @@ +services: + clickhouse: + image: clickhouse/clickhouse-server + user: "101:101" + container_name: clickhouse + hostname: clickhouse + ports: + - 127.0.0.1:8123:8123 + - 127.0.0.1:9000:9000 diff --git a/test_runner/logical_repl/debezium/docker-compose.yml b/test_runner/logical_repl/debezium/docker-compose.yml new file mode 100644 index 0000000000..fee127a2fd --- /dev/null +++ b/test_runner/logical_repl/debezium/docker-compose.yml @@ -0,0 +1,24 @@ +services: + zookeeper: + image: quay.io/debezium/zookeeper:2.7 + kafka: + image: quay.io/debezium/kafka:2.7 + environment: + ZOOKEEPER_CONNECT: "zookeeper:2181" + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 + KAFKA_BROKER_ID: 1 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_JMX_PORT: 9991 + ports: + - 127.0.0.1:9092:9092 + debezium: + image: quay.io/debezium/connect:2.7 + environment: + BOOTSTRAP_SERVERS: kafka:9092 + GROUP_ID: 1 + CONFIG_STORAGE_TOPIC: debezium-config + OFFSET_STORAGE_TOPIC: debezium-offset + STATUS_STORAGE_TOPIC: debezium-status + DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector + ports: + - 127.0.0.1:8083:8083 diff --git a/test_runner/logical_repl/test_clickhouse.py b/test_runner/logical_repl/test_clickhouse.py new file mode 100644 index 0000000000..c5ed9bc8af --- /dev/null +++ b/test_runner/logical_repl/test_clickhouse.py @@ -0,0 +1,82 @@ +""" +Test the logical replication in Neon with ClickHouse as a consumer +""" + +import hashlib +import os +import time + +import clickhouse_connect +import psycopg2 +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import RemotePostgres +from fixtures.utils import wait_until + + +def query_clickhouse( + client, + query: str, + digest: str, +) -> None: + """ + Run the query on the client + return answer if successful, raise an exception otherwise + """ + log.debug("Query: %s", query) + res = client.query(query) + log.debug(res.result_rows) + m = hashlib.sha1() + m.update(repr(tuple(res.result_rows)).encode()) + hash_res = m.hexdigest() + log.debug("Hash: %s", hash_res) + if hash_res == digest: + return + raise ValueError("Hash mismatch") + + +@pytest.mark.remote_cluster +def test_clickhouse(remote_pg: RemotePostgres): + """ + Test the logical replication having ClickHouse as a client + """ + clickhouse_host = "clickhouse" if ("CI" in os.environ) else "127.0.0.1" + conn_options = remote_pg.conn_options() + conn = psycopg2.connect(remote_pg.connstr()) + cur = conn.cursor() + cur.execute("DROP TABLE IF EXISTS table1") + cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));") + cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');") + conn.commit() + client = clickhouse_connect.get_client(host=clickhouse_host) + client.command("SET allow_experimental_database_materialized_postgresql=1") + client.command( + "CREATE DATABASE db1_postgres ENGINE = " + f"MaterializedPostgreSQL('{conn_options['host']}', " + f"'{conn_options['dbname']}', " + f"'{conn_options['user']}', '{conn_options['password']}') " + "SETTINGS materialized_postgresql_tables_list = 'table1';" + ) + wait_until( + 120, + 0.5, + lambda: query_clickhouse( + client, + "select * from db1_postgres.table1 order by 1", + "ee600d8f7cd05bd0b169fa81f44300a9dd10085a", + ), + ) + cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');") + conn.commit() + wait_until( + 120, + 0.5, + lambda: query_clickhouse( + client, + "select * from db1_postgres.table1 order by 1", + "9eba2daaf7e4d7d27ac849525f68b562ab53947d", + ), + ) + log.debug("Sleeping before final checking if Neon is still alive") + time.sleep(3) + cur.execute("SELECT 1") diff --git a/test_runner/logical_repl/test_debezium.py b/test_runner/logical_repl/test_debezium.py new file mode 100644 index 0000000000..5426a06ca1 --- /dev/null +++ b/test_runner/logical_repl/test_debezium.py @@ -0,0 +1,190 @@ +""" +Test the logical replication in Neon with Debezium as a consumer +""" + +import json +import os +import time + +import psycopg2 +import pytest +import requests +from fixtures.log_helper import log +from fixtures.neon_fixtures import RemotePostgres +from fixtures.utils import wait_until + + +class DebeziumAPI: + """ + The class for Debezium API calls + """ + + def __init__(self): + self.__host = "debezium" if ("CI" in os.environ) else "127.0.0.1" + self.__base_url = f"http://{self.__host}:8083" + self.__connectors_url = f"{self.__base_url}/connectors" + + def __request(self, method, addurl="", **kwargs): + return requests.request( + method, + self.__connectors_url + addurl, + headers={"Accept": "application/json", "Content-type": "application/json"}, + timeout=60, + **kwargs, + ) + + def create_pg_connector(self, remote_pg: RemotePostgres, dbz_conn_name: str): + """ + Create a Postgres connector in debezium + """ + conn_options = remote_pg.conn_options() + payload = { + "name": dbz_conn_name, + "config": { + "connector.class": "io.debezium.connector.postgresql.PostgresConnector", + "tasks.max": "1", + "database.hostname": conn_options["host"], + "database.port": "5432", + "database.user": conn_options["user"], + "database.password": conn_options["password"], + "database.dbname": conn_options["dbname"], + "plugin.name": "pgoutput", + "topic.prefix": "dbserver1", + "schema.include.list": "inventory", + }, + } + return self.__request("POST", json=payload) + + def list_connectors(self): + """ + Returns a list of all connectors existent in Debezium. + """ + resp = self.__request("GET") + assert resp.ok + return json.loads(resp.text) + + def del_connector(self, connector): + """ + Deletes the specified connector + """ + return self.__request("DELETE", f"/{connector}") + + +@pytest.fixture(scope="function") +def debezium(remote_pg: RemotePostgres): + """ + Prepare the Debezium API handler, connection + """ + conn = psycopg2.connect(remote_pg.connstr()) + cur = conn.cursor() + cur.execute("DROP SCHEMA IF EXISTS inventory CASCADE") + cur.execute("CREATE SCHEMA inventory") + cur.execute( + "CREATE TABLE inventory.customers (" + "id SERIAL NOT NULL PRIMARY KEY," + "first_name character varying(255) NOT NULL," + "last_name character varying(255) NOT NULL," + "email character varying(255) NOT NULL)" + ) + conn.commit() + dbz = DebeziumAPI() + assert len(dbz.list_connectors()) == 0 + dbz_conn_name = "inventory-connector" + resp = dbz.create_pg_connector(remote_pg, dbz_conn_name) + log.debug("%s %s %s", resp.status_code, resp.ok, resp.text) + assert resp.status_code == 201 + assert len(dbz.list_connectors()) == 1 + from kafka import KafkaConsumer + + consumer = KafkaConsumer( + "dbserver1.inventory.customers", + bootstrap_servers=["kafka:9092"], + auto_offset_reset="earliest", + enable_auto_commit=False, + ) + yield conn, consumer + resp = dbz.del_connector(dbz_conn_name) + assert resp.status_code == 204 + + +def get_kafka_msg(consumer, ts_ms, before=None, after=None) -> None: + """ + Gets the message from Kafka and checks its validity + Arguments: + consumer: the consumer object + ts_ms: timestamp in milliseconds of the change of db, the corresponding message must have + the later timestamp + before: a dictionary, if not None, the before field from the kafka message must + have the same values for the same keys + after: a dictionary, if not None, the after field from the kafka message must + have the same values for the same keys + """ + msg = consumer.poll() + assert msg, "Empty message" + for val in msg.values(): + r = json.loads(val[-1].value) + log.info(r["payload"]) + assert ts_ms < r["payload"]["ts_ms"], "Incorrect timestamp" + for param, pname in ((before, "before"), (after, "after")): + if param is not None: + for k, v in param.items(): + assert r["payload"][pname][k] == v, f"{pname} mismatches" + + +@pytest.mark.remote_cluster +def test_debezium(debezium): + """ + Test the logical replication having Debezium as a subscriber + """ + conn, consumer = debezium + cur = conn.cursor() + ts_ms = time.time() * 1000 + log.info("Insert 1 ts_ms: %s", ts_ms) + cur.execute( + "insert into inventory.customers (first_name, last_name, email) " + "values ('John', 'Dow','johndow@example.com')" + ) + conn.commit() + wait_until( + 100, + 0.5, + lambda: get_kafka_msg( + consumer, + ts_ms, + after={"first_name": "John", "last_name": "Dow", "email": "johndow@example.com"}, + ), + show_intermediate_error=True, + ) + ts_ms = time.time() * 1000 + log.info("Insert 2 ts_ms: %s", ts_ms) + cur.execute( + "insert into inventory.customers (first_name, last_name, email) " + "values ('Alex', 'Row','alexrow@example.com')" + ) + conn.commit() + wait_until( + 100, + 0.5, + lambda: get_kafka_msg( + consumer, + ts_ms, + after={"first_name": "Alex", "last_name": "Row", "email": "alexrow@example.com"}, + ), + show_intermediate_error=True, + ) + ts_ms = time.time() * 1000 + log.info("Update ts_ms: %s", ts_ms) + cur.execute("update inventory.customers set first_name = 'Alexander' where id = 2") + conn.commit() + wait_until( + 100, + 0.5, + lambda: get_kafka_msg( + consumer, + ts_ms, + after={"first_name": "Alexander"}, + ), + show_intermediate_error=True, + ) + time.sleep(3) + cur.execute("select 1") diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index 7ad65821d4..70d75a6dcf 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -7,7 +7,7 @@ easier to see if you have compile errors without scrolling up. You may also need to run `./scripts/pysync`. Then run the tests -`DEFAULT_PG_VERSION=15 NEON_BIN=./target/release poetry run pytest test_runner/performance` +`DEFAULT_PG_VERSION=16 NEON_BIN=./target/release poetry run pytest test_runner/performance` Some handy pytest flags for local development: - `-x` tells pytest to stop on first error diff --git a/test_runner/performance/pageserver/README.md b/test_runner/performance/pageserver/README.md index fdd09cd946..56ffad9963 100644 --- a/test_runner/performance/pageserver/README.md +++ b/test_runner/performance/pageserver/README.md @@ -11,6 +11,6 @@ It supports mounting snapshots using overlayfs, which improves iteration time. Here's a full command line. ``` -RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=15 BUILD_TYPE=release \ +RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release \ ./scripts/pytest test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py ```` diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py index 3fb28ace46..8d781c1609 100644 --- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py +++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py @@ -14,7 +14,7 @@ from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking """ Usage: -DEFAULT_PG_VERSION=15 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \ +DEFAULT_PG_VERSION=16 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \ ./scripts/pytest --timeout 0 test_runner/performance/pageserver/interactive/test_many_small_tenants.py """ @@ -55,10 +55,6 @@ def setup_env( } template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely - ".*Dropped remote consistent LSN updates.*", - ) env.pageserver.tenant_attach(template_tenant, config) ep = env.endpoints.create_start("main", tenant_id=template_tenant) ep.safe_psql("create table foo(b text)") diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py new file mode 100644 index 0000000000..8b934057e4 --- /dev/null +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -0,0 +1,188 @@ +import asyncio +import json +from pathlib import Path +from typing import Any, Dict, Tuple + +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn +from fixtures.utils import get_scale_for_db, humantime_to_ms + +from performance.pageserver.util import ( + setup_pageserver_with_tenants, +) + + +@pytest.mark.parametrize("duration", [30]) +@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)]) +@pytest.mark.parametrize("n_tenants", [10]) +@pytest.mark.timeout(1000) +def test_basebackup_with_high_slru_count( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + n_tenants: int, + pgbench_scale: int, + duration: int, +): + def record(metric, **kwargs): + zenbenchmark.record(metric_name=f"pageserver_basebackup.{metric}", **kwargs) + + params: Dict[str, Tuple[Any, Dict[str, Any]]] = {} + + # params from fixtures + params.update( + { + "n_tenants": (n_tenants, {"unit": ""}), + "pgbench_scale": (pgbench_scale, {"unit": ""}), + "duration": (duration, {"unit": "s"}), + } + ) + + # configure cache sizes like in prod + page_cache_size = 16384 + max_file_descriptors = 500000 + neon_env_builder.pageserver_config_override = ( + f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}" + ) + params.update( + { + "pageserver_config_override.page_cache_size": ( + page_cache_size * 8192, + {"unit": "byte"}, + ), + "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}), + } + ) + + for param, (value, kwargs) in params.items(): + record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs) + + n_txns = 500000 + + def setup_wrapper(env: NeonEnv): + return setup_tenant_template(env, n_txns) + + env = setup_pageserver_with_tenants( + neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper + ) + run_benchmark(env, pg_bin, record, duration) + + +def setup_tenant_template(env: NeonEnv, n_txns: int): + config = { + "gc_period": "0s", # disable periodic gc + "checkpoint_timeout": "10 years", + "compaction_period": "0s", # disable periodic compaction + "compaction_threshold": 10, + "compaction_target_size": 134217728, + "checkpoint_distance": 268435456, + "image_creation_threshold": 3, + } + + template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) + env.pageserver.tenant_detach(template_tenant) + env.pageserver.tenant_attach(template_tenant, config) + + ps_http = env.pageserver.http_client() + + with env.endpoints.create_start( + "main", tenant_id=template_tenant, config_lines=["shared_buffers=1MB"] + ) as ep: + rels = 10 + + asyncio.run(run_updates(ep, n_txns, rels)) + + wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) + ps_http.timeline_checkpoint(template_tenant, template_timeline) + ps_http.timeline_compact(template_tenant, template_timeline) + + return (template_tenant, template_timeline, config) + + +# Takes about 5 minutes and produces tenants with around 300 SLRU blocks +# of 8 KiB each. +async def run_updates(ep: Endpoint, n_txns: int, workers_count: int): + workers = [] + for i in range(workers_count): + workers.append(asyncio.create_task(run_update_loop_worker(ep, n_txns, i))) + + await asyncio.gather(*workers) + + +async def run_update_loop_worker(ep: Endpoint, n_txns: int, idx: int): + table = f"t_{idx}" + conn = await ep.connect_async() + await conn.execute(f"CREATE TABLE {table} (pk integer PRIMARY KEY, x integer)") + await conn.execute(f"ALTER TABLE {table} SET (autovacuum_enabled = false)") + await conn.execute(f"INSERT INTO {table} VALUES (1, 0)") + await conn.execute( + f""" + CREATE PROCEDURE updating{table}() as + $$ + DECLARE + i integer; + BEGIN + FOR i IN 1..{n_txns} LOOP + UPDATE {table} SET x = x + 1 WHERE pk=1; + COMMIT; + END LOOP; + END + $$ LANGUAGE plpgsql + """ + ) + await conn.execute("SET statement_timeout=0") + await conn.execute(f"call updating{table}()") + + +def run_benchmark(env: NeonEnv, pg_bin: PgBin, record, duration_secs: int): + ps_http = env.pageserver.http_client() + cmd = [ + str(env.neon_binpath / "pagebench"), + "basebackup", + "--mgmt-api-endpoint", + ps_http.base_url, + "--page-service-connstring", + env.pageserver.connstr(password=None), + "--gzip-probability", + "1", + "--runtime", + f"{duration_secs}s", + # don't specify the targets explicitly, let pagebench auto-discover them + ] + + log.info(f"command: {' '.join(cmd)}") + basepath = pg_bin.run_capture(cmd, with_command_header=False) + results_path = Path(basepath + ".stdout") + log.info(f"Benchmark results at: {results_path}") + + with open(results_path, "r") as f: + results = json.load(f) + log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") + + total = results["total"] + metric = "request_count" + record( + metric, + metric_value=total[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "latency_mean" + record( + metric, + metric_value=humantime_to_ms(total[metric]), + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) + + metric = "latency_percentiles" + for k, v in total[metric].items(): + record( + f"{metric}.{k}", + metric_value=humantime_to_ms(v), + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) diff --git a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py new file mode 100644 index 0000000000..9ad6e7907c --- /dev/null +++ b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py @@ -0,0 +1,179 @@ +import json +from pathlib import Path +from typing import Any, Dict, Tuple + +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + flush_ep_to_pageserver, +) +from fixtures.remote_storage import s3_storage +from fixtures.utils import humantime_to_ms + + +@pytest.mark.parametrize("duration", [30]) +@pytest.mark.parametrize("io_engine", ["tokio-epoll-uring", "std-fs"]) +@pytest.mark.parametrize("concurrency_per_target", [1, 10, 100]) +@pytest.mark.timeout(1000) +def test_download_churn( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + io_engine: str, + concurrency_per_target: int, + duration: int, +): + def record(metric, **kwargs): + zenbenchmark.record(metric_name=f"pageserver_ondemand_download_churn.{metric}", **kwargs) + + params: Dict[str, Tuple[Any, Dict[str, Any]]] = {} + + # params from fixtures + params.update( + { + # we don't capture `duration`, but instead use the `runtime` output field from pagebench + } + ) + + # configure cache sizes like in prod + page_cache_size = 16384 + max_file_descriptors = 500000 + neon_env_builder.pageserver_config_override = ( + f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}" + ) + params.update( + { + "pageserver_config_override.page_cache_size": ( + page_cache_size * 8192, + {"unit": "byte"}, + ), + "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}), + } + ) + + for param, (value, kwargs) in params.items(): + record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs) + + # Setup env + env = setup_env(neon_env_builder, pg_bin) + env.pageserver.allowed_errors.append( + f".*path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing" + ) + + run_benchmark(env, pg_bin, record, io_engine, concurrency_per_target, duration) + + +def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + # We configure tenant conf such that SQL query below produces a lot of layers. + # We don't care what's in the layers really, we just care that layers are created. + bytes_per_layer = 10 * (1024**2) + env = neon_env_builder.init_start( + initial_tenant_conf={ + "pitr_interval": "1000d", # let's not make it get in the way + "gc_period": "0s", # disable periodic gc to avoid noise + "compaction_period": "0s", # disable L0=>L1 compaction + "checkpoint_timeout": "10years", # rely solely on checkpoint_distance + "checkpoint_distance": bytes_per_layer, # 10M instead of 256M to create more smaller layers + "image_creation_threshold": 100000, # don't create image layers ever + } + ) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + client = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=tenant_id) as ep: + ep.safe_psql("CREATE TABLE data (random_text text)") + bytes_per_row = 512 # make big enough so WAL record size doesn't dominate + desired_layers = 300 + desired_bytes = bytes_per_layer * desired_layers + nrows = desired_bytes / bytes_per_row + ep.safe_psql( + f"INSERT INTO data SELECT lpad(i::text, {bytes_per_row}, '0') FROM generate_series(1, {int(nrows)}) as i", + options="-c statement_timeout=0", + ) + flush_ep_to_pageserver(env, ep, tenant_id, timeline_id) + + client.timeline_checkpoint(tenant_id, timeline_id, compact=False, wait_until_uploaded=True) + + return env + + +def run_benchmark( + env: NeonEnv, + pg_bin: PgBin, + record, + io_engine: str, + concurrency_per_target: int, + duration_secs: int, +): + ps_http = env.pageserver.http_client() + cmd = [ + str(env.neon_binpath / "pagebench"), + "ondemand-download-churn", + "--mgmt-api-endpoint", + ps_http.base_url, + "--runtime", + f"{duration_secs}s", + "--set-io-engine", + f"{io_engine}", + "--concurrency-per-target", + f"{concurrency_per_target}", + # don't specify the targets explicitly, let pagebench auto-discover them + ] + + log.info(f"command: {' '.join(cmd)}") + basepath = pg_bin.run_capture(cmd, with_command_header=False) + results_path = Path(basepath + ".stdout") + log.info(f"Benchmark results at: {results_path}") + + with open(results_path, "r") as f: + results = json.load(f) + log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") + + metric = "downloads_count" + record( + metric, + metric_value=results[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "downloads_bytes" + record( + metric, + metric_value=results[metric], + unit="byte", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "evictions_count" + record( + metric, + metric_value=results[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "timeline_restarts" + record( + metric, + metric_value=results[metric], + unit="", + report=MetricReport.LOWER_IS_BETTER, + ) + + metric = "runtime" + record( + metric, + metric_value=humantime_to_ms(results[metric]) / 1000, + unit="s", + report=MetricReport.TEST_PARAM, + ) diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 1ed7e577b9..949813c984 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -1,8 +1,8 @@ import json +import os from pathlib import Path from typing import Any, Dict, Tuple -import fixtures.pageserver.many_tenants as many_tenants import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.log_helper import log @@ -14,32 +14,78 @@ from fixtures.neon_fixtures import ( ) from fixtures.utils import get_scale_for_db, humantime_to_ms -from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking +from performance.pageserver.util import ( + setup_pageserver_with_tenants, +) + +# The following tests use pagebench "getpage at latest LSN" to characterize the throughput of the pageserver. +# originally there was a single test named `test_pageserver_max_throughput_getpage_at_latest_lsn`` +# so you still see some references to this name in the code. +# To avoid recreating the snapshots for each test, we continue to use the name `max_throughput_latest_lsn` +# for some files and metrics. # For reference, the space usage of the snapshots: -# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots -# 137G /instance_store/test_output/shared-snapshots -# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots/* -# 1.8G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-13 -# 1.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-6 -# 8.5G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-13 -# 5.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-6 -# 76G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-13 -# 46G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6 -@pytest.mark.parametrize("duration", [30]) -@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]]) -@pytest.mark.parametrize("n_tenants", [1, 10]) -@pytest.mark.timeout( - 10000 -) # TODO: this value is just "a really high number"; have this per instance type -def test_pageserver_max_throughput_getpage_at_latest_lsn( +# sudo du -hs /instance_store/neon/test_output/shared-snapshots/* +# 416G /instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-500-13 +@pytest.mark.parametrize("duration", [60 * 60]) +@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)]) +@pytest.mark.parametrize("n_tenants", [500]) +@pytest.mark.timeout(10000) +@pytest.mark.skipif( + os.getenv("CI", "false") == "true", + reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI", +) +def test_pageserver_characterize_throughput_with_n_tenants( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, n_tenants: int, pgbench_scale: int, duration: int, +): + setup_and_run_pagebench_benchmark( + neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, 1 + ) + + +# For reference, the space usage of the snapshots: +# sudo du -hs /instance_store/neon/test_output/shared-snapshots/* +# 19G /instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-1-136 +@pytest.mark.parametrize("duration", [20 * 60]) +@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(2048)]) +# we use 1 client to characterize latencies, and 64 clients to characterize throughput/scalability +# we use 64 clients because typically for a high number of connections we recommend the connection pooler +# which by default uses 64 connections +@pytest.mark.parametrize("n_clients", [1, 64]) +@pytest.mark.parametrize("n_tenants", [1]) +@pytest.mark.timeout(2400) +@pytest.mark.skipif( + os.getenv("CI", "false") == "true", + reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI", +) +def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + n_tenants: int, + pgbench_scale: int, + duration: int, + n_clients: int, +): + setup_and_run_pagebench_benchmark( + neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients + ) + + +def setup_and_run_pagebench_benchmark( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + n_tenants: int, + pgbench_scale: int, + duration: int, + n_clients: int, ): def record(metric, **kwargs): zenbenchmark.record( @@ -54,6 +100,7 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn( "n_tenants": (n_tenants, {"unit": ""}), "pgbench_scale": (pgbench_scale, {"unit": ""}), "duration": (duration, {"unit": "s"}), + "n_clients": (n_clients, {"unit": ""}), } ) @@ -75,12 +122,85 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn( for param, (value, kwargs) in params.items(): record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs) - env = setup_pageserver_with_pgbench_tenants(neon_env_builder, pg_bin, n_tenants, pgbench_scale) - run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration) + + def setup_wrapper(env: NeonEnv): + return setup_tenant_template(env, pg_bin, pgbench_scale) + + env = setup_pageserver_with_tenants( + neon_env_builder, + f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}", + n_tenants, + setup_wrapper, + # https://github.com/neondatabase/neon/issues/8070 + timeout_in_seconds=60, + ) + + env.pageserver.allowed_errors.append( + # https://github.com/neondatabase/neon/issues/6925 + # https://github.com/neondatabase/neon/issues/6390 + # https://github.com/neondatabase/neon/issues/6724 + r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*" + ) + + run_pagebench_benchmark(env, pg_bin, record, duration, n_clients) -def run_benchmark_max_throughput_latest_lsn( - env: NeonEnv, pg_bin: PgBin, record, duration_secs: int +def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): + """ + Set up a template tenant which will be replicated by the test infra. + It's a pgbench tenant, initialized to a certain scale, and treated afterwards + with a repeat application of (pgbench simple-update workload, checkpoint, compact). + """ + # use a config that makes production of on-disk state timing-insensitive + # as we ingest data into the tenant. + config = { + "gc_period": "0s", # disable periodic gc + "checkpoint_timeout": "10 years", + "compaction_period": "0s", # disable periodic compaction + "compaction_threshold": 10, + "compaction_target_size": 134217728, + "checkpoint_distance": 268435456, + "image_creation_threshold": 3, + } + template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) + env.pageserver.tenant_detach(template_tenant) + env.pageserver.tenant_attach(template_tenant, config) + ps_http = env.pageserver.http_client() + with env.endpoints.create_start("main", tenant_id=template_tenant) as ep: + pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()]) + wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) + ps_http.timeline_checkpoint(template_tenant, template_timeline) + ps_http.timeline_compact(template_tenant, template_timeline) + for _ in range( + 0, 17 + ): # some prime number to avoid potential resonances with the "_threshold" variables from the config + # the L0s produced by this appear to have size ~5MiB + num_txns = 10_000 + pg_bin.run_capture( + ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()] + ) + wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) + ps_http.timeline_checkpoint(template_tenant, template_timeline) + ps_http.timeline_compact(template_tenant, template_timeline) + # for reference, the output at scale=6 looked like so (306M total) + # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59 + # total 306M + # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829 + # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919 + # 33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71 + # 36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791 + # 16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1 + # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9 + # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639 + # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799 + # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19 + # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021 + + return (template_tenant, template_timeline, config) + + +def run_pagebench_benchmark( + env: NeonEnv, pg_bin: PgBin, record, duration_secs: int, n_clients: int ): """ Benchmark `env.pageserver` for max throughput @ latest LSN and record results in `zenbenchmark`. @@ -94,6 +214,8 @@ def run_benchmark_max_throughput_latest_lsn( ps_http.base_url, "--page-service-connstring", env.pageserver.connstr(password=None), + "--num-clients", + str(n_clients), "--runtime", f"{duration_secs}s", # don't specify the targets explicitly, let pagebench auto-discover them @@ -133,78 +255,3 @@ def run_benchmark_max_throughput_latest_lsn( unit="ms", report=MetricReport.LOWER_IS_BETTER, ) - - -def setup_pageserver_with_pgbench_tenants( - neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, - n_tenants: int, - scale: int, -) -> NeonEnv: - """ - Utility function to set up a pageserver with a given number of identical tenants. - Each tenant is a pgbench tenant, initialize to a certain scale, and treated afterwards - with a repeat application of (pgbench simple-update workload, checkpoint, compact). - """ - - def setup_template(env: NeonEnv): - # use a config that makes production of on-disk state timing-insensitive - # as we ingest data into the tenant. - config = { - "gc_period": "0s", # disable periodic gc - "checkpoint_timeout": "10 years", - "compaction_period": "0s", # disable periodic compaction - "compaction_threshold": 10, - "compaction_target_size": 134217728, - "checkpoint_distance": 268435456, - "image_creation_threshold": 3, - } - template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) - env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely - ".*Dropped remote consistent LSN updates.*", - ) - env.pageserver.tenant_attach(template_tenant, config) - ps_http = env.pageserver.http_client() - with env.endpoints.create_start("main", tenant_id=template_tenant) as ep: - pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()]) - wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) - ps_http.timeline_checkpoint(template_tenant, template_timeline) - ps_http.timeline_compact(template_tenant, template_timeline) - for _ in range( - 0, 17 - ): # some prime number to avoid potential resonances with the "_threshold" variables from the config - # the L0s produced by this appear to have size ~5MiB - num_txns = 10_000 - pg_bin.run_capture( - ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()] - ) - wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) - ps_http.timeline_checkpoint(template_tenant, template_timeline) - ps_http.timeline_compact(template_tenant, template_timeline) - # for reference, the output at scale=6 looked like so (306M total) - # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59 - # total 306M - # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829 - # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919 - # 33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71 - # 36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791 - # 16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1 - # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9 - # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639 - # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799 - # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19 - # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021 - - return (template_tenant, template_timeline, config) - - def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv: - return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants) - - env = neon_env_builder.build_and_use_snapshot( - f"max_throughput_latest_lsn-{n_tenants}-{scale}", doit - ) - env.start() - ensure_pageserver_ready_for_benchmarking(env, n_tenants) - return env diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py index 45eb652362..88296a7fbd 100644 --- a/test_runner/performance/pageserver/util.py +++ b/test_runner/performance/pageserver/util.py @@ -2,8 +2,15 @@ Utilities used by all code in this sub-directory """ +from typing import Any, Callable, Dict, Optional, Tuple + +import fixtures.pageserver.many_tenants as many_tenants +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, +) from fixtures.pageserver.utils import wait_until_all_tenants_state @@ -15,7 +22,7 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): log.info("wait for all tenants to become active") wait_until_all_tenants_state( - ps_http, "Active", iterations=n_tenants, period=1, http_error_ok=False + ps_http, "Active", iterations=10 + n_tenants, period=1, http_error_ok=False ) # ensure all layers are resident for predictiable performance @@ -27,3 +34,23 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): assert not layer.remote log.info("ready") + + +def setup_pageserver_with_tenants( + neon_env_builder: NeonEnvBuilder, + name: str, + n_tenants: int, + setup: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]], + timeout_in_seconds: Optional[int] = None, +) -> NeonEnv: + """ + Utility function to set up a pageserver with a given number of identical tenants. + """ + + def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv: + return many_tenants.single_timeline(neon_env_builder, setup, n_tenants) + + env = neon_env_builder.build_and_use_snapshot(name, doit) + env.start(timeout_in_seconds=timeout_in_seconds) + ensure_pageserver_ready_for_benchmarking(env, n_tenants) + return env diff --git a/test_runner/performance/pgvector/HNSW_build.sql b/test_runner/performance/pgvector/HNSW_build.sql new file mode 100644 index 0000000000..9e6918b755 --- /dev/null +++ b/test_runner/performance/pgvector/HNSW_build.sql @@ -0,0 +1,47 @@ + +\set ECHO queries +\timing + +-- prepare test table +DROP TABLE IF EXISTS hnsw_test_table; +CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA; +INSERT INTO hnsw_test_table SELECT * FROM documents; +CREATE INDEX ON hnsw_test_table (_id); -- needed later for random tuple queries +-- tune index build params +SET max_parallel_maintenance_workers = 7; +SET maintenance_work_mem = '8GB'; +-- create HNSW index for the supported distance metrics +CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_cosine_ops); +CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_ip_ops); +CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops); +CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops); +CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops); +-- note: in a second psql session we can monitor the progress of the index build phases using +-- the following query: +-- SELECT phase, round(100.0 * blocks_done / nullif(blocks_total, 0), 1) AS "%" FROM pg_stat_progress_create_index; + +-- show all indexes built on the table +SELECT + idx.relname AS index_name, + tbl.relname AS table_name, + am.amname AS access_method, + a.attname AS column_name, + opc.opcname AS operator_class +FROM + pg_index i +JOIN + pg_class idx ON idx.oid = i.indexrelid +JOIN + pg_class tbl ON tbl.oid = i.indrelid +JOIN + pg_am am ON am.oid = idx.relam +JOIN + pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey) +JOIN + pg_opclass opc ON opc.oid = i.indclass[0] +WHERE + tbl.relname = 'hnsw_test_table' + AND a.attname = 'embeddings'; + +-- show table sizes +\dt+ diff --git a/test_runner/performance/pgvector/IVFFLAT_build.sql b/test_runner/performance/pgvector/IVFFLAT_build.sql new file mode 100644 index 0000000000..338980831a --- /dev/null +++ b/test_runner/performance/pgvector/IVFFLAT_build.sql @@ -0,0 +1,52 @@ + +\set ECHO queries +\timing + +-- prepare test table +DROP TABLE IF EXISTS ivfflat_test_table; +CREATE TABLE ivfflat_test_table AS TABLE documents WITH NO DATA; +INSERT INTO ivfflat_test_table SELECT * FROM documents; +CREATE INDEX ON ivfflat_test_table (_id); -- needed later for random tuple queries +-- tune index build params +SET max_parallel_maintenance_workers = 7; +SET maintenance_work_mem = '8GB'; +-- create ivfflat index for the supported distance metrics +-- the formulat for lists is # rows / 1000 or sqrt(# rows) if # rows > 1 million +-- we have 1 million embeddings of vector size 1536 in column embeddings of table documents +-- so we use 1000 lists +CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_l2_ops) WITH (lists = 1000); +CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_ip_ops) WITH (lists = 1000); +CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_cosine_ops) WITH (lists = 1000); +CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings::halfvec(1536) halfvec_l2_ops) WITH (lists = 1000); +CREATE INDEX ON ivfflat_test_table + USING ivfflat ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops) WITH (lists = 1000); + +\d ivfflat_test_table + + +-- show all indexes built on the table +SELECT + idx.relname AS index_name, + tbl.relname AS table_name, + am.amname AS access_method, + a.attname AS column_name, + opc.opcname AS operator_class +FROM + pg_index i +JOIN + pg_class idx ON idx.oid = i.indexrelid +JOIN + pg_class tbl ON tbl.oid = i.indrelid +JOIN + pg_am am ON am.oid = idx.relam +JOIN + pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey) +JOIN + pg_opclass opc ON opc.oid = i.indclass[0] +WHERE + tbl.relname = 'ivfflat_test_table' + AND a.attname = 'embeddings'; +-- show table sizes +\dt+ + + diff --git a/test_runner/performance/pgvector/README.md b/test_runner/performance/pgvector/README.md new file mode 100644 index 0000000000..83495d270a --- /dev/null +++ b/test_runner/performance/pgvector/README.md @@ -0,0 +1,55 @@ +# Source of the dataset for pgvector tests + +This readme was copied from https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-1M + +## Download the parquet files + +```bash +brew install git-lfs +git-lfs clone https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-1M +``` + +## Load into postgres: + +see loaddata.py in this directory + +## Rest of dataset card as on huggingface + +--- +dataset_info: + features: + - name: _id + dtype: string + - name: title + dtype: string + - name: text + dtype: string + - name: text-embedding-3-large-1536-embedding + sequence: float64 + splits: + - name: train + num_bytes: 12679725776 + num_examples: 1000000 + download_size: 9551862565 + dataset_size: 12679725776 +configs: +- config_name: default + data_files: + - split: train + path: data/train-* +license: mit +task_categories: +- feature-extraction +language: +- en +size_categories: +- 1M ") + + +def main(conn_str, directory_path): + # Connection to PostgreSQL + with psycopg2.connect(conn_str) as conn: + with conn.cursor() as cursor: + # Run SQL statements + cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;") + register_vector(conn) + cursor.execute("DROP TABLE IF EXISTS documents;") + cursor.execute( + """ + CREATE TABLE documents ( + _id TEXT PRIMARY KEY, + title TEXT, + text TEXT, + embeddings vector(1536) -- text-embedding-3-large-1536-embedding (OpenAI) + ); + """ + ) + conn.commit() + + # List and sort Parquet files + parquet_files = sorted(Path(directory_path).glob("*.parquet")) + + for file in parquet_files: + print(f"Loading {file} into PostgreSQL") + df = pd.read_parquet(file) + + print(df.head()) + + data_list = [ + ( + row["_id"], + row["title"], + row["text"], + np.array(row["text-embedding-3-large-1536-embedding"]), + ) + for index, row in df.iterrows() + ] + # Use execute_values to perform batch insertion + execute_values( + cursor, + "INSERT INTO documents (_id, title, text, embeddings) VALUES %s", + data_list, + ) + # Commit after we insert all embeddings + conn.commit() + + print(f"Loaded {file} into PostgreSQL") + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print_usage() + sys.exit(1) + + conn_str = sys.argv[1] + directory_path = sys.argv[2] + main(conn_str, directory_path) diff --git a/test_runner/performance/pgvector/pgbench_custom_script_pgvector_halfvec_queries.sql b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_halfvec_queries.sql new file mode 100644 index 0000000000..70d0c18149 --- /dev/null +++ b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_halfvec_queries.sql @@ -0,0 +1,13 @@ +-- run with pooled connection +-- pgbench -T 300 -c 100 -j20 -f pgbench_halfvec_queries.sql -postgresql://neondb_owner:@ep-floral-thunder-w1gzhaxi-pooler.eu-west-1.aws.neon.build/neondb?sslmode=require" + +with x (x) as ( + select "embeddings" as x + from halfvec_test_table + TABLESAMPLE SYSTEM (1) + LIMIT 1 +) +SELECT title, "embeddings" <=> (select x from x) as distance +FROM halfvec_test_table +ORDER BY 2 +LIMIT 30; \ No newline at end of file diff --git a/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql new file mode 100644 index 0000000000..886ae9645b --- /dev/null +++ b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql @@ -0,0 +1,10 @@ +with x (x) as ( + select "embeddings" as x + from hnsw_test_table + TABLESAMPLE SYSTEM (1) + LIMIT 1 +) +SELECT title, "embeddings" <=> (select x from x) as distance +FROM hnsw_test_table +ORDER BY 2 +LIMIT 30; diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 6edcb8f1f2..b3866f1813 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -1,4 +1,5 @@ import random +import re import statistics import threading import time @@ -7,11 +8,14 @@ from contextlib import closing from typing import List import pytest -from fixtures.benchmark_fixture import MetricReport +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.common_types import Lsn from fixtures.compare_fixtures import NeonCompare from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonPageserver from fixtures.pageserver.utils import wait_for_last_record_lsn -from fixtures.types import Lsn +from fixtures.utils import wait_until +from prometheus_client.samples import Sample def _record_branch_creation_durations(neon_compare: NeonCompare, durs: List[float]): @@ -74,7 +78,7 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) p = random.randint(0, i) timer = timeit.default_timer() - env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p), tenant_id=tenant) + env.neon_cli.create_branch(f"b{i + 1}", f"b{p}", tenant_id=tenant) dur = timeit.default_timer() - timer log.info(f"Creating branch b{i+1} took {dur}s") @@ -89,11 +93,17 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) _record_branch_creation_durations(neon_compare, branch_creation_durations) -@pytest.mark.parametrize("n_branches", [1024]) -# Test measures the latency of branch creation when creating a lot of branches. -def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): +@pytest.mark.parametrize("n_branches", [500, 1024]) +@pytest.mark.parametrize("shape", ["one_ancestor", "random"]) +def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: str): + """ + Test measures the latency of branch creation when creating a lot of branches. + """ env = neon_compare.env + # seed the prng so we will measure the same structure every time + rng = random.Random("2024-02-29") + env.neon_cli.create_branch("b0") endpoint = env.endpoints.create_start("b0") @@ -102,15 +112,105 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): branch_creation_durations = [] for i in range(n_branches): - # random a source branch - p = random.randint(0, i) + if shape == "random": + parent = f"b{rng.randint(0, i)}" + elif shape == "one_ancestor": + parent = "b0" + else: + raise RuntimeError(f"unimplemented shape: {shape}") + timer = timeit.default_timer() - env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p)) + # each of these uploads to remote storage before completion + env.neon_cli.create_branch(f"b{i + 1}", parent) dur = timeit.default_timer() - timer branch_creation_durations.append(dur) _record_branch_creation_durations(neon_compare, branch_creation_durations) + endpoint.stop_and_destroy() + + with neon_compare.record_duration("shutdown"): + # this sleeps 100ms between polls + env.pageserver.stop() + + startup_line = "INFO version: git(-env)?:" + + # find the first line of the log file so we can find the next start later + _, first_start = wait_until(5, 1, lambda: env.pageserver.assert_log_contains(startup_line)) + + # start without gc so we can time compaction with less noise; use shorter + # period for compaction so it starts earlier + def patch_default_tenant_config(config): + tenant_config = config.get("tenant_config", {}) + tenant_config["compaction_period"] = "3s" + tenant_config["gc_period"] = "0s" + config["tenant_config"] = tenant_config + + env.pageserver.edit_config_toml(patch_default_tenant_config) + env.pageserver.start( + # this does print more than we want, but the number should be comparable between runs + extra_env_vars={ + "RUST_LOG": f"[compaction_loop{{tenant_id={env.initial_tenant}}}]=debug,info" + }, + ) + + _, second_start = wait_until( + 5, 1, lambda: env.pageserver.assert_log_contains(startup_line, first_start) + ) + env.pageserver.quiesce_tenants() + + wait_and_record_startup_metrics(env.pageserver, neon_compare.zenbenchmark, "restart_after") + + # wait for compaction to complete, which most likely has already done so multiple times + msg, _ = wait_until( + 30, + 1, + lambda: env.pageserver.assert_log_contains( + f".*tenant_id={env.initial_tenant}.*: compaction iteration complete.*", second_start + ), + ) + needle = re.search(" elapsed_ms=([0-9]+)", msg) + assert needle is not None, "failed to find the elapsed time" + duration = int(needle.group(1)) / 1000.0 + neon_compare.zenbenchmark.record("compaction", duration, "s", MetricReport.LOWER_IS_BETTER) + + +def wait_and_record_startup_metrics( + pageserver: NeonPageserver, target: NeonBenchmarker, prefix: str +): + """ + Waits until all startup metrics have non-zero values on the pageserver, then records them on the target + """ + + client = pageserver.http_client() + + expected_labels = set( + [ + "background_jobs_can_start", + "complete", + "initial", + "initial_tenant_load", + "initial_tenant_load_remote", + ] + ) + + def metrics_are_filled() -> List[Sample]: + m = client.get_metrics() + samples = m.query_all("pageserver_startup_duration_seconds") + # we should not have duplicate labels + matching = [ + x for x in samples if x.labels.get("phase") in expected_labels and x.value > 0.0 + ] + assert len(matching) == len(expected_labels) + return matching + + samples = wait_until(10, 1, metrics_are_filled) + + for sample in samples: + phase = sample.labels["phase"] + name = f"{prefix}.{phase}" + target.record(name, sample.value, "s", MetricReport.LOWER_IS_BETTER) + # Test measures the branch creation time when branching from a timeline with a lot of relations. # diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 72173dc2a7..69df7974b9 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -1,10 +1,10 @@ from contextlib import closing from fixtures.benchmark_fixture import MetricReport +from fixtures.common_types import Lsn from fixtures.compare_fixtures import NeonCompare, PgCompare -from fixtures.pageserver.utils import wait_tenant_status_404 +from fixtures.log_helper import log from fixtures.pg_version import PgVersion -from fixtures.types import Lsn # @@ -29,8 +29,8 @@ def test_bulk_insert(neon_with_baseline: PgCompare): # Run INSERT, recording the time and I/O it takes with env.record_pageserver_writes("pageserver_writes"): with env.record_duration("insert"): - cur.execute("insert into huge values (generate_series(1, 5000000), 0);") - env.flush() + cur.execute("insert into huge values (generate_series(1, 20000000), 0);") + env.flush(compact=False, gc=False) env.report_peak_memory_use() env.report_size() @@ -48,6 +48,9 @@ def test_bulk_insert(neon_with_baseline: PgCompare): if isinstance(env, NeonCompare): measure_recovery_time(env) + with env.record_duration("compaction"): + env.compact() + def measure_recovery_time(env: NeonCompare): client = env.env.pageserver.http_client() @@ -56,22 +59,23 @@ def measure_recovery_time(env: NeonCompare): # Delete the Tenant in the pageserver: this will drop local and remote layers, such that # when we "create" the Tenant again, we will replay the WAL from the beginning. # - # This is a "weird" thing to do, and can confuse the attachment service as we're re-using + # This is a "weird" thing to do, and can confuse the storage controller as we're re-using # the same tenant ID for a tenant that is logically different from the pageserver's point # of view, but the same as far as the safekeeper/WAL is concerned. To work around that, # we will explicitly create the tenant in the same generation that it was previously # attached in. - attach_status = env.env.attachment_service.inspect(tenant_shard_id=env.tenant) + attach_status = env.env.storage_controller.inspect(tenant_shard_id=env.tenant) assert attach_status is not None (attach_gen, _) = attach_status client.tenant_delete(env.tenant) - wait_tenant_status_404(client, env.tenant, iterations=60, interval=0.5) env.env.pageserver.tenant_create(tenant_id=env.tenant, generation=attach_gen) # Measure recovery time with env.record_duration("wal_recovery"): + log.info("Entering recovery...") client.timeline_create(pg_version, env.tenant, env.timeline) # Flush, which will also wait for lsn to catch up - env.flush() + env.flush(compact=False, gc=False) + log.info("Finished recovery.") diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py index 326c4f5c6f..3c6f0b0131 100644 --- a/test_runner/performance/test_compaction.py +++ b/test_runner/performance/test_compaction.py @@ -2,6 +2,7 @@ from contextlib import closing import pytest from fixtures.compare_fixtures import NeonCompare +from fixtures.log_helper import log from fixtures.neon_fixtures import wait_for_last_flush_lsn @@ -56,3 +57,98 @@ def test_compaction(neon_compare: NeonCompare): pageserver_http.timeline_compact(tenant_id, timeline_id) neon_compare.report_size() + + +def test_compaction_l0_memory(neon_compare: NeonCompare): + """ + Generate a large stack of L0s pending compaction into L1s, and + measure the pageserver's peak RSS while doing so + """ + + env = neon_compare.env + pageserver_http = env.pageserver.http_client() + + tenant_id, timeline_id = env.neon_cli.create_tenant( + conf={ + # Initially disable compaction so that we will build up a stack of L0s + "compaction_period": "0s", + "gc_period": "0s", + } + ) + neon_compare.tenant = tenant_id + neon_compare.timeline = timeline_id + + endpoint = env.endpoints.create_start( + "main", tenant_id=tenant_id, config_lines=["shared_buffers=512MB"] + ) + + # Read tenant effective config and assert on checkpoint_distance and compaction_threshold, + # as we do want to test with defaults (to be same as the field), but this test's workload size makes assumptions about them. + # + # If these assertions fail, it probably means we changed the default. + tenant_conf = pageserver_http.tenant_config(tenant_id) + assert tenant_conf.effective_config["checkpoint_distance"] == 256 * 1024 * 1024 + assert tenant_conf.effective_config["compaction_threshold"] == 10 + + # Aim to write about 20 L0s, so that we will hit the limit on how many + # to compact at once + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + for i in range(200): + cur.execute(f"create table tbl{i} (i int, j int);") + cur.execute(f"insert into tbl{i} values (generate_series(1, 1000), 0);") + for j in range(100): + cur.execute(f"update tbl{i} set j = {j};") + + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + endpoint.stop() + + # Check we have generated the L0 stack we expected + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + initial_l0s = len(layers.delta_l0_layers()) + initial_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers()) + log.info(f"l0s before compaction {initial_l0s} ({initial_l0s_size})") + + def rss_hwm(): + v = pageserver_http.get_metric_value("libmetrics_maxrss_kb") + assert v is not None + assert v > 0 + return v * 1024 + + before = rss_hwm() + pageserver_http.timeline_compact(tenant_id, timeline_id) + after = rss_hwm() + + log.info(f"RSS across compaction: {before} -> {after} (grew {after - before})") + + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + final_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers()) + log.info(f"l0s after compaction {len(layers.delta_l0_layers())} ({final_l0s_size})") + + assert after > before # If we didn't use some memory the test is probably buggy + compaction_mapped_rss = after - before + + # During L0 compaction, we require as much memory as the physical size of what we compacted, and then some, + # because the key->value mapping in L0s compaction is exhaustive, non-streaming, and does not de-duplicate + # repeated references to the same key. + # + # To be fixed in https://github.com/neondatabase/neon/issues/8184, after which + # this memory estimate can be revised far downwards to something that doesn't scale + # linearly with the layer sizes. + MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.5 + + # If we find that compaction is using more memory, this may indicate a regression + assert compaction_mapped_rss < MEMORY_ESTIMATE + + # If we find that compaction is using <0.5 the expected memory then: + # - maybe we made a big efficiency improvement, in which case update the test + # - maybe something is functionally wrong with the test and it's not driving the system as expected + assert compaction_mapped_rss > MEMORY_ESTIMATE / 2 + + # We should have compacted some but not all of the l0s, based on the limit on how much + # l0 to compact in one go + assert len(layers.delta_l0_layers()) > 0 + assert len(layers.delta_l0_layers()) < initial_l0s + + # The pageserver should have logged when it hit the compaction size limit + env.pageserver.assert_log_contains(".*hit max delta layer size limit.*") diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py index cf9e4808fc..9861259c16 100644 --- a/test_runner/performance/test_gc_feedback.py +++ b/test_runner/performance/test_gc_feedback.py @@ -1,19 +1,13 @@ +import json + import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -@pytest.mark.timeout(10000) -def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): - """ - Test that GC is able to collect all old layers even if them are forming - "stairs" and there are not three delta layers since last image layer. - - Information about image layers needed to collect old layers should - be propagated by GC to compaction task which should take in in account - when make a decision which new image layers needs to be created. - """ +def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, mode: str): + assert mode == "normal" or mode == "with_snapshots" env = neon_env_builder.init_start() client = env.pageserver.http_client() @@ -26,7 +20,7 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma "checkpoint_distance": f"{1024 ** 2}", "compaction_target_size": f"{1024 ** 2}", # set PITR interval to be small, so we can do GC - "pitr_interval": "10 s", + "pitr_interval": "60 s", # "compaction_threshold": "3", # "image_creation_threshold": "2", } @@ -67,6 +61,17 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"] log.info(f"Physical storage size {physical_size}") + if mode == "with_snapshots": + if step == n_steps / 2: + env.neon_cli.create_branch("child") + + max_num_of_deltas_above_image = 0 + max_total_num_of_deltas = 0 + for key_range in client.perf_info(tenant_id, timeline_id): + max_total_num_of_deltas = max(max_total_num_of_deltas, key_range["total_num_of_deltas"]) + max_num_of_deltas_above_image = max( + max_num_of_deltas_above_image, key_range["num_of_deltas_above_image"] + ) MB = 1024 * 1024 zenbenchmark.record("logical_size", logical_size // MB, "Mb", MetricReport.LOWER_IS_BETTER) @@ -74,3 +79,97 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma zenbenchmark.record( "physical/logical ratio", physical_size / logical_size, "", MetricReport.LOWER_IS_BETTER ) + zenbenchmark.record( + "max_total_num_of_deltas", max_total_num_of_deltas, "", MetricReport.LOWER_IS_BETTER + ) + zenbenchmark.record( + "max_num_of_deltas_above_image", + max_num_of_deltas_above_image, + "", + MetricReport.LOWER_IS_BETTER, + ) + + client.timeline_compact(tenant_id, timeline_id, enhanced_gc_bottom_most_compaction=True) + tline_detail = client.timeline_detail(tenant_id, timeline_id) + logical_size = tline_detail["current_logical_size"] + physical_size = tline_detail["current_physical_size"] + + max_num_of_deltas_above_image = 0 + max_total_num_of_deltas = 0 + for key_range in client.perf_info(tenant_id, timeline_id): + max_total_num_of_deltas = max(max_total_num_of_deltas, key_range["total_num_of_deltas"]) + max_num_of_deltas_above_image = max( + max_num_of_deltas_above_image, key_range["num_of_deltas_above_image"] + ) + zenbenchmark.record( + "logical_size_after_bottom_most_compaction", + logical_size // MB, + "Mb", + MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + "physical_size_after_bottom_most_compaction", + physical_size // MB, + "Mb", + MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + "physical/logical ratio after bottom_most_compaction", + physical_size / logical_size, + "", + MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + "max_total_num_of_deltas_after_bottom_most_compaction", + max_total_num_of_deltas, + "", + MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + "max_num_of_deltas_above_image_after_bottom_most_compaction", + max_num_of_deltas_above_image, + "", + MetricReport.LOWER_IS_BETTER, + ) + + with endpoint.cursor() as cur: + cur.execute("SELECT * FROM t") # ensure data is not corrupted + + layer_map_path = env.repo_dir / "layer-map.json" + log.info(f"Writing layer map to {layer_map_path}") + with layer_map_path.open("w") as f: + f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id))) + + +@pytest.mark.timeout(10000) +def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): + """ + Test that GC is able to collect all old layers even if them are forming + "stairs" and there are not three delta layers since last image layer. + + Information about image layers needed to collect old layers should + be propagated by GC to compaction task which should take in in account + when make a decision which new image layers needs to be created. + + NB: this test demonstrates the problem. The source tree contained the + `gc_feedback` mechanism for about 9 months, but, there were problems + with it and it wasn't enabled at runtime. + This PR removed the code: https://github.com/neondatabase/neon/pull/6863 + + And the bottom-most GC-compaction epic resolves the problem. + https://github.com/neondatabase/neon/issues/8002 + """ + gc_feedback_impl(neon_env_builder, zenbenchmark, "normal") + + +@pytest.mark.timeout(10000) +def test_gc_feedback_with_snapshots( + neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker +): + """ + Compared with `test_gc_feedback`, we create a branch without written data (=snapshot) in the middle + of the benchmark, and the bottom-most compaction should collect as much garbage as possible below the GC + horizon. Ideally, there should be images (in an image layer) covering the full range at the branch point, + and images covering the full key range (in a delta layer) at the GC horizon. + """ + gc_feedback_impl(neon_env_builder, zenbenchmark, "with_snapshots") diff --git a/test_runner/performance/test_hot_page.py b/test_runner/performance/test_hot_page.py index d9785dd87e..5e97c7cddf 100644 --- a/test_runner/performance/test_hot_page.py +++ b/test_runner/performance/test_hot_page.py @@ -16,20 +16,34 @@ from pytest_lazyfixture import lazy_fixture ) def test_hot_page(env: PgCompare): # Update the same page many times, then measure read performance - num_writes = 1000000 with closing(env.pg.connect()) as conn: with conn.cursor() as cur: cur.execute("drop table if exists t, f;") + num_writes = 1000000 - # Write many updates to the same row + # Use a PL/pgSQL block to perform many updates to the same row + # without depending on the latency between database client and postgres + # server + # - however a single staement should not run into a timeout so we increase it + cur.execute("SET statement_timeout = '4h';") with env.record_duration("write"): - cur.execute("create table t (i integer);") - cur.execute("insert into t values (0);") - for i in range(num_writes): - cur.execute(f"update t set i = {i};") + cur.execute( + f""" + DO $$ + BEGIN + create table t (i integer); + insert into t values (0); - # Write 3-4 MB to evict t from compute cache + FOR j IN 1..{num_writes} LOOP + update t set i = j; + END LOOP; + END $$; + """ + ) + + # Write ca 350 MB to evict t from compute shared buffers (128 MB) + # however it will still be in LFC, so I do not really understand the point of this test cur.execute("create table f (i integer);") cur.execute("insert into f values (generate_series(1,100000));") diff --git a/test_runner/performance/test_hot_table.py b/test_runner/performance/test_hot_table.py index 5fcffc8afb..9a78c92ec0 100644 --- a/test_runner/performance/test_hot_table.py +++ b/test_runner/performance/test_hot_table.py @@ -16,8 +16,8 @@ from pytest_lazyfixture import lazy_fixture ) def test_hot_table(env: PgCompare): # Update a small table many times, then measure read performance - num_rows = 100000 # Slightly larger than shared buffers size TODO validate - num_writes = 1000000 + num_rows = 100000 # initial table size only about 4 MB + num_writes = 10000000 # write approximately 349 MB blocks > 128 MB shared_buffers num_reads = 10 with closing(env.pg.connect()) as conn: @@ -28,8 +28,21 @@ def test_hot_table(env: PgCompare): with env.record_duration("write"): cur.execute("create table t (i integer primary key);") cur.execute(f"insert into t values (generate_series(1,{num_rows}));") - for i in range(num_writes): - cur.execute(f"update t set i = {i + num_rows} WHERE i = {i};") + # PL/pgSQL block to perform updates (and avoid latency between client and server) + # - however a single staement should not run into a timeout so we increase it + cur.execute("SET statement_timeout = '4h';") + cur.execute( + f""" + DO $$ + DECLARE + r integer := {num_rows}; + BEGIN + FOR j IN 1..{num_writes} LOOP + UPDATE t SET i = j + r WHERE i = j; + END LOOP; + END $$; + """ + ) # Read the table with env.record_duration("read"): diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py index 6bd0d85fa2..bc6d9de346 100644 --- a/test_runner/performance/test_layer_map.py +++ b/test_runner/performance/test_layer_map.py @@ -1,31 +1,31 @@ import time -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver -# -# Benchmark searching the layer map, when there are a lot of small layer files. -# def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): - env = neon_env_builder.init_start() + """Benchmark searching the layer map, when there are a lot of small layer files.""" + + env = neon_env_builder.init_configs() n_iters = 10 n_records = 100000 + env.start() + # We want to have a lot of lot of layer files to exercise the layer map. Disable # GC, and make checkpoint_distance very small, so that we get a lot of small layer # files. - tenant, _ = env.neon_cli.create_tenant( + tenant, timeline = env.neon_cli.create_tenant( conf={ "gc_period": "0s", - "checkpoint_distance": "8192", + "checkpoint_distance": "16384", "compaction_period": "1 s", "compaction_threshold": "1", - "compaction_target_size": "8192", + "compaction_target_size": "16384", } ) - env.neon_cli.create_timeline("test_layer_map", tenant_id=tenant) - endpoint = env.endpoints.create_start("test_layer_map", tenant_id=tenant) + endpoint = env.endpoints.create_start("main", tenant_id=tenant) cur = endpoint.connect().cursor() cur.execute("create table t(x integer)") for _ in range(n_iters): @@ -33,6 +33,12 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): time.sleep(1) cur.execute("vacuum t") + with zenbenchmark.record_duration("test_query"): cur.execute("SELECT count(*) from t") assert cur.fetchone() == (n_iters * n_records,) + + flush_ep_to_pageserver(env, endpoint, tenant, timeline) + env.pageserver.http_client().timeline_checkpoint( + tenant, timeline, compact=False, wait_until_uploaded=True + ) diff --git a/test_runner/performance/test_lazy_startup.py b/test_runner/performance/test_lazy_startup.py new file mode 100644 index 0000000000..e929bd4d05 --- /dev/null +++ b/test_runner/performance/test_lazy_startup.py @@ -0,0 +1,106 @@ +import pytest +import requests +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.neon_fixtures import NeonEnvBuilder + + +# Start and measure duration with huge SLRU segments. +# This test is similar to test_startup_simple, but it creates huge number of transactions +# and records containing this XIDs. Autovacuum is disable for the table to prevent CLOG truncation. +# +# This test runs pretty quickly and can be informative when used in combination +# with emulated network delay. Some useful delay commands: +# +# 1. Add 2msec delay to all localhost traffic +# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec` +# +# 2. Test that it works (you should see 4ms ping) +# `ping localhost` +# +# 3. Revert back to normal +# `sudo tc qdisc del dev lo root netem` +# +# NOTE this test might not represent the real startup time because the basebackup +# for a large database might be larger if there's a lof of transaction metadata, +# or safekeepers might need more syncing, or there might be more operations to +# apply during config step, like more users, databases, or extensions. By default +# we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this +# test we only load neon. +@pytest.mark.timeout(1800) +@pytest.mark.parametrize("slru", ["lazy", "eager"]) +def test_lazy_startup(slru: str, neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + lazy_slru_download = "true" if slru == "lazy" else "false" + tenant, _ = env.neon_cli.create_tenant( + conf={ + "lazy_slru_download": lazy_slru_download, + } + ) + + endpoint = env.endpoints.create_start("main", tenant_id=tenant) + with endpoint.cursor() as cur: + cur.execute("CREATE TABLE t (pk integer PRIMARY KEY, x integer)") + cur.execute("ALTER TABLE t SET (autovacuum_enabled = false)") + cur.execute("INSERT INTO t VALUES (1, 0)") + cur.execute( + """ + CREATE PROCEDURE updating() as + $$ + DECLARE + i integer; + BEGIN + FOR i IN 1..1000000 LOOP + UPDATE t SET x = x + 1 WHERE pk=1; + COMMIT; + END LOOP; + END + $$ LANGUAGE plpgsql + """ + ) + cur.execute("SET statement_timeout=0") + cur.execute("call updating()") + + endpoint.stop() + + # We do two iterations so we can see if the second startup is faster. It should + # be because the compute node should already be configured with roles, databases, + # extensions, etc from the first run. + for i in range(2): + # Start + with zenbenchmark.record_duration(f"{slru}_{i}_start"): + endpoint.start() + + with zenbenchmark.record_duration(f"{slru}_{i}_select"): + sum = endpoint.safe_psql("select sum(x) from t")[0][0] + assert sum == 1000000 + + # Get metrics + metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json() + durations = { + "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec", + "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers", + "sync_sk_check_ms": f"{slru}_{i}_sync_sk_check", + "basebackup_ms": f"{slru}_{i}_basebackup", + "start_postgres_ms": f"{slru}_{i}_start_postgres", + "config_ms": f"{slru}_{i}_config", + "total_startup_ms": f"{slru}_{i}_total_startup", + } + for key, name in durations.items(): + value = metrics[key] + zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER) + + basebackup_bytes = metrics["basebackup_bytes"] + zenbenchmark.record( + f"{slru}_{i}_basebackup_bytes", + basebackup_bytes, + "bytes", + report=MetricReport.LOWER_IS_BETTER, + ) + + # Stop so we can restart + endpoint.stop() + + # Imitate optimizations that console would do for the second start + endpoint.respec(skip_pg_catalog_updates=True) diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index b799f7248f..077f73ac06 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -1,10 +1,23 @@ +from __future__ import annotations + import time +from typing import TYPE_CHECKING +import psycopg2 +import psycopg2.extras import pytest +from fixtures.benchmark_fixture import MetricReport +from fixtures.common_types import Lsn from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, PgBin, logical_replication_sync +from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync + +if TYPE_CHECKING: + from fixtures.benchmark_fixture import NeonBenchmarker + from fixtures.neon_api import NeonApiEndpoint + from fixtures.neon_fixtures import NeonEnv, PgBin +@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2]) @pytest.mark.timeout(1000) def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg): env = neon_simple_env @@ -25,7 +38,6 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg vanilla_pg.safe_psql("truncate table pgbench_history") connstr = endpoint.connstr().replace("'", "''") - print(f"connstr='{connstr}'") vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") # Wait logical replication channel to be established @@ -41,3 +53,295 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg sum_master = endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0] sum_replica = vanilla_pg.safe_psql("select sum(abalance) from pgbench_accounts")[0][0] assert sum_master == sum_replica + + +def check_pgbench_still_running(pgbench, label=""): + rc = pgbench.poll() + if rc is not None: + raise RuntimeError(f"{label} pgbench terminated early with return code {rc}") + + +def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600): + start = time.time() + pub_cur.execute("SELECT pg_current_wal_flush_lsn()") + pub_lsn = Lsn(pub_cur.fetchall()[0][0]) + while (time.time() - start) < timeout_sec: + sub_cur.execute("SELECT latest_end_lsn FROM pg_catalog.pg_stat_subscription") + res = sub_cur.fetchall()[0][0] + if res: + log.info(f"subscriber_lsn={res}") + sub_lsn = Lsn(res) + log.info(f"Subscriber LSN={sub_lsn}, publisher LSN={pub_lsn}") + if sub_lsn >= pub_lsn: + return time.time() - start + time.sleep(0.5) + raise TimeoutError(f"Logical replication sync took more than {timeout_sec} sec") + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_subscriber_lag( + pg_bin: PgBin, + benchmark_project_pub: NeonApiEndpoint, + benchmark_project_sub: NeonApiEndpoint, + zenbenchmark: NeonBenchmarker, +): + """ + Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects + on subscriber. Periodically restarts subscriber while still running the inserts, and + measures how long sync takes after restart. + """ + test_duration_min = 60 + sync_interval_min = 5 + pgbench_duration = f"-T{test_duration_min * 60 * 2}" + + pub_env = benchmark_project_pub.pgbench_env + sub_env = benchmark_project_sub.pgbench_env + pub_connstr = benchmark_project_pub.connstr + sub_connstr = benchmark_project_sub.connstr + + if benchmark_project_pub.is_new: + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) + if benchmark_project_sub.is_new: + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + + pub_conn = psycopg2.connect(pub_connstr) + sub_conn = psycopg2.connect(sub_connstr) + pub_conn.autocommit = True + sub_conn.autocommit = True + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'") + pub_exists = len(pub_cur.fetchall()) != 0 + + if not pub_exists: + pub_cur.execute("CREATE PUBLICATION pub1 FOR TABLE pgbench_accounts, pgbench_history") + + sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'") + sub_exists = len(sub_cur.fetchall()) != 0 + if not sub_exists: + sub_cur.execute("truncate table pgbench_accounts") + sub_cur.execute("truncate table pgbench_history") + + sub_cur.execute(f"CREATE SUBSCRIPTION sub1 CONNECTION '{pub_connstr}' PUBLICATION pub1") + + initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) + + pub_conn.close() + sub_conn.close() + + zenbenchmark.record("initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER) + + pub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env + ) + try: + sub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=sub_env, + ) + try: + start = time.time() + while time.time() - start < test_duration_min * 60: + time.sleep(sync_interval_min * 60) + check_pgbench_still_running(pub_workload, "pub") + check_pgbench_still_running(sub_workload, "sub") + + with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( + sub_connstr + ) as sub_conn: + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + lag = measure_logical_replication_lag(sub_cur, pub_cur) + + log.info(f"Replica lagged behind master by {lag} seconds") + zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) + sub_workload.terminate() + benchmark_project_sub.restart() + + sub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=sub_env, + ) + + # Measure storage to make sure replication information isn't bloating storage + sub_storage = benchmark_project_sub.get_synthetic_storage_size() + pub_storage = benchmark_project_pub.get_synthetic_storage_size() + zenbenchmark.record("sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER) + zenbenchmark.record("pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER) + finally: + sub_workload.terminate() + finally: + pub_workload.terminate() + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_publisher_restart( + pg_bin: PgBin, + benchmark_project_pub: NeonApiEndpoint, + benchmark_project_sub: NeonApiEndpoint, + zenbenchmark: NeonBenchmarker, +): + """ + Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects + on subscriber. Periodically restarts publisher (to exercise on-demand WAL download), and + measures how long sync takes after restart. + """ + test_duration_min = 60 + sync_interval_min = 5 + pgbench_duration = f"-T{test_duration_min * 60 * 2}" + + pub_env = benchmark_project_pub.pgbench_env + sub_env = benchmark_project_sub.pgbench_env + pub_connstr = benchmark_project_pub.connstr + sub_connstr = benchmark_project_sub.connstr + + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + + pub_conn = psycopg2.connect(pub_connstr) + sub_conn = psycopg2.connect(sub_connstr) + pub_conn.autocommit = True + sub_conn.autocommit = True + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'") + pub_exists = len(pub_cur.fetchall()) != 0 + + if not pub_exists: + pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history") + + sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'") + sub_exists = len(sub_cur.fetchall()) != 0 + if not sub_exists: + sub_cur.execute("truncate table pgbench_accounts") + sub_cur.execute("truncate table pgbench_history") + + sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1") + + initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) + pub_conn.close() + sub_conn.close() + + zenbenchmark.record("initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER) + + pub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env + ) + try: + sub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=sub_env, + ) + try: + start = time.time() + while time.time() - start < test_duration_min * 60: + time.sleep(sync_interval_min * 60) + check_pgbench_still_running(pub_workload, "pub") + check_pgbench_still_running(sub_workload, "sub") + + pub_workload.terminate() + benchmark_project_pub.restart() + pub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], + env=pub_env, + ) + with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( + sub_connstr + ) as sub_conn: + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + lag = measure_logical_replication_lag(sub_cur, pub_cur) + + log.info(f"Replica lagged behind master by {lag} seconds") + zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) + + # Measure storage to make sure replication information isn't bloating storage + sub_storage = benchmark_project_sub.get_synthetic_storage_size() + pub_storage = benchmark_project_pub.get_synthetic_storage_size() + zenbenchmark.record("sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER) + zenbenchmark.record("pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER) + finally: + sub_workload.terminate() + finally: + pub_workload.terminate() + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_snap_files( + pg_bin: PgBin, + benchmark_project_pub: NeonApiEndpoint, + zenbenchmark: NeonBenchmarker, +): + """ + Creates a node with a replication slot. Generates pgbench into the replication slot, + then runs pgbench inserts while generating large numbers of snapfiles. Then restarts + the node and tries to peek the replication changes. + """ + test_duration_min = 60 + test_interval_min = 5 + pgbench_duration = f"-T{test_duration_min * 60 * 2}" + + env = benchmark_project_pub.pgbench_env + connstr = benchmark_project_pub.connstr + + with psycopg2.connect(connstr) as conn: + conn.autocommit = True + with conn.cursor() as cur: + cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'") + is_super = cur.fetchall()[0][0] + assert is_super, "This benchmark won't work if we don't have superuser" + + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env) + + conn = psycopg2.connect(connstr) + conn.autocommit = True + cur = conn.cursor() + cur.execute("ALTER SYSTEM SET neon.logical_replication_max_snap_files = -1") + + with psycopg2.connect(connstr) as conn: + conn.autocommit = True + with conn.cursor() as cur: + cur.execute("SELECT pg_reload_conf()") + + with psycopg2.connect(connstr) as conn: + conn.autocommit = True + with conn.cursor() as cur: + cur.execute( + """ + DO $$ + BEGIN + IF EXISTS ( + SELECT 1 + FROM pg_replication_slots + WHERE slot_name = 'slotter' + ) THEN + PERFORM pg_drop_replication_slot('slotter'); + END IF; + END $$; + """ + ) + cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')") + + workload = pg_bin.run_nonblocking(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=env) + try: + start = time.time() + prev_measurement = time.time() + while time.time() - start < test_duration_min * 60: + with psycopg2.connect(connstr) as conn: + with conn.cursor() as cur: + cur.execute( + "SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s" + ) + check_pgbench_still_running(workload) + cur.execute( + "SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())" + ) + + # Measure storage + if time.time() - prev_measurement > test_interval_min * 60: + storage = benchmark_project_pub.get_synthetic_storage_size() + zenbenchmark.record("storage", storage, "B", MetricReport.LOWER_IS_BETTER) + prev_measurement = time.time() + time.sleep(test_interval_min * 60 / 3) + + finally: + workload.terminate() diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py index 8a9509ea44..aaa2f8fec2 100644 --- a/test_runner/performance/test_perf_olap.py +++ b/test_runner/performance/test_perf_olap.py @@ -100,6 +100,30 @@ QUERIES: Tuple[LabelledQuery, ...] = ( ) # fmt: on +# A list of pgvector HNSW index builds to run. +# Please do not alter the label for the query, as it is used to identify it. +# +# Disable auto formatting for the list of queries so that it's easier to read +# fmt: off +PGVECTOR_QUERIES: Tuple[LabelledQuery, ...] = ( + LabelledQuery("PGVPREP", r"ALTER EXTENSION VECTOR UPDATE;"), + LabelledQuery("PGV0", r"DROP TABLE IF EXISTS hnsw_test_table;"), + LabelledQuery("PGV1", r"CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA;"), + LabelledQuery("PGV2", r"INSERT INTO hnsw_test_table SELECT * FROM documents;"), + LabelledQuery("PGV3", r"CREATE INDEX ON hnsw_test_table (_id);"), + LabelledQuery("PGV4", r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_cosine_ops);"), + LabelledQuery("PGV5", r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_ip_ops);"), + LabelledQuery("PGV6", r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops);"), + LabelledQuery("PGV7", r"CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops);"), + LabelledQuery("PGV8", r"CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops);"), + LabelledQuery("PGV9", r"DROP TABLE IF EXISTS halfvec_test_table;"), + LabelledQuery("PGV10", r"CREATE TABLE halfvec_test_table (_id text NOT NULL, title text, text text, embeddings halfvec(1536), PRIMARY KEY (_id));"), + LabelledQuery("PGV11", r"INSERT INTO halfvec_test_table (_id, title, text, embeddings) SELECT _id, title, text, embeddings::halfvec FROM documents;"), + LabelledQuery("PGV12", r"CREATE INDEX documents_half_precision_hnsw_idx ON halfvec_test_table USING hnsw (embeddings halfvec_cosine_ops) WITH (m = 64, ef_construction = 128);"), +) +# fmt: on + + EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)" @@ -245,3 +269,18 @@ def test_clickbench_collect_pg_stat_statements(remote_compare: RemoteCompare): log.info("Collecting pg_stat_statements") query = LabelledQuery("Q_COLLECT_PG_STAT_STATEMENTS", r"SELECT * from pg_stat_statements;") run_psql(remote_compare, query, times=1, explain=False) + + +@pytest.mark.parametrize("query", PGVECTOR_QUERIES) +@pytest.mark.remote_cluster +def test_pgvector_indexing(query: LabelledQuery, remote_compare: RemoteCompare): + """ + An pgvector test that tests HNSW index build performance and parallelism. + + The DB prepared manually in advance. + See + - test_runner/performance/pgvector/README.md + - test_runner/performance/pgvector/loaddata.py + - test_runner/performance/pgvector/HNSW_build.sql + """ + run_psql(remote_compare, query, times=1, explain=False) diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 2b8760dff2..6eaa29e4f8 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -17,6 +17,8 @@ class PgBenchLoadType(enum.Enum): INIT = "init" SIMPLE_UPDATE = "simple-update" SELECT_ONLY = "select-only" + PGVECTOR_HNSW = "pgvector-hnsw" + PGVECTOR_HALFVEC = "pgvector-halfvec" def utc_now_timestamp() -> int: @@ -132,6 +134,46 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P password=password, ) + if workload_type == PgBenchLoadType.PGVECTOR_HNSW: + # Run simple-update workload + run_pgbench( + env, + "pgvector-hnsw", + [ + "pgbench", + "-f", + "test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql", + "-c100", + "-j20", + f"-T{duration}", + "-P2", + "--protocol=prepared", + "--progress-timestamp", + connstr, + ], + password=password, + ) + + if workload_type == PgBenchLoadType.PGVECTOR_HALFVEC: + # Run simple-update workload + run_pgbench( + env, + "pgvector-halfvec", + [ + "pgbench", + "-f", + "test_runner/performance/pgvector/pgbench_custom_script_pgvector_halfvec_queries.sql", + "-c100", + "-j20", + f"-T{duration}", + "-P2", + "--protocol=prepared", + "--progress-timestamp", + connstr, + ], + password=password, + ) + env.report_size() diff --git a/test_runner/performance/test_perf_pgvector_queries.py b/test_runner/performance/test_perf_pgvector_queries.py new file mode 100644 index 0000000000..bb3db16305 --- /dev/null +++ b/test_runner/performance/test_perf_pgvector_queries.py @@ -0,0 +1,24 @@ +import pytest +from fixtures.compare_fixtures import PgCompare + +from performance.test_perf_pgbench import PgBenchLoadType, get_durations_matrix, run_test_pgbench + + +# The following test runs on an existing database that has pgvector extension installed +# and a table with 1 million embedding vectors loaded and indexed with HNSW. +# +# Run this pgbench tests against an existing remote Postgres cluster with the necessary setup. +@pytest.mark.parametrize("duration", get_durations_matrix()) +@pytest.mark.remote_cluster +def test_pgbench_remote_pgvector_hnsw(remote_compare: PgCompare, duration: int): + run_test_pgbench(remote_compare, 1, duration, PgBenchLoadType.PGVECTOR_HNSW) + + +# The following test runs on an existing database that has pgvector extension installed +# and a table with 1 million embedding vectors loaded and indexed with halfvec. +# +# Run this pgbench tests against an existing remote Postgres cluster with the necessary setup. +@pytest.mark.parametrize("duration", get_durations_matrix()) +@pytest.mark.remote_cluster +def test_pgbench_remote_pgvector_halfvec(remote_compare: PgCompare, duration: int): + run_test_pgbench(remote_compare, 1, duration, PgBenchLoadType.PGVECTOR_HALFVEC) diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py new file mode 100644 index 0000000000..7e16197211 --- /dev/null +++ b/test_runner/performance/test_physical_replication.py @@ -0,0 +1,296 @@ +from __future__ import annotations + +import csv +import os +import subprocess +import time +import traceback +from pathlib import Path +from typing import TYPE_CHECKING + +import psycopg2 +import psycopg2.extras +import pytest +from fixtures.benchmark_fixture import MetricReport +from fixtures.common_types import Lsn +from fixtures.log_helper import log +from fixtures.neon_api import connection_parameters_to_env +from fixtures.pg_version import PgVersion + +if TYPE_CHECKING: + from typing import Any, List, Optional + + from fixtures.benchmark_fixture import NeonBenchmarker + from fixtures.neon_api import NeonAPI + from fixtures.neon_fixtures import PgBin + + +# Granularity of ~0.5 sec +def measure_replication_lag(master, replica, timeout_sec=600): + start = time.time() + master.execute("SELECT pg_current_wal_flush_lsn()") + master_lsn = Lsn(master.fetchall()[0][0]) + while (time.time() - start) < timeout_sec: + replica.execute("select pg_last_wal_replay_lsn()") + replica_lsn = replica.fetchall()[0][0] + if replica_lsn: + if Lsn(replica_lsn) >= master_lsn: + return time.time() - start + time.sleep(0.5) + raise TimeoutError(f"Replication sync took more than {timeout_sec} sec") + + +def check_pgbench_still_running(pgbench): + rc = pgbench.poll() + if rc is not None: + raise RuntimeError(f"Pgbench terminated early with return code {rc}") + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_ro_replica_lag( + pg_bin: PgBin, + neon_api: NeonAPI, + pg_version: PgVersion, + zenbenchmark: NeonBenchmarker, +): + test_duration_min = 60 + sync_interval_min = 10 + + pgbench_duration = f"-T{test_duration_min * 60 * 2}" + + project = neon_api.create_project(pg_version) + project_id = project["project"]["id"] + neon_api.wait_for_operation_to_finish(project_id) + error_occurred = False + try: + branch_id = project["branch"]["id"] + master_connstr = project["connection_uris"][0]["connection_uri"] + master_env = connection_parameters_to_env( + project["connection_uris"][0]["connection_parameters"] + ) + + replica = neon_api.create_endpoint( + project_id, + branch_id, + endpoint_type="read_only", + settings={"pg_settings": {"hot_standby_feedback": "on"}}, + ) + replica_env = master_env.copy() + replica_env["PGHOST"] = replica["endpoint"]["host"] + neon_api.wait_for_operation_to_finish(project_id) + + replica_connstr = neon_api.get_connection_uri( + project_id, + endpoint_id=replica["endpoint"]["id"], + )["uri"] + + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=master_env) + + master_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], + env=master_env, + ) + try: + replica_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=replica_env, + ) + try: + start = time.time() + while time.time() - start < test_duration_min * 60: + check_pgbench_still_running(master_workload) + check_pgbench_still_running(replica_workload) + time.sleep(sync_interval_min * 60) + with psycopg2.connect(master_connstr) as conn_master, psycopg2.connect( + replica_connstr + ) as conn_replica: + with conn_master.cursor() as cur_master, conn_replica.cursor() as cur_replica: + lag = measure_replication_lag(cur_master, cur_replica) + log.info(f"Replica lagged behind master by {lag} seconds") + zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) + finally: + replica_workload.terminate() + finally: + master_workload.terminate() + except Exception as e: + error_occurred = True + log.error(f"Caught exception: {e}") + log.error(traceback.format_exc()) + finally: + assert not error_occurred # Fail the test if an error occurred + neon_api.delete_project(project_id) + + +def report_pgbench_aggregate_intervals( + output_dir: Path, + prefix: str, + zenbenchmark: NeonBenchmarker, +): + for filename in os.listdir(output_dir): + if filename.startswith(prefix): + # The file will be in the form _. + # So we first lop off the ., and then lop off the prefix and the _ + node = filename.split(".")[0][len(prefix) + 1 :] + with open(output_dir / filename) as f: + reader = csv.reader(f, delimiter=" ") + for line in reader: + num_transactions = int(line[1]) + if num_transactions == 0: + continue + sum_latency = int(line[2]) + sum_lag = int(line[3]) + zenbenchmark.record( + f"{node}_num_txns", num_transactions, "txns", MetricReport.HIGHER_IS_BETTER + ) + zenbenchmark.record( + f"{node}_avg_latency", + sum_latency / num_transactions, + "s", + MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + f"{node}_avg_lag", + sum_lag / num_transactions, + "s", + MetricReport.LOWER_IS_BETTER, + ) + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_replication_start_stop( + pg_bin: PgBin, + test_output_dir: Path, + neon_api: NeonAPI, + pg_version: PgVersion, + zenbenchmark: NeonBenchmarker, +): + """ + Cycles through different configurations of read replicas being enabled disabled. The whole time, + there's a pgbench read/write workload going on the master. For each replica, we either turn it + on or off, and see how long it takes to catch up after some set amount of time of replicating + the pgbench. + """ + + prefix = "pgbench_agg" + num_replicas = 2 + configuration_test_time_sec = 10 * 60 + pgbench_duration = f"-T{2 ** num_replicas * configuration_test_time_sec}" + error_occurred = False + + project = neon_api.create_project(pg_version) + project_id = project["project"]["id"] + neon_api.wait_for_operation_to_finish(project_id) + try: + branch_id = project["branch"]["id"] + master_connstr = project["connection_uris"][0]["connection_uri"] + master_env = connection_parameters_to_env( + project["connection_uris"][0]["connection_parameters"] + ) + + replicas = [] + for _ in range(num_replicas): + replicas.append( + neon_api.create_endpoint( + project_id, + branch_id, + endpoint_type="read_only", + settings={"pg_settings": {"hot_standby_feedback": "on"}}, + ) + ) + neon_api.wait_for_operation_to_finish(project_id) + + replica_connstr = [ + neon_api.get_connection_uri( + project_id, + endpoint_id=replicas[i]["endpoint"]["id"], + )["uri"] + for i in range(num_replicas) + ] + replica_env = [master_env.copy() for _ in range(num_replicas)] + for i in range(num_replicas): + replica_env[i]["PGHOST"] = replicas[i]["endpoint"]["host"] + + pg_bin.run_capture(["pgbench", "-i", "-s10"], env=master_env) + + # Sync replicas + with psycopg2.connect(master_connstr) as conn_master: + with conn_master.cursor() as cur_master: + for i in range(num_replicas): + conn_replica = psycopg2.connect(replica_connstr[i]) + measure_replication_lag(cur_master, conn_replica.cursor()) + + master_pgbench = pg_bin.run_nonblocking( + [ + "pgbench", + "-c10", + pgbench_duration, + "-Mprepared", + "--log", + f"--log-prefix={test_output_dir}/{prefix}_master", + f"--aggregate-interval={configuration_test_time_sec}", + ], + env=master_env, + ) + replica_pgbench: List[Optional[subprocess.Popen[Any]]] = [None for _ in range(num_replicas)] + + # Use the bits of iconfig to tell us which configuration we are on. For example + # a iconfig of 2 is 10 in binary, indicating replica 0 is suspended and replica 1 is + # alive. + for iconfig in range((1 << num_replicas) - 1, -1, -1): + + def replica_enabled(iconfig: int = iconfig): + return bool((iconfig >> 1) & 1) + + # Change configuration + for ireplica in range(num_replicas): + if replica_enabled() and replica_pgbench[ireplica] is None: + replica_pgbench[ireplica] = pg_bin.run_nonblocking( + [ + "pgbench", + "-c10", + "-S", + pgbench_duration, + "--log", + f"--log-prefix={test_output_dir}/{prefix}_replica_{ireplica}", + f"--aggregate-interval={configuration_test_time_sec}", + ], + env=replica_env[ireplica], + ) + elif not replica_enabled() and replica_pgbench[ireplica] is not None: + pgb = replica_pgbench[ireplica] + assert pgb is not None + pgb.terminate() + pgb.wait() + replica_pgbench[ireplica] = None + + neon_api.suspend_endpoint( + project_id, + replicas[ireplica]["endpoint"]["id"], + ) + neon_api.wait_for_operation_to_finish(project_id) + + time.sleep(configuration_test_time_sec) + + with psycopg2.connect(master_connstr) as conn_master: + with conn_master.cursor() as cur_master: + for ireplica in range(num_replicas): + replica_conn = psycopg2.connect(replica_connstr[ireplica]) + lag = measure_replication_lag(cur_master, replica_conn.cursor()) + zenbenchmark.record( + f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER + ) + log.info( + f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}" + ) + master_pgbench.terminate() + except Exception as e: + error_occurred = True + log.error(f"Caught exception {e}") + log.error(traceback.format_exc()) + finally: + assert not error_occurred + neon_api.delete_project(project_id) + # Only report results if we didn't error out + report_pgbench_aggregate_intervals(test_output_dir, prefix, zenbenchmark) diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py new file mode 100644 index 0000000000..9cd83f0959 --- /dev/null +++ b/test_runner/performance/test_sharding_autosplit.py @@ -0,0 +1,280 @@ +import concurrent.futures +import re +from pathlib import Path + +import pytest +from fixtures.common_types import TenantId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + tenant_get_shards, +) + + +@pytest.mark.timeout(600) +def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + """ + Check that sharding, including auto-splitting, "just works" under pgbench workloads. + + This is not a benchmark, but it lives in the same place as benchmarks in order to be run + on a dedicated node that can sustain some significant throughput. + + Other tests validate the details of shard splitting, error cases etc. This test is + the sanity check that it all really works as expected with realistic amounts of data + and under load. + + Success conditions: + - Tenants auto-split when their capacity grows + - Client workloads are not interrupted while that happens + """ + + neon_env_builder.num_pageservers = 8 + neon_env_builder.storage_controller_config = { + # Split tenants at 500MB: it's up to the storage controller how it interprets this (logical + # sizes, physical sizes, etc). We will write this much data logically, therefore other sizes + # will reliably be greater. + "split_threshold": 1024 * 1024 * 500 + } + + tenant_conf = { + # We want layer rewrites to happen as soon as possible (this is the most stressful + # case for the system), so set PITR interval to something tiny. + "pitr_interval": "5s", + # Scaled down thresholds. We will run at ~1GB scale but would like to emulate + # the behavior of a system running at ~100GB scale. + "checkpoint_distance": f"{1024 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{1024 * 1024}", + "image_creation_threshold": "2", + "image_layer_creation_check_threshold": "0", + } + + env = neon_env_builder.init_start() + + for ps in env.pageservers: + ps.allowed_errors.extend( + [ + # We shut down pageservers while they might have some compaction work going on + ".*Compaction failed.*shutting down.*" + ] + ) + + env.storage_controller.allowed_errors.extend( + [ + # The neon_local functionality for updating computes is flaky for unknown reasons + ".*Local notification hook failed.*", + ".*Marking shard.*for notification retry.*", + ".*Failed to notify compute.*", + ] + ) + + # Total tenants + tenant_count = 4 + + # Transaction rate: we set this rather than running at full-speed because we + # might run on a slow node that doesn't cope well with many full-speed pgbenches running concurrently. + transaction_rate = 100 + + class TenantState: + def __init__(self, timeline_id, endpoint): + self.timeline_id = timeline_id + self.endpoint = endpoint + + # Create tenants + tenants = {} + for tenant_id in set(TenantId.generate() for _i in range(0, tenant_count)): + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant(tenant_id, timeline_id, conf=tenant_conf) + endpoint = env.endpoints.create("main", tenant_id=tenant_id) + tenants[tenant_id] = TenantState(timeline_id, endpoint) + endpoint.start() + + def run_pgbench_init(endpoint): + pg_bin.run_capture( + [ + "pgbench", + "-s50", + "-i", + f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres", + ] + ) + + def check_pgbench_output(out_path: str): + """ + When we run pgbench, we want not just an absence of errors, but also continuous evidence + of I/O progressing: our shard splitting and migration should not interrrupt the benchmark. + """ + matched_lines = 0 + stderr = Path(f"{out_path}.stderr").read_text() + + low_watermark = None + + # Apply this as a threshold for what we consider an unacceptable interruption to I/O + min_tps = transaction_rate // 10 + + for line in stderr.split("\n"): + match = re.match(r"progress: ([0-9\.]+) s, ([0-9\.]+) tps, .* ([0-9]+) failed", line) + if match is None: + # Fall back to older-version pgbench output (omits failure count) + match = re.match(r"progress: ([0-9\.]+) s, ([0-9\.]+) tps, .*", line) + if match is None: + continue + else: + (_time, tps) = match.groups() + tps = float(tps) + failed = 0 + else: + (_time, tps, failed) = match.groups() # type: ignore + tps = float(tps) + failed = int(failed) + + matched_lines += 1 + + if failed > 0: + raise RuntimeError( + f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has failed > 0" + ) + + if low_watermark is None or low_watermark > tps: + low_watermark = tps + + # Temporarily disabled: have seen some 0 tps regions on Hetzner runners, but not + # at the same time as a shard split. + # if tps < min_tps: + # raise RuntimeError( + # f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has tps < {min_tps}" + # ) + + log.info(f"Checked {matched_lines} progress lines, lowest TPS was {min_tps}") + + if matched_lines == 0: + raise RuntimeError(f"pgbench output at {out_path} contained no progress lines") + + def run_pgbench_main(endpoint): + out_path = pg_bin.run_capture( + [ + "pgbench", + "-s50", + "-T", + "180", + "-R", + f"{transaction_rate}", + "-P", + "1", + f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres", + ] + ) + + check_pgbench_output(out_path) + + def run_pgbench_read(endpoint): + out_path = pg_bin.run_capture( + [ + "pgbench", + "-s50", + "-T", + "30", + "-R", + f"{transaction_rate}", + "-S", + "-P", + "1", + f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres", + ] + ) + + check_pgbench_output(out_path) + + with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads: + pgbench_futs = [] + for tenant_state in tenants.values(): + fut = pgbench_threads.submit(run_pgbench_init, tenant_state.endpoint) + pgbench_futs.append(fut) + + log.info("Waiting for pgbench inits") + for fut in pgbench_futs: + fut.result() + + pgbench_futs = [] + for tenant_state in tenants.values(): + fut = pgbench_threads.submit(run_pgbench_main, tenant_state.endpoint) + pgbench_futs.append(fut) + + log.info("Waiting for pgbench read/write pass") + for fut in pgbench_futs: + fut.result() + + def assert_all_split(): + for tenant_id in tenants.keys(): + shards = tenant_get_shards(env, tenant_id) + assert len(shards) == 8 + + # This is not a wait_until, because we wanted the splits to happen _while_ pgbench is running: otherwise + # this test is not properly doing its job of validating that splits work nicely under load. + assert_all_split() + + env.storage_controller.assert_log_contains(".*Successful auto-split.*") + + # Log timeline sizes, useful for debug, and implicitly validates that the shards + # are available in the places the controller thinks they should be. + for tenant_id, tenant_state in tenants.items(): + (shard_zero_id, shard_zero_ps) = tenant_get_shards(env, tenant_id)[0] + timeline_info = shard_zero_ps.http_client().timeline_detail( + shard_zero_id, tenant_state.timeline_id + ) + log.info(f"{shard_zero_id} timeline: {timeline_info}") + + # Run compaction for all tenants, restart endpoint so that on subsequent reads we will + # definitely hit pageserver for reads. This compaction passis expected to drop unwanted + # layers but not do any rewrites (we're still in the same generation) + for tenant_id, tenant_state in tenants.items(): + tenant_state.endpoint.stop() + for shard_id, shard_ps in tenant_get_shards(env, tenant_id): + shard_ps.http_client().timeline_gc(shard_id, tenant_state.timeline_id, gc_horizon=None) + shard_ps.http_client().timeline_compact(shard_id, tenant_state.timeline_id) + tenant_state.endpoint.start() + + with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads: + pgbench_futs = [] + for tenant_state in tenants.values(): + fut = pgbench_threads.submit(run_pgbench_read, tenant_state.endpoint) + pgbench_futs.append(fut) + + log.info("Waiting for pgbench read pass") + for fut in pgbench_futs: + fut.result() + + env.storage_controller.consistency_check() + + # Restart the storage controller + env.storage_controller.stop() + env.storage_controller.start() + + env.storage_controller.consistency_check() + + # Restart all pageservers + for ps in env.pageservers: + ps.stop() + ps.start() + + # Freshen gc_info in Timeline, so that when compaction runs in the background in the + # subsequent pgbench period, the last_gc_cutoff is updated and enables the conditions for a rewrite to pass. + for tenant_id, tenant_state in tenants.items(): + for shard_id, shard_ps in tenant_get_shards(env, tenant_id): + shard_ps.http_client().timeline_gc(shard_id, tenant_state.timeline_id, gc_horizon=None) + + # One last check data remains readable after everything has restarted + with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads: + pgbench_futs = [] + for tenant_state in tenants.values(): + fut = pgbench_threads.submit(run_pgbench_read, tenant_state.endpoint) + pgbench_futs.append(fut) + + log.info("Waiting for pgbench read pass") + for fut in pgbench_futs: + fut.result() + + # Assert that some rewrites happened + # TODO: uncomment this after https://github.com/neondatabase/neon/pull/7531 is merged + # assert any(ps.log_contains(".*Rewriting layer after shard split.*") for ps in env.pageservers) diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py new file mode 100644 index 0000000000..297aedfbed --- /dev/null +++ b/test_runner/performance/test_storage_controller_scale.py @@ -0,0 +1,302 @@ +import concurrent.futures +import random +import time +from collections import defaultdict + +import pytest +from fixtures.common_types import TenantId, TenantShardId, TimelineId +from fixtures.compute_reconfigure import ComputeReconfigure +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PageserverAvailability, + PageserverSchedulingPolicy, +) +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pg_version import PgVersion + + +def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[str, int]: + """ + Get the number of shards attached to each node. + This function takes into account the intersection of the intent and the observed state. + If they do not match, it asserts out. + """ + tenant_placement = env.storage_controller.get_tenants_placement() + log.info(f"{tenant_placement=}") + + matching = { + tid: tenant_placement[tid]["intent"]["attached"] + for tid in tenant_placement + if tenant_placement[tid]["intent"]["attached"] + == tenant_placement[tid]["observed"]["attached"] + } + assert len(matching) == total_shards + + attached_per_node: defaultdict[str, int] = defaultdict(int) + for node_id in matching.values(): + attached_per_node[node_id] += 1 + + return attached_per_node + + +def assert_consistent_balanced_attachments(env: NeonEnv, total_shards): + attached_per_node = get_consistent_node_shard_counts(env, total_shards) + + min_shard_count = min(attached_per_node.values()) + max_shard_count = max(attached_per_node.values()) + + flake_factor = 5 / 100 + assert max_shard_count - min_shard_count <= int(total_shards * flake_factor) + + +@pytest.mark.timeout(3600) # super long running test: should go down as we optimize +def test_storage_controller_many_tenants( + neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure +): + """ + Check that we cope well with a not-totally-trivial number of tenants. + + This is checking for: + - Obvious concurrency bugs from issuing many tenant creations/modifications + concurrently. + - Obvious scaling bugs like O(N^2) scaling that would be so slow that even + a basic test starts failing from slowness. + + This is _not_ a comprehensive scale test: just a basic sanity check that + we don't fall over for a thousand shards. + """ + + neon_env_builder.num_pageservers = 5 + neon_env_builder.storage_controller_config = { + # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts. + # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to + # guard against regressions in restart time. + "max_offline": "30s", + "max_warming_up": "300s", + } + neon_env_builder.control_plane_compute_hook_api = ( + compute_reconfigure_listener.control_plane_compute_hook_api + ) + + # A small sleep on each call into the notify hook, to simulate the latency of doing a database write + compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01)) + + env = neon_env_builder.init_configs() + neon_env_builder.start() + + # We will intentionally stress reconciler concurrrency, which triggers a warning when lots + # of shards are hitting the delayed path. + env.storage_controller.allowed_errors.extend( + [ + # We will intentionally stress reconciler concurrrency, which triggers a warning when lots + # of shards are hitting the delayed path. + ".*Many shards are waiting to reconcile", + # We will create many timelines concurrently, so they might get slow enough to trip the warning + # that timeline creation is holding a lock too long. + ".*Shared lock by TimelineCreate.*was held.*", + ] + ) + + for ps in env.pageservers: + # Storage controller is allowed to drop pageserver requests when the cancellation token + # for a Reconciler fires. + ps.allowed_errors.append(".*request was dropped before completing.*") + + # Total tenants + tenant_count = 4000 + + # Shards per tenant + shard_count = 2 + stripe_size = 1024 + + total_shards = tenant_count * shard_count + + tenants = set(TenantId.generate() for _i in range(0, tenant_count)) + + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + + def check_memory(): + # Shards should be cheap_ in memory, as we will have very many of them + expect_memory_per_shard = 128 * 1024 + + rss = env.storage_controller.get_metric_value("process_resident_memory_bytes") + assert rss is not None + log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)") + assert rss < expect_memory_per_shard * shard_count * tenant_count + + # We use a fixed seed to make the test somewhat reproducible: we want a randomly + # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run. + rng = random.Random(1234) + + # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore + # permits, to ensure that we are exercising stressing that. + api_concurrency = 135 + + # We will create tenants directly via API, not via neon_local, to avoid any false + # serialization of operations in neon_local (it e.g. loads/saves a config file on each call) + with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor: + futs = [] + t1 = time.time() + for tenant_id in tenants: + f = executor.submit( + env.storage_controller.tenant_create, + tenant_id, + shard_count, + stripe_size, + # Upload heatmaps fast, so that secondary downloads happen promptly, enabling + # the controller's optimization migrations to proceed promptly. + tenant_config={"heatmap_period": "10s"}, + placement_policy={"Attached": 1}, + ) + futs.append(f) + + # Wait for creations to finish + for f in futs: + f.result() + log.info( + f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s" + ) + + run_ops = api_concurrency * 4 + assert run_ops < len(tenants) + op_tenants = list(tenants)[0:run_ops] + + # Generate a mixture of operations and dispatch them all concurrently + futs = [] + for tenant_id in op_tenants: + op = rng.choice([0, 1, 2]) + if op == 0: + # A fan-out write operation to all shards in a tenant (timeline creation) + f = executor.submit( + virtual_ps_http.timeline_create, + PgVersion.NOT_SET, + tenant_id, + TimelineId.generate(), + ) + elif op == 1: + # A reconciler operation: migrate a shard. + shard_number = rng.randint(0, shard_count - 1) + tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count) + + # Migrate it to its secondary location + desc = env.storage_controller.tenant_describe(tenant_id) + dest_ps_id = desc["shards"][shard_number]["node_secondary"][0] + + f = executor.submit( + env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id + ) + elif op == 2: + # A passthrough read to shard zero + f = executor.submit(virtual_ps_http.tenant_status, tenant_id) + + futs.append(f) + + # Wait for mixed ops to finish + for f in futs: + f.result() + + # Some of the operations above (notably migrations) might leave the controller in a state where it has + # some work to do, for example optimizing shard placement after we do a random migration. Wait for the system + # to reach a quiescent state before doing following checks. + env.storage_controller.reconcile_until_idle() + + env.storage_controller.consistency_check() + check_memory() + + # This loop waits for reconcile_all to indicate no pending work, and then calls it once more to time + # how long the call takes when idle: this iterates over shards while doing no I/O and should be reliably fast: if + # it isn't, that's a sign that we have made some algorithmic mistake (e.g. O(N**2) scheduling) + # + # We do not require that the system is quiescent already here, although at present in this point in the test + # that may be the case. + while True: + t1 = time.time() + reconcilers = env.storage_controller.reconcile_all() + if reconcilers == 0: + # Time how long a no-op background reconcile takes: this measures how long it takes to + # loop over all the shards looking for work to do. + runtime = time.time() - t1 + log.info(f"No-op call to reconcile_all took {runtime}s") + assert runtime < 1 + break + + # Restart the storage controller + env.storage_controller.stop() + env.storage_controller.start() + + # See how long the controller takes to pass its readiness check. This should be fast because + # all the nodes are online: offline pageservers are the only thing that's allowed to delay + # startup. + readiness_period = env.storage_controller.wait_until_ready() + assert readiness_period < 5 + + # Consistency check is safe here: the storage controller's restart should not have caused any reconcilers + # to run, as it was in a stable state before restart. If it did, that's a bug. + env.storage_controller.consistency_check() + check_memory() + + shard_counts = get_consistent_node_shard_counts(env, total_shards) + log.info(f"Shard counts before rolling restart: {shard_counts}") + + assert_consistent_balanced_attachments(env, total_shards) + + # Restart pageservers gracefully: this exercises the /re-attach pageserver API + # and the storage controller drain and fill API + for ps in env.pageservers: + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2 + ) + + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.PAUSE_FOR_RESTART, + max_attempts=24, + backoff=5, + ) + + shard_counts = get_consistent_node_shard_counts(env, total_shards) + log.info(f"Shard counts after draining node {ps.id}: {shard_counts}") + # Assert that we've drained the node + assert shard_counts[str(ps.id)] == 0 + # Assert that those shards actually went somewhere + assert sum(shard_counts.values()) == total_shards + + ps.restart() + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=24, + backoff=1, + ) + + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2 + ) + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=24, + backoff=5, + ) + + shard_counts = get_consistent_node_shard_counts(env, total_shards) + log.info(f"Shard counts after filling node {ps.id}: {shard_counts}") + + assert_consistent_balanced_attachments(env, total_shards) + + env.storage_controller.reconcile_until_idle() + env.storage_controller.consistency_check() + + # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn, + # as they were not offline long enough to trigger any scheduling changes. + env.storage_controller.consistency_check() + check_memory() + + # Stop the storage controller before tearing down fixtures, because it otherwise might log + # errors trying to call our `ComputeReconfigure`. + env.storage_controller.stop() diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 7eb244d378..c824e60c29 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -2,14 +2,14 @@ import statistics import threading import time import timeit -from typing import Any, Callable, List +from typing import Any, Callable, Generator, List import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.common_types import Lsn from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare from fixtures.log_helper import log -from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin -from fixtures.types import Lsn +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, flush_ep_to_pageserver from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix @@ -20,7 +20,7 @@ from performance.test_perf_pgbench import get_durations_matrix, get_scales_matri # For example, to build a `NeonCompare` interface, the corresponding fixture's param should have # a format of `neon_{safekeepers_enable_fsync}`. # Note that, here "_" is used to separate builder parameters. -def pg_compare(request) -> PgCompare: +def pg_compare(request) -> Generator[PgCompare, None, None]: x = request.param.split("_") if x[0] == "vanilla": @@ -28,7 +28,7 @@ def pg_compare(request) -> PgCompare: fixture = request.getfixturevalue("vanilla_compare") assert isinstance(fixture, VanillaCompare) - return fixture + yield fixture else: assert ( len(x) == 2 @@ -47,10 +47,15 @@ def pg_compare(request) -> PgCompare: neon_env_builder.safekeepers_enable_fsync = x[1] == "on" env = neon_env_builder.init_start() - env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) - branch_name = request.node.name - return NeonCompare(zenbenchmark, env, pg_bin, branch_name) + cmp = NeonCompare(zenbenchmark, env, pg_bin) + + yield cmp + + flush_ep_to_pageserver(env, cmp._pg, cmp.tenant, cmp.timeline) + env.pageserver.http_client().timeline_checkpoint( + cmp.tenant, cmp.timeline, compact=False, wait_until_uploaded=True + ) def start_heavy_write_workload(env: PgCompare, n_tables: int, scale: int, num_iters: int): diff --git a/test_runner/pg_clients/csharp/npgsql/Dockerfile b/test_runner/pg_clients/csharp/npgsql/Dockerfile index b23eb2e5eb..71717a6006 100644 --- a/test_runner/pg_clients/csharp/npgsql/Dockerfile +++ b/test_runner/pg_clients/csharp/npgsql/Dockerfile @@ -1,4 +1,4 @@ -FROM mcr.microsoft.com/dotnet/sdk:7.0 AS build +FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build WORKDIR /source COPY *.csproj . @@ -7,7 +7,7 @@ RUN dotnet restore COPY . . RUN dotnet publish -c release -o /app --no-restore -FROM mcr.microsoft.com/dotnet/runtime:7.0 +FROM mcr.microsoft.com/dotnet/runtime:8.0 WORKDIR /app COPY --from=build /app . diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj index bb4427f2c4..edf2a01337 100644 --- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj +++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj @@ -2,13 +2,13 @@ Exe - net7.0 + net8.0 enable enable - + diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile index 74eb9bdc32..7c2b1b40e0 100644 --- a/test_runner/pg_clients/java/jdbc/Dockerfile +++ b/test_runner/pg_clients/java/jdbc/Dockerfile @@ -1,10 +1,10 @@ -FROM openjdk:20 +FROM openjdk:22 WORKDIR /source COPY . . WORKDIR /app -RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.6.0.jar && \ +RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.7.2.jar && \ javac -d /app /source/Example.java CMD ["java", "-cp", "/app/postgresql.jar:.", "Example"] diff --git a/test_runner/pg_clients/python/asyncpg/Dockerfile b/test_runner/pg_clients/python/asyncpg/Dockerfile index 8b6d56b8fb..f2cc37a7bb 100644 --- a/test_runner/pg_clients/python/asyncpg/Dockerfile +++ b/test_runner/pg_clients/python/asyncpg/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11 +FROM python:3.12 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/python/asyncpg/requirements.txt b/test_runner/pg_clients/python/asyncpg/requirements.txt index b33c21474c..61972959a9 100644 --- a/test_runner/pg_clients/python/asyncpg/requirements.txt +++ b/test_runner/pg_clients/python/asyncpg/requirements.txt @@ -1 +1 @@ -asyncpg==0.27.0 +asyncpg==0.29.0 diff --git a/test_runner/pg_clients/python/pg8000/Dockerfile b/test_runner/pg_clients/python/pg8000/Dockerfile index ebef1f9059..ee1de20da5 100644 --- a/test_runner/pg_clients/python/pg8000/Dockerfile +++ b/test_runner/pg_clients/python/pg8000/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11 +FROM python:3.12 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt index a8407c3cb0..099a4ade2c 100644 --- a/test_runner/pg_clients/python/pg8000/requirements.txt +++ b/test_runner/pg_clients/python/pg8000/requirements.txt @@ -1,2 +1,2 @@ -pg8000==1.29.8 +pg8000==1.31.2 scramp>=1.4.3 diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index 3ac0f16e4b..354fc15745 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "addr2line" -version = "0.21.0" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678" dependencies = [ "gimli", ] @@ -19,9 +19,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "async-trait" -version = "0.1.74" +version = "0.1.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a66537f1bb974b254c98ed142ff995236e81b9d0fe4db0575f46612cb15eb0f9" +checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" dependencies = [ "proc-macro2", "quote", @@ -30,15 +30,15 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.1.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" [[package]] name = "backtrace" -version = "0.3.69" +version = "0.3.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a" dependencies = [ "addr2line", "cc", @@ -51,9 +51,9 @@ dependencies = [ [[package]] name = "base64" -version = "0.21.4" +version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ba43ea6f343b788c8764558649e08df62f86c6ef251fdaeb1ffd010a9ae50a2" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" [[package]] name = "bitflags" @@ -63,9 +63,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.4.1" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" [[package]] name = "block-buffer" @@ -78,9 +78,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.14.0" +version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "byteorder" @@ -90,18 +90,15 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.5.0" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" +checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" [[package]] name = "cc" -version = "1.0.83" +version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" -dependencies = [ - "libc", -] +checksum = "ac367972e516d45567c7eafc73d24e1c193dcf200a8d94e9db7b3d38b349572d" [[package]] name = "cfg-if" @@ -111,9 +108,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "core-foundation" -version = "0.9.3" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" dependencies = [ "core-foundation-sys", "libc", @@ -121,15 +118,15 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.4" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" [[package]] name = "cpufeatures" -version = "0.2.9" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" +checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" dependencies = [ "libc", ] @@ -157,12 +154,12 @@ dependencies = [ [[package]] name = "errno" -version = "0.3.5" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -173,15 +170,9 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" [[package]] name = "fastrand" -version = "2.0.1" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" - -[[package]] -name = "finl_unicode" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6" +checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" [[package]] name = "foreign-types" @@ -200,9 +191,9 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" [[package]] name = "futures" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" +checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" dependencies = [ "futures-channel", "futures-core", @@ -215,9 +206,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" dependencies = [ "futures-core", "futures-sink", @@ -225,15 +216,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" [[package]] name = "futures-executor" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" +checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" dependencies = [ "futures-core", "futures-task", @@ -242,15 +233,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" [[package]] name = "futures-macro" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" +checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", @@ -259,21 +250,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" [[package]] name = "futures-task" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" [[package]] name = "futures-util" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" dependencies = [ "futures-channel", "futures-core", @@ -299,9 +290,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.10" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "libc", @@ -310,9 +301,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.28.0" +version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0" +checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" [[package]] name = "hmac" @@ -325,36 +316,30 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.64" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" dependencies = [ "wasm-bindgen", ] -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - [[package]] name = "libc" -version = "0.2.149" +version = "0.2.155" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" [[package]] name = "linux-raw-sys" -version = "0.4.10" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "lock_api" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" dependencies = [ "autocfg", "scopeguard", @@ -362,9 +347,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.20" +version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" [[package]] name = "md-5" @@ -378,37 +363,36 @@ dependencies = [ [[package]] name = "memchr" -version = "2.6.4" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "miniz_oxide" -version = "0.7.1" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" dependencies = [ "adler", ] [[package]] name = "mio" -version = "0.8.8" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "wasi", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] name = "native-tls" -version = "0.2.11" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" +checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" dependencies = [ - "lazy_static", "libc", "log", "openssl", @@ -422,26 +406,26 @@ dependencies = [ [[package]] name = "object" -version = "0.32.1" +version = "0.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cf5f9dd3933bd50a9e1f149ec995f39ae2c496d31fd772c1fd45ebc27e902b0" +checksum = "576dfe1fc8f9df304abb159d767a29d0476f7750fbf8aa7ad07816004a207434" dependencies = [ "memchr", ] [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "openssl" -version = "0.10.60" +version = "0.10.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800" +checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.6.0", "cfg-if", "foreign-types", "libc", @@ -469,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.96" +version = "0.9.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f" +checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6" dependencies = [ "cc", "libc", @@ -481,9 +465,9 @@ dependencies = [ [[package]] name = "parking_lot" -version = "0.12.1" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" dependencies = [ "lock_api", "parking_lot_core", @@ -491,22 +475,22 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.9" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.4.1", + "redox_syscall 0.5.2", "smallvec", - "windows-targets", + "windows-targets 0.52.5", ] [[package]] name = "percent-encoding" -version = "2.3.0" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "phf" @@ -528,9 +512,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.13" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" +checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" [[package]] name = "pin-utils" @@ -540,9 +524,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.27" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" +checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" [[package]] name = "postgres-native-tls" @@ -594,18 +578,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.69" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.33" +version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] @@ -642,20 +626,20 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.3.5" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" dependencies = [ "bitflags 1.3.2", ] [[package]] name = "redox_syscall" -version = "0.4.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.6.0", ] [[package]] @@ -670,30 +654,30 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustix" -version = "0.38.19" +version = "0.38.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "745ecfa778e66b2b63c88a61cb36e0eea109e803b0b86bf9879fbc77c70e86ed" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.6.0", "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "schannel" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c3733bf4cf7ea0880754e19cb5a462007c4a8c1914bff372ccc95b464f1df88" +checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" dependencies = [ - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -704,11 +688,11 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "security-framework" -version = "2.9.2" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de" +checksum = "c627723fd09706bacdb5cf41499e95098555af3c3c29d014dc3c458ef6be11c0" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.6.0", "core-foundation", "core-foundation-sys", "libc", @@ -717,9 +701,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.9.1" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a" +checksum = "317936bbbd05227752583946b9e66d7ce3b489f84e11a94a510b4437fef407d7" dependencies = [ "core-foundation-sys", "libc", @@ -753,42 +737,42 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.11.1" +version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "socket2" -version = "0.5.4" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4031e820eb552adee9295814c0ced9e5cf38ddf1e8b7d566d6de8e2538ea989e" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "stringprep" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb41d74e231a107a1b4ee36bd1214b11285b77768d2e3824aedafa988fd36ee6" +checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1" dependencies = [ - "finl_unicode", "unicode-bidi", "unicode-normalization", + "unicode-properties", ] [[package]] name = "subtle" -version = "2.5.0" +version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.38" +version = "2.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b" +checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9" dependencies = [ "proc-macro2", "quote", @@ -797,22 +781,21 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.8.0" +version = "3.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" dependencies = [ "cfg-if", "fastrand", - "redox_syscall 0.3.5", "rustix", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "tinyvec" -version = "1.6.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +checksum = "c55115c6fbe2d2bef26eb09ad74bde02d8255476fc0c7b515ef09fbb35742d82" dependencies = [ "tinyvec_macros", ] @@ -825,9 +808,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.33.0" +version = "1.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f38200e3ef7995e5ef13baec2f432a6da0aa9ac495b2c0e8f3b7eec2c92d653" +checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" dependencies = [ "backtrace", "bytes", @@ -836,14 +819,14 @@ dependencies = [ "pin-project-lite", "socket2", "tokio-macros", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] name = "tokio-macros" -version = "2.1.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" +checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" dependencies = [ "proc-macro2", "quote", @@ -888,35 +871,15 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.9" +version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d68074620f57a0b21594d9735eb2e98ab38b17f80d3fcb189fca266771ca60d" +checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" dependencies = [ "bytes", "futures-core", "futures-sink", "pin-project-lite", "tokio", - "tracing", -] - -[[package]] -name = "tracing" -version = "0.1.40" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" -dependencies = [ - "pin-project-lite", - "tracing-core", -] - -[[package]] -name = "tracing-core" -version = "0.1.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" -dependencies = [ - "once_cell", ] [[package]] @@ -927,9 +890,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "unicode-bidi" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" +checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" [[package]] name = "unicode-ident" @@ -939,13 +902,19 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-normalization" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-properties" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4259d9d4425d9f0661581b804cb85fe66a4c631cadd8f490d1c13a35d5d9291" + [[package]] name = "vcpkg" version = "0.2.15" @@ -965,10 +934,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] -name = "wasm-bindgen" -version = "0.2.87" +name = "wasite" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" +checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" + +[[package]] +name = "wasm-bindgen" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -976,9 +951,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.87" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" dependencies = [ "bumpalo", "log", @@ -991,9 +966,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.87" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1001,9 +976,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.87" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", @@ -1014,15 +989,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.87" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" [[package]] name = "web-sys" -version = "0.3.64" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" +checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" dependencies = [ "js-sys", "wasm-bindgen", @@ -1030,11 +1005,12 @@ dependencies = [ [[package]] name = "whoami" -version = "1.4.1" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22fc3756b8a9133049b26c7f61ab35416c130e8c09b660f5b3958b446f52cc50" +checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9" dependencies = [ - "wasm-bindgen", + "redox_syscall 0.4.1", + "wasite", "web-sys", ] @@ -1044,7 +1020,16 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets", + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.5", ] [[package]] @@ -1053,13 +1038,29 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" +dependencies = [ + "windows_aarch64_gnullvm 0.52.5", + "windows_aarch64_msvc 0.52.5", + "windows_i686_gnu 0.52.5", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.5", + "windows_x86_64_gnu 0.52.5", + "windows_x86_64_gnullvm 0.52.5", + "windows_x86_64_msvc 0.52.5", ] [[package]] @@ -1068,38 +1069,86 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" + [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" +[[package]] +name = "windows_i686_gnu" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" + [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" +[[package]] +name = "windows_i686_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml index 6f100aafd5..27d01810bd 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml @@ -7,9 +7,9 @@ publish = false # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -native-tls = "0.2.11" +native-tls = "0.2.12" postgres-native-tls = "0.5.0" -tokio = { version = "1.33", features=["rt", "macros"] } +tokio = { version = "1.38", features=["rt", "macros"] } tokio-postgres = "0.7.10" diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile index 1d3709803e..3e214de785 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile +++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile @@ -1,4 +1,4 @@ -FROM rust:1.73 +FROM rust:1.79 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile index 9538cf4ed4..6006e61ee2 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile @@ -1,11 +1,11 @@ -FROM swift:5.8 AS build +FROM swift:5.10 AS build RUN apt-get -q update && apt-get -q install -y libssl-dev WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.8 +FROM swift:5.10 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresClientKitExample"] diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved index 767443a9dd..6e8613095f 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved @@ -1,4 +1,5 @@ { + "originHash" : "8eff8c577ba246ce7824d3434839acefced2b1a1d2b1ad700554502538a50558", "pins" : [ { "identity" : "bluesocket", @@ -18,15 +19,6 @@ "version" : "2.0.2" } }, - { - "identity" : "openssl", - "kind" : "remoteSourceControl", - "location" : "https://github.com/Kitura/OpenSSL.git", - "state" : { - "revision" : "5dc8cb4f971135c17343e3c6df4f28904a0600e2", - "version" : "2.3.1" - } - }, { "identity" : "postgresclientkit", "kind" : "remoteSourceControl", @@ -37,5 +29,5 @@ } } ], - "version" : 2 + "version" : 3 } diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift index 48320dd023..a66d09c542 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift @@ -1,4 +1,4 @@ -// swift-tools-version:5.8 +// swift-tools-version:5.10 import PackageDescription let package = Package( diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile index 61e1d1bba6..d6815fbb5f 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile @@ -1,10 +1,10 @@ -FROM swift:5.8 AS build +FROM swift:5.10 AS build WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.8 +FROM swift:5.10 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresNIOExample"] diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved index 9f13106011..0e5dfdafcb 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved @@ -1,12 +1,22 @@ { + "originHash" : "11b5dcece349a3e56a7a9a7d0af6d0f5b83dff321b43124a01b158ed7aac5302", "pins" : [ { "identity" : "postgres-nio", "kind" : "remoteSourceControl", "location" : "https://github.com/vapor/postgres-nio.git", "state" : { - "revision" : "061a0836d7c1887e04a975d1d2eaa2ef5fd7dfab", - "version" : "1.16.0" + "revision" : "5c268768890b062803a49f1358becc478f954265", + "version" : "1.21.5" + } + }, + { + "identity" : "swift-async-algorithms", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-async-algorithms.git", + "state" : { + "revision" : "da4e36f86544cdf733a40d59b3a2267e3a7bbf36", + "version" : "1.0.0" } }, { @@ -14,8 +24,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-atomics.git", "state" : { - "revision" : "6c89474e62719ddcc1e9614989fff2f68208fe10", - "version" : "1.1.0" + "revision" : "cd142fd2f64be2100422d658e7411e39489da985", + "version" : "1.2.0" } }, { @@ -41,8 +51,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-log.git", "state" : { - "revision" : "32e8d724467f8fe623624570367e3d50c5638e46", - "version" : "1.5.2" + "revision" : "e97a6fcb1ab07462881ac165fdbb37f067e205d5", + "version" : "1.5.4" } }, { @@ -50,8 +60,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-metrics.git", "state" : { - "revision" : "9b39d811a83cf18b79d7d5513b06f8b290198b10", - "version" : "2.3.3" + "revision" : "971ba26378ab69c43737ee7ba967a896cb74c0d1", + "version" : "2.4.1" } }, { @@ -59,8 +69,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio.git", "state" : { - "revision" : "6213ba7a06febe8fef60563a4a7d26a4085783cf", - "version" : "2.54.0" + "revision" : "635b2589494c97e48c62514bc8b37ced762e0a62", + "version" : "2.63.0" } }, { @@ -68,8 +78,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio-ssl.git", "state" : { - "revision" : "e866a626e105042a6a72a870c88b4c531ba05f83", - "version" : "2.24.0" + "revision" : "7c381eb6083542b124a6c18fae742f55001dc2b5", + "version" : "2.26.0" } }, { @@ -77,10 +87,28 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio-transport-services.git", "state" : { - "revision" : "41f4098903878418537020075a4d8a6e20a0b182", - "version" : "1.17.0" + "revision" : "6cbe0ed2b394f21ab0d46b9f0c50c6be964968ce", + "version" : "1.20.1" + } + }, + { + "identity" : "swift-service-lifecycle", + "kind" : "remoteSourceControl", + "location" : "https://github.com/swift-server/swift-service-lifecycle.git", + "state" : { + "revision" : "d58e6bf2b1ae2884cf204a8b5bcaaa7aae3c1ff0", + "version" : "2.6.0" + } + }, + { + "identity" : "swift-system", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-system.git", + "state" : { + "revision" : "025bcb1165deab2e20d4eaba79967ce73013f496", + "version" : "1.2.1" } } ], - "version" : 2 + "version" : 3 } diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift index a80590daa2..20bb10f76c 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift @@ -1,10 +1,10 @@ -// swift-tools-version:5.8 +// swift-tools-version:5.10 import PackageDescription let package = Package( name: "PostgresNIOExample", dependencies: [ - .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.16.0") + .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.21.5") ], targets: [ .executableTarget( diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile index 07e98c586b..45e8753f7e 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile +++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile @@ -1,4 +1,4 @@ -FROM node:20 +FROM node:22 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json index 4cedf56acd..19311808b6 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json @@ -5,24 +5,24 @@ "packages": { "": { "dependencies": { - "postgresql-client": "2.5.9" + "postgresql-client": "2.11.0" } }, "node_modules/doublylinked": { - "version": "2.5.2", - "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.2.tgz", - "integrity": "sha512-TDh0XfQWWDrfvGdAN0hLNIdkTXlw04nVCO5B/37ie4dV0yw1iT9ZrZ6tD+q/0SwXxeI/u6TF9Mxgd7s5/XYV6A==", + "version": "2.5.4", + "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.4.tgz", + "integrity": "sha512-jBCKDnFkEHJRjQvYEl5N9VngRV8ypHgw6a52OK4VN57eV2r2rYvgOx9uABdY78INNoW7S6auULp+KBVm/jfYqw==", "engines": { "node": ">= 10.0" } }, "node_modules/lightning-pool": { - "version": "4.2.1", - "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-4.2.1.tgz", - "integrity": "sha512-/pUIoGD3nzTH/wI4TYiJM3cLPeUOzGMTfFeBRuxaOAnwL0LZfwvqn5YFqsfyF98M0C3UXxWgfTz+Lu6okkno+g==", + "version": "4.2.2", + "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-4.2.2.tgz", + "integrity": "sha512-KW0Df0IbjNLxy5wAsdErTKYtHGwefLRQseHNksEctyaL7gtRwJT0nqLa2uiRdNYDwKSnZtqOjSjUNtfxmfH1qw==", "dependencies": { - "doublylinked": "^2.5.2", - "putil-promisify": "^1.8.6" + "doublylinked": "^2.5.3", + "putil-promisify": "^1.10.1" } }, "node_modules/obuf": { @@ -42,48 +42,47 @@ } }, "node_modules/postgresql-client": { - "version": "2.5.9", - "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.9.tgz", - "integrity": "sha512-s+kgTN6TfWLzehEyxw4Im4odnxVRCbZ0DEJzWS6SLowPAmB2m1/DOiOvZC0+ZVoi5AfbGE6SBqFxKguSyVAXZg==", + "version": "2.11.0", + "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.11.0.tgz", + "integrity": "sha512-QSPHcWVaiBG+JyASaDojOXvhRmsc2n8j2COdIjUDENFAtFls16Zy240asY2ENzZRQJUMAA8vpR8w4SAdI8jdbw==", + "license": "MIT", "dependencies": { - "doublylinked": "^2.5.2", - "lightning-pool": "^4.2.1", + "doublylinked": "^2.5.4", + "lightning-pool": "^4.2.2", "postgres-bytea": "^3.0.0", - "power-tasks": "^1.7.0", - "putil-merge": "^3.10.3", - "putil-promisify": "^1.10.0", + "power-tasks": "^1.7.3", + "putil-merge": "^3.12.1", + "putil-promisify": "^1.10.1", "putil-varhelpers": "^1.6.5" }, "engines": { - "node": ">=16.0", - "npm": ">=7.0.0" + "node": ">=16.0" } }, "node_modules/power-tasks": { - "version": "1.7.0", - "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.0.tgz", - "integrity": "sha512-rndZXCDxhuIDjPUJJvQwBDHaYagCkjvbPF/NA+omh/Ef4rAI9KtnvdA0k98dyiGpn1zXOpc6c2c0JWzg/xAhJg==", + "version": "1.7.3", + "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.3.tgz", + "integrity": "sha512-EnkjLfaX4PxFYHbUWyWzlE4I8SgctaW9jx4qQXrVRoELlqBXrxIMtuhHzRwsHv2qs1tO7efOcZa6/wDCdCjRfA==", "dependencies": { - "doublylinked": "^2.5.2", - "strict-typed-events": "^2.3.1" + "doublylinked": "^2.5.4", + "strict-typed-events": "^2.3.3" }, "engines": { - "node": ">=14.0", - "npm": ">=7.0.0" + "node": ">=16.0" } }, "node_modules/putil-merge": { - "version": "3.10.3", - "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.10.3.tgz", - "integrity": "sha512-B18CYi0/SmBYl9+fgowYWkgzJM/8XcLSeafHrFrGzwySQuOzLW0sOGx0CdFVp9zqaxgLctexUdGoSPpm6CPM6A==", + "version": "3.12.1", + "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.12.1.tgz", + "integrity": "sha512-4clPyRkJPrd5zl98AP7I3JamyXbx0ixe2CnfvGwoTyWSr7Kslcv8weoKjfU4BMBifkWIRL54l4OrNe97pYcDwQ==", "engines": { "node": ">= 10.0" } }, "node_modules/putil-promisify": { - "version": "1.10.0", - "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.10.0.tgz", - "integrity": "sha512-zYPoAoMxmf8pC+I75kRkYkVMwU4ZbZl82aTGema175bmhQ06BEJuuOlzOy1buQK9G+hCyQ+BFpzMTKAJhD8rZw==", + "version": "1.10.1", + "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.10.1.tgz", + "integrity": "sha512-1jm0egJNrj5eBDRj15Cg08RNHDV91OVEHeeYjAFRcs663PXxFokndxcJAGbaO6CSErCTp8eTgC8vuOF+fvXIAA==", "engines": { "node": ">= 14.0" } @@ -97,21 +96,21 @@ } }, "node_modules/strict-typed-events": { - "version": "2.3.1", - "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.3.1.tgz", - "integrity": "sha512-Z1h8KpVbrVg34Vwy/VwTD/tS9tFebH2h1Kvw4xnPkKpkISMwUpnqwU44rMfkKMpXbFCybIgDt7ARoCGTzURZhQ==", + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.3.3.tgz", + "integrity": "sha512-Vc8/N5giCVpO2n5BCskqDD9ns7RkdEq0pFd4yQk1ROULusJDbjORNvbtyEPxxK7Xqn9/NdW8XHLxv/PvUTgFsA==", "dependencies": { - "putil-promisify": "^1.8.5", - "ts-gems": "^2.2.0" + "putil-promisify": "^1.10.1", + "ts-gems": "^3.1.0" }, "engines": { "node": ">=16.0" } }, "node_modules/ts-gems": { - "version": "2.4.0", - "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.4.0.tgz", - "integrity": "sha512-SdugYAXoWvbqrxLodIObzxhEKacDxh5LfAJIiIkiH7q5thvuuCzdmkdTVQYf7uEDrEpPhfx4tokDMamdO3be9A==" + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-3.1.1.tgz", + "integrity": "sha512-Li1Z44FnxN06c1lBwFepb932jPYT+4eOvOmoiC30lOTkvOJOERr9xZFg3UA9y19OYO9CrW3ZSqNL66DUSuwFTw==" } } } diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json index 12703ce89f..d2bba23d29 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package.json @@ -1,6 +1,6 @@ { "type": "module", "dependencies": { - "postgresql-client": "2.5.9" + "postgresql-client": "2.11.0" } } diff --git a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile index 07e98c586b..45e8753f7e 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile +++ b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile @@ -1,4 +1,4 @@ -FROM node:20 +FROM node:22 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json index 72cc452817..7f3f7f2e84 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json @@ -5,100 +5,142 @@ "packages": { "": { "dependencies": { - "@neondatabase/serverless": "0.4.18", - "ws": "8.13.0" + "@neondatabase/serverless": "0.9.4", + "ws": "8.17.1" } }, "node_modules/@neondatabase/serverless": { - "version": "0.4.18", - "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.4.18.tgz", - "integrity": "sha512-2TZnIyRGC/+0fjZ8TKCzaSTPUD94PM7NBGuantGZbUrbWyqBwGnUoRtdZAQ95qBKVHqORLVfymlv2NE+HQMFeA==", + "version": "0.9.4", + "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.4.tgz", + "integrity": "sha512-D0AXgJh6xkf+XTlsO7iwE2Q1w8981E1cLCPAALMU2YKtkF/1SF6BiAzYARZFYo175ON+b1RNIy9TdSFHm5nteg==", + "license": "MIT", "dependencies": { - "@types/pg": "8.6.6" + "@types/pg": "8.11.6" } }, "node_modules/@types/node": { - "version": "18.16.3", - "resolved": "https://registry.npmjs.org/@types/node/-/node-18.16.3.tgz", - "integrity": "sha512-OPs5WnnT1xkCBiuQrZA4+YAV4HEJejmHneyraIaxsbev5yCEr6KMwINNFP9wQeFIw8FWcoTqF3vQsa5CDaI+8Q==" + "version": "20.14.9", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.9.tgz", + "integrity": "sha512-06OCtnTXtWOZBJlRApleWndH4JsRVs1pDCc8dLSQp+7PpUpX3ePdHyeNSFTeSe7FtKyQkrlPvHwJOW3SLd8Oyg==", + "license": "MIT", + "dependencies": { + "undici-types": "~5.26.4" + } }, "node_modules/@types/pg": { - "version": "8.6.6", - "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.6.6.tgz", - "integrity": "sha512-O2xNmXebtwVekJDD+02udOncjVcMZQuTEQEMpKJ0ZRf5E7/9JJX3izhKUcUifBkyKpljyUM6BTgy2trmviKlpw==", + "version": "8.11.6", + "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.11.6.tgz", + "integrity": "sha512-/2WmmBXHLsfRqzfHW7BNZ8SbYzE8OSk7i3WjFYvfgRHj7S1xj+16Je5fUKv3lVdVzk/zn9TXOqf+avFCFIE0yQ==", + "license": "MIT", "dependencies": { "@types/node": "*", "pg-protocol": "*", - "pg-types": "^2.2.0" + "pg-types": "^4.0.1" } }, + "node_modules/obuf": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz", + "integrity": "sha512-PX1wu0AmAdPqOL1mWhqmlOd8kOIZQwGZw6rh7uby9fTc5lhaOWFLX3I6R1hrF9k3zUY40e6igsLGkDXK92LJNg==", + "license": "MIT" + }, "node_modules/pg-int8": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/pg-int8/-/pg-int8-1.0.1.tgz", "integrity": "sha512-WCtabS6t3c8SkpDBUlb1kjOs7l66xsGdKpIPZsg4wR+B3+u9UAum2odSsF9tnvxg80h4ZxLWMy4pRjOsFIqQpw==", + "license": "ISC", "engines": { "node": ">=4.0.0" } }, - "node_modules/pg-protocol": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.6.0.tgz", - "integrity": "sha512-M+PDm637OY5WM307051+bsDia5Xej6d9IR4GwJse1qA1DIhiKlksvrneZOYQq42OM+spubpcNYEo2FcKQrDk+Q==" - }, - "node_modules/pg-types": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-2.2.0.tgz", - "integrity": "sha512-qTAAlrEsl8s4OiEQY69wDvcMIdQN6wdz5ojQiOy6YRMuynxenON0O5oCpJI6lshc6scgAY8qvJ2On/p+CXY0GA==", - "dependencies": { - "pg-int8": "1.0.1", - "postgres-array": "~2.0.0", - "postgres-bytea": "~1.0.0", - "postgres-date": "~1.0.4", - "postgres-interval": "^1.1.0" - }, + "node_modules/pg-numeric": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/pg-numeric/-/pg-numeric-1.0.2.tgz", + "integrity": "sha512-BM/Thnrw5jm2kKLE5uJkXqqExRUY/toLHda65XgFTBTFYZyopbKjBe29Ii3RbkvlsMoFwD+tHeGaCjjv0gHlyw==", + "license": "ISC", "engines": { "node": ">=4" } }, + "node_modules/pg-protocol": { + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.6.1.tgz", + "integrity": "sha512-jPIlvgoD63hrEuihvIg+tJhoGjUsLPn6poJY9N5CnlPd91c2T18T/9zBtLxZSb1EhYxBRoZJtzScCaWlYLtktg==", + "license": "MIT" + }, + "node_modules/pg-types": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-4.0.2.tgz", + "integrity": "sha512-cRL3JpS3lKMGsKaWndugWQoLOCoP+Cic8oseVcbr0qhPzYD5DWXK+RZ9LY9wxRf7RQia4SCwQlXk0q6FCPrVng==", + "license": "MIT", + "dependencies": { + "pg-int8": "1.0.1", + "pg-numeric": "1.0.2", + "postgres-array": "~3.0.1", + "postgres-bytea": "~3.0.0", + "postgres-date": "~2.1.0", + "postgres-interval": "^3.0.0", + "postgres-range": "^1.1.1" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/postgres-array": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-2.0.0.tgz", - "integrity": "sha512-VpZrUqU5A69eQyW2c5CA1jtLecCsN2U/bD6VilrFDWq5+5UIEVO7nazS3TEcHf1zuPYO/sqGvUvW62g86RXZuA==", + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-3.0.2.tgz", + "integrity": "sha512-6faShkdFugNQCLwucjPcY5ARoW1SlbnrZjmGl0IrrqewpvxvhSLHimCVzqeuULCbG0fQv7Dtk1yDbG3xv7Veog==", + "license": "MIT", "engines": { - "node": ">=4" + "node": ">=12" } }, "node_modules/postgres-bytea": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-1.0.0.tgz", - "integrity": "sha512-xy3pmLuQqRBZBXDULy7KbaitYqLcmxigw14Q5sj8QBVLqEwXfeybIKVWiqAXTlcvdvb0+xkOtDbfQMOf4lST1w==", + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-3.0.0.tgz", + "integrity": "sha512-CNd4jim9RFPkObHSjVHlVrxoVQXz7quwNFpz7RY1okNNme49+sVyiTvTRobiLV548Hx/hb1BG+iE7h9493WzFw==", + "license": "MIT", + "dependencies": { + "obuf": "~1.1.2" + }, "engines": { - "node": ">=0.10.0" + "node": ">= 6" } }, "node_modules/postgres-date": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-1.0.7.tgz", - "integrity": "sha512-suDmjLVQg78nMK2UZ454hAG+OAW+HQPZ6n++TNDUX+L0+uUlLywnoxJKDou51Zm+zTCjrCl0Nq6J9C5hP9vK/Q==", + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-2.1.0.tgz", + "integrity": "sha512-K7Juri8gtgXVcDfZttFKVmhglp7epKb1K4pgrkLxehjqkrgPhfG6OO8LHLkfaqkbpjNRnra018XwAr1yQFWGcA==", + "license": "MIT", "engines": { - "node": ">=0.10.0" + "node": ">=12" } }, "node_modules/postgres-interval": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-1.2.0.tgz", - "integrity": "sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ==", - "dependencies": { - "xtend": "^4.0.0" - }, + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-3.0.0.tgz", + "integrity": "sha512-BSNDnbyZCXSxgA+1f5UU2GmwhoI0aU5yMxRGO8CdFEcY2BQF9xm/7MqKnYoM1nJDk8nONNWDk9WeSmePFhQdlw==", + "license": "MIT", "engines": { - "node": ">=0.10.0" + "node": ">=12" } }, + "node_modules/postgres-range": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/postgres-range/-/postgres-range-1.1.4.tgz", + "integrity": "sha512-i/hbxIE9803Alj/6ytL7UHQxRvZkI9O4Sy+J3HGc4F4oo/2eQAjTSNJ0bfxyse3bH0nuVesCk+3IRLaMtG3H6w==", + "license": "MIT" + }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", + "license": "MIT" + }, "node_modules/ws": { - "version": "8.13.0", - "resolved": "https://registry.npmjs.org/ws/-/ws-8.13.0.tgz", - "integrity": "sha512-x9vcZYTrFPC7aSIbj7sRCYo7L/Xb8Iy+pW0ng0wt2vCJv7M9HOMy0UoN3rr+IFC7hb7vXoqS+P9ktyLLLhO+LA==", + "version": "8.17.1", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.17.1.tgz", + "integrity": "sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==", "engines": { "node": ">=10.0.0" }, @@ -114,14 +156,6 @@ "optional": true } } - }, - "node_modules/xtend": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", - "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==", - "engines": { - "node": ">=0.4" - } } } } diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json index 840c7a5c4c..f791d184c5 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package.json @@ -1,7 +1,7 @@ { "type": "module", "dependencies": { - "@neondatabase/serverless": "0.4.18", - "ws": "8.13.0" + "@neondatabase/serverless": "0.9.4", + "ws": "8.17.1" } } diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index 0e390ba9e5..f83b44a7ad 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -1,6 +1,6 @@ +from fixtures.common_types import TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import TimelineId from fixtures.utils import query_scalar @@ -20,7 +20,9 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): } ) - pageserver_http.configure_failpoints(("flush-frozen-pausable", "sleep(10000)")) + failpoint = "flush-frozen-pausable" + + pageserver_http.configure_failpoints((failpoint, "sleep(10000)")) endpoint_branch0 = env.endpoints.create_start("main", tenant_id=tenant) branch0_cur = endpoint_branch0.connect().cursor() @@ -45,7 +47,6 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): # Create branch1. env.neon_cli.create_branch("branch1", "main", tenant_id=tenant, ancestor_start_lsn=lsn_100) endpoint_branch1 = env.endpoints.create_start("branch1", tenant_id=tenant) - log.info("postgres is running on 'branch1' branch") branch1_cur = endpoint_branch1.connect().cursor() branch1_timeline = TimelineId(query_scalar(branch1_cur, "SHOW neon.timeline_id")) @@ -68,7 +69,6 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): # Create branch2. env.neon_cli.create_branch("branch2", "branch1", tenant_id=tenant, ancestor_start_lsn=lsn_200) endpoint_branch2 = env.endpoints.create_start("branch2", tenant_id=tenant) - log.info("postgres is running on 'branch2' branch") branch2_cur = endpoint_branch2.connect().cursor() branch2_timeline = TimelineId(query_scalar(branch2_cur, "SHOW neon.timeline_id")) @@ -98,3 +98,5 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): assert query_scalar(branch1_cur, "SELECT count(*) FROM foo") == 200000 assert query_scalar(branch2_cur, "SELECT count(*) FROM foo") == 300000 + + pageserver_http.configure_failpoints((failpoint, "off")) diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index ed389b1aa2..bb337d9cc1 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -2,13 +2,13 @@ from dataclasses import dataclass from typing import Generator, Optional import pytest +from fixtures.common_types import TenantId from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, ) -from fixtures.pageserver.http import PageserverApiException, TenantConfig +from fixtures.pageserver.http import TenantConfig from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.types import TenantId from fixtures.utils import wait_until @@ -17,9 +17,11 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv: neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() - # eviction might be the first one after an attach to access the layers - env.pageserver.allowed_errors.append( - ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction" + env.pageserver.allowed_errors.extend( + [ + # eviction might be the first one after an attach to access the layers + ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction", + ] ) assert isinstance(env.pageserver_remote_storage, LocalFsStorage) return env @@ -54,19 +56,16 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N env.pageserver.allowed_errors.extend( [ - # This fixture detaches the tenant, and tests using it will tend to re-attach it - # shortly after. There may be un-processed deletion_queue validations from the - # initial attachment - ".*Dropped remote consistent LSN updates.*", # This fixture is for tests that will intentionally generate 400 responses ".*Error processing HTTP request: Bad request", ] ) - def log_contains_bad_request(): - env.pageserver.log_contains(".*Error processing HTTP request: Bad request") - - wait_until(50, 0.1, log_contains_bad_request) + wait_until( + 50, + 0.1, + lambda: env.pageserver.assert_log_contains(".*Error processing HTTP request: Bad request"), + ) def test_null_body(negative_env: NegativeTests): @@ -77,8 +76,8 @@ def test_null_body(negative_env: NegativeTests): tenant_id = negative_env.tenant_id ps_http = env.pageserver.http_client() - res = ps_http.post( - f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach", + res = ps_http.put( + f"{ps_http.base_url}/v1/tenant/{tenant_id}/location_config", data=b"null", headers={"Content-Type": "application/json"}, ) @@ -94,35 +93,16 @@ def test_null_config(negative_env: NegativeTests): tenant_id = negative_env.tenant_id ps_http = env.pageserver.http_client() - res = ps_http.post( - f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach", - data=b'{"config": null}', + res = ps_http.put( + f"{ps_http.base_url}/v1/tenant/{tenant_id}/location_config", + json={"mode": "AttachedSingle", "generation": 1, "tenant_conf": None}, headers={"Content-Type": "application/json"}, ) assert res.status_code == 400 -def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests): - """ - If we send a config with unknown keys, the request should be rejected with status 400. - """ - - env = negative_env.neon_env - tenant_id = negative_env.tenant_id - - config_with_unknown_keys = { - "compaction_period": "1h", - "this_key_does_not_exist": "some value", - } - - with pytest.raises(PageserverApiException) as e: - env.pageserver.tenant_attach(tenant_id, config=config_with_unknown_keys) - assert e.type == PageserverApiException - assert e.value.status_code == 400 - - @pytest.mark.parametrize("content_type", [None, "application/json"]) -def test_no_config(positive_env: NeonEnv, content_type: Optional[str]): +def test_empty_config(positive_env: NeonEnv, content_type: Optional[str]): """ When the 'config' body attribute is omitted, the request should be accepted and the tenant should use the default configuration @@ -136,11 +116,13 @@ def test_no_config(positive_env: NeonEnv, content_type: Optional[str]): ps_http.tenant_detach(tenant_id) assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()] - body = {"generation": env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)} - - ps_http.post( - f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach", - json=body, + ps_http.put( + f"{ps_http.base_url}/v1/tenant/{tenant_id}/location_config", + json={ + "mode": "AttachedSingle", + "generation": env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id), + "tenant_conf": {}, + }, headers=None if content_type else {"Content-Type": "application/json"}, ).raise_for_status() @@ -160,23 +142,36 @@ def test_fully_custom_config(positive_env: NeonEnv): "compaction_target_size": 1048576, "checkpoint_distance": 10000, "checkpoint_timeout": "13m", + "compaction_algorithm": { + "kind": "tiered", + }, "eviction_policy": { "kind": "LayerAccessThreshold", "period": "20s", "threshold": "23h", }, "evictions_low_residence_duration_metric_threshold": "2days", - "gc_feedback": True, "gc_horizon": 23 * (1024 * 1024), "gc_period": "2h 13m", "heatmap_period": "10m", "image_creation_threshold": 7, "pitr_interval": "1m", "lagging_wal_timeout": "23m", + "lazy_slru_download": True, "max_lsn_wal_lag": 230000, "min_resident_size_override": 23, - "trace_read_requests": True, + "timeline_get_throttle": { + "task_kinds": ["PageRequestHandler"], + "initial": 0, + "refill_interval": "1s", + "refill_amount": 1000, + "max": 1000, + }, "walreceiver_connect_timeout": "13m", + "image_layer_creation_check_threshold": 1, + "switch_aux_file_policy": "cross-validation", + "lsn_lease_length": "1m", + "lsn_lease_length_for_ts": "5s", } ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index bd87ff3efd..780c0e1602 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -4,13 +4,13 @@ from pathlib import Path import psycopg2 import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, PgProtocol, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient -from fixtures.types import TenantId, TimelineId def assert_client_authorized(env: NeonEnv, http_client: PageserverHttpClient): @@ -105,7 +105,7 @@ def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder): # The neon_local tool generates one key pair at a hardcoded path by default. # As a preparation for our test, move the public key of the key pair into a # directory at the same location as the hardcoded path by: - # 1. moving the the file at `configured_pub_key_path` to a temporary location + # 1. moving the file at `configured_pub_key_path` to a temporary location # 2. creating a new directory at `configured_pub_key_path` # 3. moving the file from the temporary location into the newly created directory configured_pub_key_path = Path(env.repo_dir) / "auth_public_key.pem" @@ -211,7 +211,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): def check_pageserver(expect_success: bool, **conn_kwargs): check_connection( env.pageserver, - f"get_last_record_rlsn {env.initial_tenant} {timeline_id}", + f"pagestream_v2 {env.initial_tenant} {env.initial_timeline}", expect_success, **conn_kwargs, ) @@ -225,9 +225,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): check_pageserver(True, password=pageserver_token) - env.pageserver.allowed_errors.append( - ".*SafekeeperData scope makes no sense for Pageserver.*" - ) + env.pageserver.allowed_errors.append(".*JWT scope '.+' is ineligible for Pageserver auth.*") check_pageserver(False, password=safekeeper_token) def check_safekeeper(expect_success: bool, **conn_kwargs): diff --git a/test_runner/regress/test_aux_files.py b/test_runner/regress/test_aux_files.py new file mode 100644 index 0000000000..5328aef156 --- /dev/null +++ b/test_runner/regress/test_aux_files.py @@ -0,0 +1,76 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + AuxFileStore, + NeonEnvBuilder, + logical_replication_sync, +) + + +def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg): + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main") + client = env.pageserver.http_client() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + tenant_config = client.tenant_config(tenant_id).effective_config + tenant_config["switch_aux_file_policy"] = AuxFileStore.V2 + client.set_tenant_config(tenant_id, tenant_config) + # aux file v2 is enabled on the write path, so for now, it should be unset (or null) + assert ( + client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["last_aux_file_policy"] + is None + ) + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + cur.execute("create table t(pk integer primary key, payload integer)") + cur.execute( + "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));" + ) + cur.execute("create publication pub1 for table t, replication_example") + + # now start subscriber, aux files will be created at this point. TODO: find better ways of testing aux files (i.e., neon_test_utils) + # instead of going through the full logical replication process. + vanilla_pg.start() + vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)") + vanilla_pg.safe_psql( + "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);" + ) + connstr = endpoint.connstr().replace("'", "''") + log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") + + # Wait logical replication channel to be established + logical_replication_sync(vanilla_pg, endpoint) + vanilla_pg.stop() + endpoint.stop() + + with env.pageserver.http_client() as client: + # aux file v2 flag should be enabled at this point + assert ( + client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"] + == AuxFileStore.V2 + ) + with env.pageserver.http_client() as client: + tenant_config = client.tenant_config(tenant_id).effective_config + tenant_config["switch_aux_file_policy"] = "V1" + client.set_tenant_config(tenant_id, tenant_config) + # the flag should still be enabled + assert ( + client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[ + "last_aux_file_policy" + ] + == AuxFileStore.V2 + ) + env.pageserver.restart() + with env.pageserver.http_client() as client: + # aux file v2 flag should be persisted + assert ( + client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[ + "last_aux_file_policy" + ] + == AuxFileStore.V2 + ) diff --git a/test_runner/regress/test_backpressure.py b/test_runner/regress/test_backpressure.py index bc3faf9271..819912dd05 100644 --- a/test_runner/regress/test_backpressure.py +++ b/test_runner/regress/test_backpressure.py @@ -107,7 +107,6 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder): # which is needed for backpressure_lsns() to work endpoint.respec(skip_pg_catalog_updates=False) endpoint.start() - log.info("postgres is running on 'test_backpressure' branch") # setup check thread check_stop_event = threading.Event() diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py index ba0624c730..392b73c1f7 100644 --- a/test_runner/regress/test_bad_connection.py +++ b/test_runner/regress/test_bad_connection.py @@ -1,31 +1,47 @@ import random import time +import psycopg2.errors +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +@pytest.mark.timeout(600) def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append(".*simulated connection error.*") + env.pageserver.allowed_errors.append(".*simulated connection error.*") # this is never hit + # the real reason (Simulated Connection Error) is on the next line, and we cannot filter this out. + env.pageserver.allowed_errors.append( + ".*ERROR error in page_service connection task: Postgres query error" + ) + + # Enable failpoint before starting everything else up so that we exercise the retry + # on fetching basebackup pageserver_http = env.pageserver.http_client() + pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)")) + env.neon_cli.create_branch("test_compute_pageserver_connection_stress") endpoint = env.endpoints.create_start("test_compute_pageserver_connection_stress") - # Enable failpoint after starting everything else up so that loading initial - # basebackup doesn't fail - pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)")) - pg_conn = endpoint.connect() cur = pg_conn.cursor() + def execute_retry_on_timeout(query): + while True: + try: + cur.execute(query) + return + except psycopg2.errors.QueryCanceled: + log.info(f"Query '{query}' timed out - retrying") + # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point # of this test. - cur.execute("CREATE TABLE foo (t text)") - cur.execute( + execute_retry_on_timeout("CREATE TABLE foo (t text)") + execute_retry_on_timeout( """ INSERT INTO foo SELECT 'long string to consume some space' || g @@ -34,7 +50,7 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): ) # Verify that the table is larger than shared_buffers - cur.execute( + execute_retry_on_timeout( """ select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size from pg_settings where name = 'shared_buffers' @@ -45,16 +61,20 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): log.info(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) - cur.execute("SELECT count(*) FROM foo") + execute_retry_on_timeout("SELECT count(*) FROM foo") assert cur.fetchone() == (100000,) end_time = time.time() + 30 times_executed = 0 while time.time() < end_time: if random.random() < 0.5: - cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')") + execute_retry_on_timeout("INSERT INTO foo VALUES ('stas'), ('heikki')") else: - cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10") + execute_retry_on_timeout("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10") cur.fetchall() times_executed += 1 log.info(f"Workload executed {times_executed} times") + + # do a graceful shutdown which would had caught the allowed_errors before + # https://github.com/neondatabase/neon/pull/8632 + env.pageserver.stop() diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index bdc944f352..f2e3855c12 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -2,10 +2,10 @@ import threading import time import pytest +from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.pageserver.http import TimelineCreate406 -from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar @@ -65,8 +65,8 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str): "compaction_period": "1 s", "compaction_threshold": "2", "image_creation_threshold": "1", - # set PITR interval to be small, so we can do GC - "pitr_interval": "1 s", + # Disable PITR, this test will set an explicit space-based GC limit + "pitr_interval": "0 s", } ) @@ -120,12 +120,12 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): env = neon_simple_env pageserver_http_client = env.pageserver.http_client() - env.pageserver.allowed_errors.extend( - [ - ".*invalid branch start lsn: less than latest GC cutoff.*", - ".*invalid branch start lsn: less than planned GC cutoff.*", - ] - ) + error_regexes = [ + ".*invalid branch start lsn: less than latest GC cutoff.*", + ".*invalid branch start lsn: less than planned GC cutoff.*", + ] + env.pageserver.allowed_errors.extend(error_regexes) + env.storage_controller.allowed_errors.extend(error_regexes) # Disable background GC but set the `pitr_interval` to be small, so GC can delete something tenant, _ = env.neon_cli.create_tenant( diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index 9879254897..0a5336f5a2 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -1,8 +1,8 @@ import pytest +from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pageserver.http import TimelineCreate406 -from fixtures.types import Lsn, TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -11,17 +11,18 @@ from fixtures.utils import print_gc_result, query_scalar # def test_branch_behind(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC - neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) - env.pageserver.allowed_errors.extend( - [".*invalid branch start lsn.*", ".*invalid start lsn .* for ancestor timeline.*"] - ) + error_regexes = [ + ".*invalid branch start lsn.*", + ".*invalid start lsn .* for ancestor timeline.*", + ] + env.pageserver.allowed_errors.extend(error_regexes) + env.storage_controller.allowed_errors.extend(error_regexes) # Branch at the point where only 100 rows were inserted branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind") endpoint_main = env.endpoints.create_start("test_branch_behind") - log.info("postgres is running on 'test_branch_behind' branch") main_cur = endpoint_main.connect().cursor() diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 9a0b91b54e..fc74707639 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -1,9 +1,11 @@ import random import threading import time +from concurrent.futures import ThreadPoolExecutor from typing import List import pytest +from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -13,11 +15,9 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import wait_until_tenant_active -from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar from performance.test_perf_pgbench import get_scales_matrix from requests import RequestException -from requests.exceptions import RetryError # Test branch creation @@ -84,11 +84,11 @@ def test_branching_with_pgbench( threads = [] if ty == "cascade": - env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(i), tenant_id=tenant) + env.neon_cli.create_branch(f"b{i + 1}", f"b{i}", tenant_id=tenant) else: - env.neon_cli.create_branch("b{}".format(i + 1), "b0", tenant_id=tenant) + env.neon_cli.create_branch(f"b{i + 1}", "b0", tenant_id=tenant) - endpoints.append(env.endpoints.create_start("b{}".format(i + 1), tenant_id=tenant)) + endpoints.append(env.endpoints.create_start(f"b{i + 1}", tenant_id=tenant)) threads.append( threading.Thread(target=run_pgbench, args=(endpoints[-1].connstr(),), daemon=True) @@ -150,7 +150,7 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE env.pageserver.allowed_errors.extend( [ ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*", - ".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading", + ".*page_service_conn_main.*: query handler for 'basebackup .* ERROR: Not found: Timeline", ] ) ps_http = env.pageserver.http_client() @@ -175,10 +175,12 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE env.neon_cli.map_branch(initial_branch, env.initial_tenant, env.initial_timeline) - with pytest.raises(RuntimeError, match="is not active, state: Loading"): - env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant) + with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"): + env.endpoints.create_start( + initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2 + ) + ps_http.configure_failpoints(("before-upload-index-pausable", "off")) finally: - # FIXME: paused uploads bother shutdown env.pageserver.stop(immediate=True) t.join() @@ -192,8 +194,11 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder env = neon_env_builder.init_configs() env.start() - env.pageserver.allowed_errors.append( - ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*" + env.pageserver.allowed_errors.extend( + [ + ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*", + ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: .*Cannot branch off the timeline that's not present in pageserver.*", + ] ) ps_http = env.pageserver.http_client() @@ -215,7 +220,10 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder branch_id = TimelineId.generate() - with pytest.raises(RetryError, match="too many 503 error responses"): + with pytest.raises( + PageserverApiException, + match="Cannot branch off the timeline that's not present in pageserver", + ): ps_http.timeline_create( env.pg_version, env.initial_tenant, @@ -347,6 +355,92 @@ def test_non_uploaded_branch_is_deleted_after_restart(neon_env_builder: NeonEnvB ps_http.timeline_detail(env.initial_tenant, branch_id) +def test_duplicate_creation(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_configs() + env.start() + env.pageserver.tenant_create(env.initial_tenant) + + success_timeline = TimelineId.generate() + log.info(f"Creating timeline {success_timeline}") + ps_http = env.pageserver.http_client() + success_result = ps_http.timeline_create( + env.pg_version, env.initial_tenant, success_timeline, timeout=60 + ) + + ps_http.configure_failpoints(("timeline-creation-after-uninit", "pause")) + + def start_creating_timeline(): + log.info(f"Creating (expect failure) timeline {env.initial_timeline}") + with pytest.raises(RequestException): + ps_http.timeline_create( + env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60 + ) + + t = threading.Thread(target=start_creating_timeline) + try: + t.start() + + wait_until_paused(env, "timeline-creation-after-uninit") + + # While timeline creation is in progress, trying to create a timeline + # again with the same ID should return 409 + with pytest.raises( + PageserverApiException, match="creation of timeline with the given ID is in progress" + ): + ps_http.timeline_create( + env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60 + ) + + # Creation of a timeline already successfully created is idempotent, and is not impeded by some + # other timeline creation with a different TimelineId being stuck. + repeat_result = ps_http.timeline_create( + env.pg_version, env.initial_tenant, success_timeline, timeout=60 + ) + # remote_consistent_lsn_visible will be published only after we've + # confirmed the generation, which is not part of what we await during + # timeline creation (uploads). mask it out here to avoid flakyness. + del success_result["remote_consistent_lsn_visible"] + del repeat_result["remote_consistent_lsn_visible"] + assert repeat_result == success_result + finally: + env.pageserver.stop(immediate=True) + t.join() + + # now without a failpoint + env.pageserver.start() + + wait_until_tenant_active(ps_http, env.initial_tenant) + + with pytest.raises(PageserverApiException, match="not found"): + ps_http.timeline_detail(env.initial_tenant, env.initial_timeline) + + # The one successfully created timeline should still be there. + assert len(ps_http.timeline_list(tenant_id=env.initial_tenant)) == 1 + + +def test_branching_while_stuck_find_gc_cutoffs(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + client = env.pageserver.http_client() + + failpoint = "Timeline::find_gc_cutoffs-pausable" + + client.configure_failpoints((failpoint, "pause")) + + with ThreadPoolExecutor(max_workers=1) as exec: + completion = exec.submit(client.timeline_gc, env.initial_tenant, env.initial_timeline, None) + + wait_until_paused(env, failpoint) + + env.neon_cli.create_branch( + tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch" + ) + + client.configure_failpoints((failpoint, "off")) + + completion.result() + + def wait_until_paused(env: NeonEnv, failpoint: str): found = False msg = f"at failpoint {failpoint}" diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index b046ed7f1b..5ec9a22ba1 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -3,6 +3,7 @@ import os from typing import List, Tuple import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -11,7 +12,6 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, ) from fixtures.pg_version import PgVersion -from fixtures.types import TenantId, TimelineId # Test restarting page server, while safekeeper and compute node keep @@ -21,13 +21,13 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend( [ - ".*get_value_reconstruct_data for layer .*", + ".*get_values_reconstruct_data for layer .*", ".*could not find data for key.*", ".*is not active. Current state: Broken.*", ".*will not become active. Current state: Broken.*", ".*failed to load metadata.*", ".*load failed.*load local timeline.*", - ".*layer loading failed permanently: load layer: .*", + ".*: layer load failed, assuming permanent failure:.*", ] ) @@ -51,14 +51,8 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): (tenant0, timeline0, pg0) = tenant_timelines[0] log.info(f"Timeline {tenant0}/{timeline0} is left intact") - (tenant1, timeline1, pg1) = tenant_timelines[1] - metadata_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/metadata" - with open(metadata_path, "w") as f: - f.write("overwritten with garbage!") - log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled") - - (tenant2, timeline2, pg2) = tenant_timelines[2] - timeline_path = f"{env.pageserver.workdir}/tenants/{tenant2}/timelines/{timeline2}/" + (tenant1, timeline1, pg1) = tenant_timelines[2] + timeline_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/" for filename in os.listdir(timeline_path): if filename.startswith("00000"): # Looks like a layer file. Corrupt it @@ -67,7 +61,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): with open(p, "wb") as f: f.truncate(0) f.truncate(size) - log.info(f"Timeline {tenant2}/{timeline2} got its local layer files spoiled") + log.info(f"Timeline {tenant1}/{timeline1} got its local layer files spoiled") env.pageserver.start() @@ -75,19 +69,15 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): pg0.start() assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100 - # Tenant with corrupt local metadata works: remote storage is authoritative for metadata - pg1.start() - assert pg1.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100 - # Second timeline will fail during basebackup, because the local layer file is corrupt. # It will fail when we try to read (and reconstruct) a page from it, ergo the error message. # (We don't check layer file contents on startup, when loading the timeline) # # This will change when we implement checksums for layers - with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err: - pg2.start() + with pytest.raises(Exception, match="get_values_reconstruct_data for layer ") as err: + pg1.start() log.info( - f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}" + f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}" ) @@ -204,7 +194,7 @@ def test_timeline_init_break_before_checkpoint_recreate( assert timeline_id == new_timeline_id -def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilder): +def test_timeline_create_break_after_dir_creation(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -214,9 +204,9 @@ def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilde old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) initial_timeline_dirs = [d for d in timelines_dir.iterdir()] - # Introduce failpoint when creating a new timeline uninit mark, before any other files were created - pageserver_http.configure_failpoints(("after-timeline-uninit-mark-creation", "return")) - with pytest.raises(Exception, match="after-timeline-uninit-mark-creation"): + # Introduce failpoint when creating a new timeline, right after creating its directory + pageserver_http.configure_failpoints(("after-timeline-dir-creation", "return")) + with pytest.raises(Exception, match="after-timeline-dir-creation"): _ = pageserver_http.timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate()) # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py index adb67a579e..34791e5988 100644 --- a/test_runner/regress/test_change_pageserver.py +++ b/test_runner/regress/test_change_pageserver.py @@ -3,9 +3,16 @@ import asyncio from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.remote_storage import RemoteStorageKind +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response -def test_change_pageserver(neon_env_builder: NeonEnvBuilder): +def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver): + """ + A relatively low level test of reconfiguring a compute's pageserver at runtime. Usually this + is all done via the storage controller, but this test will disable the storage controller's compute + notifications, and instead update endpoints directly. + """ num_connections = 3 neon_env_builder.num_pageservers = 2 @@ -14,14 +21,24 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): ) env = neon_env_builder.init_start() - for pageserver in env.pageservers: - # This test dual-attaches a tenant, one of the pageservers will therefore - # be running with a stale generation. - pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") + neon_env_builder.control_plane_compute_hook_api = ( + f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach" + ) + + def ignore_notify(request: Request): + # This test does direct updates to compute configuration: disable the storage controller's notification + log.info(f"Ignoring storage controller compute notification: {request.json}") + return Response(status=200) + + make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler( + ignore_notify + ) env.neon_cli.create_branch("test_change_pageserver") endpoint = env.endpoints.create_start("test_change_pageserver") + # Put this tenant into a dual-attached state + assert env.get_tenant_pageserver(env.initial_tenant) == env.pageservers[0] alt_pageserver_id = env.pageservers[1].id env.pageservers[1].tenant_attach(env.initial_tenant) @@ -77,6 +94,7 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): env.pageservers[ 0 ].stop() # Stop the old pageserver just to make sure we're reading from the new one + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"}) execute("SELECT count(*) FROM foo") assert fetchone() == (100000,) @@ -85,11 +103,12 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): # the endpoint. Whereas the previous reconfiguration was like a healthy migration, this # is more like what happens in an unexpected pageserver failure. # - # Since we're dual-attached, need to tip-off attachment service to treat the one we're + # Since we're dual-attached, need to tip-off storage controller to treat the one we're # about to start as the attached pageserver - env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[0].id) env.pageservers[0].start() env.pageservers[1].stop() + env.storage_controller.node_configure(env.pageservers[1].id, {"availability": "Offline"}) + env.storage_controller.reconcile_until_idle() endpoint.reconfigure(pageserver_id=env.pageservers[0].id) @@ -97,10 +116,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): assert fetchone() == (100000,) env.pageservers[0].stop() - # Since we're dual-attached, need to tip-off attachment service to treat the one we're - # about to start as the attached pageserver - env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[1].id) env.pageservers[1].start() + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"}) + env.storage_controller.reconcile_until_idle() # Test a (former) bug where a child process spins without updating its connection string # by executing a query separately. This query will hang until we issue the reconfigure. diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py index f22eca02cc..26e6e336b9 100644 --- a/test_runner/regress/test_clog_truncate.py +++ b/test_runner/regress/test_clog_truncate.py @@ -25,7 +25,6 @@ def test_clog_truncate(neon_simple_env: NeonEnv): ] endpoint = env.endpoints.create_start("test_clog_truncate", config_lines=config) - log.info("postgres is running on test_clog_truncate branch") # Install extension containing function needed for test endpoint.safe_psql("CREATE EXTENSION neon_test_utils") @@ -62,7 +61,6 @@ def test_clog_truncate(neon_simple_env: NeonEnv): "test_clog_truncate_new", "test_clog_truncate", ancestor_start_lsn=lsn_after_truncation ) endpoint2 = env.endpoints.create_start("test_clog_truncate_new") - log.info("postgres is running on test_clog_truncate_new branch") # check that new node doesn't contain truncated segment pg_xact_0000_path_new = os.path.join(endpoint2.pg_xact_dir_path(), "0000") diff --git a/test_runner/regress/test_combocid.py b/test_runner/regress/test_combocid.py new file mode 100644 index 0000000000..41907b1f20 --- /dev/null +++ b/test_runner/regress/test_combocid.py @@ -0,0 +1,153 @@ +from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver + + +def do_combocid_op(neon_env_builder: NeonEnvBuilder, op): + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "shared_buffers='1MB'", + ], + ) + + conn = endpoint.connect() + cur = conn.cursor() + n_records = 1000 + + cur.execute("CREATE EXTENSION neon_test_utils") + + cur.execute("create table t(id integer, val integer)") + + cur.execute("begin") + cur.execute("insert into t values (1, 0)") + cur.execute("insert into t values (2, 0)") + cur.execute(f"insert into t select g, 0 from generate_series(3,{n_records}) g") + + # Open a cursor that scroll it halfway through + cur.execute("DECLARE c1 NO SCROLL CURSOR WITHOUT HOLD FOR SELECT * FROM t") + cur.execute("fetch 500 from c1") + rows = cur.fetchall() + assert len(rows) == 500 + + # Perform specified operation + cur.execute(op) + + # Clear the cache, so that we exercise reconstructing the pages + # from WAL + endpoint.clear_shared_buffers() + + # Check that the cursor opened earlier still works. If the + # combocids are not restored correctly, it won't. + cur.execute("fetch all from c1") + rows = cur.fetchall() + assert len(rows) == 500 + + cur.execute("rollback") + flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline) + env.pageserver.http_client().timeline_checkpoint( + env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True + ) + + +def test_combocid_delete(neon_env_builder: NeonEnvBuilder): + do_combocid_op(neon_env_builder, "delete from t") + + +def test_combocid_update(neon_env_builder: NeonEnvBuilder): + do_combocid_op(neon_env_builder, "update t set val=val+1") + + +def test_combocid_lock(neon_env_builder: NeonEnvBuilder): + do_combocid_op(neon_env_builder, "select * from t for update") + + +def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "shared_buffers='1MB'", + ], + ) + + conn = endpoint.connect() + cur = conn.cursor() + n_records = 1000 + + cur.execute("CREATE EXTENSION neon_test_utils") + + cur.execute("create table t(id integer, val integer)") + file_path = f"{endpoint.pg_data_dir_path()}/t.csv" + cur.execute(f"insert into t select g, 0 from generate_series(1,{n_records}) g") + cur.execute(f"copy t to '{file_path}'") + cur.execute("truncate table t") + + cur.execute("begin") + cur.execute(f"copy t from '{file_path}'") + + # Open a cursor that scroll it halfway through + cur.execute("DECLARE c1 NO SCROLL CURSOR WITHOUT HOLD FOR SELECT * FROM t") + cur.execute("fetch 500 from c1") + rows = cur.fetchall() + assert len(rows) == 500 + + # Delete all the rows. Because all of the rows were inserted earlier in the + # same transaction, all the rows will get a combocid. + cur.execute("delete from t") + # Clear the cache, so that we exercise reconstructing the pages + # from WAL + endpoint.clear_shared_buffers() + + # Check that the cursor opened earlier still works. If the + # combocids are not restored correctly, it won't. + cur.execute("fetch all from c1") + rows = cur.fetchall() + assert len(rows) == 500 + + cur.execute("rollback") + + flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline) + env.pageserver.http_client().timeline_checkpoint( + env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True + ) + + +def test_combocid(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main") + + conn = endpoint.connect() + cur = conn.cursor() + n_records = 100000 + + cur.execute("create table t(id integer, val integer)") + cur.execute(f"insert into t values (generate_series(1,{n_records}), 0)") + + cur.execute("begin") + + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + + cur.execute("delete from t") + assert cur.rowcount == n_records + cur.execute("delete from t") + assert cur.rowcount == 0 + + cur.execute(f"insert into t values (generate_series(1,{n_records}), 0)") + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + + cur.execute("rollback") + + flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline) + env.pageserver.http_client().timeline_checkpoint( + env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True + ) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py new file mode 100644 index 0000000000..be787e0642 --- /dev/null +++ b/test_runner/regress/test_compaction.py @@ -0,0 +1,413 @@ +import enum +import json +import os +import time +from typing import Optional + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + generate_uploads_and_deletions, +) +from fixtures.pageserver.http import PageserverApiException +from fixtures.utils import wait_until +from fixtures.workload import Workload + +AGGRESIVE_COMPACTION_TENANT_CONF = { + # Disable gc and compaction. The test runs compaction manually. + "gc_period": "0s", + "compaction_period": "0s", + # Small checkpoint distance to create many layers + "checkpoint_distance": 1024**2, + # Compact small layers + "compaction_target_size": 1024**2, + "image_creation_threshold": 2, +} + + +@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build") +def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder): + """ + This is a smoke test that compaction kicks in. The workload repeatedly churns + a small number of rows and manually instructs the pageserver to run compaction + between iterations. At the end of the test validate that the average number of + layers visited to gather reconstruct data for a given key is within the empirically + observed bounds. + """ + + # Effectively disable the page cache to rely only on image layers + # to shorten reads. + neon_env_builder.pageserver_config_override = """ +page_cache_size=10 +""" + + env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + row_count = 10000 + churn_rounds = 100 + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(row_count, env.pageserver.id) + + for i in range(1, churn_rounds + 1): + if i % 10 == 0: + log.info(f"Running churn round {i}/{churn_rounds} ...") + + workload.churn_rows(row_count, env.pageserver.id) + ps_http.timeline_compact(tenant_id, timeline_id) + + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + + log.info("Checking layer access metrics ...") + + layer_access_metric_names = [ + "pageserver_layers_visited_per_read_global_sum", + "pageserver_layers_visited_per_read_global_count", + "pageserver_layers_visited_per_read_global_bucket", + "pageserver_layers_visited_per_vectored_read_global_sum", + "pageserver_layers_visited_per_vectored_read_global_count", + "pageserver_layers_visited_per_vectored_read_global_bucket", + ] + + metrics = env.pageserver.http_client().get_metrics() + for name in layer_access_metric_names: + layer_access_metrics = metrics.query_all(name) + log.info(f"Got metrics: {layer_access_metrics}") + + non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum") + non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count") + if non_vectored_count.value != 0: + non_vectored_average = non_vectored_sum.value / non_vectored_count.value + else: + non_vectored_average = 0 + vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum") + vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count") + if vectored_count.value > 0: + assert vectored_sum.value > 0 + vectored_average = vectored_sum.value / vectored_count.value + else: + # special case: running local tests with default legacy configuration + assert vectored_sum.value == 0 + vectored_average = 0 + + log.info(f"{non_vectored_average=} {vectored_average=}") + + # The upper bound for average number of layer visits below (8) + # was chosen empirically for this workload. + assert non_vectored_average < 8 + assert vectored_average < 8 + + +# Stripe sizes in number of pages. +TINY_STRIPES = 16 +LARGE_STRIPES = 32768 + + +@pytest.mark.parametrize( + "shard_count,stripe_size", [(None, None), (4, TINY_STRIPES), (4, LARGE_STRIPES)] +) +def test_sharding_compaction( + neon_env_builder: NeonEnvBuilder, stripe_size: int, shard_count: Optional[int] +): + """ + Use small stripes, small layers, and small compaction thresholds to exercise how compaction + and image layer generation interacts with sharding. + + We are looking for bugs that might emerge from the way sharding uses sparse layer files that + only contain some of the keys in the key range covered by the layer, such as errors estimating + the size of layers that might result in too-small layer files. + """ + + compaction_target_size = 128 * 1024 + + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{compaction_target_size}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly: we want to exercise image layer creation in this test. + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": 0, + } + + # Disable compression, as we can't estimate the size of layers with compression enabled + # TODO: implement eager layer cutting during compaction + neon_env_builder.pageserver_config_override = "image_compression='disabled'" + + neon_env_builder.num_pageservers = 1 if shard_count is None else shard_count + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, + initial_tenant_shard_count=shard_count, + initial_tenant_shard_stripe_size=stripe_size, + ) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(64) + for _i in range(0, 10): + # Each of these does some writes then a checkpoint: because we set image_creation_threshold to 1, + # these should result in image layers each time we write some data into a shard, and also shards + # recieving less data hitting their "empty image layer" path (wherre they should skip writing the layer, + # rather than asserting) + workload.churn_rows(64) + + # Assert that we got some image layers: this is important because this test's purpose is to exercise the sharding changes + # to Timeline::create_image_layers, so if we weren't creating any image layers we wouldn't be doing our job. + shard_has_image_layers = [] + for shard in env.storage_controller.locate(tenant_id): + pageserver = env.get_pageserver(shard["node_id"]) + shard_id = shard["shard_id"] + layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id) + image_layer_sizes = {} + for layer in layer_map.historic_layers: + if layer.kind == "Image": + image_layer_sizes[layer.layer_file_name] = layer.layer_file_size + + # Pageserver should assert rather than emit an empty layer file, but double check here + assert layer.layer_file_size > 0 + + shard_has_image_layers.append(len(image_layer_sizes) > 1) + log.info(f"Shard {shard_id} image layer sizes: {json.dumps(image_layer_sizes, indent=2)}") + + if stripe_size == TINY_STRIPES: + # Checking the average size validates that our keyspace partitioning is properly respecting sharding: if + # it was not, we would tend to get undersized layers because the partitioning would overestimate the physical + # data in a keyrange. + # + # We only do this check with tiny stripes, because large stripes may not give all shards enough + # data to have statistically significant image layers + avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes) + log.info(f"Shard {shard_id} average image layer size: {avg_size}") + assert avg_size > compaction_target_size / 2 + + if stripe_size == TINY_STRIPES: + # Expect writes were scattered across all pageservers: they should all have compacted some image layers + assert all(shard_has_image_layers) + else: + # With large stripes, it is expected that most of our writes went to one pageserver, so we just require + # that at least one of them has some image layers. + assert any(shard_has_image_layers) + + # Assert that everything is still readable + workload.validate() + + +class CompactionAlgorithm(str, enum.Enum): + LEGACY = "legacy" + TIERED = "tiered" + + +@pytest.mark.parametrize( + "compaction_algorithm", [CompactionAlgorithm.LEGACY, CompactionAlgorithm.TIERED] +) +def test_uploads_and_deletions( + neon_env_builder: NeonEnvBuilder, + compaction_algorithm: CompactionAlgorithm, +): + """ + :param compaction_algorithm: the compaction algorithm to use. + """ + + tenant_conf = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", + "compaction_algorithm": json.dumps({"kind": compaction_algorithm.value}), + } + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + + # TODO remove these allowed errors + # https://github.com/neondatabase/neon/issues/7707 + # https://github.com/neondatabase/neon/issues/7759 + allowed_errors = [ + ".*/checkpoint.*rename temporary file as correct path for.*", # EEXIST + ".*delta layer created with.*duplicate values.*", + ".*assertion failed: self.lsn_range.start <= lsn.*", + ".*HTTP request handler task panicked: task.*panicked.*", + ] + if compaction_algorithm == CompactionAlgorithm.TIERED: + env.pageserver.allowed_errors.extend(allowed_errors) + + try: + generate_uploads_and_deletions(env, pageserver=env.pageserver) + except PageserverApiException as e: + log.info(f"Obtained PageserverApiException: {e}") + + # The errors occur flakily and no error is ensured to occur, + # however at least one of them occurs. + if compaction_algorithm == CompactionAlgorithm.TIERED: + found_allowed_error = any(env.pageserver.log_contains(e) for e in allowed_errors) + if not found_allowed_error: + raise Exception("None of the allowed_errors occured in the log") + + +def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder): + """ + Check that repeated failures in compaction result in a circuit breaker breaking + """ + TENANT_CONF = { + # Very frequent runs to rack up failures quickly + "compaction_period": "100ms", + # Small checkpoint distance to create many layers + "checkpoint_distance": 1024 * 128, + # Compact small layers + "compaction_target_size": 1024 * 128, + "image_creation_threshold": 1, + } + + FAILPOINT = "delta-layer-writer-fail-before-finish" + BROKEN_LOG = ".*Circuit breaker broken!.*" + + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + + workload = Workload(env, env.initial_tenant, env.initial_timeline) + workload.init() + + # Set a failpoint that will prevent compaction succeeding + env.pageserver.http_client().configure_failpoints((FAILPOINT, "return")) + + # Write some data to trigger compaction + workload.write_rows(1024, upload=False) + workload.write_rows(1024, upload=False) + workload.write_rows(1024, upload=False) + + def assert_broken(): + env.pageserver.assert_log_contains(BROKEN_LOG) + assert ( + env.pageserver.http_client().get_metric_value("pageserver_circuit_breaker_broken_total") + or 0 + ) == 1 + assert ( + env.pageserver.http_client().get_metric_value( + "pageserver_circuit_breaker_unbroken_total" + ) + or 0 + ) == 0 + + # Wait for enough failures to break the circuit breaker + # This wait is fairly long because we back off on compaction failures, so 5 retries takes ~30s + wait_until(60, 1, assert_broken) + + # Sleep for a while, during which time we expect that compaction will _not_ be retried + time.sleep(10) + + assert ( + env.pageserver.http_client().get_metric_value("pageserver_circuit_breaker_broken_total") + or 0 + ) == 1 + assert ( + env.pageserver.http_client().get_metric_value("pageserver_circuit_breaker_unbroken_total") + or 0 + ) == 0 + assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*") + + +@pytest.mark.parametrize("enabled", [True, False]) +def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool): + tenant_conf = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers as eagerly as possible + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", + } + + # Explicitly enable/disable compression, rather than using default + if enabled: + neon_env_builder.pageserver_config_override = "image_compression='zstd'" + else: + neon_env_builder.pageserver_config_override = "image_compression='disabled'" + + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + pageserver = env.pageserver + ps_http = env.pageserver.http_client() + with env.endpoints.create_start( + "main", tenant_id=tenant_id, pageserver_id=pageserver.id + ) as endpoint: + endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") + # Generate around 800k worth of easily compressible data to store + for v in range(100): + endpoint.safe_psql( + f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))" + ) + # run compaction to create image layers + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) + + layer_map = ps_http.layer_map_info(tenant_id, timeline_id) + image_layer_count = 0 + delta_layer_count = 0 + for layer in layer_map.historic_layers: + if layer.kind == "Image": + image_layer_count += 1 + elif layer.kind == "Delta": + delta_layer_count += 1 + assert image_layer_count > 0 + assert delta_layer_count > 0 + + log.info(f"images: {image_layer_count}, deltas: {delta_layer_count}") + + bytes_in = pageserver.http_client().get_metric_value( + "pageserver_compression_image_in_bytes_total" + ) + bytes_out = pageserver.http_client().get_metric_value( + "pageserver_compression_image_out_bytes_total" + ) + assert bytes_in is not None + assert bytes_out is not None + log.info(f"Compression ratio: {bytes_out/bytes_in} ({bytes_out} in, {bytes_out} out)") + + if enabled: + # We are writing high compressible repetitive plain text, expect excellent compression + EXPECT_RATIO = 0.2 + assert bytes_out / bytes_in < EXPECT_RATIO + else: + # Nothing should be compressed if we disabled it. + assert bytes_out >= bytes_in + + # Destroy the endpoint and create a new one to resetthe caches + with env.endpoints.create_start( + "main", tenant_id=tenant_id, pageserver_id=pageserver.id + ) as endpoint: + for v in range(100): + res = endpoint.safe_psql( + f"SELECT count(*) FROM foo WHERE id={v} and val=repeat('abcde{v:0>3}', 500)" + ) + assert res[0][0] == 1 diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 1a1425f069..467e5b1734 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -1,27 +1,29 @@ import os +import re import shutil import subprocess import tempfile +from dataclasses import dataclass from pathlib import Path from typing import List, Optional import pytest import toml +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, PgBin, + flush_ep_to_pageserver, ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( timeline_delete_wait_completed, - wait_for_last_record_lsn, - wait_for_upload, ) from fixtures.pg_version import PgVersion -from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn +from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage +from fixtures.workload import Workload # # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases. @@ -40,7 +42,7 @@ from fixtures.types import Lsn # # How to run `test_backward_compatibility` locally: # -# export DEFAULT_PG_VERSION=15 +# export DEFAULT_PG_VERSION=16 # export BUILD_TYPE=release # export CHECK_ONDISK_DATA_COMPATIBILITY=true # export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION} @@ -62,7 +64,7 @@ from fixtures.types import Lsn # # How to run `test_forward_compatibility` locally: # -# export DEFAULT_PG_VERSION=15 +# export DEFAULT_PG_VERSION=16 # export BUILD_TYPE=release # export CHECK_ONDISK_DATA_COMPATIBILITY=true # export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} @@ -112,11 +114,6 @@ def test_create_snapshot( env = neon_env_builder.init_start() endpoint = env.endpoints.create_start("main") - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" - ) - pg_bin.run_capture(["pgbench", "--initialize", "--scale=10", endpoint.connstr()]) pg_bin.run_capture(["pgbench", "--time=60", "--progress=2", endpoint.connstr()]) pg_bin.run_capture( @@ -128,16 +125,15 @@ def test_create_snapshot( timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id] pageserver_http = env.pageserver.http_client() - lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn) - pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn) + flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) env.endpoints.stop_all() for sk in env.safekeepers: sk.stop() env.pageserver.stop() + env.storage_controller.stop() # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it compatibility_snapshot_dir = ( @@ -145,7 +141,16 @@ def test_create_snapshot( ) if compatibility_snapshot_dir.exists(): shutil.rmtree(compatibility_snapshot_dir) - shutil.copytree(test_output_dir, compatibility_snapshot_dir) + + shutil.copytree( + test_output_dir, + compatibility_snapshot_dir, + ignore=shutil.ignore_patterns("pg_dynshmem"), + ) + + +# check_neon_works does recovery from WAL => the compatibility snapshot's WAL is old => will log this warning +ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*" @check_ondisk_data_compatibility_if_enabled @@ -172,6 +177,7 @@ def test_backward_compatibility( try: neon_env_builder.num_safekeepers = 3 env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo") + env.pageserver.allowed_errors.append(ingest_lag_log_line) neon_env_builder.start() check_neon_works( @@ -180,6 +186,9 @@ def test_backward_compatibility( sql_dump_path=compatibility_snapshot_dir / "dump.sql", repo_dir=env.repo_dir, ) + + env.pageserver.assert_log_contains(ingest_lag_log_line) + except Exception: if breaking_changes_allowed: pytest.xfail( @@ -226,19 +235,48 @@ def test_forward_compatibility( try: neon_env_builder.num_safekeepers = 3 + + # Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.). + # But always use the current version's neon_local binary. + # This is because we want to test the compatibility of the data format, not the compatibility of the neon_local CLI. + neon_env_builder.neon_binpath = compatibility_neon_bin + neon_env_builder.pg_distrib_dir = compatibility_postgres_distrib_dir + neon_env_builder.neon_local_binpath = neon_env_builder.neon_local_binpath + env = neon_env_builder.from_repo_dir( compatibility_snapshot_dir / "repo", - neon_binpath=compatibility_neon_bin, - pg_distrib_dir=compatibility_postgres_distrib_dir, ) + # there may be an arbitrary number of unrelated tests run between create_snapshot and here + env.pageserver.allowed_errors.append(ingest_lag_log_line) + + # not using env.pageserver.version because it was initialized before + prev_pageserver_version_str = env.get_binary_version("pageserver") + prev_pageserver_version_match = re.search( + "Neon page server git-env:(.*) failpoints: (.*), features: (.*)", + prev_pageserver_version_str, + ) + if prev_pageserver_version_match is not None: + prev_pageserver_version = prev_pageserver_version_match.group(1) + else: + raise AssertionError( + "cannot find git hash in the version string: " + prev_pageserver_version_str + ) + + # does not include logs from previous runs + assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version) + neon_env_builder.start() + # ensure the specified pageserver is running + assert env.pageserver.log_contains("git-env:" + prev_pageserver_version) + check_neon_works( env, test_output_dir=test_output_dir, sql_dump_path=compatibility_snapshot_dir / "dump.sql", repo_dir=env.repo_dir, ) + except Exception: if breaking_changes_allowed: pytest.xfail( @@ -252,9 +290,10 @@ def test_forward_compatibility( def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path): ep = env.endpoints.create_start("main") + connstr = ep.connstr() + pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) - connstr = ep.connstr() pg_bin.run_capture( ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"] ) @@ -271,6 +310,9 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r timeline_id = env.initial_timeline pg_version = env.pg_version + # Stop endpoint while we recreate timeline + flush_ep_to_pageserver(env, ep, tenant_id, timeline_id) + try: pageserver_http.timeline_preserve_initdb_archive(tenant_id, timeline_id) except PageserverApiException as e: @@ -295,6 +337,9 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r existing_initdb_timeline_id=timeline_id, ) + # Timeline exists again: restart the endpoint + ep.start() + pg_bin.run_capture( ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"] ) @@ -314,6 +359,11 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r assert not dump_from_wal_differs, "dump from WAL differs" assert not initial_dump_differs, "initial dump differs" + flush_ep_to_pageserver(env, ep, tenant_id, timeline_id) + pageserver_http.timeline_checkpoint( + tenant_id, timeline_id, compact=False, wait_until_uploaded=True + ) + def dump_differs( first: Path, second: Path, output: Path, allowed_diffs: Optional[List[str]] = None @@ -373,3 +423,132 @@ def dump_differs( break return differs + + +@dataclass +class HistoricDataSet: + name: str + tenant_id: TenantId + pg_version: PgVersion + url: str + + def __str__(self): + return self.name + + +HISTORIC_DATA_SETS = [ + # From before we enabled image layer compression. + # - IndexPart::LATEST_VERSION 7 + # - STORAGE_FORMAT_VERSION 3 + HistoricDataSet( + "2024-07-18", + TenantId("17bf64a53509714687664b3a84e9b3ba"), + PgVersion.V16, + "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2024-07-18-pgv16.tar.zst", + ), +] + + +@pytest.mark.parametrize("dataset", HISTORIC_DATA_SETS) +@pytest.mark.xdist_group("compatibility") +def test_historic_storage_formats( + neon_env_builder: NeonEnvBuilder, + test_output_dir: Path, + pg_version: PgVersion, + dataset: HistoricDataSet, +): + """ + This test is like test_backward_compatibility, but it looks back further to examples of our storage format from long ago. + """ + + ARTIFACT_CACHE_DIR = "./artifact_cache" + + import tarfile + from contextlib import closing + + import requests + import zstandard + + artifact_unpack_path = ARTIFACT_CACHE_DIR / Path("unpacked") / Path(dataset.name) + + # Note: we assume that when running across a matrix of PG versions, the matrix includes all the versions needed by + # HISTORIC_DATA_SETS. If we ever remove a PG version from the matrix, then historic datasets built using that version + # will no longer be covered by this test. + if pg_version != dataset.pg_version: + pytest.skip(f"Dataset {dataset} is for different PG version, skipping") + + with closing(requests.get(dataset.url, stream=True)) as r: + unzstd = zstandard.ZstdDecompressor() + with unzstd.stream_reader(r.raw) as stream: + with tarfile.open(mode="r|", fileobj=stream) as tf: + tf.extractall(artifact_unpack_path) + + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.pg_version = dataset.pg_version + env = neon_env_builder.init_configs() + env.start() + assert isinstance(env.pageserver_remote_storage, S3Storage) + + # Link artifact data into test's remote storage. We don't want the whole repo dir, just the remote storage part: we are not testing + # compat of local disk data across releases (test_backward_compat does that), we're testing really long-lived data in S3 like layer files and indices. + # + # The code generating the snapshot uses local_fs, but this test uses S3Storage, so we are copying a tree of files into a bucket. We use + # S3Storage so that the scrubber can run (the scrubber doesn't speak local_fs) + artifact_pageserver_path = ( + artifact_unpack_path / Path("repo") / Path("local_fs_remote_storage") / Path("pageserver") + ) + for root, _dirs, files in os.walk(artifact_pageserver_path): + for file in files: + local_path = os.path.join(root, file) + remote_key = ( + env.pageserver_remote_storage.prefix_in_bucket + + str(local_path)[len(str(artifact_pageserver_path)) :] + ) + log.info(f"Uploading {local_path} -> {remote_key}") + env.pageserver_remote_storage.client.upload_file( + local_path, env.pageserver_remote_storage.bucket_name, remote_key + ) + + # Check the scrubber handles this old data correctly (can read it and doesn't consider it corrupt) + # + # Do this _before_ importing to the pageserver, as that import may start writing immediately + healthy, metadata_summary = env.storage_scrubber.scan_metadata() + assert healthy + assert metadata_summary["tenant_count"] >= 1 + assert metadata_summary["timeline_count"] >= 1 + + env.neon_cli.import_tenant(dataset.tenant_id) + + # Discover timelines + timelines = env.pageserver.http_client().timeline_list(dataset.tenant_id) + # All our artifacts should contain at least one timeline + assert len(timelines) > 0 + + # TODO: ensure that the snapshots we're importing contain a sensible variety of content, at the very + # least they should include a mixture of deltas and image layers. Preferably they should also + # contain some "exotic" stuff like aux files from logical replication. + + # Check we can start an endpoint and read the SQL that the artifact is meant to contain + reference_sql_dump = artifact_unpack_path / Path("dump.sql") + ep = env.endpoints.create_start("main", tenant_id=dataset.tenant_id) + pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) + pg_bin.run_capture( + ["pg_dumpall", f"--dbname={ep.connstr()}", f"--file={test_output_dir / 'dump.sql'}"] + ) + assert not dump_differs( + reference_sql_dump, + test_output_dir / "dump.sql", + test_output_dir / "dump.filediff", + ) + ep.stop() + + # Check we can also do writes to the database + existing_timeline_id = TimelineId(timelines[0]["timeline_id"]) + workload = Workload(env, dataset.tenant_id, existing_timeline_id) + workload.init() + workload.write_rows(100) + + # Check that compaction works + env.pageserver.http_client().timeline_compact( + dataset.tenant_id, existing_timeline_id, force_image_layer_creation=True + ) diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py new file mode 100644 index 0000000000..dd36190fcd --- /dev/null +++ b/test_runner/regress/test_compute_catalog.py @@ -0,0 +1,34 @@ +import requests +from fixtures.neon_fixtures import NeonEnv + + +def test_compute_catalog(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_config", "empty") + + endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"]) + client = endpoint.http_client() + + objects = client.dbs_and_roles() + + # Assert that 'cloud_admin' role exists in the 'roles' list + assert any( + role["name"] == "cloud_admin" for role in objects["roles"] + ), "The 'cloud_admin' role is missing" + + # Assert that 'postgres' database exists in the 'databases' list + assert any( + db["name"] == "postgres" for db in objects["databases"] + ), "The 'postgres' database is missing" + + ddl = client.database_schema(database="postgres") + + assert "-- PostgreSQL database dump" in ddl + + try: + client.database_schema(database="nonexistentdb") + raise AssertionError("Expected HTTPError was not raised") + except requests.exceptions.HTTPError as e: + assert ( + e.response.status_code == 404 + ), f"Expected 404 status code, but got {e.response.status_code}" diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py index 0ea5784b67..2ef28eb94b 100644 --- a/test_runner/regress/test_config.py +++ b/test_runner/regress/test_config.py @@ -1,7 +1,7 @@ +import os from contextlib import closing -from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder # @@ -13,7 +13,6 @@ def test_config(neon_simple_env: NeonEnv): # change config endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"]) - log.info("postgres is running on test_config branch") with closing(endpoint.connect()) as conn: with conn.cursor() as cur: @@ -30,3 +29,45 @@ def test_config(neon_simple_env: NeonEnv): # check that config change was applied assert cur.fetchone() == ("debug1",) + + +# +# Test that reordering of safekeepers does not restart walproposer +# +def test_safekeepers_reconfigure_reorder( + neon_env_builder: NeonEnvBuilder, +): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_safekeepers_reconfigure_reorder") + + endpoint = env.endpoints.create_start("test_safekeepers_reconfigure_reorder") + + old_sks = "" + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SHOW neon.safekeepers") + res = cur.fetchone() + assert res is not None, "neon.safekeepers GUC is set" + old_sks = res[0] + + # Reorder safekeepers + safekeepers = endpoint.active_safekeepers + safekeepers = safekeepers[1:] + safekeepers[:1] + + endpoint.reconfigure(safekeepers=safekeepers) + + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SHOW neon.safekeepers") + res = cur.fetchone() + assert res is not None, "neon.safekeepers GUC is set" + new_sks = res[0] + + assert new_sks != old_sks, "GUC changes were applied" + + log_path = os.path.join(endpoint.endpoint_path(), "compute.log") + with open(log_path, "r") as log_file: + logs = log_file.read() + # Check that walproposer was not restarted + assert "restarting walproposer" not in logs diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py index 01ecc2b95f..30f8d81890 100644 --- a/test_runner/regress/test_crafted_wal_end.py +++ b/test_runner/regress/test_crafted_wal_end.py @@ -19,6 +19,12 @@ from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_crafted_wal_end") + env.pageserver.allowed_errors.extend( + [ + # seems like pageserver stop triggers these + ".*initial size calculation failed.*Bad state (not active).*", + ] + ) endpoint = env.endpoints.create("test_crafted_wal_end") wal_craft = WalCraft(env) diff --git a/test_runner/regress/test_createdropdb.py b/test_runner/regress/test_createdropdb.py index 500d19cf31..f741a9fc87 100644 --- a/test_runner/regress/test_createdropdb.py +++ b/test_runner/regress/test_createdropdb.py @@ -20,7 +20,6 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str): env.neon_cli.create_branch("test_createdb", "empty") endpoint = env.endpoints.create_start("test_createdb") - log.info("postgres is running on 'test_createdb' branch") with endpoint.cursor() as cur: # Cause a 'relmapper' change in the original branch @@ -65,7 +64,6 @@ def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env env.neon_cli.create_branch("test_dropdb", "empty") endpoint = env.endpoints.create_start("test_dropdb") - log.info("postgres is running on 'test_dropdb' branch") with endpoint.cursor() as cur: cur.execute("CREATE DATABASE foodb") diff --git a/test_runner/regress/test_createuser.py b/test_runner/regress/test_createuser.py index f1bc405287..17d9824f52 100644 --- a/test_runner/regress/test_createuser.py +++ b/test_runner/regress/test_createuser.py @@ -1,4 +1,3 @@ -from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.utils import query_scalar @@ -10,7 +9,6 @@ def test_createuser(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_createuser", "empty") endpoint = env.endpoints.create_start("test_createuser") - log.info("postgres is running on 'test_createuser' branch") with endpoint.cursor() as cur: # Cause a 'relmapper' change in the original branch diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index 7174487e68..50da673d87 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -296,7 +296,6 @@ def test_ddl_forwarding_invalid_db(neon_simple_env: NeonEnv): # Some non-existent url config_lines=["neon.console_url=http://localhost:9999/unknown/api/v0/roles_and_databases"], ) - log.info("postgres is running on 'test_ddl_forwarding_invalid_db' branch") with endpoint.cursor() as cur: cur.execute("SET neon.forward_ddl = false") diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 6a4f0edbea..85616c3fe2 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -5,7 +5,7 @@ from dataclasses import dataclass from typing import Any, Dict, Iterable, Tuple import pytest -import toml +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -17,11 +17,14 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_upload_queue_empty from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId -from fixtures.utils import wait_until +from fixtures.utils import human_bytes, wait_until GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" +# access times in the pageserver are stored at a very low resolution: to generate meaningfully different +# values, tests must inject sleeps +ATIME_RESOLUTION = 2 + @pytest.mark.parametrize("config_level_override", [None, 400]) def test_min_resident_size_override_handling( @@ -45,17 +48,16 @@ def test_min_resident_size_override_handling( ps_http.set_tenant_config(tenant_id, {}) assert_config(tenant_id, None, default_tenant_conf_value) - env.pageserver.stop() if config_level_override is not None: - env.pageserver.start( - overrides=( - "--pageserver-config-override=tenant_config={ min_resident_size_override = " - + str(config_level_override) - + " }", - ) - ) - else: - env.pageserver.start() + + def set_min_resident_size(config): + tenant_config = config.get("tenant_config", {}) + tenant_config["min_resident_size_override"] = config_level_override + config["tenant_config"] = tenant_config + + env.pageserver.edit_config_toml(set_min_resident_size) + env.pageserver.stop() + env.pageserver.start() tenant_id, _ = env.neon_cli.create_tenant() assert_overrides(tenant_id, config_level_override) @@ -69,14 +71,11 @@ def test_min_resident_size_override_handling( @enum.unique class EvictionOrder(str, enum.Enum): - ABSOLUTE_ORDER = "absolute" RELATIVE_ORDER_EQUAL = "relative_equal" RELATIVE_ORDER_SPARE = "relative_spare" def config(self) -> Dict[str, Any]: - if self == EvictionOrder.ABSOLUTE_ORDER: - return {"type": "AbsoluteAccessed"} - elif self == EvictionOrder.RELATIVE_ORDER_EQUAL: + if self == EvictionOrder.RELATIVE_ORDER_EQUAL: return { "type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": False}, @@ -155,37 +154,52 @@ class EvictionEnv: mock_behavior, eviction_order: EvictionOrder, ): - disk_usage_config = { - "period": period, - "max_usage_pct": max_usage_pct, - "min_avail_bytes": min_avail_bytes, - "mock_statvfs": mock_behavior, - "eviction_order": eviction_order.config(), - } + """ + Starts pageserver up with mocked statvfs setup. The startup is + problematic because of dueling initial logical size calculations + requiring layers and disk usage based task evicting. - enc = toml.TomlEncoder() + Returns after initial logical sizes are complete, but the phase of disk + usage eviction task is unknown; it might need to run one more iteration + before assertions can be made. + """ # these can sometimes happen during startup before any tenants have been # loaded, so nothing can be evicted, we just wait for next iteration which # is able to evict. pageserver.allowed_errors.append(".*WARN.* disk usage still high.*") - pageserver.start( - overrides=( - "--pageserver-config-override=disk_usage_based_eviction=" - + enc.dump_inline_table(disk_usage_config).replace("\n", " "), + pageserver.patch_config_toml_nonrecursive( + { + "disk_usage_based_eviction": { + "period": period, + "max_usage_pct": max_usage_pct, + "min_avail_bytes": min_avail_bytes, + "mock_statvfs": mock_behavior, + "eviction_order": eviction_order.config(), + }, # Disk usage based eviction runs as a background task. # But pageserver startup delays launch of background tasks for some time, to prioritize initial logical size calculations during startup. # But, initial logical size calculation may not be triggered if safekeepers don't publish new broker messages. # But, we only have a 10-second-timeout in this test. # So, disable the delay for this test. - "--pageserver-config-override=background_task_maximum_delay='0s'", - ), + "background_task_maximum_delay": "0s", + } ) - def statvfs_called(): - assert pageserver.log_contains(".*running mocked statvfs.*") + pageserver.start() + # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction + for tenant_id, timeline_id in self.timelines: + tenant_ps = self.neon_env.get_tenant_pageserver(tenant_id) + # Pageserver may be none if we are currently not attached anywhere, e.g. during secondary eviction test + if tenant_ps is not None: + tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id) + + def statvfs_called(): + pageserver.assert_log_contains(".*running mocked statvfs.*") + + # we most likely have already completed multiple runs wait_until(10, 1, statvfs_called) @@ -205,19 +219,6 @@ def count_layers_per_tenant( return dict(ret) -def human_bytes(amt: float) -> str: - suffixes = ["", "Ki", "Mi", "Gi"] - - last = suffixes[-1] - - for name in suffixes: - if amt < 1024 or name == last: - return f"{int(round(amt))} {name}B" - amt = amt / 1024 - - raise RuntimeError("unreachable") - - def _eviction_env( request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, num_pageservers: int ) -> EvictionEnv: @@ -230,6 +231,9 @@ def _eviction_env( neon_env_builder.num_pageservers = num_pageservers neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + # Disable compression support for EvictionEnv to get larger layer sizes + neon_env_builder.pageserver_config_override = "image_compression='disabled'" + # initial tenant will not be present on this pageserver env = neon_env_builder.init_configs() env.start() @@ -281,7 +285,7 @@ def pgbench_init_tenant( "gc_period": "0s", "compaction_period": "0s", "checkpoint_distance": f"{layer_size}", - "image_creation_threshold": "100", + "image_creation_threshold": "999999", "compaction_target_size": f"{layer_size}", } ) @@ -381,7 +385,7 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv): @pytest.mark.parametrize( "order", - [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], + [EvictionOrder.RELATIVE_ORDER_EQUAL], ) def test_pageserver_evicts_until_pressure_is_relieved( eviction_env: EvictionEnv, order: EvictionOrder @@ -415,7 +419,7 @@ def test_pageserver_evicts_until_pressure_is_relieved( @pytest.mark.parametrize( "order", - [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], + [EvictionOrder.RELATIVE_ORDER_EQUAL], ) def test_pageserver_respects_overridden_resident_size( eviction_env: EvictionEnv, order: EvictionOrder @@ -492,7 +496,7 @@ def test_pageserver_respects_overridden_resident_size( @pytest.mark.parametrize( "order", - [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], + [EvictionOrder.RELATIVE_ORDER_EQUAL], ) def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: EvictionOrder): """ @@ -516,14 +520,13 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E assert actual_change >= target, "eviction must always evict more than target" time.sleep(1) # give log time to flush - assert env.neon_env.pageserver.log_contains(GLOBAL_LRU_LOG_LINE) + env.neon_env.pageserver.assert_log_contains(GLOBAL_LRU_LOG_LINE) env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) @pytest.mark.parametrize( "order", [ - EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE, ], @@ -547,6 +550,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): (tenant_id, timeline_id) = warm # make picked tenant more recently used than the other one + time.sleep(ATIME_RESOLUTION) env.warm_up_tenant(tenant_id) # Build up enough pressure to require evictions from both tenants, @@ -569,62 +573,38 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): later_tenant_usage < du_by_timeline[tenant] ), "all tenants should have lost some layers" - warm_size = later_du_by_timeline[warm] - cold_size = later_du_by_timeline[cold] + # with relative order what matters is the amount of layers, with a + # fudge factor of whether the eviction bothers tenants with highest + # layer count the most. last accessed times between tenants does not + # matter. + assert order in [EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE] + layers_now = env.count_layers_per_tenant(env.pageserver) - if order == EvictionOrder.ABSOLUTE_ORDER: - # bounds for warmed_size - warm_lower = 0.5 * du_by_timeline[warm] + expected_ratio = later_total_on_disk / total_on_disk + log.info( + f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio" + ) - # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room. - # So, check for up to 3 here. - warm_upper = warm_lower + 3 * env.layer_size + for tenant_id, original_count in tenant_layers.items(): + count_now = layers_now[tenant_id] + ratio = count_now / original_count + abs_diff = abs(ratio - expected_ratio) + assert original_count > count_now - cold_upper = 2 * env.layer_size - log.info(f"tenants: warm={warm[0]}, cold={cold[0]}") + expectation = 0.065 log.info( - f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}" + f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}" ) - log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}") - - assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)" - assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)" - - assert ( - cold_size < cold_upper - ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size" - else: - # with relative order what matters is the amount of layers, with a - # fudge factor of whether the eviction bothers tenants with highest - # layer count the most. last accessed times between tenants does not - # matter. - layers_now = env.count_layers_per_tenant(env.pageserver) - - expected_ratio = later_total_on_disk / total_on_disk - log.info( - f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio" - ) - - for tenant_id, original_count in tenant_layers.items(): - count_now = layers_now[tenant_id] - ratio = count_now / original_count - abs_diff = abs(ratio - expected_ratio) - assert original_count > count_now - log.info( - f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < 0.1" - ) - - # in this test case both relative_spare and relative_equal produce - # the same outcomes; this must be a quantization effect of similar - # sizes (-s4 and -s6) and small (5MB) layer size. - # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02 - assert abs_diff < 0.05 + # in this test case both relative_spare and relative_equal produce + # the same outcomes; this must be a quantization effect of similar + # sizes (-s4 and -s6) and small (5MB) layer size. + # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02 + assert abs_diff < expectation @pytest.mark.parametrize( "order", [ - EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE, ], @@ -647,6 +627,10 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or for scale in [1, 1, 1, 4]: timelines.append((pgbench_init_tenant(layer_size, scale, env, pg_bin), scale)) + # Eviction times are stored at a low resolution. We must ensure that the time between + # tenants is long enough for the pageserver to distinguish them. + time.sleep(ATIME_RESOLUTION) + env.neon_cli.safekeeper_stop() for (tenant_id, timeline_id), scale in timelines: @@ -654,11 +638,10 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or finish_tenant_creation(env, tenant_id, timeline_id, min_expected_layers) tenant_layers = count_layers_per_tenant(env.pageserver, map(lambda x: x[0], timelines)) - (total_on_disk, _, _) = poor_mans_du(env, map(lambda x: x[0], timelines), env.pageserver, False) + (total_on_disk, _, _) = poor_mans_du(env, map(lambda x: x[0], timelines), env.pageserver, True) - # cut 10 percent response = env.pageserver.http_client().disk_usage_eviction_run( - {"evict_bytes": total_on_disk // 10, "eviction_order": order.config()} + {"evict_bytes": total_on_disk // 5, "eviction_order": order.config()} ) log.info(f"{response}") @@ -677,14 +660,7 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or ), "rest of the assertions expect 3 + 1 timelines, ratios, scales, all in order" log.info(f"{ratios}") - if order == EvictionOrder.ABSOLUTE_ORDER: - # first tenant loses most - assert ratios[0] <= ratios[1], "first should lose the most" - assert ratios[1] < ratios[2], "second should lose some" - assert ratios[1] < 1.0 - assert ratios[2] <= ratios[3], "third might not lose" - assert ratios[3] == 1.0, "tenant created last does not lose" - elif order == EvictionOrder.RELATIVE_ORDER_EQUAL: + if order == EvictionOrder.RELATIVE_ORDER_EQUAL: assert all([x for x in ratios if x < 1.0]), "all tenants lose layers" elif order == EvictionOrder.RELATIVE_ORDER_SPARE: # with different layer sizes and pg versions, there are different combinations @@ -747,10 +723,10 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv): "type": "Failure", "mocked_error": "EIO", }, - eviction_order=EvictionOrder.ABSOLUTE_ORDER, + eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE, ) - assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO") + env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO") env.neon_env.pageserver.allowed_errors.append(".*statvfs failed.*EIO") @@ -781,17 +757,28 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv): # This avoids accounting for metadata files & tenant conf in the tests. "name_filter": ".*__.*", }, - eviction_order=EvictionOrder.ABSOLUTE_ORDER, + eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE, ) - def relieved_log_message(): - assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved") + wait_until( + 10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved") + ) - wait_until(10, 1, relieved_log_message) + def less_than_max_usage_pct(): + post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) + assert post_eviction_total_size < 0.33 * total_size, "we requested max 33% usage" - post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) + wait_until(2, 2, less_than_max_usage_pct) - assert post_eviction_total_size <= 0.33 * total_size, "we requested max 33% usage" + # Disk usage candidate collection only takes into account active tenants. + # However, the statvfs call takes into account the entire tenants directory, + # which includes tenants which haven't become active yet. + # + # After re-starting the pageserver, disk usage eviction may kick in *before* + # both tenants have become active. Hence, the logic will try to satisfy the + # disk usage requirements by evicting everything belonging to the active tenant, + # and hence violating the tenant minimum resident size. + env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): @@ -823,19 +810,20 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): # This avoids accounting for metadata files & tenant conf in the tests. "name_filter": ".*__.*", }, - eviction_order=EvictionOrder.ABSOLUTE_ORDER, + eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE, ) - def relieved_log_message(): - assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved") + wait_until( + 10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved") + ) - wait_until(10, 1, relieved_log_message) + def more_than_min_avail_bytes_freed(): + post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) + assert ( + total_size - post_eviction_total_size >= min_avail_bytes + ), f"we requested at least {min_avail_bytes} worth of free space" - post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) - - assert ( - total_size - post_eviction_total_size >= min_avail_bytes - ), "we requested at least min_avail_bytes worth of free space" + wait_until(2, 2, more_than_min_avail_bytes_freed) def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv): @@ -845,18 +833,18 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv): # Set up a situation where one pageserver _only_ has secondary locations on it, # so that when we release space we are sure it is via secondary locations. - - log.info("Setting up secondary location...") - ps_attached = env.neon_env.pageservers[0] + log.info("Setting up secondary locations...") ps_secondary = env.neon_env.pageservers[1] for tenant_id in tenant_ids: - # Migrate all attached tenants to the same pageserver, so that all the secondaries - # will run on the other pageserver. This is necessary because when we create tenants, - # they are spread over pageservers by default. - env.neon_env.attachment_service.tenant_shard_migrate( - TenantShardId(tenant_id, 0, 0), ps_attached.id - ) + # Find where it is attached + pageserver = env.neon_env.get_tenant_pageserver(tenant_id) + pageserver.http_client().tenant_heatmap_upload(tenant_id) + # Detach it + pageserver.tenant_detach(tenant_id) + + # Create a secondary mode location for the tenant, all tenants on one pageserver that will only + # contain secondary locations: this is the one where we will exercise disk usage eviction ps_secondary.tenant_location_configure( tenant_id, { @@ -868,41 +856,18 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv): readback_conf = ps_secondary.read_tenant_location_conf(tenant_id) log.info(f"Read back conf: {readback_conf}") - # Request secondary location to download all layers that the attached location has - ps_attached.http_client().tenant_heatmap_upload(tenant_id) + # Request secondary location to download all layers that the attached location indicated + # in its heatmap ps_secondary.http_client().tenant_secondary_download(tenant_id) - # Configure the secondary pageserver to have a phony small disk size - ps_secondary.stop() total_size, _, _ = env.timelines_du(ps_secondary) - blocksize = 512 - total_blocks = (total_size + (blocksize - 1)) // blocksize + evict_bytes = total_size // 3 - min_avail_bytes = total_size // 3 - - env.pageserver_start_with_disk_usage_eviction( - ps_secondary, - period="1s", - max_usage_pct=100, - min_avail_bytes=min_avail_bytes, - mock_behavior={ - "type": "Success", - "blocksize": blocksize, - "total_blocks": total_blocks, - # Only count layer files towards used bytes in the mock_statvfs. - # This avoids accounting for metadata files & tenant conf in the tests. - "name_filter": ".*__.*", - }, - eviction_order=EvictionOrder.ABSOLUTE_ORDER, - ) - - def relieved_log_message(): - assert ps_secondary.log_contains(".*disk usage pressure relieved") - - wait_until(10, 1, relieved_log_message) + response = ps_secondary.http_client().disk_usage_eviction_run({"evict_bytes": evict_bytes}) + log.info(f"{response}") post_eviction_total_size, _, _ = env.timelines_du(ps_secondary) assert ( - total_size - post_eviction_total_size >= min_avail_bytes - ), "we requested at least min_avail_bytes worth of free space" + total_size - post_eviction_total_size >= evict_bytes + ), "we requested at least evict_bytes worth of free space" diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py deleted file mode 100644 index 224e6f50c7..0000000000 --- a/test_runner/regress/test_duplicate_layers.py +++ /dev/null @@ -1,147 +0,0 @@ -import time - -import pytest -from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn -from fixtures.pageserver.utils import ( - wait_for_last_record_lsn, - wait_for_upload_queue_empty, - wait_until_tenant_active, -) -from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from requests.exceptions import ConnectionError - - -def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): - env = neon_env_builder.init_start() - pageserver_http = env.pageserver.http_client() - - # use a failpoint to return all L0s as L1s - message = ".*duplicated L1 layer layer=.*" - env.pageserver.allowed_errors.append(message) - - # Use aggressive compaction and checkpoint settings - tenant_id, _ = env.neon_cli.create_tenant( - conf={ - "checkpoint_distance": f"{1024 ** 2}", - "compaction_target_size": f"{1024 ** 2}", - "compaction_period": "5 s", - "compaction_threshold": "3", - } - ) - - pageserver_http.configure_failpoints(("compact-level0-phase1-return-same", "return")) - - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) - connstr = endpoint.connstr(options="-csynchronous_commit=off") - pg_bin.run_capture(["pgbench", "-i", "-s1", connstr]) - - time.sleep(10) # let compaction to be performed - assert env.pageserver.log_contains("compact-level0-phase1-return-same") - - -def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): - """ - Test sets fail point at the end of first compaction phase: after - flushing new L1 layer but before deletion of L0 layers. - - The L1 used to be overwritten, but with crash-consistency via remote - index_part.json, we end up deleting the not yet uploaded L1 layer on - startup. - """ - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - - env = neon_env_builder.init_start( - initial_tenant_conf={ - "checkpoint_distance": f"{1024 ** 2}", - "compaction_target_size": f"{1024 ** 2}", - "compaction_period": "0 s", - "compaction_threshold": "3", - } - ) - pageserver_http = env.pageserver.http_client() - - tenant_id, timeline_id = env.initial_tenant, env.initial_timeline - - pageserver_http.configure_failpoints(("after-timeline-compacted-first-L1", "exit")) - - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) - connstr = endpoint.connstr(options="-csynchronous_commit=off") - pg_bin.run_capture(["pgbench", "-i", "-s1", connstr]) - - lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - endpoint.stop() - - # make sure we receive no new wal after this, so that we'll write over the same L1 file. - endpoint.stop() - for sk in env.safekeepers: - sk.stop() - - # hit the exit failpoint - with pytest.raises(ConnectionError, match="Remote end closed connection without response"): - pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - env.pageserver.stop() - - # now the duplicate L1 has been created, but is not yet uploaded - assert isinstance(env.pageserver_remote_storage, LocalFsStorage) - - # path = env.remote_storage.timeline_path(tenant_id, timeline_id) - l1_found = None - for path in env.pageserver.timeline_dir(tenant_id, timeline_id).iterdir(): - if path.name == "metadata" or path.name.startswith("ephemeral-"): - continue - - if len(path.suffixes) > 0: - # temp files - continue - - [key_range, lsn_range] = path.name.split("__", maxsplit=1) - - if "-" not in lsn_range: - # image layer - continue - - [key_start, key_end] = key_range.split("-", maxsplit=1) - - if key_start == "0" * 36 and key_end == "F" * 36: - # L0 - continue - - if l1_found is not None: - raise RuntimeError(f"found multiple L1: {l1_found.name} and {path.name}") - l1_found = path - - assert l1_found is not None, "failed to find L1 locally" - - uploaded = env.pageserver_remote_storage.remote_layer_path( - tenant_id, timeline_id, l1_found.name - ) - assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded" - - env.pageserver.start() - wait_until_tenant_active(pageserver_http, tenant_id) - - assert not l1_found.exists(), "partial compaction result should had been removed during startup" - - # wait for us to catch up again - wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn) - - pageserver_http.timeline_compact(tenant_id, timeline_id) - - # give time for log flush - time.sleep(1) - - message = f".*duplicated L1 layer layer={l1_found.name}" - found_msg = env.pageserver.log_contains(message) - # resident or evicted, it should not be overwritten, however it should had been non-existing at startup - assert ( - found_msg is None - ), "layer should had been removed during startup, did it live on as evicted?" - - assert l1_found.exists(), "the L1 reappears" - - wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id) - - uploaded = env.pageserver_remote_storage.remote_layer_path( - tenant_id, timeline_id, l1_found.name - ) - assert uploaded.exists(), "the L1 is uploaded" diff --git a/test_runner/regress/test_endpoint_crash.py b/test_runner/regress/test_endpoint_crash.py new file mode 100644 index 0000000000..ae3dded437 --- /dev/null +++ b/test_runner/regress/test_endpoint_crash.py @@ -0,0 +1,23 @@ +import pytest +from fixtures.neon_fixtures import NeonEnvBuilder + + +@pytest.mark.parametrize( + "sql_func", + [ + "trigger_panic", + "trigger_segfault", + "💣", # calls `trigger_segfault` internally + ], +) +def test_endpoint_crash(neon_env_builder: NeonEnvBuilder, sql_func: str): + """ + Test that triggering crash from neon_test_utils crashes the endpoint + """ + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_endpoint_crash") + endpoint = env.endpoints.create_start("test_endpoint_crash") + + endpoint.safe_psql("CREATE EXTENSION neon_test_utils;") + with pytest.raises(Exception, match="This probably means the server terminated abnormally"): + endpoint.safe_psql(f"SELECT {sql_func}();") diff --git a/test_runner/regress/test_explain_with_lfc_stats.py b/test_runner/regress/test_explain_with_lfc_stats.py new file mode 100644 index 0000000000..5231dedcda --- /dev/null +++ b/test_runner/regress/test_explain_with_lfc_stats.py @@ -0,0 +1,84 @@ +from pathlib import Path + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv + + +def test_explain_with_lfc_stats(neon_simple_env: NeonEnv): + env = neon_simple_env + + cache_dir = Path(env.repo_dir) / "file_cache" + cache_dir.mkdir(exist_ok=True) + + branchname = "test_explain_with_lfc_stats" + env.neon_cli.create_branch(branchname, "empty") + log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}") + endpoint = env.endpoints.create_start( + branchname, + config_lines=[ + "shared_buffers='1MB'", + f"neon.file_cache_path='{cache_dir}/file.cache'", + "neon.max_file_cache_size='128MB'", + "neon.file_cache_size_limit='64MB'", + ], + ) + + cur = endpoint.connect().cursor() + + log.info(f"preparing some data in {endpoint.connstr()}") + + ddl = """ +CREATE TABLE pgbench_accounts ( + aid bigint NOT NULL, + bid integer, + abalance integer, + filler character(84), + -- more web-app like columns + text_column_plain TEXT DEFAULT repeat('NeonIsCool', 5), + jsonb_column_extended JSONB DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb +) +WITH (fillfactor='100'); +""" + + cur.execute(ddl) + cur.execute( + "insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;" + ) + + log.info(f"warming up caches with sequential scan in {endpoint.connstr()}") + cur.execute("SELECT * FROM pgbench_accounts WHERE abalance > 0") + + log.info("running explain analyze without LFC values to verify they do not show up in the plan") + cur.execute("EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM pgbench_accounts WHERE abalance > 0") + rows = cur.fetchall() + plan = "\n".join(r[0] for r in rows) + log.debug(plan) + assert "Seq Scan on pgbench_accounts" in plan + assert "Buffers: shared hit" in plan + assert "File cache: hits=" not in plan + log.info("running explain analyze WITH LFC values to verify they do now show up") + cur.execute( + "EXPLAIN (ANALYZE, BUFFERS,FILECACHE) SELECT * FROM pgbench_accounts WHERE abalance > 0" + ) + rows = cur.fetchall() + plan = "\n".join(r[0] for r in rows) + log.debug(plan) + assert "Seq Scan on pgbench_accounts" in plan + assert "Buffers: shared hit" in plan + assert "File cache: hits=" in plan + log.info("running explain analyze WITH LFC values to verify json output") + cur.execute( + "EXPLAIN (ANALYZE, BUFFERS,FILECACHE, FORMAT JSON) SELECT * FROM pgbench_accounts WHERE abalance > 0" + ) + jsonplan = cur.fetchall()[0][0] + log.debug(jsonplan) + # Directly access the 'Plan' part of the first element of the JSON array + plan_details = jsonplan[0]["Plan"] + + # Extract "File Cache Hits" and "File Cache Misses" + file_cache_hits = plan_details.get("File Cache Hits") + file_cache_misses = plan_details.get("File Cache Misses") + + # Now you can assert the values + assert file_cache_hits >= 5000, f"Expected File Cache Hits to be > 5000, got {file_cache_hits}" + assert file_cache_misses == 0, f"Expected File Cache Misses to be 0, got {file_cache_misses}" diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index a456c06862..e6d51a77a6 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -1,6 +1,7 @@ import os from pathlib import Path +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -8,7 +9,6 @@ from fixtures.neon_fixtures import ( VanillaPostgres, ) from fixtures.port_distributor import PortDistributor -from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar, subprocess_capture num_rows = 1000 @@ -19,18 +19,16 @@ def test_fullbackup( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, port_distributor: PortDistributor, - pg_distrib_dir: Path, test_output_dir: Path, ): env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_fullbackup") - endpoint_main = env.endpoints.create_start("test_fullbackup") - log.info("postgres is running on 'test_fullbackup' branch") + # endpoint needs to be alive until the fullbackup so that we have + # prev_record_lsn for the vanilla_pg to start in read-write mode + # for some reason this does not happen if endpoint is shutdown. + endpoint_main = env.endpoints.create_start("main") with endpoint_main.cursor() as cur: - timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) - # data loading may take a while, so increase statement timeout cur.execute("SET statement_timeout='300s'") cur.execute( @@ -42,17 +40,13 @@ def test_fullbackup( lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) log.info(f"start_backup_lsn = {lsn}") - # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. - # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} - # Get and unpack fullbackup from pageserver restored_dir_path = env.repo_dir / "restored_datadir" os.mkdir(restored_dir_path, 0o750) - query = f"fullbackup {env.initial_tenant} {timeline} {lsn}" tar_output_file = test_output_dir / "fullbackup.tar" - cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, env.initial_timeline, lsn, tar_output_file + ) subprocess_capture( env.repo_dir, ["tar", "-xf", str(tar_output_file), "-C", str(restored_dir_path)] ) @@ -62,17 +56,11 @@ def test_fullbackup( # use resetwal to overwrite it pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") cmd = [pg_resetwal_path, "-D", str(restored_dir_path)] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.run_capture(cmd) # Restore from the backup and find the data we inserted port = port_distributor.get_port() with VanillaPostgres(restored_dir_path, pg_bin, port, init=False) as vanilla_pg: - # TODO make port an optional argument - vanilla_pg.configure( - [ - f"port={port}", - ] - ) vanilla_pg.start() num_rows_found = vanilla_pg.safe_psql("select count(*) from tbl;", user="cloud_admin")[0][0] assert num_rows == num_rows_found diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index ef68049ee7..44133f2350 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -2,6 +2,7 @@ import asyncio import concurrent.futures import random +from fixtures.common_types import TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -10,7 +11,6 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, ) from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import TimelineId # Test configuration # @@ -67,11 +67,9 @@ async def update_and_gc(env: NeonEnv, endpoint: Endpoint, timeline: TimelineId): # def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC - neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) timeline = env.neon_cli.create_branch("test_gc_aggressive", "main") endpoint = env.endpoints.create_start("test_gc_aggressive") - log.info("postgres is running on test_gc_aggressive branch") with endpoint.cursor() as cur: # Create table, and insert the first 100 rows @@ -95,13 +93,11 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): # def test_gc_index_upload(neon_env_builder: NeonEnvBuilder): - # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls - neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" num_index_uploads = 0 neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - - env = neon_env_builder.init_start() + # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls + env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) tenant_id = env.initial_tenant timeline_id = env.neon_cli.create_branch("test_gc_index_upload", "main") endpoint = env.endpoints.create_start("test_gc_index_upload") diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py deleted file mode 100644 index 284a8c3563..0000000000 --- a/test_runner/regress/test_gc_cutoff.py +++ /dev/null @@ -1,47 +0,0 @@ -import subprocess - -import pytest -from fixtures.neon_fixtures import NeonEnvBuilder, PgBin - - -# Test gc_cutoff -# -# This test sets fail point at the end of GC, and checks that pageserver -# normally restarts after it. Also, there should be GC ERRORs in the log, -# but the fixture checks the log for any unexpected ERRORs after every -# test anyway, so it doesn't need any special attention here. -@pytest.mark.timeout(600) -def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): - env = neon_env_builder.init_start( - initial_tenant_conf={ - "gc_period": "10 s", - "gc_horizon": f"{1024 ** 2}", - "checkpoint_distance": f"{1024 ** 2}", - "compaction_period": "5 s", - # set PITR interval to be small, so we can do GC - "pitr_interval": "1 s", - "compaction_threshold": "3", - "image_creation_threshold": "2", - } - ) - - pageserver_http = env.pageserver.http_client() - - # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test - tenant_id = env.initial_tenant - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) - connstr = endpoint.connstr(options="-csynchronous_commit=off") - pg_bin.run_capture(["pgbench", "-i", "-s10", connstr]) - - pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit")) - - # Because this test does a rapid series of restarts of the same node, it's possible that - # we are restarted again before we can clean up deletion lists form the previous generation, - # resulting in a subsequent startup logging a warning. - env.pageserver.allowed_errors.append(".*Dropping stale deletions for tenant.*") - - for _ in range(5): - with pytest.raises(subprocess.SubprocessError): - pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr]) - env.pageserver.stop() - env.pageserver.start(extra_env_vars={"FAILPOINTS": "after-timeline-gc-removed-layers=exit"}) diff --git a/test_runner/regress/test_gin_redo.py b/test_runner/regress/test_gin_redo.py new file mode 100644 index 0000000000..9205882239 --- /dev/null +++ b/test_runner/regress/test_gin_redo.py @@ -0,0 +1,22 @@ +import time + +from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup + + +# +# Test that redo of XLOG_GIN_VACUUM_PAGE doesn't produce error +# +def test_gin_redo(neon_simple_env: NeonEnv): + env = neon_simple_env + + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + time.sleep(1) + secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + con = primary.connect() + cur = con.cursor() + cur.execute("create table gin_test_tbl(id integer, i int4[])") + cur.execute("create index gin_test_idx on gin_test_tbl using gin (i)") + cur.execute("insert into gin_test_tbl select g,array[3, 1, g] from generate_series(1, 10000) g") + cur.execute("delete from gin_test_tbl where id % 2 = 0") + cur.execute("vacuum gin_test_tbl") + wait_replica_caughtup(primary, secondary) diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 7822e29ed9..ae63136abb 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -1,38 +1,20 @@ +import asyncio import os -import re +import threading import time +from functools import partial +import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import Endpoint, NeonEnv - - -def wait_caughtup(primary: Endpoint, secondary: Endpoint): - primary_lsn = primary.safe_psql_scalar( - "SELECT pg_current_wal_insert_lsn()::text", log_query=False - ) - while True: - secondary_lsn = secondary.safe_psql_scalar( - "SELECT pg_last_wal_replay_lsn()", log_query=False - ) - caught_up = secondary_lsn >= primary_lsn - log.info(f"caughtup={caught_up}, primary_lsn={primary_lsn}, secondary_lsn={secondary_lsn}") - if caught_up: - return - time.sleep(1) - - -# Check for corrupted WAL messages which might otherwise go unnoticed if -# reconnection fixes this. -def scan_standby_log_for_errors(secondary): - log_path = secondary.endpoint_path() / "compute.log" - with log_path.open("r") as f: - markers = re.compile( - r"incorrect resource manager data|record with incorrect|invalid magic number|unexpected pageaddr" - ) - for line in f: - if markers.search(line): - log.info(f"bad error in standby log: {line}") - raise AssertionError() +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + log_replica_lag, + tenant_get_shards, + wait_replica_caughtup, +) +from fixtures.utils import wait_until def test_hot_standby(neon_simple_env: NeonEnv): @@ -79,7 +61,7 @@ def test_hot_standby(neon_simple_env: NeonEnv): primary.safe_psql("create table t(key int, value text)") primary.safe_psql("insert into t select generate_series(1, 100000), 'payload'") - wait_caughtup(primary, secondary) + wait_replica_caughtup(primary, secondary) with secondary.connect() as s_con: with s_con.cursor() as s_cur: @@ -94,8 +76,269 @@ def test_hot_standby(neon_simple_env: NeonEnv): assert response is not None assert response == responses[query] - scan_standby_log_for_errors(secondary) + # Check for corrupted WAL messages which might otherwise go unnoticed if + # reconnection fixes this. + assert not secondary.log_contains( + "incorrect resource manager data|record with incorrect|invalid magic number|unexpected pageaddr" + ) # clean up if slow_down_send: sk_http.configure_failpoints(("sk-send-wal-replica-sleep", "off")) + + +def test_2_replicas_start(neon_simple_env: NeonEnv): + env = neon_simple_env + + with env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) as primary: + time.sleep(1) + with env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary1" + ) as secondary1: + with env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary2" + ) as secondary2: + wait_replica_caughtup(primary, secondary1) + wait_replica_caughtup(primary, secondary2) + + +# Test two different scenarios related to gc of data needed by hot standby. +# +# When pause_apply is False, standby is mostly caught up with the primary. +# However, in compute <-> pageserver protocol version 1 only one LSN had been +# sent to the pageserver in page request, and to avoid waits in the pageserver +# it was last-written LSN cache value. If page hasn't been updated for a long +# time that resulted in an error from the pageserver: "Bad request: tried to +# request a page version that was garbage collected". For primary this wasn't a +# problem because pageserver always bumped LSN to the newest one; for standy +# that would be incorrect since we might get page fresher then apply LSN. Hence, +# in protocol version v2 two LSNs were introduced: main request_lsn (apply LSN +# in case of standby) and not_modified_since which could be used as an +# optimization to avoid waiting. +# +# https://github.com/neondatabase/neon/issues/6211 +# +# When pause_apply is True we model standby lagging behind primary (e.g. due to +# high max_standby_streaming_delay). To prevent pageserver from removing data +# still needed by the standby apply LSN is propagated in standby -> safekeepers +# -> broker -> pageserver flow so that pageserver could hold off gc for it. +@pytest.mark.parametrize("pause_apply", [False, True]) +def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): + tenant_conf = { + # set PITR interval to be small, so we can do GC + "pitr_interval": "0 s", + } + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + timeline_id = env.initial_timeline + tenant_id = env.initial_tenant + + with env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) as primary: + with env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + # Protocol version 2 was introduced to fix the issue + # that this test exercises. With protocol version 1 it + # fails. + config_lines=["neon.protocol_version=2"], + ) as secondary: + p_cur = primary.connect().cursor() + p_cur.execute("CREATE EXTENSION neon_test_utils") + p_cur.execute("CREATE TABLE test (id int primary key) WITH (autovacuum_enabled=false)") + p_cur.execute("INSERT INTO test SELECT generate_series(1, 10000) AS g") + + wait_replica_caughtup(primary, secondary) + + s_cur = secondary.connect().cursor() + + s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()") + res = s_cur.fetchone() + assert res is not None + + s_cur.execute("SELECT COUNT(*) FROM test") + res = s_cur.fetchone() + assert res[0] == 10000 + + # Clear the cache in the standby, so that when we + # re-execute the query, it will make GetPage + # requests. This does not clear the last-written LSN cache + # so we still remember the LSNs of the pages. + secondary.clear_shared_buffers(cursor=s_cur) + + if pause_apply: + s_cur.execute("SELECT pg_wal_replay_pause()") + + # Do other stuff on the primary, to advance the WAL + p_cur.execute("CREATE TABLE test2 AS SELECT generate_series(1, 1000000) AS g") + + # Run GC. The PITR interval is very small, so this advances the GC cutoff LSN + # very close to the primary's current insert LSN. + shards = tenant_get_shards(env, tenant_id, None) + for tenant_shard_id, pageserver in shards: + client = pageserver.http_client() + client.timeline_checkpoint(tenant_shard_id, timeline_id) + client.timeline_compact(tenant_shard_id, timeline_id) + client.timeline_gc(tenant_shard_id, timeline_id, 0) + + # Re-execute the query. The GetPage requests that this + # generates use old not_modified_since LSNs, older than + # the GC cutoff, but new request LSNs. (In protocol + # version 1 there was only one LSN, and this failed.) + log_replica_lag(primary, secondary) + s_cur.execute("SELECT COUNT(*) FROM test") + log_replica_lag(primary, secondary) + res = s_cur.fetchone() + assert res[0] == 10000 + + +def run_pgbench(connstr: str, pg_bin: PgBin): + log.info(f"Start a pgbench workload on pg {connstr}") + # s10 is about 150MB of data. In debug mode init takes about 15s on SSD. + pg_bin.run_capture(["pgbench", "-i", "-s10", connstr]) + log.info("pgbench init done") + pg_bin.run_capture(["pgbench", "-T60", connstr]) + + +# assert that pgbench_accounts and its index are created. +def pgbench_accounts_initialized(ep): + ep.safe_psql_scalar("select 'pgbench_accounts_pkey'::regclass") + + +# Test that hot_standby_feedback works in neon (it is forwarded through +# safekeepers). That is, ensure queries on standby don't fail during load on +# primary under the following conditions: +# - pgbench bombards primary with updates. +# - On the secondary we run long select of the updated table. +# - Set small max_standby_streaming_delay: hs feedback should prevent conflicts +# so apply doesn't need to wait. +# - Do agressive vacuum on primary which still shouldn't create conflicts. +# Actually this appears to be redundant due to microvacuum existence. +# +# Without hs feedback enabled we'd see 'User query might have needed to see row +# versions that must be removed.' errors. +def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + env = neon_env_builder.init_start() + agressive_vacuum_conf = [ + "log_autovacuum_min_duration = 0", + "autovacuum_naptime = 10s", + "autovacuum_vacuum_threshold = 25", + "autovacuum_vacuum_scale_factor = 0.1", + "autovacuum_vacuum_cost_delay = -1", + ] + with env.endpoints.create_start( + branch_name="main", endpoint_id="primary", config_lines=agressive_vacuum_conf + ) as primary: + # It would be great to have more strict max_standby_streaming_delay=0s here, but then sometimes it fails with + # 'User was holding shared buffer pin for too long.'. + with env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + config_lines=[ + "max_standby_streaming_delay=2s", + "neon.protocol_version=2", + "hot_standby_feedback=true", + ], + ) as secondary: + log.info( + f"primary connstr is {primary.connstr()}, secondary connstr {secondary.connstr()}" + ) + t = threading.Thread(target=run_pgbench, args=(primary.connstr(), pg_bin)) + t.start() + # Wait until pgbench_accounts is created + filled on replica *and* + # index is created. Otherwise index creation would conflict with + # read queries and hs feedback won't save us. + wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary)) + + # Test should fail if hs feedback is disabled anyway, but cross + # check that walproposer sets some xmin. + def xmin_is_not_null(): + slot_xmin = primary.safe_psql_scalar( + "select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'", + log_query=False, + ) + log.info(f"xmin is {slot_xmin}") + assert int(slot_xmin) > 0 + + wait_until(10, 1.0, xmin_is_not_null) + for _ in range(1, 5): + # in debug mode takes about 5-7s + balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts") + log.info(f"balance={balance}") + log_replica_lag(primary, secondary) + t.join() + + # check xmin is reset when standby is gone + def xmin_is_null(): + slot_xmin = primary.safe_psql_scalar( + "select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'", + log_query=False, + ) + log.info(f"xmin is {slot_xmin}") + assert slot_xmin is None + + wait_until(10, 1.0, xmin_is_null) + + +# Test race condition between WAL replay and backends performing queries +# https://github.com/neondatabase/neon/issues/7791 +def test_replica_query_race(neon_simple_env: NeonEnv): + env = neon_simple_env + + primary_ep = env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) + + with primary_ep.connect() as p_con: + with p_con.cursor() as p_cur: + p_cur.execute("CREATE EXTENSION neon_test_utils") + p_cur.execute("CREATE TABLE test AS SELECT 0 AS counter") + + standby_ep = env.endpoints.new_replica_start(origin=primary_ep, endpoint_id="standby") + wait_replica_caughtup(primary_ep, standby_ep) + + # In primary, run a lot of UPDATEs on a single page + finished = False + writecounter = 1 + + async def primary_workload(): + nonlocal writecounter, finished + conn = await primary_ep.connect_async() + while writecounter < 10000: + writecounter += 1 + await conn.execute(f"UPDATE test SET counter = {writecounter}") + finished = True + + # In standby, at the same time, run queries on it. And repeatedly drop caches + async def standby_workload(): + nonlocal writecounter, finished + conn = await standby_ep.connect_async() + reads = 0 + while not finished: + readcounter = await conn.fetchval("SELECT counter FROM test") + + # Check that the replica is keeping up with the primary. In local + # testing, the lag between primary and standby is much smaller, in + # the ballpark of 2-3 counter values. But be generous in case there's + # some hiccup. + # assert(writecounter - readcounter < 1000) + assert readcounter <= writecounter + if reads % 100 == 0: + log.info(f"read {reads}: counter {readcounter}, last update {writecounter}") + reads += 1 + + # FIXME: what about LFC clearing? + await conn.execute("SELECT clear_buffer_cache()") + + async def both(): + await asyncio.gather( + primary_workload(), + standby_workload(), + ) + + asyncio.run(both()) diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index faedf5d944..4385cfca76 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -7,6 +7,7 @@ from contextlib import closing from pathlib import Path import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -17,11 +18,9 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_for_last_record_lsn, - wait_for_upload, ) from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import subprocess_capture +from fixtures.utils import assert_pageserver_backups_equal, subprocess_capture def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_builder): @@ -76,7 +75,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"] end_lsn = manifest["WAL-Ranges"][0]["End-LSN"] - endpoint_id = "ep-import_from_vanilla" + branch_name = "import_from_vanilla" tenant = TenantId.generate() timeline = TimelineId.generate() @@ -88,23 +87,13 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build env.pageserver.allowed_errors.extend( [ - ".*error importing base backup .*", + ".*Failed to import basebackup.*", + ".*unexpected non-zero bytes after the tar archive.*", ".*Timeline got dropped without initializing, cleaning its files.*", - ".*Removing intermediate uninit mark file.*", ".*InternalServerError.*timeline not found.*", ".*InternalServerError.*Tenant .* not found.*", ".*InternalServerError.*Timeline .* not found.*", ".*InternalServerError.*Cannot delete timeline which has child timelines.*", - ".*ignored .* unexpected bytes after the tar archive.*", - ] - ) - - env.pageserver.allowed_errors.extend( - [ - # FIXME: we should clean up pageserver to not print this - ".*exited with error: unexpected message type: CopyData.*", - # FIXME: Is this expected? - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*", ] ) @@ -117,8 +106,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build str(tenant), "--timeline-id", str(timeline), - "--node-name", - endpoint_id, + "--branch-name", + branch_name, "--base-lsn", start_lsn, "--base-tarfile", @@ -142,12 +131,9 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build with pytest.raises(RuntimeError): import_tar(corrupt_base_tar, wal_tar) - # A tar with trailing garbage is currently accepted. It prints a warnings - # to the pageserver log, however. Check that. - import_tar(base_plus_garbage_tar, wal_tar) - assert env.pageserver.log_contains( - ".*WARN.*ignored .* unexpected bytes after the tar archive.*" - ) + # Importing a tar with trailing garbage fails + with pytest.raises(RuntimeError): + import_tar(base_plus_garbage_tar, wal_tar) client = env.pageserver.http_client() timeline_delete_wait_completed(client, tenant, timeline) @@ -157,12 +143,14 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build # Wait for data to land in s3 wait_for_last_record_lsn(client, tenant, timeline, Lsn(end_lsn)) - wait_for_upload(client, tenant, timeline, Lsn(end_lsn)) + client.timeline_checkpoint(tenant, timeline, compact=False, wait_until_uploaded=True) # Check it worked - endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant) + endpoint = env.endpoints.create_start(branch_name, tenant_id=tenant) assert endpoint.safe_psql("select count(*) from t") == [(300000,)] + vanilla_pg.stop() + def test_import_from_pageserver_small( pg_bin: PgBin, neon_env_builder: NeonEnvBuilder, test_output_dir: Path @@ -170,17 +158,12 @@ def test_import_from_pageserver_small( neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" - ) - timeline = env.neon_cli.create_branch("test_import_from_pageserver_small") endpoint = env.endpoints.create_start("test_import_from_pageserver_small") num_rows = 3000 lsn = _generate_data(num_rows, endpoint) - _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir) + _import(num_rows, lsn, env, pg_bin, timeline, test_output_dir) @pytest.mark.timeout(1800) @@ -210,9 +193,7 @@ def test_import_from_pageserver_multisegment( log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB") assert logical_size > 1024**3 # = 1GB - tar_output_file = _import( - num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir - ) + tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline, test_output_dir) # Check if the backup data contains multiple segment files cnt_seg_files = 0 @@ -252,7 +233,6 @@ def _import( env: NeonEnv, pg_bin: PgBin, timeline: TimelineId, - pg_distrib_dir: Path, test_output_dir: Path, ) -> Path: """Test importing backup data to the pageserver. @@ -265,15 +245,9 @@ def _import( path to the backup archive file""" log.info(f"start_backup_lsn = {lsn}") - # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. - # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} - # Get a fullbackup from pageserver - query = f"fullbackup { env.initial_tenant} {timeline} {lsn}" tar_output_file = test_output_dir / "fullbackup.tar" - cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.take_fullbackup(env.pageserver, env.initial_tenant, timeline, lsn, tar_output_file) # Stop the first pageserver instance, erase all its data env.endpoints.stop_all() @@ -291,7 +265,7 @@ def _import( tenant = TenantId.generate() # Import to pageserver - endpoint_id = "ep-import_from_pageserver" + branch_name = "import_from_pageserver" client = env.pageserver.http_client() env.pageserver.tenant_create(tenant) env.neon_cli.raw_cli( @@ -302,8 +276,8 @@ def _import( str(tenant), "--timeline-id", str(timeline), - "--node-name", - endpoint_id, + "--branch-name", + branch_name, "--base-lsn", str(lsn), "--base-tarfile", @@ -315,29 +289,18 @@ def _import( # Wait for data to land in s3 wait_for_last_record_lsn(client, tenant, timeline, lsn) - wait_for_upload(client, tenant, timeline, lsn) + client.timeline_checkpoint(tenant, timeline, compact=False, wait_until_uploaded=True) # Check it worked - endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant) + endpoint = env.endpoints.create_start(branch_name, tenant_id=tenant, lsn=lsn) assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)] # Take another fullbackup - query = f"fullbackup { tenant} {timeline} {lsn}" new_tar_output_file = test_output_dir / "fullbackup-new.tar" - cmd = [ - "psql", - "--no-psqlrc", - env.pageserver.connstr(), - "-c", - query, - "-o", - str(new_tar_output_file), - ] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.take_fullbackup(env.pageserver, tenant, timeline, lsn, new_tar_output_file) # Check it's the same as the first fullbackup - # TODO pageserver should be checking checksum - assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file) + assert_pageserver_backups_equal(tar_output_file, new_tar_output_file, set()) # Check that gc works pageserver_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_ingestion_layer_size.py b/test_runner/regress/test_ingestion_layer_size.py new file mode 100644 index 0000000000..44c77b3410 --- /dev/null +++ b/test_runner/regress/test_ingestion_layer_size.py @@ -0,0 +1,151 @@ +from dataclasses import dataclass +from typing import Iterable, List, Union + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn +from fixtures.pageserver.http import HistoricLayerInfo, LayerMapInfo +from fixtures.utils import human_bytes + + +def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder, build_type: str): + """ + Build a non-small GIN index which includes similarly batched up images in WAL stream as does pgvector + to show that we no longer create oversized layers. + """ + + if build_type == "debug": + pytest.skip("debug run is unnecessarily slow") + + minimum_initdb_size = 20 * 1024**2 + checkpoint_distance = 32 * 1024**2 + minimum_good_layer_size = checkpoint_distance * 0.9 + minimum_too_large_layer_size = 2 * checkpoint_distance + + # index size: 99MiB + rows = 2_500_000 + + # bucket lower limits + buckets = [0, minimum_initdb_size, minimum_good_layer_size, minimum_too_large_layer_size] + + assert ( + minimum_initdb_size < minimum_good_layer_size + ), "keep checkpoint_distance higher than the initdb size (find it by experimenting)" + + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_distance": f"{checkpoint_distance}", + "compaction_target_size": f"{checkpoint_distance}", + # this test is primarly interested in L0 sizes but we'll compact after ingestion to ensure sizes are good even then + "compaction_period": "0s", + "gc_period": "0s", + "compaction_threshold": "255", + "image_creation_threshold": "99999", + } + ) + + # build a larger than 3*checkpoint_distance sized gin index. + # gin index building exhibits the same behaviour as the pgvector with the two phase build + with env.endpoints.create_start("main") as ep, ep.cursor() as cur: + cur.execute( + f"create table int_array_test as select array_agg(g) as int_array from generate_series(1, {rows}) g group by g / 10;" + ) + cur.execute( + "create index int_array_test_gin_index on int_array_test using gin (int_array);" + ) + cur.execute("select pg_table_size('int_array_test_gin_index')") + size = cur.fetchone() + assert size is not None + assert isinstance(size[0], int) + log.info(f"gin index size: {human_bytes(size[0])}") + assert ( + size[0] > checkpoint_distance * 3 + ), f"gin index is not large enough: {human_bytes(size[0])}" + wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + ps_http = env.pageserver.http_client() + ps_http.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + infos = ps_http.layer_map_info(env.initial_tenant, env.initial_timeline) + assert len(infos.in_memory_layers) == 0, "should had flushed open layers" + post_ingest = histogram_historic_layers(infos, buckets) + + # describe first, assert later for easier debugging + log.info("non-cumulative layer size distribution after ingestion:") + print_layer_size_histogram(post_ingest) + + # since all we have are L0s, we should be getting nice L1s and images out of them now + ps_http.patch_tenant_config_client_side( + env.initial_tenant, + { + "compaction_threshold": 1, + "image_creation_threshold": 1, + }, + ) + + ps_http.timeline_compact(env.initial_tenant, env.initial_timeline, True, True) + + infos = ps_http.layer_map_info(env.initial_tenant, env.initial_timeline) + assert len(infos.in_memory_layers) == 0, "no new inmem layers expected" + post_compact = histogram_historic_layers(infos, buckets) + + log.info("non-cumulative layer size distribution after compaction:") + print_layer_size_histogram(post_compact) + + assert ( + post_ingest.counts[3] == 0 + ), f"there should be no layers larger than 2*checkpoint_distance ({human_bytes(2*checkpoint_distance)})" + assert post_ingest.counts[1] == 1, "expect one smaller layer for initdb" + assert ( + post_ingest.counts[0] <= 1 + ), "expect at most one tiny layer from shutting down the endpoint" + + # just make sure we don't have trouble splitting the layers apart + assert post_compact.counts[3] == 0 + + +@dataclass +class Histogram: + buckets: List[Union[int, float]] + counts: List[int] + sums: List[int] + + +def histogram_historic_layers( + infos: LayerMapInfo, minimum_sizes: List[Union[int, float]] +) -> Histogram: + def log_layer(layer: HistoricLayerInfo) -> HistoricLayerInfo: + log.info( + f"{layer.layer_file_name} {human_bytes(layer.layer_file_size)} ({layer.layer_file_size} bytes)" + ) + return layer + + layers = map(log_layer, infos.historic_layers) + sizes = (x.layer_file_size for x in layers) + return histogram(sizes, minimum_sizes) + + +def histogram(sizes: Iterable[int], minimum_sizes: List[Union[int, float]]) -> Histogram: + assert all(minimum_sizes[i] < minimum_sizes[i + 1] for i in range(len(minimum_sizes) - 1)) + buckets = list(enumerate(minimum_sizes)) + counts = [0 for _ in buckets] + sums = [0 for _ in buckets] + + for size in sizes: + found = False + for index, min_size in reversed(buckets): + if size >= min_size: + counts[index] += 1 + sums[index] += size + found = True + break + assert found + + return Histogram(minimum_sizes, counts, sums) + + +def print_layer_size_histogram(h: Histogram): + for index, min_size in enumerate(h.buckets): + log.info( + f">= {human_bytes(min_size)}: {h.counts[index]} layers total {human_bytes(h.sums[index])}" + ) diff --git a/test_runner/regress/test_large_schema.py b/test_runner/regress/test_large_schema.py index b6ac1aa41f..c5d5b5fe64 100644 --- a/test_runner/regress/test_large_schema.py +++ b/test_runner/regress/test_large_schema.py @@ -74,8 +74,8 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder): cur.execute("select * from pg_depend order by refclassid, refobjid, refobjsubid") # Check layer file sizes - timeline_path = "{}/tenants/{}/timelines/{}/".format( - env.pageserver.workdir, env.initial_tenant, env.initial_timeline + timeline_path = ( + f"{env.pageserver.workdir}/tenants/{env.initial_tenant}/timelines/{env.initial_timeline}/" ) for filename in os.listdir(timeline_path): if filename.startswith("00000"): diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py new file mode 100644 index 0000000000..b8126395fd --- /dev/null +++ b/test_runner/regress/test_layer_bloating.py @@ -0,0 +1,71 @@ +import os + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + logical_replication_sync, + wait_for_last_flush_lsn, +) +from fixtures.pg_version import PgVersion + + +def test_layer_bloating(neon_env_builder: NeonEnvBuilder, vanilla_pg): + if neon_env_builder.pg_version != PgVersion.V16: + pytest.skip("pg_log_standby_snapshot() function is available only in PG16") + + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "0s", + "compaction_period": "0s", + "compaction_threshold": 99999, + "image_creation_threshold": 99999, + } + ) + + timeline = env.initial_timeline + endpoint = env.endpoints.create_start("main", config_lines=["log_statement=all"]) + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # create table... + cur.execute("create table t(pk integer primary key)") + cur.execute("create publication pub1 for table t") + # Create slot to hold WAL + cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')") + + # now start subscriber + vanilla_pg.start() + vanilla_pg.safe_psql("create table t(pk integer primary key)") + + connstr = endpoint.connstr().replace("'", "''") + log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") + + cur.execute( + """create or replace function create_snapshots(n integer) returns void as $$ + declare + i integer; + begin + for i in 1..n loop + perform pg_log_standby_snapshot(); + end loop; + end; $$ language plpgsql""" + ) + cur.execute("set statement_timeout=0") + cur.execute("select create_snapshots(10000)") + # Wait logical replication to sync + logical_replication_sync(vanilla_pg, endpoint) + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline) + env.pageserver.http_client().timeline_checkpoint(env.initial_tenant, timeline, compact=False) + + # Check layer file sizes + timeline_path = f"{env.pageserver.workdir}/tenants/{env.initial_tenant}/timelines/{timeline}/" + log.info(f"Check {timeline_path}") + for filename in os.listdir(timeline_path): + if filename.startswith("00000"): + log.info(f"layer {filename} size is {os.path.getsize(timeline_path + filename)}") + assert os.path.getsize(timeline_path + filename) < 512_000_000 + + env.stop(immediate=True) diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index efba2033fb..193149ea03 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -4,12 +4,12 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, + flush_ep_to_pageserver, wait_for_last_flush_lsn, ) -from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.pageserver.common_types import parse_layer_file_name +from fixtures.pageserver.utils import wait_for_upload from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn -from fixtures.utils import query_scalar # Crates a few layers, ensures that we can evict them (removing locally but keeping track of them anyway) @@ -46,20 +46,21 @@ def test_basic_eviction( FROM generate_series(1, 5000000) g """ ) - current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) - wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + # stops the endpoint + current_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id) + client.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(client, tenant_id, timeline_id, current_lsn) - # disable compute & sks to avoid on-demand downloads by walreceiver / getpage - endpoint.stop() + # stop sks to avoid on-demand downloads by walreceiver / getpage; endpoint + # has already been stopped by flush_ep_to_pageserver for sk in env.safekeepers: sk.stop() - timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id) - initial_local_layers = sorted( - list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + initial_local_layers = dict( + (parse_layer_file_name(path.name), path) + for path in env.pageserver.list_layers(tenant_id, timeline_id) ) assert ( len(initial_local_layers) > 1 @@ -73,6 +74,7 @@ def test_basic_eviction( assert len(initial_local_layers) == len( initial_layer_map_info.historic_layers ), "Should have the same layers in memory and on disk" + for returned_layer in initial_layer_map_info.historic_layers: assert ( returned_layer.kind == "Delta" @@ -81,27 +83,29 @@ def test_basic_eviction( not returned_layer.remote ), f"All created layers should be present locally, but got {returned_layer}" - local_layers = list( - filter(lambda layer: layer.name == returned_layer.layer_file_name, initial_local_layers) + returned_layer_name = parse_layer_file_name(returned_layer.layer_file_name) + assert ( + returned_layer_name in initial_local_layers + ), f"Did not find returned layer {returned_layer_name} in local layers {list(initial_local_layers.keys())}" + + local_layer_path = ( + env.pageserver.timeline_dir(tenant_id, timeline_id) + / initial_local_layers[returned_layer_name] ) assert ( - len(local_layers) == 1 - ), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}" - local_layer = local_layers[0] - assert ( - returned_layer.layer_file_size == local_layer.stat().st_size - ), f"Returned layer {returned_layer} has a different file size than local layer {local_layer}" + returned_layer.layer_file_size == local_layer_path.stat().st_size + ), f"Returned layer {returned_layer} has a different file size than local layer {local_layer_path}" # Detach all layers, ensre they are not in the local FS, but are still dumped as part of the layer map - for local_layer in initial_local_layers: + for local_layer_name, local_layer_path in initial_local_layers.items(): client.evict_layer( - tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer.name + tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer_path.name ) - assert not any( - new_local_layer.name == local_layer.name for new_local_layer in timeline_path.glob("*") - ), f"Did not expect to find {local_layer} layer after evicting" + assert not env.pageserver.layer_exists( + tenant_id, timeline_id, local_layer_name + ), f"Did not expect to find {local_layer_name} layer after evicting" - empty_layers = list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + empty_layers = env.pageserver.list_layers(tenant_id, timeline_id) assert not empty_layers, f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}" evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id) @@ -118,15 +122,15 @@ def test_basic_eviction( assert ( returned_layer.remote ), f"All layers should be evicted and not present locally, but got {returned_layer}" - assert any( - local_layer.name == returned_layer.layer_file_name - for local_layer in initial_local_layers + returned_layer_name = parse_layer_file_name(returned_layer.layer_file_name) + assert ( + returned_layer_name in initial_local_layers ), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}" # redownload all evicted layers and ensure the initial state is restored - for local_layer in initial_local_layers: + for local_layer_name, _local_layer_path in initial_local_layers.items(): client.download_layer( - tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer.name + tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer_name.to_str() ) client.timeline_download_remote_layers( tenant_id, @@ -137,8 +141,9 @@ def test_basic_eviction( at_least_one_download=False, ) - redownloaded_layers = sorted( - list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + redownloaded_layers = dict( + (parse_layer_file_name(path.name), path) + for path in env.pageserver.list_layers(tenant_id, timeline_id) ) assert ( redownloaded_layers == initial_local_layers @@ -154,7 +159,9 @@ def test_basic_eviction( def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() + # don't create initial tenant, we'll create it manually with custom config + env = neon_env_builder.init_configs() + env.start() tenant_config = { "pitr_interval": "1s", # set to non-zero, so GC actually does something @@ -165,6 +172,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): "compaction_threshold": "3", # "image_creation_threshold": set at runtime "compaction_target_size": f"{128 * (1024**2)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers + "image_layer_creation_check_threshold": "0", # always check if a new image layer can be created } def tenant_update_config(changes): @@ -264,14 +272,14 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): resident_physical_size_metric == 0 ), "ensure that resident_physical_size metric is zero" assert resident_physical_size_metric == sum( - layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote + layer.layer_file_size for layer in info.historic_layers if not layer.remote ), "ensure that resident_physical_size metric corresponds to layer map dump" remote_physical_size_metric = ps_http.get_timeline_metric( tenant_id, timeline_id, "pageserver_remote_physical_size" ) assert remote_physical_size_metric == sum( - layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote + layer.layer_file_size for layer in info.historic_layers if layer.remote ), "ensure that remote_physical_size metric corresponds to layer map dump" log.info("before runnning GC, ensure that remote_physical size is zero") diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 999e077e45..3b2218dd9b 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -1,10 +1,11 @@ import time +from fixtures.common_types import Lsn from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.pageserver.types import ( - DeltaLayerFileName, - ImageLayerFileName, +from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver +from fixtures.pageserver.common_types import ( + DeltaLayerName, + ImageLayerName, is_future_layer, ) from fixtures.pageserver.utils import ( @@ -13,7 +14,6 @@ from fixtures.pageserver.utils import ( wait_until_tenant_active, ) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.types import Lsn from fixtures.utils import query_scalar, wait_until @@ -37,10 +37,8 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): """ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() - env.pageserver.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) + env = neon_env_builder.init_configs() + env.start() ps_http = env.pageserver.http_client() @@ -53,6 +51,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): "checkpoint_timeout": "24h", # something we won't reach "checkpoint_distance": f"{50 * (1024**2)}", # something we won't reach, we checkpoint manually "image_creation_threshold": "100", # we want to control when image is created + "image_layer_creation_check_threshold": "0", "compaction_threshold": f"{l0_l1_threshold}", "compaction_target_size": f"{128 * (1024**3)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers } @@ -80,7 +79,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): current = get_index_part() assert len(set(current.layer_metadata.keys())) == 1 layer_file_name = list(current.layer_metadata.keys())[0] - assert isinstance(layer_file_name, DeltaLayerFileName) + assert isinstance(layer_file_name, DeltaLayerName) assert layer_file_name.is_l0(), f"{layer_file_name}" log.info("force image layer creation in the future by writing some data into in-memory layer") @@ -115,8 +114,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): ) == 0 ) - - endpoint.stop() + last_record_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id) wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) @@ -146,7 +144,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): future_layers = get_future_layers() assert len(future_layers) == 1 future_layer = future_layers[0] - assert isinstance(future_layer, ImageLayerFileName) + assert isinstance(future_layer, ImageLayerName) assert future_layer.lsn == last_record_lsn log.info( f"got layer from the future: lsn={future_layer.lsn} disk_consistent_lsn={ip.disk_consistent_lsn} last_record_lsn={last_record_lsn}" @@ -160,7 +158,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): time.sleep(1.1) # so that we can use change in pre_stat.st_mtime to detect overwrites def get_generation_number(): - attachment = env.attachment_service.inspect(tenant_id) + attachment = env.storage_controller.inspect(tenant_id) assert attachment is not None return attachment[0] @@ -184,10 +182,13 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): # NB: the layer file is unlinked index part now, but, because we made the delete # operation stuck, the layer file itself is still in the remote_storage - def delete_at_pause_point(): - assert env.pageserver.log_contains(f".*{tenant_id}.*at failpoint.*{failpoint_name}") - - wait_until(10, 0.5, delete_at_pause_point) + wait_until( + 10, + 0.5, + lambda: env.pageserver.assert_log_contains( + f".*{tenant_id}.*at failpoint.*{failpoint_name}" + ), + ) future_layer_path = env.pageserver_remote_storage.remote_layer_path( tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach ) diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index 5c68a63d06..1b2c7f808f 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -1,3 +1,7 @@ +import os +import random +import re +import subprocess import threading import time @@ -17,18 +21,17 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): "test_lfc_resize", config_lines=[ "neon.file_cache_path='file.cache'", - "neon.max_file_cache_size=1GB", - "neon.file_cache_size_limit=1GB", + "neon.max_file_cache_size=512MB", + "neon.file_cache_size_limit=512MB", ], ) n_resize = 10 - scale = 10 - log.info("postgres is running on 'test_lfc_resize' branch") + scale = 100 def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) - pg_bin.run_capture(["pgbench", "-c4", f"-T{n_resize}", "-Mprepared", connstr]) + pg_bin.run_capture(["pgbench", "-c10", f"-T{n_resize}", "-Mprepared", "-S", connstr]) thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True) thread.start() @@ -36,9 +39,21 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): conn = endpoint.connect() cur = conn.cursor() - for i in range(n_resize): - cur.execute(f"alter system set neon.file_cache_size_limit='{i*10}MB'") + for _ in range(n_resize): + size = random.randint(1, 512) + cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'") cur.execute("select pg_reload_conf()") time.sleep(1) + cur.execute("alter system set neon.file_cache_size_limit='100MB'") + cur.execute("select pg_reload_conf()") + thread.join() + + lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache" + lfc_file_size = os.path.getsize(lfc_file_path) + res = subprocess.run(["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True) + lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0] + log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}") + assert lfc_file_size <= 512 * 1024 * 1024 + assert int(lfc_file_blocks) <= 128 * 1024 diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py new file mode 100644 index 0000000000..4c53e4e2fd --- /dev/null +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -0,0 +1,118 @@ +import time +from pathlib import Path + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import query_scalar + + +def test_lfc_working_set_approximation(neon_simple_env: NeonEnv): + env = neon_simple_env + + cache_dir = Path(env.repo_dir) / "file_cache" + cache_dir.mkdir(exist_ok=True) + + branchname = "test_approximate_working_set_size" + env.neon_cli.create_branch(branchname, "empty") + log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}") + endpoint = env.endpoints.create_start( + branchname, + config_lines=[ + "shared_buffers='1MB'", + f"neon.file_cache_path='{cache_dir}/file.cache'", + "neon.max_file_cache_size='128MB'", + "neon.file_cache_size_limit='64MB'", + ], + ) + + cur = endpoint.connect().cursor() + cur.execute("create extension neon") + + log.info(f"preparing some data in {endpoint.connstr()}") + + ddl = """ +CREATE TABLE pgbench_accounts ( + aid bigint NOT NULL, + bid integer, + abalance integer, + filler character(84), + -- more web-app like columns + text_column_plain TEXT DEFAULT repeat('NeonIsCool', 5), + jsonb_column_extended JSONB DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb +) +WITH (fillfactor='100'); +""" + + cur.execute(ddl) + # prepare index access below + cur.execute( + "ALTER TABLE ONLY pgbench_accounts ADD CONSTRAINT pgbench_accounts_pkey PRIMARY KEY (aid)" + ) + cur.execute( + "insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;" + ) + # ensure correct query plans and stats + cur.execute("vacuum ANALYZE pgbench_accounts") + # determine table size - working set should approximate table size after sequential scan + pages = query_scalar(cur, "SELECT relpages FROM pg_class WHERE relname = 'pgbench_accounts'") + log.info(f"pgbench_accounts has {pages} pages, resetting working set to zero") + cur.execute("select approximate_working_set_size(true)") + cur.execute( + 'SELECT count(*) FROM pgbench_accounts WHERE abalance > 0 or jsonb_column_extended @> \'{"tell everyone": [{"Neon": "IsCool"}]}\'::jsonb' + ) + # verify working set size after sequential scan matches table size and reset working set for next test + blocks = query_scalar(cur, "select approximate_working_set_size(true)") + log.info(f"working set size after sequential scan on pgbench_accounts {blocks}") + assert pages * 0.8 < blocks < pages * 1.2 + # run a few point queries with index lookup + cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 4242") + cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 54242") + cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242") + cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242") + # verify working set size after some index access of a few select pages only + blocks = query_scalar(cur, "select approximate_working_set_size(true)") + log.info(f"working set size after some index access of a few select pages only {blocks}") + assert blocks < 10 + + +def test_sliding_working_set_approximation(neon_simple_env: NeonEnv): + env = neon_simple_env + + endpoint = env.endpoints.create_start( + branch_name="main", + config_lines=[ + "autovacuum = off", + "shared_buffers=1MB", + "neon.max_file_cache_size=256MB", + "neon.file_cache_size_limit=245MB", + ], + ) + conn = endpoint.connect() + cur = conn.cursor() + cur.execute("create extension neon") + cur.execute( + "create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 128))" + ) + cur.execute("insert into t (pk) values (generate_series(1,1000000))") + time.sleep(2) + before_10k = time.monotonic() + cur.execute("select sum(count) from t where pk between 10000 and 20000") + time.sleep(2) + before_1k = time.monotonic() + cur.execute("select sum(count) from t where pk between 1000 and 2000") + after = time.monotonic() + + cur.execute(f"select approximate_working_set_size_seconds({int(after - before_1k + 1)})") + estimation_1k = cur.fetchall()[0][0] + log.info(f"Working set size for selecting 1k records {estimation_1k}") + + cur.execute(f"select approximate_working_set_size_seconds({int(after - before_10k + 1)})") + estimation_10k = cur.fetchall()[0][0] + log.info(f"Working set size for selecting 10k records {estimation_10k}") + + cur.execute("select pg_table_size('t')") + size = cur.fetchall()[0][0] // 8192 + log.info(f"Table size {size} blocks") + + assert estimation_1k >= 20 and estimation_1k <= 40 + assert estimation_10k >= 200 and estimation_10k <= 400 diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py index 38f2034c18..3c404c3b23 100644 --- a/test_runner/regress/test_local_file_cache.py +++ b/test_runner/regress/test_local_file_cache.py @@ -1,19 +1,21 @@ import os +import queue import random import threading import time from typing import List -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder from fixtures.utils import query_scalar -def test_local_file_cache_unlink(neon_simple_env: NeonEnv): - env = neon_simple_env +def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() cache_dir = os.path.join(env.repo_dir, "file_cache") os.mkdir(cache_dir) + env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) env.neon_cli.create_branch("test_local_file_cache_unlink", "empty") endpoint = env.endpoints.create_start( @@ -28,11 +30,10 @@ def test_local_file_cache_unlink(neon_simple_env: NeonEnv): cur = endpoint.connect().cursor() + stop = threading.Event() n_rows = 100000 n_threads = 20 - n_updates_per_thread = 10000 n_updates_per_connection = 1000 - n_total_updates = n_threads * n_updates_per_thread cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)") cur.execute(f"INSERT INTO lfctest SELECT g, 1 FROM generate_series(1, {n_rows}) g") @@ -43,11 +44,11 @@ def test_local_file_cache_unlink(neon_simple_env: NeonEnv): # performed (plus the initial 1 on each row). # # Furthermore, each thread will reconnect between every 1000 updates. - def run_updates(): + def run_updates(n_updates_performed_q: queue.Queue[int]): n_updates_performed = 0 conn = endpoint.connect() cur = conn.cursor() - for _ in range(n_updates_per_thread): + while not stop.is_set(): id = random.randint(1, n_rows) cur.execute(f"UPDATE lfctest SET n = n + 1 WHERE id = {id}") n_updates_performed += 1 @@ -56,19 +57,28 @@ def test_local_file_cache_unlink(neon_simple_env: NeonEnv): conn.close() conn = endpoint.connect() cur = conn.cursor() + n_updates_performed_q.put(n_updates_performed) + n_updates_performed_q: queue.Queue[int] = queue.Queue() threads: List[threading.Thread] = [] for _i in range(n_threads): - thread = threading.Thread(target=run_updates, args=(), daemon=True) + thread = threading.Thread(target=run_updates, args=(n_updates_performed_q,), daemon=True) thread.start() threads.append(thread) time.sleep(5) + # unlink, this is what we're actually testing new_cache_dir = os.path.join(env.repo_dir, "file_cache_new") os.rename(cache_dir, new_cache_dir) + time.sleep(10) + + stop.set() + + n_updates_performed = 0 for thread in threads: thread.join() + n_updates_performed += n_updates_performed_q.get() - assert query_scalar(cur, "SELECT SUM(n) FROM lfctest") == n_total_updates + n_rows + assert query_scalar(cur, "SELECT SUM(n) FROM lfctest") == n_rows + n_updates_performed diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py index d559be0a8f..bfffad7572 100644 --- a/test_runner/regress/test_logging.py +++ b/test_runner/regress/test_logging.py @@ -3,10 +3,12 @@ import uuid import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.pg_version import run_only_on_default_postgres from fixtures.utils import wait_until @pytest.mark.parametrize("level", ["trace", "debug", "info", "warn", "error"]) +@run_only_on_default_postgres("it does not use any postgres functionality") def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str): # self-test: make sure the event is logged (i.e., our testing endpoint works) log_expected = { @@ -32,7 +34,7 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str): def assert_logged(): if not log_expected: return - assert env.pageserver.log_contains(f".*{msg_id}.*") + env.pageserver.assert_log_contains(f".*{msg_id}.*") wait_until(10, 0.5, assert_logged) diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index 51e358e60d..f83a833dda 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -1,16 +1,37 @@ import time +from functools import partial +from random import choice +from string import ascii_lowercase import pytest +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( + AuxFileStore, NeonEnv, + NeonEnvBuilder, + PgProtocol, logical_replication_sync, wait_for_last_flush_lsn, ) -from fixtures.types import Lsn -from fixtures.utils import query_scalar +from fixtures.utils import wait_until +def random_string(n: int): + return "".join([choice(ascii_lowercase) for _ in range(n)]) + + +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V2, AuxFileStore.CrossValidation] +) +def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy: AuxFileStore): + env = neon_simple_env + with env.pageserver.http_client() as client: + tenant_config = client.tenant_config(env.initial_tenant).effective_config + assert pageserver_aux_file_policy == tenant_config["switch_aux_file_policy"] + + +@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation]) def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env @@ -20,7 +41,6 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg): "test_logical_replication", config_lines=["log_statement=all"] ) - log.info("postgres is running on 'test_logical_replication' branch") pg_conn = endpoint.connect() cur = pg_conn.cursor() @@ -152,9 +172,187 @@ COMMIT; assert endpoint.safe_psql("select count(*) from pg_replication_slots")[0][0] == 1 -# Test compute start at LSN page of which starts with contrecord -# https://github.com/neondatabase/neon/issues/5749 -def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): +# Test that neon.logical_replication_max_snap_files works +@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation]) +def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg): + def slot_removed(ep): + assert ( + endpoint.safe_psql( + "select count(*) from pg_replication_slots where slot_name = 'stale_slot'" + )[0][0] + == 0 + ) + + env = neon_simple_env + + env.neon_cli.create_branch("test_logical_replication", "empty") + # set low neon.logical_replication_max_snap_files + endpoint = env.endpoints.create_start( + "test_logical_replication", + config_lines=["log_statement=all", "neon.logical_replication_max_snap_files=1"], + ) + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # create obsolete slot + cur.execute("select pg_create_logical_replication_slot('stale_slot', 'pgoutput');") + assert ( + endpoint.safe_psql( + "select count(*) from pg_replication_slots where slot_name = 'stale_slot'" + )[0][0] + == 1 + ) + + # now insert some data and create and start live subscriber to create more .snap files + # (in most cases this is not needed as stale_slot snap will have higher LSN than restart_lsn anyway) + cur.execute("create table t(pk integer primary key, payload integer)") + cur.execute("create publication pub1 for table t") + + vanilla_pg.start() + vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)") + connstr = endpoint.connstr().replace("'", "''") + log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") + + wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint)) + + +def test_ondemand_wal_download_in_replication_slot_funcs(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("init") + endpoint = env.endpoints.create_start("init") + + with endpoint.connect().cursor() as cur: + cur.execute("create table wal_generator (id serial primary key, data text)") + cur.execute( + "SELECT * FROM pg_create_logical_replication_slot('slotty_mcslotface', 'test_decoding')" + ) + cur.execute( + """ +INSERT INTO wal_generator (data) +SELECT repeat('A', 1024) -- Generates a kilobyte of data per row +FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of data +""" + ) + + endpoint.stop_and_destroy() + endpoint = env.endpoints.create_start("init") + + with endpoint.connect().cursor() as cur: + cur.execute( + "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')" + ) + # do the peek second time: we've had a bug using wrong memory context + # for NeonWALReader leading to the crash in this case. + log.info("peek_changes again") + cur.execute( + "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')" + ) + cur.execute( + """ +INSERT INTO wal_generator (data) +SELECT repeat('A', 1024) -- Generates a kilobyte of data per row +FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of data +""" + ) + + endpoint.stop_and_destroy() + endpoint = env.endpoints.create_start("init") + with endpoint.connect().cursor() as cur: + log.info("advance slot") + cur.execute( + "SELECT * from pg_replication_slot_advance('slotty_mcslotface', pg_current_wal_lsn())" + ) + + +# Tests that walsender correctly blocks until WAL is downloaded from safekeepers +def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("init") + endpoint = env.endpoints.create_start("init") + + with endpoint.connect().cursor() as cur: + cur.execute("create table wal_generator (id serial primary key, data text)") + cur.execute( + """ +INSERT INTO wal_generator (data) +SELECT repeat('A', 1024) -- Generates a kilobyte of data per row +FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of data +""" + ) + cur.execute("create table t(a int)") + cur.execute("create publication pub for table t") + cur.execute("insert into t values (1)") + + vanilla_pg.start() + vanilla_pg.safe_psql("create table t(a int)") + connstr = endpoint.connstr().replace("'", "''") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub") + logical_replication_sync(vanilla_pg, endpoint) + + vanilla_pg.stop() + + # Pause the safekeepers so that they can't send WAL (except to pageserver) + for sk in env.safekeepers: + sk_http = sk.http_client() + sk_http.configure_failpoints([("sk-pause-send", "return")]) + + # Insert a 2 + with endpoint.connect().cursor() as cur: + cur.execute("insert into t values (2)") + + endpoint.stop_and_destroy() + + # This new endpoint should contain [1, 2], but it can't access WAL from safekeeper + endpoint = env.endpoints.create_start("init") + with endpoint.connect().cursor() as cur: + cur.execute("select * from t") + res = [r[0] for r in cur.fetchall()] + assert res == [1, 2] + + # Reconnect subscriber + vanilla_pg.start() + connstr = endpoint.connstr().replace("'", "''") + vanilla_pg.safe_psql(f"alter subscription sub1 connection '{connstr}'") + + time.sleep(5) + # Make sure the 2 isn't replicated + assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1] + + # Re-enable WAL download + for sk in env.safekeepers: + sk_http = sk.http_client() + sk_http.configure_failpoints([("sk-pause-send", "off")]) + + logical_replication_sync(vanilla_pg, endpoint) + assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2] + + # Check that local reads also work + with endpoint.connect().cursor() as cur: + cur.execute("insert into t values (3)") + logical_replication_sync(vanilla_pg, endpoint) + assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2, 3] + + log_path = vanilla_pg.pgdatadir / "pg.log" + with open(log_path, "r") as log_file: + logs = log_file.read() + assert "could not receive data from WAL stream" not in logs + + +# Test replication of WAL record spanning page boundary (with contrecord) after +# compute restart and WAL write of the page. +# +# See https://github.com/neondatabase/neon/issues/5749 +# +# Most pages start with a contrecord, so we don't do anything special +# to ensure that. +@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation]) +def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env env.neon_cli.create_branch("init") @@ -179,52 +377,6 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): logical_replication_sync(vanilla_pg, endpoint) vanilla_pg.stop() - with endpoint.cursor() as cur: - # measure how much space logical message takes. Sometimes first attempt - # creates huge message and then it stabilizes, have no idea why. - for _ in range(3): - lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - log.info(f"current_lsn={lsn_before}") - # Non-transactional logical message doesn't write WAL, only XLogInsert's - # it, so use transactional. Which is a bit problematic as transactional - # necessitates commit record. Alternatively we can do smth like - # select neon_xlogflush(pg_current_wal_insert_lsn()); - # but isn't much better + that particular call complains on 'xlog flush - # request 0/282C018 is not satisfied' as pg_current_wal_insert_lsn skips - # page headers. - payload = "blahblah" - cur.execute(f"select pg_logical_emit_message(true, 'pref', '{payload}')") - lsn_after_by_curr_wal_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - lsn_diff = lsn_after_by_curr_wal_lsn - lsn_before - logical_message_base = lsn_after_by_curr_wal_lsn - lsn_before - len(payload) - log.info( - f"before {lsn_before}, after {lsn_after_by_curr_wal_lsn}, lsn diff is {lsn_diff}, base {logical_message_base}" - ) - - # and write logical message spanning exactly as we want - lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - log.info(f"current_lsn={lsn_before}") - curr_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - offs = int(curr_lsn) % 8192 - till_page = 8192 - offs - payload_len = ( - till_page - logical_message_base - 8 - ) # not sure why 8 is here, it is deduced from experiments - log.info(f"current_lsn={curr_lsn}, offs {offs}, till_page {till_page}") - - # payload_len above would go exactly till the page boundary; but we want contrecord, so make it slightly longer - payload_len += 8 - - cur.execute(f"select pg_logical_emit_message(true, 'pref', 'f{'a' * payload_len}')") - supposedly_contrecord_end = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - log.info(f"supposedly_page_boundary={supposedly_contrecord_end}") - # The calculations to hit the page boundary are very fuzzy, so just - # ignore test if we fail to reach it. - if not (int(supposedly_contrecord_end) % 8192 == 32): - pytest.skip("missed page boundary, bad luck") - - cur.execute("insert into replication_example values (2, 3)") - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) endpoint.stop().start() @@ -238,6 +390,58 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): ) == endpoint.safe_psql("select sum(somedata) from replication_example") +# Test that WAL redo works for fairly large records. +# +# See https://github.com/neondatabase/neon/pull/6534. That wasn't a +# logical replication bug as such, but without logical replication, +# records passed ot the WAL redo process are never large enough to hit +# the bug. +@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation]) +def test_large_records(neon_simple_env: NeonEnv, vanilla_pg): + env = neon_simple_env + + env.neon_cli.create_branch("init") + endpoint = env.endpoints.create_start("init") + + cur = endpoint.connect().cursor() + cur.execute("CREATE TABLE reptbl(id int, largeval text);") + cur.execute("alter table reptbl replica identity full") + cur.execute("create publication pub1 for table reptbl") + + # now start subscriber + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE reptbl(id int, largeval text);") + + log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") + connstr = endpoint.connstr().replace("'", "''") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") + + # Test simple insert, update, delete. But with very large values + value = random_string(10_000_000) + cur.execute(f"INSERT INTO reptbl VALUES (1, '{value}')") + logical_replication_sync(vanilla_pg, endpoint) + assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(1, value)] + + # Test delete, and reinsert another value + cur.execute("DELETE FROM reptbl WHERE id = 1") + cur.execute(f"INSERT INTO reptbl VALUES (2, '{value}')") + logical_replication_sync(vanilla_pg, endpoint) + assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)] + + value = random_string(10_000_000) + cur.execute(f"UPDATE reptbl SET largeval='{value}'") + logical_replication_sync(vanilla_pg, endpoint) + assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)] + + endpoint.stop() + endpoint.start() + cur = endpoint.connect().cursor() + value = random_string(10_000_000) + cur.execute(f"UPDATE reptbl SET largeval='{value}'") + logical_replication_sync(vanilla_pg, endpoint) + assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)] + + # # Check that slots are not inherited in brnach # @@ -258,8 +462,159 @@ def test_slots_and_branching(neon_simple_env: NeonEnv): # Create branch ws. env.neon_cli.create_branch("ws", "main", tenant_id=tenant) ws_branch = env.endpoints.create_start("ws", tenant_id=tenant) - log.info("postgres is running on 'ws' branch") # Check that we can create slot with the same name ws_cur = ws_branch.connect().cursor() ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')") + + +@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation]) +def test_replication_shutdown(neon_simple_env: NeonEnv): + # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed + env = neon_simple_env + env.neon_cli.create_branch("test_replication_shutdown_publisher", "empty") + pub = env.endpoints.create("test_replication_shutdown_publisher") + + env.neon_cli.create_branch("test_replication_shutdown_subscriber") + sub = env.endpoints.create("test_replication_shutdown_subscriber") + + pub.respec(skip_pg_catalog_updates=False) + pub.start() + + sub.respec(skip_pg_catalog_updates=False) + sub.start() + + pub.wait_for_migrations() + sub.wait_for_migrations() + + with pub.cursor() as cur: + cur.execute( + "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser" + ) + cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers") + cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser") + + # If we don't do this, creating the subscription will fail later on PG16 + pub.edit_hba(["host all mr_whiskers 0.0.0.0/0 md5"]) + + with sub.cursor() as cur: + cur.execute( + "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser" + ) + cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers") + cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser") + + with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur: + cur.execute("CREATE PUBLICATION pub FOR ALL TABLES") + cur.execute("CREATE TABLE t (a int)") + cur.execute("INSERT INTO t VALUES (10), (20)") + cur.execute("SELECT * from t") + res = cur.fetchall() + assert [r[0] for r in res] == [10, 20] + + with sub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur: + cur.execute("CREATE TABLE t (a int)") + + pub_conn = f"host=localhost port={pub.pg_port} dbname=neondb user=mr_whiskers password=cat" + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub" + log.info(f"Creating subscription: {query}") + cur.execute(query) + + with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as pcur: + pcur.execute("INSERT INTO t VALUES (30), (40)") + + def check_that_changes_propagated(): + cur.execute("SELECT * FROM t") + res = cur.fetchall() + log.info(res) + assert len(res) == 4 + assert [r[0] for r in res] == [10, 20, 30, 40] + + wait_until(10, 0.5, check_that_changes_propagated) + + +def logical_replication_wait_flush_lsn_sync(publisher: PgProtocol) -> Lsn: + """ + Wait for logical replication subscriber reported flush_lsn to reach + pg_current_wal_flush_lsn on publisher. Note that this is somewhat unreliable + because for some WAL records like vacuum subscriber won't get any data at + all. + """ + publisher_flush_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + + def check_caughtup(): + res = publisher.safe_psql( + """ +select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication sr, pg_replication_slots s + where s.active_pid = sr.pid and s.slot_type = 'logical'; + """ + )[0] + sent_lsn, flush_lsn, curr_publisher_flush_lsn = Lsn(res[0]), Lsn(res[1]), Lsn(res[2]) + log.info( + f"sent_lsn={sent_lsn}, flush_lsn={flush_lsn}, publisher_flush_lsn={curr_publisher_flush_lsn}, waiting flush_lsn to reach {publisher_flush_lsn}" + ) + assert flush_lsn >= publisher_flush_lsn + + wait_until(30, 0.5, check_caughtup) + return publisher_flush_lsn + + +# Test that subscriber takes into account quorum committed flush_lsn in +# flush_lsn reporting to publisher. Without this, it may ack too far, losing +# data on restart because publisher advances START_REPLICATION position to the +# confirmed_flush_lsn of the slot. +def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg): + env = neon_simple_env + # use vanilla as publisher to allow writes on it when safekeeper is down + vanilla_pg.configure( + [ + "wal_level = 'logical'", + # neon fork uses custom WAL records which won't work without extension installed with obscure + # ERROR: resource manager with ID 134 not registered + # error. + "shared_preload_libraries = 'neon'", + ] + ) + vanilla_pg.start() + vanilla_pg.safe_psql("create extension neon;") + + env.neon_cli.create_branch("subscriber") + sub = env.endpoints.create("subscriber") + sub.start() + + with vanilla_pg.cursor() as pcur: + with sub.cursor() as scur: + pcur.execute("CREATE TABLE t (pk integer primary key, sk integer)") + pcur.execute("CREATE PUBLICATION pub FOR TABLE t") + scur.execute("CREATE TABLE t (pk integer primary key, sk integer)") + + pub_connstr = vanilla_pg.connstr().replace("'", "''") + log.info(f"pub connstr is {pub_connstr}, subscriber connstr {sub.connstr()}") + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_connstr}' PUBLICATION pub with (synchronous_commit=off)" + scur.execute(query) + time.sleep(2) # let initial table sync complete + + # stop safekeeper so it won't get any data + for sk in env.safekeepers: + sk.stop() + # and insert to publisher + with vanilla_pg.cursor() as pcur: + for i in range(0, 1000): + pcur.execute("INSERT into t values (%s, random()*100000)", (i,)) + # wait until sub receives all data + logical_replication_sync(sub, vanilla_pg) + # Update confirmed_flush_lsn of the slot. If subscriber ack'ed recevied data + # as flushed we'll now lose it if subscriber restars. That's why + # logical_replication_wait_flush_lsn_sync is expected to hang while + # safekeeper is down. + vanilla_pg.safe_psql("checkpoint;") + assert sub.safe_psql_scalar("SELECT count(*) FROM t") == 1000 + + # restart subscriber and ensure it can catch up lost tail again + sub.stop(mode="immediate") + for sk in env.safekeepers: + sk.start() + sub.start() + log.info("waiting for sync after restart") + logical_replication_wait_flush_lsn_sync(vanilla_pg) + assert sub.safe_psql_scalar("SELECT count(*) FROM t") == 1000 diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 65d6d7a9fd..67e82f8d30 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -1,17 +1,35 @@ +import re import time +from concurrent.futures import ThreadPoolExecutor from datetime import datetime, timedelta, timezone +import pytest +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn from fixtures.pageserver.http import PageserverApiException -from fixtures.types import Lsn -from fixtures.utils import query_scalar +from fixtures.utils import query_scalar, wait_until +from requests.exceptions import ReadTimeout -# -# Test pageserver get_lsn_by_timestamp API -# -def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): +def assert_lsn_lease_granted(result, with_lease: bool): + """ + Asserts an LSN lease is granted when `with_lease` flag is turned on. + Always asserts no LSN lease is granted when `with_lease` flag is off. + """ + if with_lease: + assert result.get("valid_until") + else: + assert result.get("valid_until") is None + + +@pytest.mark.parametrize("with_lease", [True, False]) +def test_lsn_mapping(neon_env_builder: NeonEnvBuilder, with_lease: bool): + """ + Test pageserver get_lsn_by_timestamp API. + + :param with_lease: Whether to get a lease associated with returned LSN. + """ env = neon_env_builder.init_start() tenant_id, _ = env.neon_cli.create_tenant( @@ -28,7 +46,6 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): timeline_id = env.neon_cli.create_branch("test_lsn_mapping", tenant_id=tenant_id) endpoint_main = env.endpoints.create_start("test_lsn_mapping", tenant_id=tenant_id) timeline_id = endpoint_main.safe_psql("show neon.timeline_id")[0][0] - log.info("postgres is running on 'main' branch") cur = endpoint_main.connect().cursor() @@ -65,18 +82,21 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Timestamp is in the future probe_timestamp = tbl[-1][1] + timedelta(hours=1) result = client.timeline_get_lsn_by_timestamp( - tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z" + tenant_id, timeline_id, probe_timestamp, with_lease=with_lease ) assert result["kind"] == "future" + assert_lsn_lease_granted(result, with_lease) # make sure that we return a well advanced lsn here assert Lsn(result["lsn"]) > start_lsn # Timestamp is in the unreachable past probe_timestamp = tbl[0][1] - timedelta(hours=10) result = client.timeline_get_lsn_by_timestamp( - tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z" + tenant_id, timeline_id, probe_timestamp, with_lease=with_lease ) assert result["kind"] == "past" + assert_lsn_lease_granted(result, with_lease) + # make sure that we return the minimum lsn here at the start of the range assert Lsn(result["lsn"]) < start_lsn @@ -84,9 +104,10 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): for i in range(1, len(tbl), 100): probe_timestamp = tbl[i][1] result = client.timeline_get_lsn_by_timestamp( - tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z" + tenant_id, timeline_id, probe_timestamp, with_lease=with_lease ) assert result["kind"] not in ["past", "nodata"] + assert_lsn_lease_granted(result, with_lease) lsn = result["lsn"] # Call get_lsn_by_timestamp to get the LSN # Launch a new read-only node at that LSN, and check that only the rows @@ -109,28 +130,76 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Timestamp is in the unreachable past probe_timestamp = tbl[0][1] - timedelta(hours=10) result = client.timeline_get_lsn_by_timestamp( - tenant_id, timeline_id_child, f"{probe_timestamp.isoformat()}Z", 2 + tenant_id, timeline_id_child, probe_timestamp, with_lease=with_lease ) assert result["kind"] == "past" + assert_lsn_lease_granted(result, with_lease) # make sure that we return the minimum lsn here at the start of the range assert Lsn(result["lsn"]) >= last_flush_lsn +def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder): + """ + Test if cancelled pageserver get_lsn_by_timestamp request is correctly handled. + Added as an effort to improve error handling and avoid full anyhow backtrace. + """ + + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend( + [ + ".*request was dropped before completing.*", + ".*Cancelled request finished with an error: Cancelled", + ] + ) + + client = env.pageserver.http_client() + failpoint = "find-lsn-for-timestamp-pausable" + client.configure_failpoints((failpoint, "pause")) + + with ThreadPoolExecutor(max_workers=1) as exec: + # Request get_lsn_by_timestamp, hit the pausable failpoint + failing = exec.submit( + client.timeline_get_lsn_by_timestamp, + env.initial_tenant, + env.initial_timeline, + datetime.now(), + timeout=2, + ) + + _, offset = wait_until( + 20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + ) + + with pytest.raises(ReadTimeout): + failing.result() + + client.configure_failpoints((failpoint, "off")) + + _, offset = wait_until( + 20, + 0.5, + lambda: env.pageserver.assert_log_contains( + "Cancelled request finished with an error: Cancelled$", offset + ), + ) + + # Test pageserver get_timestamp_of_lsn API def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): + key_not_found_error = r".*could not find data for key.*" + env = neon_env_builder.init_start() new_timeline_id = env.neon_cli.create_branch("test_ts_of_lsn_api") endpoint_main = env.endpoints.create_start("test_ts_of_lsn_api") - log.info("postgres is running on 'test_ts_of_lsn_api' branch") cur = endpoint_main.connect().cursor() # Create table, and insert rows, each in a separate transaction - # Disable synchronous_commit to make this initialization go faster. + # Enable synchronous commit as we are timing sensitive # # Each row contains current insert LSN and the current timestamp, when # the row was inserted. - cur.execute("SET synchronous_commit=off") + cur.execute("SET synchronous_commit=on") cur.execute("CREATE TABLE foo (x integer)") tbl = [] for i in range(1000): @@ -139,7 +208,7 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=timezone.utc) after_lsn = query_scalar(cur, "SELECT pg_current_wal_lsn()") tbl.append([i, after_timestamp, after_lsn]) - time.sleep(0.005) + time.sleep(0.02) # Execute one more transaction with synchronous_commit enabled, to flush # all the previous transactions @@ -187,8 +256,8 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): raise RuntimeError("there should have been an 'could not find data for key' error") except PageserverApiException as error: assert error.status_code == 500 - assert str(error).startswith("could not find data for key") - env.pageserver.allowed_errors.append(".*could not find data for key.*") + assert re.match(key_not_found_error, str(error)) + env.pageserver.allowed_errors.append(key_not_found_error) # Probe a bunch of timestamps in the valid range step_size = 100 diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py index 121fa91f66..bdc5ca907e 100644 --- a/test_runner/regress/test_migrations.py +++ b/test_runner/regress/test_migrations.py @@ -1,6 +1,10 @@ -import time +from __future__ import annotations -from fixtures.neon_fixtures import NeonEnv +import time +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv def test_migrations(neon_simple_env: NeonEnv): @@ -8,30 +12,23 @@ def test_migrations(neon_simple_env: NeonEnv): env.neon_cli.create_branch("test_migrations", "empty") endpoint = env.endpoints.create("test_migrations") - log_path = endpoint.endpoint_path() / "compute.log" - - endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"]) + endpoint.respec(skip_pg_catalog_updates=False) endpoint.start() - time.sleep(1) # Sleep to let migrations run + num_migrations = 10 + endpoint.wait_for_migrations(num_migrations=num_migrations) with endpoint.cursor() as cur: cur.execute("SELECT id FROM neon_migration.migration_id") migration_id = cur.fetchall() - assert migration_id[0][0] == 2 - - with open(log_path, "r") as log_file: - logs = log_file.read() - assert "INFO handle_migrations: Ran 2 migrations" in logs + assert migration_id[0][0] == num_migrations endpoint.stop() endpoint.start() - time.sleep(1) # Sleep to let migrations run + # We don't have a good way of knowing that the migrations code path finished executing + # in compute_ctl in the case that no migrations are being run + time.sleep(1) with endpoint.cursor() as cur: cur.execute("SELECT id FROM neon_migration.migration_id") migration_id = cur.fetchall() - assert migration_id[0][0] == 2 - - with open(log_path, "r") as log_file: - logs = log_file.read() - assert "INFO handle_migrations: Ran 0 migrations" in logs + assert migration_id[0][0] == num_migrations diff --git a/test_runner/regress/test_multixact.py b/test_runner/regress/test_multixact.py index 9db463dc4a..88f7a5db59 100644 --- a/test_runner/regress/test_multixact.py +++ b/test_runner/regress/test_multixact.py @@ -1,4 +1,3 @@ -from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.utils import query_scalar @@ -18,7 +17,6 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): env.neon_cli.create_branch("test_multixact", "empty") endpoint = env.endpoints.create_start("test_multixact") - log.info("postgres is running on 'test_multixact' branch") cur = endpoint.connect().cursor() cur.execute( """ @@ -78,7 +76,6 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn) endpoint_new = env.endpoints.create_start("test_multixact_new") - log.info("postgres is running on 'test_multixact_new' branch") next_multixact_id_new = endpoint_new.safe_psql( "SELECT next_multixact_id FROM pg_control_checkpoint()" )[0][0] diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index 16d120e24a..ba170cfb4c 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -5,6 +5,7 @@ from typing import cast import pytest import requests +from fixtures.common_types import TenantId, TimelineId from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, @@ -13,7 +14,6 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pg_version import PgVersion, skip_on_postgres -from fixtures.types import TenantId, TimelineId def helper_compare_timeline_list( @@ -133,7 +133,7 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder): # Stop default ps/sk env.neon_cli.pageserver_stop(env.pageserver.id) env.neon_cli.safekeeper_stop() - env.neon_cli.attachment_service_stop(False) + env.neon_cli.storage_controller_stop(False) # Keep NeonEnv state up to date, it usually owns starting/stopping services env.pageserver.running = False @@ -175,7 +175,7 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder): env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 2) # Stop this to get out of the way of the following `start` - env.neon_cli.attachment_service_stop(False) + env.neon_cli.storage_controller_stop(False) # Default start res = env.neon_cli.raw_cli(["start"]) diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index 998f84f968..bb844244e3 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -1,3 +1,4 @@ +import time from contextlib import closing from fixtures.log_helper import log @@ -14,8 +15,6 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder): endpoint_main.respec(skip_pg_catalog_updates=False) endpoint_main.start() - log.info("postgres is running on 'test_create_extension_neon' branch") - with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: cur.execute("SELECT extversion from pg_extension where extname='neon'") @@ -25,4 +24,73 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder): # IMPORTANT: # If the version has changed, the test should be updated. # Ensure that the default version is also updated in the neon.control file - assert cur.fetchone() == ("1.1",) + assert cur.fetchone() == ("1.4",) + cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") + res = cur.fetchall() + log.info(res) + assert len(res) == 1 + assert len(res[0]) == 5 + + +# Verify that the neon extension can be upgraded/downgraded. +def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_neon_extension_compatibility") + + endpoint_main = env.endpoints.create("test_neon_extension_compatibility") + # don't skip pg_catalog updates - it runs CREATE EXTENSION neon + endpoint_main.respec(skip_pg_catalog_updates=False) + endpoint_main.start() + + with closing(endpoint_main.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SELECT extversion from pg_extension where extname='neon'") + # IMPORTANT: + # If the version has changed, the test should be updated. + # Ensure that the default version is also updated in the neon.control file + assert cur.fetchone() == ("1.4",) + cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") + all_versions = ["1.4", "1.3", "1.2", "1.1", "1.0"] + current_version = "1.4" + for idx, begin_version in enumerate(all_versions): + for target_version in all_versions[idx + 1 :]: + if current_version != begin_version: + cur.execute( + f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {current_version}->{begin_version}" + ) + current_version = begin_version + # downgrade + cur.execute( + f"ALTER EXTENSION neon UPDATE TO '{target_version}'; -- {begin_version}->{target_version}" + ) + # upgrade + cur.execute( + f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {target_version}->{begin_version}" + ) + + +# Verify that the neon extension can be auto-upgraded to the latest version. +def test_neon_extension_auto_upgrade(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_neon_extension_auto_upgrade") + + endpoint_main = env.endpoints.create("test_neon_extension_auto_upgrade") + # don't skip pg_catalog updates - it runs CREATE EXTENSION neon + endpoint_main.respec(skip_pg_catalog_updates=False) + endpoint_main.start() + + with closing(endpoint_main.connect()) as conn: + with conn.cursor() as cur: + cur.execute("ALTER EXTENSION neon UPDATE TO '1.0';") + cur.execute("SELECT extversion from pg_extension where extname='neon'") + assert cur.fetchone() == ("1.0",) # Ensure the extension gets downgraded + + endpoint_main.stop() + time.sleep(1) + endpoint_main.start() + time.sleep(1) + + with closing(endpoint_main.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SELECT extversion from pg_extension where extname='neon'") + assert cur.fetchone() != ("1.0",) # Ensure the extension gets upgraded diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py index 46b72fbca5..8edba49b8a 100644 --- a/test_runner/regress/test_neon_local_cli.py +++ b/test_runner/regress/test_neon_local_cli.py @@ -59,3 +59,5 @@ def test_neon_two_primary_endpoints_fail( env.neon_cli.endpoint_stop("ep1") # ep1 is stopped so create ep2 will succeed env.neon_cli.endpoint_start("ep2") + # cleanup + env.neon_cli.endpoint_stop("ep2") diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py index 6be7c114cb..fd31df84da 100644 --- a/test_runner/regress/test_neon_superuser.py +++ b/test_runner/regress/test_neon_superuser.py @@ -1,26 +1,44 @@ -import time - +from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.pg_version import PgVersion +from fixtures.utils import wait_until def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion): env = neon_simple_env - env.neon_cli.create_branch("test_neon_superuser", "empty") - endpoint = env.endpoints.create("test_neon_superuser") - endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"]) - endpoint.start() + env.neon_cli.create_branch("test_neon_superuser_publisher", "empty") + pub = env.endpoints.create("test_neon_superuser_publisher") - time.sleep(1) # Sleep to let migrations run + env.neon_cli.create_branch("test_neon_superuser_subscriber") + sub = env.endpoints.create("test_neon_superuser_subscriber") - with endpoint.cursor() as cur: + pub.respec(skip_pg_catalog_updates=False) + pub.start() + + sub.respec(skip_pg_catalog_updates=False) + sub.start() + + pub.wait_for_migrations() + sub.wait_for_migrations() + + with pub.cursor() as cur: cur.execute( "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser" ) cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers") cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser") - with endpoint.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur: + # If we don't do this, creating the subscription will fail later on PG16 + pub.edit_hba(["host all mr_whiskers 0.0.0.0/0 md5"]) + + with sub.cursor() as cur: + cur.execute( + "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser" + ) + cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers") + cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser") + + with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur: cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'member')") assert cur.fetchall()[0][0] cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'usage')") @@ -32,3 +50,50 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion): cur.execute("CREATE PUBLICATION pub FOR ALL TABLES") cur.execute("CREATE ROLE definitely_not_a_superuser WITH PASSWORD 'nope'") + cur.execute("CREATE DATABASE definitely_a_database") + cur.execute("CREATE TABLE t (a int)") + cur.execute("INSERT INTO t VALUES (10), (20)") + cur.execute("SELECT * from t") + res = cur.fetchall() + assert [r[0] for r in res] == [10, 20] + + with sub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur: + cur.execute("CREATE TABLE t (a int)") + + pub_conn = f"host=localhost port={pub.pg_port} dbname=neondb user=mr_whiskers password=cat" + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub" + log.info(f"Creating subscription: {query}") + cur.execute(query) + + with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as pcur: + pcur.execute("INSERT INTO t VALUES (30), (40)") + + def check_that_changes_propagated(): + cur.execute("SELECT * FROM t") + res = cur.fetchall() + log.info(res) + assert len(res) == 4 + assert [r[0] for r in res] == [10, 20, 30, 40] + + wait_until(10, 0.5, check_that_changes_propagated) + + # Test that pg_monitor is working for neon_superuser role + cur.execute("SELECT query from pg_stat_activity LIMIT 1") + assert cur.fetchall()[0][0] != "" + # Test that pg_monitor is not working for non neon_superuser role without grant + cur.execute("CREATE ROLE not_a_superuser LOGIN PASSWORD 'Password42!'") + cur.execute("GRANT not_a_superuser TO neon_superuser WITH ADMIN OPTION") + cur.execute("SET ROLE not_a_superuser") + cur.execute("SELECT query from pg_stat_activity LIMIT 1") + assert cur.fetchall()[0][0] == "" + cur.execute("RESET ROLE") + # Test that pg_monitor is working for non neon_superuser role with grant + cur.execute("GRANT pg_monitor TO not_a_superuser") + cur.execute("SET ROLE not_a_superuser") + cur.execute("SELECT query from pg_stat_activity LIMIT 1") + assert cur.fetchall()[0][0] != "" + cur.execute("RESET ROLE") + cur.execute("DROP ROLE not_a_superuser") + query = "DROP SUBSCRIPTION sub CASCADE" + log.info(f"Dropping subscription: {query}") + cur.execute(query) diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py index da2580dbf9..51e847135e 100644 --- a/test_runner/regress/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -1,15 +1,17 @@ -import json import os import time from pathlib import Path +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_wal_insert_lsn -from fixtures.pageserver.utils import ( - wait_for_last_record_lsn, +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + VanillaPostgres, + import_timeline_from_vanilla_postgres, + wait_for_wal_insert_lsn, ) from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar @@ -71,22 +73,16 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder): def test_import_at_2bil( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, - pg_distrib_dir: Path, - pg_bin, + pg_bin: PgBin, vanilla_pg, ): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() - ps_http = env.pageserver.http_client() - - # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. - # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} # Reset the vanilla Postgres instance to somewhat before 2 billion transactions. pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") cmd = [pg_resetwal_path, "--next-transaction-id=2129920000", "-D", str(vanilla_pg.pgdatadir)] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.run_capture(cmd) vanilla_pg.start() vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") @@ -97,68 +93,28 @@ def test_import_at_2bil( assert vanilla_pg.safe_psql("select count(*) from tt") == [(300000,)] vanilla_pg.safe_psql("CREATE TABLE t (t text);") vanilla_pg.safe_psql("INSERT INTO t VALUES ('inserted in vanilla')") - - endpoint_id = "ep-import_from_vanilla" - tenant = TenantId.generate() - timeline = TimelineId.generate() - - env.pageserver.tenant_create(tenant) - - # Take basebackup - basebackup_dir = os.path.join(test_output_dir, "basebackup") - base_tar = os.path.join(basebackup_dir, "base.tar") - wal_tar = os.path.join(basebackup_dir, "pg_wal.tar") - os.mkdir(basebackup_dir) vanilla_pg.safe_psql("CHECKPOINT") - pg_bin.run( - [ - "pg_basebackup", - "-F", - "tar", - "-d", - vanilla_pg.connstr(), - "-D", - basebackup_dir, - ] + + tenant_id = TenantId.generate() + env.pageserver.tenant_create(tenant_id) + timeline_id = TimelineId.generate() + + # Import the cluster to Neon + import_timeline_from_vanilla_postgres( + test_output_dir, + env, + pg_bin, + tenant_id, + timeline_id, + "imported_2bil_xids", + vanilla_pg.connstr(), ) + vanilla_pg.stop() # don't need the original server anymore - # Get start_lsn and end_lsn - with open(os.path.join(basebackup_dir, "backup_manifest")) as f: - manifest = json.load(f) - start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"] - end_lsn = manifest["WAL-Ranges"][0]["End-LSN"] - - def import_tar(base, wal): - env.neon_cli.raw_cli( - [ - "timeline", - "import", - "--tenant-id", - str(tenant), - "--timeline-id", - str(timeline), - "--node-name", - endpoint_id, - "--base-lsn", - start_lsn, - "--base-tarfile", - base, - "--end-lsn", - end_lsn, - "--wal-tarfile", - wal, - "--pg-version", - env.pg_version, - ] - ) - - # Importing correct backup works - import_tar(base_tar, wal_tar) - wait_for_last_record_lsn(ps_http, tenant, timeline, Lsn(end_lsn)) - + # Check that it works endpoint = env.endpoints.create_start( - endpoint_id, - tenant_id=tenant, + "imported_2bil_xids", + tenant_id=tenant_id, config_lines=[ "log_autovacuum_min_duration = 0", "autovacuum_naptime='5 s'", @@ -166,7 +122,6 @@ def test_import_at_2bil( ) assert endpoint.safe_psql("select count(*) from t") == [(1,)] - # Ok, consume conn = endpoint.connect() cur = conn.cursor() @@ -203,12 +158,22 @@ def test_import_at_2bil( $$; """ ) + + # Also create a multi-XID with members past the 2 billion mark + conn2 = endpoint.connect() + cur2 = conn2.cursor() + cur.execute("INSERT INTO t VALUES ('x')") + cur.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;") + cur2.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;") + cur.execute("COMMIT") + cur2.execute("COMMIT") + # A checkpoint writes a WAL record with xl_xid=0. Many other WAL # records would have the same effect. cur.execute("checkpoint") # wait until pageserver receives that data - wait_for_wal_insert_lsn(env, endpoint, tenant, timeline) + wait_for_wal_insert_lsn(env, endpoint, tenant_id, timeline_id) # Restart endpoint endpoint.stop() @@ -217,4 +182,276 @@ def test_import_at_2bil( conn = endpoint.connect() cur = conn.cursor() cur.execute("SELECT count(*) from t") - assert cur.fetchone() == (10000 + 1,) + assert cur.fetchone() == (10000 + 1 + 1,) + + +# Constants and macros copied from PostgreSQL multixact.c and headers. These are needed to +# calculate the SLRU segments that a particular multixid or multixid-offsets falls into. +BLCKSZ = 8192 +MULTIXACT_OFFSETS_PER_PAGE = int(BLCKSZ / 4) +SLRU_PAGES_PER_SEGMENT = int(32) +MXACT_MEMBER_BITS_PER_XACT = 8 +MXACT_MEMBER_FLAGS_PER_BYTE = 1 +MULTIXACT_FLAGBYTES_PER_GROUP = 4 +MULTIXACT_MEMBERS_PER_MEMBERGROUP = MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE +MULTIXACT_MEMBERGROUP_SIZE = 4 * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP +MULTIXACT_MEMBERGROUPS_PER_PAGE = int(BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) +MULTIXACT_MEMBERS_PER_PAGE = MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP + + +def MultiXactIdToOffsetSegment(xid: int): + return int(xid / (SLRU_PAGES_PER_SEGMENT * MULTIXACT_OFFSETS_PER_PAGE)) + + +def MXOffsetToMemberSegment(off: int): + return int(off / (SLRU_PAGES_PER_SEGMENT * MULTIXACT_MEMBERS_PER_PAGE)) + + +def advance_multixid_to( + pg_bin: PgBin, vanilla_pg: VanillaPostgres, next_multi_xid: int, next_multi_offset: int +): + """ + Use pg_resetwal to advance the nextMulti and nextMultiOffset values in a stand-alone + Postgres cluster. This is useful to get close to wraparound or some other interesting + value, without having to burn a lot of time consuming the (multi-)XIDs one by one. + + The new values should be higher than the old ones, in a wraparound-aware sense. + + On entry, the server should be running. It will be shut down and restarted. + """ + + # Read old values from the last checkpoint. We will pass the old oldestMultiXid value + # back to pg_resetwal, there's no option to leave it alone. + with vanilla_pg.connect() as conn: + with conn.cursor() as cur: + # Make sure the oldest-multi-xid value in the control file is up-to-date + cur.execute("checkpoint") + cur.execute("select oldest_multi_xid, next_multixact_id from pg_control_checkpoint()") + rec = cur.fetchone() + assert rec is not None + (ckpt_oldest_multi_xid, ckpt_next_multi_xid) = rec + log.info(f"oldestMultiXid was {ckpt_oldest_multi_xid}, nextMultiXid was {ckpt_next_multi_xid}") + log.info(f"Resetting to {next_multi_xid}") + + # Use pg_resetwal to reset the next multiXid and multiOffset to given values. + vanilla_pg.stop() + pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") + cmd = [ + pg_resetwal_path, + f"--multixact-ids={next_multi_xid},{ckpt_oldest_multi_xid}", + f"--multixact-offset={next_multi_offset}", + "-D", + str(vanilla_pg.pgdatadir), + ] + pg_bin.run_capture(cmd) + + # Because we skip over a lot of values, Postgres hasn't created the SLRU segments for + # the new values yet. Create them manually, to allow Postgres to start up. + # + # This leaves "gaps" in the SLRU where segments between old value and new value are + # missing. That's OK for our purposes. Autovacuum will print some warnings about the + # missing segments, but will clean it up by truncating the SLRUs up to the new value, + # closing the gap. + segname = "%04X" % MultiXactIdToOffsetSegment(next_multi_xid) + log.info(f"Creating dummy segment pg_multixact/offsets/{segname}") + with open(vanilla_pg.pgdatadir / "pg_multixact" / "offsets" / segname, "w") as of: + of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ) + of.flush() + + segname = "%04X" % MXOffsetToMemberSegment(next_multi_offset) + log.info(f"Creating dummy segment pg_multixact/members/{segname}") + with open(vanilla_pg.pgdatadir / "pg_multixact" / "members" / segname, "w") as of: + of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ) + of.flush() + + # Start Postgres again and wait until autovacuum has processed all the databases + # + # This allows truncating the SLRUs, fixing the gaps with missing segments. + vanilla_pg.start() + with vanilla_pg.connect().cursor() as cur: + for _ in range(1000): + datminmxid = int( + query_scalar(cur, "select min(datminmxid::text::int8) from pg_database") + ) + log.info(f"datminmxid {datminmxid}") + if next_multi_xid - datminmxid < 1_000_000: # not wraparound-aware! + break + time.sleep(0.5) + + +def test_multixid_wraparound_import( + neon_env_builder: NeonEnvBuilder, + test_output_dir: Path, + pg_bin: PgBin, + vanilla_pg, +): + """ + Test that the wraparound of the "next-multi-xid" counter is handled correctly in + pageserver, And multi-offsets as well + """ + env = neon_env_builder.init_start() + + # In order to to test multixid wraparound, we need to first advance the counter to + # within spitting distance of the wraparound, that is 2^32 multi-XIDs. We could simply + # run a workload that consumes a lot of multi-XIDs until we approach that, but that + # takes a very long time. So we cheat. + # + # Our strategy is to create a vanilla Postgres cluster, and use pg_resetwal to + # directly set the multi-xid counter a higher value. However, we cannot directly set + # it to just before 2^32 (~ 4 billion), because that would make the exisitng + # 'relminmxid' values to look like they're in the future. It's not clear how the + # system would behave in that situation. So instead, we bump it up ~ 1 billion + # multi-XIDs at a time, and let autovacuum to process all the relations and update + # 'relminmxid' between each run. + # + # XXX: For the multi-offsets, most of the bump is done in the last call. This is + # because advancing it ~ 1 billion at a time hit a pathological case in the + # MultiXactMemberFreezeThreshold() function, causing autovacuum not trigger multixid + # freezing. See + # https://www.postgresql.org/message-id/85fb354c-f89f-4d47-b3a2-3cbd461c90a3%40iki.fi + # Multi-offsets don't have the same wraparound problems at 2 billion mark as + # multi-xids do, so one big jump is fine. + vanilla_pg.configure( + [ + "log_autovacuum_min_duration = 0", + # Perform anti-wraparound vacuuming aggressively + "autovacuum_naptime='1 s'", + "autovacuum_freeze_max_age = 1000000", + "autovacuum_multixact_freeze_max_age = 1000000", + ], + ) + vanilla_pg.start() + advance_multixid_to(pg_bin, vanilla_pg, 0x40000000, 0x10000000) + advance_multixid_to(pg_bin, vanilla_pg, 0x80000000, 0x20000000) + advance_multixid_to(pg_bin, vanilla_pg, 0xC0000000, 0x30000000) + advance_multixid_to(pg_bin, vanilla_pg, 0xFFFFFF00, 0xFFFFFF00) + + vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") + vanilla_pg.safe_psql("create table tt as select g as id from generate_series(1, 10) g") + vanilla_pg.safe_psql("CHECKPOINT") + + # Import the cluster to the pageserver + tenant_id = TenantId.generate() + env.pageserver.tenant_create(tenant_id) + timeline_id = TimelineId.generate() + import_timeline_from_vanilla_postgres( + test_output_dir, + env, + pg_bin, + tenant_id, + timeline_id, + "imported_multixid_wraparound_test", + vanilla_pg.connstr(), + ) + vanilla_pg.stop() + + endpoint = env.endpoints.create_start( + "imported_multixid_wraparound_test", + tenant_id=tenant_id, + config_lines=[ + "log_autovacuum_min_duration = 0", + "autovacuum_naptime='5 s'", + "autovacuum=off", + ], + ) + conn = endpoint.connect() + cur = conn.cursor() + assert query_scalar(cur, "select count(*) from tt") == 10 # sanity check + + # Install extension containing function needed for test + cur.execute("CREATE EXTENSION neon_test_utils") + + # Consume a lot of XIDs, just to advance the XIDs to different range than the + # multi-xids. That avoids confusion while debugging + cur.execute("select test_consume_xids(100000)") + cur.execute("select pg_switch_wal()") + cur.execute("checkpoint") + + # Use subtransactions so that each row in 'tt' is stamped with different XID. Leave + # the transaction open. + cur.execute("BEGIN") + cur.execute( + """ +do $$ +declare + idvar int; +begin + for idvar in select id from tt loop + begin + update tt set id = idvar where id = idvar; + exception when others then + raise 'didn''t expect an error: %', sqlerrm; + end; + end loop; +end; +$$; +""" + ) + + # In a different transaction, acquire a FOR KEY SHARE lock on each row. This generates + # a new multixid for each row, with the previous xmax and this transaction's XID as the + # members. + # + # Repeat this until the multi-xid counter wraps around. + conn3 = endpoint.connect() + cur3 = conn3.cursor() + next_multixact_id_before_restart = 0 + observed_before_wraparound = False + while True: + cur3.execute("BEGIN") + cur3.execute("SELECT * FROM tt FOR KEY SHARE") + + # Get the xmax of one of the rows we locked. It should be a multi-xid. It might + # not be the latest one, but close enough. + row_xmax = int(query_scalar(cur3, "SELECT xmax FROM tt LIMIT 1")) + cur3.execute("COMMIT") + log.info(f"observed a row with xmax {row_xmax}") + + # High value means not wrapped around yet + if row_xmax >= 0xFFFFFF00: + observed_before_wraparound = True + continue + + # xmax should not be a regular XID. (We bumped up the regular XID range earlier + # to around 100000 and above.) + assert row_xmax < 100 + + # xmax values < FirstNormalTransactionId (== 3) could be special XID values, or + # multixid values after wraparound. We don't know for sure which, so keep going to + # be sure we see value that's unambiguously a wrapped-around multixid + if row_xmax < 3: + continue + + next_multixact_id_before_restart = row_xmax + log.info( + f"next_multixact_id is now at {next_multixact_id_before_restart} or a little higher" + ) + break + + # We should have observed the state before wraparound + assert observed_before_wraparound + + cur.execute("COMMIT") + + # Wait until pageserver has received all the data, and restart the endpoint + wait_for_wal_insert_lsn(env, endpoint, tenant_id, timeline_id) + endpoint.stop(mode="immediate") # 'immediate' to avoid writing shutdown checkpoint + endpoint.start() + + # Check that the next-multixid value wrapped around correctly + conn = endpoint.connect() + cur = conn.cursor() + cur.execute("select next_multixact_id from pg_control_checkpoint()") + next_multixact_id_after_restart = int( + query_scalar(cur, "select next_multixact_id from pg_control_checkpoint()") + ) + log.info(f"next_multixact_id after restart: {next_multixact_id_after_restart}") + assert next_multixact_id_after_restart >= next_multixact_id_before_restart + + # The multi-offset should wrap around as well + cur.execute("select next_multi_offset from pg_control_checkpoint()") + next_multi_offset_after_restart = int( + query_scalar(cur, "select next_multi_offset from pg_control_checkpoint()") + ) + log.info(f"next_multi_offset after restart: {next_multi_offset_after_restart}") + assert next_multi_offset_after_restart < 100000 diff --git a/test_runner/regress/test_oid_overflow.py b/test_runner/regress/test_oid_overflow.py new file mode 100644 index 0000000000..e8eefc2414 --- /dev/null +++ b/test_runner/regress/test_oid_overflow.py @@ -0,0 +1,45 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder + + +def test_oid_overflow(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + endpoint = env.endpoints.create_start("main") + + conn = endpoint.connect() + cur = conn.cursor() + + cur.execute("CREATE EXTENSION neon_test_utils") + + cur.execute("CREATE TABLE t1(x integer)") + cur.execute("INSERT INTO t1 values (1)") + cur.execute("CREATE TABLE t2(x integer)") + cur.execute("INSERT INTO t2 values (2)") + + cur.execute("SELECT x from t1") + assert cur.fetchone() == (1,) + cur.execute("SELECT x from t2") + assert cur.fetchone() == (2,) + + cur.execute("VACUUM FULL t1") + cur.execute("VACUUM FULL t1") + cur.execute("vacuum pg_class") + cur.execute("SELECT relfilenode FROM pg_class where relname='t1'") + oid = cur.fetchall()[0][0] + log.info(f"t1.relfilenode={oid}") + + cur.execute("set statement_timeout=0") + cur.execute(f"select test_consume_oids({oid-1})") + cur.execute("VACUUM FULL t2") + + cur.execute("SELECT relfilenode FROM pg_class where relname='t2'") + oid = cur.fetchall()[0][0] + log.info(f"t2.relfilenode={oid}") + + endpoint.clear_shared_buffers(cursor=cur) + + cur.execute("SELECT x from t1") + assert cur.fetchone() == (1,) + cur.execute("SELECT x from t2") + assert cur.fetchone() == (2,) diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index 9b0bab5125..f1dd3fb67d 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -1,6 +1,6 @@ +from fixtures.common_types import TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -16,11 +16,9 @@ from fixtures.utils import print_gc_result, query_scalar # def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC - neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) env.neon_cli.create_branch("test_old_request_lsn", "main") endpoint = env.endpoints.create_start("test_old_request_lsn") - log.info("postgres is running on test_old_request_lsn branch") pg_conn = endpoint.connect() cur = pg_conn.cursor() diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index af2d7aae88..c8249bb2ce 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -3,23 +3,27 @@ import time from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor from typing import Any, DefaultDict, Dict, Tuple +import pytest +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, + flush_ep_to_pageserver, last_flush_lsn_upload, wait_for_last_flush_lsn, ) -from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( assert_tenant_state, wait_for_last_record_lsn, wait_for_upload, wait_for_upload_queue_empty, + wait_until_tenant_active, ) -from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn +from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage from fixtures.utils import query_scalar, wait_until @@ -165,6 +169,10 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder): tenant_id = env.initial_tenant timeline_id = env.initial_timeline + #### + # Produce layers + #### + lsns = [] table_len = 10000 @@ -194,11 +202,28 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder): # run checkpoint manually to be sure that data landed in remote storage client.timeline_checkpoint(tenant_id, timeline_id) - ##### Stop the first pageserver instance, erase all its data + # prevent new WAL from being produced, wait for layers to reach remote storage env.endpoints.stop_all() - - # wait until pageserver has successfully uploaded all the data to remote storage + for sk in env.safekeepers: + sk.stop() + # NB: the wait_for_upload returns as soon as remote_consistent_lsn == current_lsn. + # But the checkpoint also triggers a compaction + # => image layer generation => + # => doesn't advance LSN + # => but we want the remote state to deterministic, so additionally, wait for upload queue to drain wait_for_upload(client, tenant_id, timeline_id, current_lsn) + wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id) + client.deletion_queue_flush(execute=True) + env.pageserver.stop() + env.pageserver.start() + # We've shut down the SKs, then restarted the PSes to sever all walreceiver connections; + # This means pageserver's remote_consistent_lsn is now frozen to whatever it was after the pageserver.stop() call. + wait_until_tenant_active(client, tenant_id) + + ### + # Produce layers complete; + # Start the actual testing. + ### def get_api_current_physical_size(): d = client.timeline_detail(tenant_id, timeline_id) @@ -215,9 +240,7 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder): log.info(filled_size) assert filled_current_physical == filled_size, "we don't yet do layer eviction" - # Wait until generated image layers are uploaded to S3 - wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id) - + # Stop the first pageserver instance, erase all its data env.pageserver.stop() # remove all the layer files @@ -312,6 +335,17 @@ def test_download_remote_layers_api( } ) + # This test triggers layer download failures on demand. It is possible to modify the failpoint + # during a `Timeline::get_vectored` right between the vectored read and it's validation read. + # This means that one of the reads can fail while the other one succeeds and vice versa. + # TODO(vlad): Remove this block once the vectored read path validation goes away. + env.pageserver.allowed_errors.extend( + [ + ".*initial_size_calculation.*Vectored get failed with downloading evicted layer file failed, but sequential get did not.*" + ".*initial_size_calculation.*Sequential get failed with downloading evicted layer file failed, but vectored get did not.*" + ] + ) + endpoint = env.endpoints.create_start("main") client = env.pageserver.http_client() @@ -370,7 +404,7 @@ def test_download_remote_layers_api( env.pageserver.allowed_errors.extend( [ ".*download failed: downloading evicted layer file failed.*", - f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed: downloading evicted layer file failed", + f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed.*downloading evicted layer file failed", ] ) @@ -497,7 +531,7 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder: with endpoint.cursor() as cur: cur.execute("update a set id = -id") - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id) pageserver_http.timeline_checkpoint(tenant_id, timeline_id) layers = pageserver_http.layer_map_info(tenant_id, timeline_id) @@ -508,7 +542,6 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder: for layer in layers.historic_layers: log.info(f"pre-compact: {layer}") - assert layer.layer_file_size is not None, "we must know layer file sizes" layer_sizes += layer.layer_file_size pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name) @@ -547,6 +580,8 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne "image_creation_threshold": 100, # repartitioning parameter, unused "compaction_target_size": 128 * 1024**2, + # Always check if a new image layer can be created + "image_layer_creation_check_threshold": 0, # pitr_interval and gc_horizon are not interesting because we dont run gc } @@ -611,7 +646,8 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne # threshold to expose image creation to downloading all of the needed # layers -- threshold of 2 would sound more reasonable, but keeping it as 1 # to be less flaky - env.neon_cli.config_tenant(tenant_id, {"image_creation_threshold": "1"}) + conf["image_creation_threshold"] = "1" + env.neon_cli.config_tenant(tenant_id, {k: str(v) for k, v in conf.items()}) pageserver_http.timeline_compact(tenant_id, timeline_id) layers = pageserver_http.layer_map_info(tenant_id, timeline_id) @@ -622,5 +658,202 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne assert dict(kinds_after) == {"Delta": 4, "Image": 1} +def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBuilder): + """ + Demonstrates that tenant shutdown will cancel on-demand download and secondary doing warmup. + """ + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + + # turn off background tasks so that they don't interfere with the downloads + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "0s", + "compaction_period": "0s", + } + ) + client = env.pageserver.http_client() + failpoint = "before-downloading-layer-stream-pausable" + client.configure_failpoints((failpoint, "pause")) + + env.pageserver.allowed_errors.extend( + [ + ".*downloading failed, possibly for shutdown.*", + ] + ) + + info = client.layer_map_info(env.initial_tenant, env.initial_timeline) + assert len(info.delta_layers()) == 1 + + layer = info.delta_layers()[0] + + client.tenant_heatmap_upload(env.initial_tenant) + + # evict the initdb layer so we can download it + client.evict_layer(env.initial_tenant, env.initial_timeline, layer.layer_file_name) + + with ThreadPoolExecutor(max_workers=2) as exec: + download = exec.submit( + client.download_layer, + env.initial_tenant, + env.initial_timeline, + layer.layer_file_name, + ) + + _, offset = wait_until( + 20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + ) + + location_conf = {"mode": "Detached", "tenant_conf": {}} + # assume detach removes the layers + detach = exec.submit(client.tenant_location_conf, env.initial_tenant, location_conf) + + _, offset = wait_until( + 20, + 0.5, + lambda: env.pageserver.assert_log_contains( + "closing is taking longer than expected", offset + ), + ) + + client.configure_failpoints((failpoint, "off")) + + with pytest.raises( + PageserverApiException, match="downloading failed, possibly for shutdown" + ): + download.result() + + env.pageserver.assert_log_contains(".*downloading failed, possibly for shutdown.*") + + detach.result() + + client.configure_failpoints((failpoint, "pause")) + + _, offset = wait_until( + 20, + 0.5, + lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset), + ) + + location_conf = { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + } + + client.tenant_location_conf(env.initial_tenant, location_conf) + + warmup = exec.submit(client.tenant_secondary_download, env.initial_tenant, wait_ms=30000) + + _, offset = wait_until( + 20, + 0.5, + lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}", offset), + ) + + client.configure_failpoints((failpoint, "off")) + location_conf = {"mode": "Detached", "tenant_conf": {}} + client.tenant_location_conf(env.initial_tenant, location_conf) + + client.configure_failpoints((failpoint, "off")) + + # here we have nothing in the log, but we see that the warmup and conf location update worked + warmup.result() + + +def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder): + """ + Pause using a pausable_failpoint longer than the client timeout to simulate the timeout happening. + """ + # running this test is not reliable against REAL_S3, because operations can + # take longer than 1s we want to use as a timeout + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + assert isinstance(neon_env_builder.pageserver_remote_storage, S3Storage) + neon_env_builder.pageserver_remote_storage.custom_timeout = "1s" + + # turn off background tasks so that they don't interfere with the downloads + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "0s", + "compaction_period": "0s", + } + ) + client = env.pageserver.http_client() + failpoint = "before-downloading-layer-stream-pausable" + client.configure_failpoints((failpoint, "pause")) + + info = client.layer_map_info(env.initial_tenant, env.initial_timeline) + assert len(info.delta_layers()) == 1 + + layer = info.delta_layers()[0] + + client.tenant_heatmap_upload(env.initial_tenant) + + # evict so we can download it + client.evict_layer(env.initial_tenant, env.initial_timeline, layer.layer_file_name) + + with ThreadPoolExecutor(max_workers=2) as exec: + download = exec.submit( + client.download_layer, + env.initial_tenant, + env.initial_timeline, + layer.layer_file_name, + ) + + _, offset = wait_until( + 20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + ) + # ensure enough time while paused to trip the timeout + time.sleep(2) + + client.configure_failpoints((failpoint, "off")) + download.result() + + _, offset = env.pageserver.assert_log_contains( + ".*failed, will retry \\(attempt 0\\): timeout.*" + ) + _, offset = env.pageserver.assert_log_contains(".*succeeded after [0-9]+ retries.*", offset) + + client.evict_layer(env.initial_tenant, env.initial_timeline, layer.layer_file_name) + + client.configure_failpoints((failpoint, "pause")) + + # capture the next offset for a new synchronization with the failpoint + _, offset = wait_until( + 20, + 0.5, + lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset), + ) + + location_conf = { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + } + + client.tenant_location_conf( + env.initial_tenant, + location_conf, + ) + + started = time.time() + + warmup = exec.submit(client.tenant_secondary_download, env.initial_tenant, wait_ms=30000) + # ensure enough time while paused to trip the timeout + time.sleep(2) + + client.configure_failpoints((failpoint, "off")) + + warmup.result() + + elapsed = time.time() - started + + _, offset = env.pageserver.assert_log_contains( + ".*failed, will retry \\(attempt 0\\): timeout.*", offset + ) + _, offset = env.pageserver.assert_log_contains(".*succeeded after [0-9]+ retries.*", offset) + + assert elapsed < 30, "too long passed: {elapsed=}" + + def stringify(conf: Dict[str, Any]) -> Dict[str, str]: return dict(map(lambda x: (x[0], str(x[1])), conf.items())) diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py new file mode 100644 index 0000000000..d6babe4393 --- /dev/null +++ b/test_runner/regress/test_ondemand_slru_download.py @@ -0,0 +1,161 @@ +from typing import Optional + +import pytest +from fixtures.common_types import Lsn +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, tenant_get_shards +from fixtures.utils import query_scalar + + +# +# Test on-demand download of the pg_xact SLRUs +# +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + + tenant_conf = { + "lazy_slru_download": "true", + # set PITR interval to be small, so we can do GC + "pitr_interval": "0 s", + } + env = neon_env_builder.init_start( + initial_tenant_conf=tenant_conf, initial_tenant_shard_count=shard_count + ) + + timeline_id = env.initial_timeline + tenant_id = env.initial_tenant + endpoint = env.endpoints.create_start("main") + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + cur.execute("CREATE EXTENSION neon_test_utils") + + # Create a test table + cur.execute("CREATE TABLE clogtest (id integer)") + cur.execute("INSERT INTO clogtest VALUES (1)") + + # Consume a lot of XIDs, to create more pg_xact segments + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + cur.execute("INSERT INTO clogtest VALUES (2)") + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + cur.execute("INSERT INTO clogtest VALUES (2)") + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + cur.execute("INSERT INTO clogtest VALUES (3)") + + # Restart postgres. After restart, the new instance will download the + # pg_xact segments lazily. + endpoint.stop() + endpoint.start() + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # Consume more WAL, so that the pageserver can compact and GC older data, + # including the LSN that we started the new endpoint at, + cur.execute("CREATE TABLE anothertable (i int, t text)") + cur.execute( + "INSERT INTO anothertable SELECT g, 'long string to consume some space' || g FROM generate_series(1, 10000) g" + ) + + # Run GC + shards = tenant_get_shards(env, tenant_id, None) + for tenant_shard_id, pageserver in shards: + client = pageserver.http_client() + client.timeline_checkpoint(tenant_shard_id, timeline_id) + client.timeline_compact(tenant_shard_id, timeline_id) + client.timeline_gc(tenant_shard_id, timeline_id, 0) + + # Test that this can still on-demand download the old pg_xact segments + cur.execute("select xmin, xmax, * from clogtest") + tup = cur.fetchall() + log.info(f"tuples = {tup}") + + +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + + tenant_conf = { + "lazy_slru_download": "true", + } + env = neon_env_builder.init_start( + initial_tenant_conf=tenant_conf, initial_tenant_shard_count=shard_count + ) + + endpoint = env.endpoints.create_start("main") + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + cur.execute("CREATE EXTENSION neon_test_utils") + + # Create a test table + cur.execute("CREATE TABLE clogtest (id integer)") + cur.execute("INSERT INTO clogtest VALUES (1)") + + # Consume a lot of XIDs, to create more pg_xact segments + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + + # Open a new connection and insert another row, but leave + # the transaction open + pg_conn2 = endpoint.connect() + cur2 = pg_conn2.cursor() + cur2.execute("BEGIN") + cur2.execute("INSERT INTO clogtest VALUES (2)") + + # Another insert on the first connection, which is committed. + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + cur.execute("INSERT INTO clogtest VALUES (3)") + + # Start standby at this point in time + lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) + endpoint_at_lsn = env.endpoints.create_start( + branch_name="main", endpoint_id="ep-at-lsn", lsn=lsn + ) + + # Commit transaction 2, after the standby was launched. + cur2.execute("COMMIT") + + # The replica should not see transaction 2 as committed. + conn_replica = endpoint_at_lsn.connect() + cur_replica = conn_replica.cursor() + cur_replica.execute("SELECT * FROM clogtest") + assert cur_replica.fetchall() == [(1,), (3,)] + + +def test_ondemand_download_after_wal_switch(neon_env_builder: NeonEnvBuilder): + """ + Test on-demand SLRU download on standby, when starting right after + WAL segment switch. + + This is a repro for a bug in how the LSN at WAL page/segment + boundary was handled (https://github.com/neondatabase/neon/issues/8030) + """ + + tenant_conf = { + "lazy_slru_download": "true", + } + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + + endpoint = env.endpoints.create_start("main") + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # Create a test table + cur.execute("CREATE TABLE clogtest (id integer)") + cur.execute("INSERT INTO clogtest VALUES (1)") + + # Start standby at WAL segment boundary + cur.execute("SELECT pg_switch_wal()") + lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) + _endpoint_at_lsn = env.endpoints.create_start( + branch_name="main", endpoint_id="ep-at-lsn", lsn=lsn + ) diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index e29db1e252..28dbf40bed 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -1,66 +1,15 @@ -import subprocess -from pathlib import Path from typing import Optional +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverHttpClient -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until -# test that we cannot override node id after init -def test_pageserver_init_node_id( - neon_simple_env: NeonEnv, neon_binpath: Path, pg_distrib_dir: Path -): - workdir = neon_simple_env.pageserver.workdir - pageserver_config = workdir / "pageserver.toml" - pageserver_bin = neon_binpath / "pageserver" - - def run_pageserver(args): - return subprocess.run( - [str(pageserver_bin), "-D", str(workdir), *args], - check=False, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - - # remove initial config and stop existing pageserver - pageserver_config.unlink() - neon_simple_env.pageserver.stop() - - bad_init = run_pageserver(["--init", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']) - assert ( - bad_init.returncode == 1 - ), "pageserver should not be able to init new config without the node id" - assert "missing id" in bad_init.stderr - assert not pageserver_config.exists(), "config file should not be created after init error" - - completed_init = run_pageserver( - ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'] - ) - assert ( - completed_init.returncode == 0 - ), "pageserver should be able to create a new config with the node id given" - assert pageserver_config.exists(), "config file should be created successfully" - - bad_reinit = run_pageserver( - ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'] - ) - assert ( - bad_reinit.returncode == 1 - ), "pageserver should not be able to init new config without the node id" - assert "already exists, cannot init it" in bad_reinit.stderr - - bad_update = run_pageserver(["--update-config", "-c", "id = 3"]) - assert bad_update.returncode == 1, "pageserver should not allow updating node id" - assert "has node id already, it cannot be overridden" in bad_update.stderr - - def check_client(env: NeonEnv, client: PageserverHttpClient): pg_version = env.pg_version initial_tenant = env.initial_tenant @@ -72,8 +21,10 @@ def check_client(env: NeonEnv, client: PageserverHttpClient): # create new tenant and check it is also there tenant_id = TenantId.generate() - client.tenant_create( - tenant_id, generation=env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id) + env.pageserver.tenant_create( + tenant_id, + generation=env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id), + auth_token=client.auth_token, ) assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()} diff --git a/test_runner/regress/test_pageserver_crash_consistency.py b/test_runner/regress/test_pageserver_crash_consistency.py new file mode 100644 index 0000000000..2d6b50490e --- /dev/null +++ b/test_runner/regress/test_pageserver_crash_consistency.py @@ -0,0 +1,108 @@ +import pytest +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn +from fixtures.pageserver.common_types import ImageLayerName, parse_layer_file_name +from fixtures.pageserver.utils import ( + wait_for_last_record_lsn, + wait_until_tenant_active, +) +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind +from requests.exceptions import ConnectionError + + +def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + """ + Test case for docs/rfcs/027-crash-consistent-layer-map-through-index-part.md. + + Simulate crash after compaction has written layers to disk + but before they have been uploaded/linked into remote index_part.json. + + Startup handles this situation by deleting the not yet uploaded L1 layer files. + """ + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_distance": f"{10 * 1024**2}", + "compaction_period": "0 s", + "compaction_threshold": "999999", + } + ) + pageserver_http = env.pageserver.http_client() + + tenant_id, timeline_id = env.initial_tenant, env.initial_timeline + + pageserver_http.configure_failpoints(("after-timeline-compacted-first-L1", "exit")) + + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + connstr = endpoint.connstr(options="-csynchronous_commit=off") + pg_bin.run_capture(["pgbench", "-i", "-s1", connstr]) + + lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + + # make sure we receive no new wal after this, so that we'll write over the same L1 file. + endpoint.stop() + for sk in env.safekeepers: + sk.stop() + + pageserver_http.patch_tenant_config_client_side(tenant_id, {"compaction_threshold": 3}) + # hit the exit failpoint + with pytest.raises(ConnectionError, match="Remote end closed connection without response"): + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + env.pageserver.stop() + + # now the duplicate L1 has been created, but is not yet uploaded + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + + # path = env.remote_storage.timeline_path(tenant_id, timeline_id) + l1_found = None + for path in env.pageserver.list_layers(tenant_id, timeline_id): + [key_range, lsn_range] = path.name.split("__", maxsplit=1) + + if "-" not in lsn_range: + # image layer + continue + + [key_start, key_end] = key_range.split("-", maxsplit=1) + + if key_start == "0" * 36 and key_end == "F" * 36: + # L0 + continue + + candidate = parse_layer_file_name(path.name) + + if isinstance(candidate, ImageLayerName): + continue + + if l1_found is not None: + raise RuntimeError(f"found multiple L1: {l1_found.to_str()} and {path.name}") + + l1_found = candidate + + assert l1_found is not None, "failed to find L1 locally" + + uploaded = env.pageserver_remote_storage.remote_layer_path( + tenant_id, timeline_id, l1_found.to_str() + ) + assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded" + + env.pageserver.start() + wait_until_tenant_active(pageserver_http, tenant_id) + + assert not env.pageserver.layer_exists( + tenant_id, timeline_id, l1_found + ), "partial compaction result should had been removed during startup" + + # wait for us to catch up again + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn) + + pageserver_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True) + + assert env.pageserver.layer_exists(tenant_id, timeline_id, l1_found), "the L1 reappears" + + uploaded = env.pageserver_remote_storage.remote_layer_path( + tenant_id, timeline_id, l1_found.to_str() + ) + assert uploaded.exists(), "the L1 is uploaded" + + +# TODO: same test for L0s produced by ingest. diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 63f6130af5..ebf58d2bd1 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -9,21 +9,22 @@ of the pageserver are: - Updates to remote_consistent_lsn may only be made visible after validating generation """ - import enum +import os import re import time from typing import Optional import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, PgBin, - S3Scrubber, - last_flush_lsn_upload, + generate_uploads_and_deletions, ) +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( assert_tenant_state, @@ -34,8 +35,7 @@ from fixtures.pageserver.utils import ( from fixtures.remote_storage import ( RemoteStorageKind, ) -from fixtures.types import TenantId, TimelineId -from fixtures.utils import print_gc_result, wait_until +from fixtures.utils import wait_until from fixtures.workload import Workload # A tenant configuration that is convenient for generating uploads and deletions @@ -52,71 +52,10 @@ TENANT_CONF = { "compaction_period": "0s", # create image layers eagerly, so that GC can remove some layers "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", } -def generate_uploads_and_deletions( - env: NeonEnv, - *, - init: bool = True, - tenant_id: Optional[TenantId] = None, - timeline_id: Optional[TimelineId] = None, - data: Optional[str] = None, - pageserver_id: Optional[int] = None, -): - """ - Using the environment's default tenant + timeline, generate a load pattern - that results in some uploads and some deletions to remote storage. - """ - - if tenant_id is None: - tenant_id = env.initial_tenant - assert tenant_id is not None - - if timeline_id is None: - timeline_id = env.initial_timeline - assert timeline_id is not None - - ps_http = env.pageserver.http_client() - - with env.endpoints.create_start( - "main", tenant_id=tenant_id, pageserver_id=pageserver_id - ) as endpoint: - if init: - endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") - last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) - - def churn(data): - endpoint.safe_psql_many( - [ - f""" - INSERT INTO foo (id, val) - SELECT g, '{data}' - FROM generate_series(1, 200) g - ON CONFLICT (id) DO UPDATE - SET val = EXCLUDED.val - """, - # to ensure that GC can actually remove some layers - "VACUUM foo", - ] - ) - assert tenant_id is not None - assert timeline_id is not None - # We are waiting for uploads as well as local flush, in order to avoid leaving the system - # in a state where there are "future layers" in remote storage that will generate deletions - # after a restart. - last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) - ps_http.timeline_checkpoint(tenant_id, timeline_id) - - # Compaction should generate some GC-elegible layers - for i in range(0, 2): - churn(f"{i if data is None else data}") - - gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0) - print_gc_result(gc_result) - assert gc_result["layers_removed"] > 0 - - def read_all( env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None ): @@ -198,14 +137,23 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): env.broker.try_start() for sk in env.safekeepers: sk.start() - env.attachment_service.start() + env.storage_controller.start() - env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',)) + # We will start a pageserver with no control_plane_api set, so it won't be able to self-register + env.storage_controller.node_register(env.pageserver) + + def remove_control_plane_api_field(config): + return config.pop("control_plane_api") + + control_plane_api = env.pageserver.edit_config_toml(remove_control_plane_api_field) + env.pageserver.start() + env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"}) env.neon_cli.create_tenant( tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline ) - generate_uploads_and_deletions(env, pageserver_id=env.pageserver.id) + + generate_uploads_and_deletions(env, pageserver=env.pageserver) def parse_generation_suffix(key): m = re.match(".+-([0-9a-zA-Z]{8})$", key) @@ -229,11 +177,15 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): assert parse_generation_suffix(key) is None env.pageserver.stop() - # Starting without the override that disabled control_plane_api + env.pageserver.patch_config_toml_nonrecursive( + { + "control_plane_api": control_plane_api, + } + ) env.pageserver.start() - generate_uploads_and_deletions(env, pageserver_id=env.pageserver.id, init=False) + generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False) legacy_objects: list[str] = [] suffixed_objects = [] @@ -265,27 +217,27 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # Having written a mixture of generation-aware and legacy index_part.json, # ensure the scrubber handles the situation as expected. - metadata_summary = S3Scrubber( - neon_env_builder.test_output_dir, neon_env_builder - ).scan_metadata() + healthy, metadata_summary = env.storage_scrubber.scan_metadata() assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline assert metadata_summary["timeline_count"] == 1 assert metadata_summary["timeline_shard_count"] == 1 - assert not metadata_summary["with_errors"] - assert not metadata_summary["with_warnings"] + assert healthy def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, ) + neon_env_builder.num_pageservers = 2 env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) - some_other_pageserver = 1234 + attached_to_id = env.storage_controller.locate(env.initial_tenant)[0]["node_id"] + main_pageserver = env.get_pageserver(attached_to_id) + other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0] - ps_http = env.pageserver.http_client() + ps_http = main_pageserver.http_client() - generate_uploads_and_deletions(env) + generate_uploads_and_deletions(env, pageserver=main_pageserver) # Flush: pending deletions should all complete assert_deletion_queue(ps_http, lambda n: n > 0) @@ -298,14 +250,10 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"] assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 - env.pageserver.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) - # Now advance the generation in the control plane: subsequent validations # from the running pageserver will fail. No more deletions should happen. - env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver) - generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id) + env.storage_controller.attach_hook_issue(env.initial_tenant, other_pageserver.id) + generate_uploads_and_deletions(env, init=False, pageserver=main_pageserver) assert_deletion_queue(ps_http, lambda n: n > 0) queue_depth_before = get_deletion_queue_depth(ps_http) @@ -357,9 +305,14 @@ def test_deletion_queue_recovery( neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, ) + neon_env_builder.num_pageservers = 2 env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) - ps_http = env.pageserver.http_client() + attached_to_id = env.storage_controller.locate(env.initial_tenant)[0]["node_id"] + main_pageserver = env.get_pageserver(attached_to_id) + other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0] + + ps_http = main_pageserver.http_client() failpoints = [ # Prevent deletion lists from being executed, to build up some backlog of deletions @@ -369,14 +322,13 @@ def test_deletion_queue_recovery( if validate_before == ValidateBefore.NO_VALIDATE: failpoints.append( # Prevent deletion lists from being validated, we will test that they are - # dropped properly during recovery. 'pause' is okay here because we kill - # the pageserver with immediate=true - ("control-plane-client-validate", "pause") + # dropped properly during recovery. This is such a long sleep as to be equivalent to "never" + ("control-plane-client-validate", "return(3600000)") ) ps_http.configure_failpoints(failpoints) - generate_uploads_and_deletions(env) + generate_uploads_and_deletions(env, pageserver=main_pageserver) # There should be entries in the deletion queue assert_deletion_queue(ps_http, lambda n: n > 0) @@ -403,7 +355,7 @@ def test_deletion_queue_recovery( # also wait to see the header hit the disk: this seems paranoid but the race # can really happen on a heavily overloaded test machine. def assert_header_written(): - assert (env.pageserver.workdir / "deletion" / "header-01").exists() + assert (main_pageserver.workdir / "deletion" / "header-01").exists() wait_until(20, 1, assert_header_written) @@ -413,15 +365,15 @@ def test_deletion_queue_recovery( before_restart_depth = get_deletion_queue_validated(ps_http) log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued") - env.pageserver.stop(immediate=True) + main_pageserver.stop(immediate=True) if keep_attachment == KeepAttachment.LOSE: - some_other_pageserver = 101010 - env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver) + some_other_pageserver = other_pageserver.id + env.storage_controller.attach_hook_issue(env.initial_tenant, some_other_pageserver) - env.pageserver.start() + main_pageserver.start() - def assert_deletions_submitted(n: int): + def assert_deletions_submitted(n: int) -> None: assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n # After restart, issue a flush to kick the deletion frontend to do recovery. @@ -442,8 +394,6 @@ def test_deletion_queue_recovery( # validated before restart. assert get_deletion_queue_executed(ps_http) == before_restart_depth else: - env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"]) - # If we lost the attachment, we should have dropped our pre-restart deletions. assert get_deletion_queue_dropped(ps_http) == before_restart_depth @@ -451,8 +401,8 @@ def test_deletion_queue_recovery( assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 # Restart again - env.pageserver.stop(immediate=True) - env.pageserver.start() + main_pageserver.stop(immediate=True) + main_pageserver.start() # No deletion lists should be recovered: this demonstrates that deletion lists # were cleaned up after being executed or dropped in the previous process lifetime. @@ -471,7 +421,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): ps_http = env.pageserver.http_client() - generate_uploads_and_deletions(env, pageserver_id=env.pageserver.id) + generate_uploads_and_deletions(env, pageserver=env.pageserver) env.pageserver.allowed_errors.extend( [ @@ -483,12 +433,12 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): ) # Simulate a major incident: the control plane goes offline - env.attachment_service.stop() + env.storage_controller.stop() # Remember how many validations had happened before the control plane went offline validated = get_deletion_queue_validated(ps_http) - generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id) + generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver) # The running pageserver should stop progressing deletions time.sleep(10) @@ -498,12 +448,15 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): # incident, but it might be unavoidable: if so, we want to be able to start up # and serve clients. env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP - env.pageserver.start( - overrides=("--pageserver-config-override=control_plane_emergency_mode=true",) + replaced = env.pageserver.patch_config_toml_nonrecursive( + { + "control_plane_emergency_mode": True, + } ) + env.pageserver.start() # The pageserver should provide service to clients - generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id) + generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver) # The pageserver should neither validate nor execute any deletions, it should have # loaded the DeletionLists from before though @@ -513,7 +466,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): assert get_deletion_queue_executed(ps_http) == 0 # When the control plane comes back up, normal service should resume - env.attachment_service.start() + env.storage_controller.start() ps_http.deletion_queue_flush(execute=True) assert get_deletion_queue_depth(ps_http) == 0 @@ -522,9 +475,10 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): # The pageserver should work fine when subsequently restarted in non-emergency mode env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP + env.pageserver.patch_config_toml_nonrecursive(replaced) env.pageserver.start() - generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id) + generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver) ps_http.deletion_queue_flush(execute=True) assert get_deletion_queue_depth(ps_http) == 0 assert get_deletion_queue_validated(ps_http) > 0 @@ -562,7 +516,7 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder): tenant_id = env.initial_tenant timeline_id = env.initial_timeline - generate_uploads_and_deletions(env) + generate_uploads_and_deletions(env, pageserver=env.pageserver) read_all(env, tenant_id, timeline_id) evict_all_layers(env, tenant_id, timeline_id) @@ -594,13 +548,6 @@ def test_multi_attach( tenant_id = env.initial_tenant timeline_id = env.initial_timeline - # We will intentionally create situations where stale deletions happen from non-latest-generation - # nodes when the tenant is multiply-attached - for ps in env.pageservers: - ps.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) - # Initially, the tenant will be attached to the first pageserver (first is default in our test harness) wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active")) _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) @@ -651,16 +598,116 @@ def test_multi_attach( for ps in pageservers: ps.stop() - # Returning to a normal healthy state: all pageservers will start, but only the one most - # recently attached via the control plane will re-attach on startup + # Returning to a normal healthy state: all pageservers will start for ps in pageservers: ps.start() - with pytest.raises(PageserverApiException): - _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) - with pytest.raises(PageserverApiException): - _detail = http_clients[1].timeline_detail(tenant_id, timeline_id) - _detail = http_clients[2].timeline_detail(tenant_id, timeline_id) + # Pageservers are marked offline by the storage controller during the rolling restart + # above. This may trigger a reschedulling, so there's no guarantee that the tenant + # shard ends up attached to the most recent ps. + raised = 0 + serving_ps_idx = None + for idx, http_client in enumerate(http_clients): + try: + _detail = http_client.timeline_detail(tenant_id, timeline_id) + serving_ps_idx = idx + except PageserverApiException: + raised += 1 + + assert raised == 2 and serving_ps_idx is not None # All data we wrote while multi-attached remains readable - workload.validate(pageservers[2].id) + workload.validate(pageservers[serving_ps_idx].id) + + +def test_upgrade_generationless_local_file_paths( + neon_env_builder: NeonEnvBuilder, +): + """ + Test pageserver behavior when startup up with local layer paths without + generation numbers: it should accept these layer files, and avoid doing + a delete/download cycle on them. + """ + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}' + ) + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(1000) + + attached_pageserver = env.get_tenant_pageserver(tenant_id) + secondary_pageserver = list([ps for ps in env.pageservers if ps.id != attached_pageserver.id])[ + 0 + ] + + attached_pageserver.http_client().tenant_heatmap_upload(tenant_id) + secondary_pageserver.http_client().tenant_secondary_download(tenant_id) + + # Rename the local paths to legacy format, to simulate what + # we would see when upgrading. Do this on both attached and secondary locations, as we will + # test the behavior of both. + for pageserver in env.pageservers: + pageserver.stop() + timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id) + files_renamed = 0 + for filename in os.listdir(timeline_dir): + path = os.path.join(timeline_dir, filename) + log.info(f"Found file {path}") + if path.endswith("-v1-00000001"): + new_path = path[:-12] + os.rename(path, new_path) + log.info(f"Renamed {path} -> {new_path}") + files_renamed += 1 + + assert files_renamed > 0 + + pageserver.start() + + workload.validate() + + # Assert that there were no on-demand downloads + assert ( + attached_pageserver.http_client().get_metric_value( + "pageserver_remote_ondemand_downloaded_layers_total" + ) + == 0 + ) + + # Do a secondary download and ensure there were no layer downloads + secondary_pageserver.http_client().tenant_secondary_download(tenant_id) + assert ( + secondary_pageserver.http_client().get_metric_value( + "pageserver_secondary_download_layer_total" + ) + == 0 + ) + + # Check that when we evict and promote one of the legacy-named layers, everything works as + # expected + local_layers = list( + ( + parse_layer_file_name(path.name), + os.path.join(attached_pageserver.timeline_dir(tenant_id, timeline_id), path), + ) + for path in attached_pageserver.list_layers(tenant_id, timeline_id) + ) + (victim_layer_name, victim_path) = local_layers[0] + assert os.path.exists(victim_path) + + attached_pageserver.http_client().evict_layer( + tenant_id, timeline_id, victim_layer_name.to_str() + ) + assert not os.path.exists(victim_path) + + attached_pageserver.http_client().download_layer( + tenant_id, timeline_id, victim_layer_name.to_str() + ) + # We should download into the same local path we started with + assert os.path.exists(victim_path) diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py new file mode 100644 index 0000000000..4c9eac5cd7 --- /dev/null +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -0,0 +1,174 @@ +import copy +import json +import uuid + +from anyio import Path +from fixtures.common_types import TenantId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin +from fixtures.pg_version import PgVersion +from fixtures.utils import wait_until + + +def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + env = neon_env_builder.init_start() + + env.pageserver.tenant_detach(env.initial_tenant) + + env.pageserver.allowed_errors.append( + # https://github.com/neondatabase/neon/issues/6925 + r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*" + ) + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + + rate_limit_rps = 100 + compaction_period = 5 + env.pageserver.tenant_create( + tenant_id, + conf={ + "compaction_period": f"{compaction_period}s", + "timeline_get_throttle": { + "task_kinds": ["PageRequestHandler"], + "initial": 0, + "refill_interval": "100ms", + "refill_amount": int(rate_limit_rps / 10), + "max": int(rate_limit_rps / 10), + "fair": True, + }, + }, + ) + + ps_http = env.pageserver.http_client() + + ps_http.timeline_create(PgVersion.V16, tenant_id, timeline_id) + + def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: int): + cmd = [ + str(env.neon_binpath / "pagebench"), + "get-page-latest-lsn", + "--mgmt-api-endpoint", + ps_http.base_url, + "--page-service-connstring", + env.pageserver.connstr(password=None), + "--runtime", + f"{duration_secs}s", + f"{tenant_id}/{timeline_id}", + ] + + basepath = pg_bin.run_capture(cmd, with_command_header=False) + results_path = Path(basepath + ".stdout") + log.info(f"Benchmark results at: {results_path}") + + with open(results_path, "r") as f: + results = json.load(f) + log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") + return int(results["total"]["request_count"]) + + log.info("warmup / make sure metrics are present") + run_pagebench_at_max_speed_and_get_total_requests_completed(2) + metrics_query = { + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + "smgr_query_type": "get_page_at_lsn", + } + metric_name = "pageserver_smgr_query_seconds_sum" + smgr_query_seconds_pre = ps_http.get_metric_value(metric_name, metrics_query) + assert smgr_query_seconds_pre is not None + + marker = uuid.uuid4().hex + ps_http.post_tracing_event("info", marker) + _, marker_offset = wait_until( + 10, 0.5, lambda: env.pageserver.assert_log_contains(marker, offset=None) + ) + + log.info("run pagebench") + duration_secs = 10 + actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs) + + log.info("validate the client is capped at the configured rps limit") + expect_ncompleted = duration_secs * rate_limit_rps + delta_abs = abs(expect_ncompleted - actual_ncompleted) + threshold = 0.05 * expect_ncompleted + assert ( + threshold / rate_limit_rps < 0.1 * duration_secs + ), "test self-test: unrealistic expecations regarding precision in this test" + assert ( + delta_abs < 0.05 * expect_ncompleted + ), "the throttling deviates more than 5percent from the expectation" + + log.info("validate that we logged the throttling") + + wait_until( + 10, + compaction_period / 10, + lambda: env.pageserver.assert_log_contains( + f".*{tenant_id}.*shard was throttled in the last n_seconds.*", + offset=marker_offset, + ), + ) + + log.info("validate that the metric doesn't include throttle wait time") + smgr_query_seconds_post = ps_http.get_metric_value(metric_name, metrics_query) + assert smgr_query_seconds_post is not None + actual_smgr_query_seconds = smgr_query_seconds_post - smgr_query_seconds_pre + + assert ( + duration_secs >= 10 * actual_smgr_query_seconds + ), "smgr metrics should not include throttle wait time" + + +throttle_config_with_field_fair_set = { + "task_kinds": ["PageRequestHandler"], + "fair": True, + "initial": 27, + "refill_interval": "43s", + "refill_amount": 23, + "max": 42, +} + + +def assert_throttle_config_with_field_fair_set(conf): + """ + Field `fair` is ignored, so, responses don't contain it + """ + without_fair = copy.deepcopy(throttle_config_with_field_fair_set) + without_fair.pop("fair") + + assert conf == without_fair + + +def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder): + """ + To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out. + """ + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + # with_fair config should still be settable + ps_http.set_tenant_config( + env.initial_tenant, + {"timeline_get_throttle": throttle_config_with_field_fair_set}, + ) + conf = ps_http.tenant_config(env.initial_tenant) + assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"]) + assert_throttle_config_with_field_fair_set( + conf.tenant_specific_overrides["timeline_get_throttle"] + ) + + +def test_throttle_fair_config_is_settable_but_ignored_in_config_toml( + neon_env_builder: NeonEnvBuilder, +): + """ + To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out. + """ + + def set_tenant_config(ps_cfg): + ps_cfg["tenant_config"] = {"timeline_get_throttle": throttle_config_with_field_fair_set} + + neon_env_builder.pageserver_config_override = set_tenant_config + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + conf = ps_http.tenant_config(env.initial_tenant) + assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"]) diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py new file mode 100644 index 0000000000..f6404d68ac --- /dev/null +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -0,0 +1,318 @@ +import asyncio +import os +import time +from typing import Optional, Tuple + +import psutil +import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + tenant_get_shards, +) +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.utils import wait_until + +TIMELINE_COUNT = 10 +ENTRIES_PER_TIMELINE = 10_000 +CHECKPOINT_TIMEOUT_SECONDS = 60 + + +async def run_worker_for_tenant( + env: NeonEnv, entries: int, tenant: TenantId, offset: Optional[int] = None +) -> Lsn: + if offset is None: + offset = 0 + + with env.endpoints.create_start("main", tenant_id=tenant) as ep: + conn = await ep.connect_async() + try: + await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)") + await conn.execute( + f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series({offset},{entries}) as i" + ) + finally: + await conn.close(timeout=10) + + last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + return last_flush_lsn + + +async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]: + tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf) + last_flush_lsn = await run_worker_for_tenant(env, entries, tenant) + return tenant, timeline, last_flush_lsn + + +async def workload( + env: NeonEnv, tenant_conf, timelines: int, entries: int +) -> list[Tuple[TenantId, TimelineId, Lsn]]: + workers = [asyncio.create_task(run_worker(env, tenant_conf, entries)) for _ in range(timelines)] + return await asyncio.gather(*workers) + + +def wait_until_pageserver_is_caught_up( + env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]] +): + for tenant, timeline, last_flush_lsn in last_flush_lsns: + shards = tenant_get_shards(env, tenant) + for tenant_shard_id, pageserver in shards: + waited = wait_for_last_record_lsn( + pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn + ) + assert waited >= last_flush_lsn + + +def wait_until_pageserver_has_uploaded( + env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]] +): + for tenant, timeline, last_flush_lsn in last_flush_lsns: + shards = tenant_get_shards(env, tenant) + for tenant_shard_id, pageserver in shards: + wait_for_upload(pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn) + + +def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float: + def query(): + value = pageserver_http.get_metric_value("pageserver_wal_ingest_records_received_total") + assert value is not None + return value + + # The metric gets initialised on the first update. + # Retry a few times, but return 0 if it's stable. + try: + return float(wait_until(3, 0.5, query)) + except Exception: + return 0 + + +def get_dirty_bytes(env): + v = env.pageserver.http_client().get_metric_value("pageserver_timeline_ephemeral_bytes") or 0 + log.info(f"dirty_bytes: {v}") + return v + + +def assert_dirty_bytes(env, v): + assert get_dirty_bytes(env) == v + + +def assert_dirty_bytes_nonzero(env): + dirty_bytes = get_dirty_bytes(env) + assert dirty_bytes > 0 + return dirty_bytes + + +@pytest.mark.parametrize("immediate_shutdown", [True, False]) +def test_pageserver_small_inmemory_layers( + neon_env_builder: NeonEnvBuilder, immediate_shutdown: bool +): + """ + Test that open layers get flushed after the `checkpoint_timeout` config + and do not require WAL reingest upon restart. + + The workload creates a number of timelines and writes some data to each, + but not enough to trigger flushes via the `checkpoint_distance` config. + """ + tenant_conf = { + # Large `checkpoint_distance` effectively disables size + # based checkpointing. + "checkpoint_distance": f"{2 * 1024 ** 3}", + "checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s", + "compaction_period": "1s", + } + + env = neon_env_builder.init_configs() + env.start() + + last_flush_lsns = asyncio.run(workload(env, tenant_conf, TIMELINE_COUNT, ENTRIES_PER_TIMELINE)) + wait_until_pageserver_is_caught_up(env, last_flush_lsns) + + # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data. + wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + + ps_http_client = env.pageserver.http_client() + total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client) + + # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed, + # such that there are zero bytes of ephemeral layer left on the pageserver + log.info("Waiting for background checkpoints...") + wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore + + # Zero ephemeral layer bytes does not imply that all the frozen layers were uploaded: they + # must be uploaded to remain visible to the pageserver after restart. + wait_until_pageserver_has_uploaded(env, last_flush_lsns) + + env.pageserver.restart(immediate=immediate_shutdown) + wait_until_pageserver_is_caught_up(env, last_flush_lsns) + + # Catching up with WAL ingest should have resulted in zero bytes of ephemeral layers, since + # we froze, flushed and uploaded everything before restarting. There can be no more WAL writes + # because we shut down compute endpoints before flushing. + assert get_dirty_bytes(env) == 0 + + total_wal_ingested_after_restart = wait_for_wal_ingest_metric(ps_http_client) + + log.info(f"WAL ingested before restart: {total_wal_ingested_before_restart}") + log.info(f"WAL ingested after restart: {total_wal_ingested_after_restart}") + + assert total_wal_ingested_after_restart == 0 + + +def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): + """ + Test that `checkpoint_timeout` is enforced even if there is no safekeeper input. + """ + tenant_conf = { + # Large `checkpoint_distance` effectively disables size + # based checkpointing. + "checkpoint_distance": f"{2 * 1024 ** 3}", + "checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s", + "compaction_period": "1s", + } + + env = neon_env_builder.init_configs() + env.start() + + last_flush_lsns = asyncio.run(workload(env, tenant_conf, TIMELINE_COUNT, ENTRIES_PER_TIMELINE)) + wait_until_pageserver_is_caught_up(env, last_flush_lsns) + + # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data. + wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + + # Stop the safekeepers, so that we cannot have any more WAL receiver connections + for sk in env.safekeepers: + sk.stop() + + # We should have got here fast enough that we didn't hit the background interval yet, + # and the teardown of SK connections shouldn't prompt any layer freezing. + assert get_dirty_bytes(env) > 0 + + # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed, + # such that there are zero bytes of ephemeral layer left on the pageserver + log.info("Waiting for background checkpoints...") + wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore + + # The code below verifies that we do not flush on the first write + # after an idle period longer than the checkpoint timeout. + + # Sit quietly for longer than the checkpoint timeout + time.sleep(CHECKPOINT_TIMEOUT_SECONDS + CHECKPOINT_TIMEOUT_SECONDS / 2) + + # Restart the safekeepers and write a bit of extra data into one tenant + for sk in env.safekeepers: + sk.start() + + tenant_with_extra_writes = last_flush_lsns[0][0] + asyncio.run( + run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE) + ) + + dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + + # We shouldn't flush since we've just opened a new layer + waited_for = 0 + while waited_for < CHECKPOINT_TIMEOUT_SECONDS // 4: + time.sleep(5) + waited_for += 5 + + assert get_dirty_bytes(env) >= dirty_after_write + + +@pytest.mark.skipif( + # We have to use at least ~100MB of data to hit the lowest limit we can configure, which is + # prohibitively slow in debug mode + os.getenv("BUILD_TYPE") == "debug", + reason="Avoid running bulkier ingest tests in debug mode", +) +def test_total_size_limit(neon_env_builder: NeonEnvBuilder): + """ + Test that checkpoints are done based on total ephemeral layer size, even if no one timeline is + individually exceeding checkpoint thresholds. + """ + + system_memory = psutil.virtual_memory().total + + # The smallest total size limit we can configure is 1/1024th of the system memory (e.g. 128MB on + # a system with 128GB of RAM). We will then write enough data to violate this limit. + max_dirty_data = 128 * 1024 * 1024 + ephemeral_bytes_per_memory_kb = (max_dirty_data * 1024) // system_memory + assert ephemeral_bytes_per_memory_kb > 0 + + neon_env_builder.pageserver_config_override = f""" + ephemeral_bytes_per_memory_kb={ephemeral_bytes_per_memory_kb} + """ + + compaction_period_s = 10 + + checkpoint_distance = 1024**3 + tenant_conf = { + # Large space + time thresholds: effectively disable these limits + "checkpoint_distance": f"{checkpoint_distance}", + "checkpoint_timeout": "3600s", + "compaction_period": f"{compaction_period_s}s", + } + + env = neon_env_builder.init_configs() + env.start() + + timeline_count = 10 + + # This is about 2MiB of data per timeline + entries_per_timeline = 100_000 + + last_flush_lsns = asyncio.run(workload(env, tenant_conf, timeline_count, entries_per_timeline)) + wait_until_pageserver_is_caught_up(env, last_flush_lsns) + + total_bytes_ingested = 0 + for tenant, timeline, last_flush_lsn in last_flush_lsns: + http_client = env.pageserver.http_client() + initdb_lsn = Lsn(http_client.timeline_detail(tenant, timeline)["initdb_lsn"]) + this_timeline_ingested = last_flush_lsn - initdb_lsn + assert ( + this_timeline_ingested < checkpoint_distance * 0.8 + ), "this test is supposed to fill InMemoryLayer" + total_bytes_ingested += this_timeline_ingested + + log.info(f"Ingested {total_bytes_ingested} bytes since initdb (vs max dirty {max_dirty_data})") + assert total_bytes_ingested > max_dirty_data + + # Expected end state: the total physical size of all the tenants is in excess of the max dirty + # data, but the total amount of dirty data is less than the limit: this demonstrates that we + # have exceeded the threshold but then rolled layers in response + def get_total_historic_layers(): + total_ephemeral_layers = 0 + total_historic_bytes = 0 + for tenant, timeline, _last_flush_lsn in last_flush_lsns: + http_client = env.pageserver.http_client() + initdb_lsn = Lsn(http_client.timeline_detail(tenant, timeline)["initdb_lsn"]) + layer_map = http_client.layer_map_info(tenant, timeline) + total_historic_bytes += sum( + layer.layer_file_size + for layer in layer_map.historic_layers + if Lsn(layer.lsn_start) > initdb_lsn + ) + total_ephemeral_layers += len(layer_map.in_memory_layers) + + log.info( + f"Total historic layer bytes: {total_historic_bytes} ({total_ephemeral_layers} ephemeral layers)" + ) + + return total_historic_bytes + + def assert_bytes_rolled(): + assert total_bytes_ingested - get_total_historic_layers() <= max_dirty_data + + # Wait until enough layers have rolled that the amount of dirty data is under the threshold. + # We do this indirectly via layer maps, rather than the dirty bytes metric, to avoid false-passing + # if that metric isn't updated quickly enough to reflect the dirty bytes exceeding the limit. + wait_until(compaction_period_s * 2, 1, assert_bytes_rolled) + + # The end state should also have the reported metric under the limit + def assert_dirty_data_limited(): + dirty_bytes = get_dirty_bytes(env) + assert dirty_bytes < max_dirty_data + + wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited()) # type: ignore diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index 042961baa5..24a37b04ec 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -1,17 +1,23 @@ +import gzip import json +import os import time from dataclasses import dataclass from pathlib import Path from queue import SimpleQueue from typing import Any, Dict, Set +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, wait_for_last_flush_lsn, ) -from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import TenantId, TimelineId +from fixtures.remote_storage import ( + LocalFsStorage, + RemoteStorageKind, + remote_storage_to_toml_inline_table, +) from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response @@ -40,6 +46,9 @@ def test_metric_collection( uploads.put((events, is_last == "true")) return Response(status=200) + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + assert neon_env_builder.pageserver_remote_storage is not None + # Require collecting metrics frequently, since we change # the timeline and want something to be logged about it. # @@ -48,12 +57,10 @@ def test_metric_collection( neon_env_builder.pageserver_config_override = f""" metric_collection_interval="1s" metric_collection_endpoint="{metric_collection_endpoint}" - cached_metric_collection_interval="0s" + metric_collection_bucket={remote_storage_to_toml_inline_table(neon_env_builder.pageserver_remote_storage)} synthetic_size_calculation_interval="3s" """ - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}") # mock http server that returns OK for the metrics @@ -67,9 +74,7 @@ def test_metric_collection( env.pageserver.allowed_errors.extend( [ ".*metrics endpoint refused the sent metrics*", - # we have a fast rate of calculation, these can happen at shutdown - ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*", - ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes", + ".*metrics_collection: failed to upload to S3: Failed to upload data of length .* to storage path.*", ] ) @@ -166,6 +171,20 @@ def test_metric_collection( httpserver.check() + # Check that at least one bucket output object is present, and that all + # can be decompressed and decoded. + bucket_dumps = {} + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + for dirpath, _dirs, files in os.walk(env.pageserver_remote_storage.root): + for file in files: + file_path = os.path.join(dirpath, file) + log.info(file_path) + if file.endswith(".gz"): + bucket_dumps[file_path] = json.load(gzip.open(file_path)) + + assert len(bucket_dumps) >= 1 + assert all("events" in data for data in bucket_dumps.values()) + def test_metric_collection_cleans_up_tempfile( httpserver: HTTPServer, @@ -196,7 +215,6 @@ def test_metric_collection_cleans_up_tempfile( neon_env_builder.pageserver_config_override = f""" metric_collection_interval="1s" metric_collection_endpoint="{metric_collection_endpoint}" - cached_metric_collection_interval="0s" synthetic_size_calculation_interval="3s" """ @@ -215,9 +233,6 @@ def test_metric_collection_cleans_up_tempfile( env.pageserver.allowed_errors.extend( [ ".*metrics endpoint refused the sent metrics*", - # we have a fast rate of calculation, these can happen at shutdown - ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*", - ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes", ] ) diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py index aecfcdd262..37ff923632 100644 --- a/test_runner/regress/test_pageserver_reconnect.py +++ b/test_runner/regress/test_pageserver_reconnect.py @@ -2,6 +2,7 @@ import threading import time from contextlib import closing +import psycopg2.errors from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, PgBin @@ -40,3 +41,26 @@ def test_pageserver_reconnect(neon_simple_env: NeonEnv, pg_bin: PgBin): c.execute("select pg_reload_conf()") thread.join() + + +# Test handling errors during page server reconnect +def test_pageserver_reconnect_failure(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_pageserver_reconnect") + endpoint = env.endpoints.create_start("test_pageserver_reconnect") + + con = endpoint.connect() + cur = con.cursor() + + cur.execute("set statement_timeout='2s'") + cur.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'") + connstring = cur.fetchall()[0][0] + cur.execute( + f"alter system set neon.pageserver_connstring='{connstring}?some_invalid_param=xyz'" + ) + cur.execute("select pg_reload_conf()") + try: + cur.execute("select count(*) from pg_class") + except psycopg2.errors.QueryCanceled: + log.info("Connection to PS failed") + assert not endpoint.log_contains("ERROR: cannot wait on socket event without a socket.*") diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index c4499196b5..bbf82fea4c 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -1,4 +1,6 @@ +import random from contextlib import closing +from typing import Optional import pytest from fixtures.log_helper import log @@ -11,14 +13,20 @@ from fixtures.utils import wait_until # running. def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() + + # We inject a delay of 15 seconds for tenant activation below. + # Hence, bump the max delay here to not skip over the activation. + neon_env_builder.pageserver_config_override = 'background_task_maximum_delay="20s"' env = neon_env_builder.init_start() endpoint = env.endpoints.create_start("main") pageserver_http = env.pageserver.http_client() - assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ( + pageserver_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) + == 1 + ) pg_conn = endpoint.connect() cur = pg_conn.cursor() @@ -53,7 +61,10 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): env.pageserver.start() # We reloaded our tenant - assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ( + pageserver_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) + == 1 + ) cur.execute("SELECT count(*) FROM foo") assert cur.fetchone() == (100000,) @@ -62,7 +73,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # pageserver does if a compute node connects and sends a request for the tenant # while it's still in Loading state. (It waits for the loading to finish, and then # processes the request.) - tenant_load_delay_ms = 5000 + tenant_load_delay_ms = 15000 env.pageserver.stop() env.pageserver.start( extra_env_vars={"FAILPOINTS": f"before-attaching-tenant=return({tenant_load_delay_ms})"} @@ -141,18 +152,20 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # Test that repeatedly kills and restarts the page server, while the # safekeeper and compute node keep running. @pytest.mark.timeout(540) -def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str): +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_pageserver_chaos( + neon_env_builder: NeonEnvBuilder, build_type: str, shard_count: Optional[int] +): if build_type == "debug": pytest.skip("times out in debug builds") + # same rationale as with the immediate stop; we might leave orphan layers behind. + neon_env_builder.disable_scrub_on_exit() neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.enable_scrub_on_exit() + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count - env = neon_env_builder.init_start() - - # these can happen, if we shutdown at a good time. to be fixed as part of #5172. - message = ".*duplicated L1 layer layer=.*" - env.pageserver.allowed_errors.append(message) + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) # Use a tiny checkpoint distance, to create a lot of layers quickly. # That allows us to stress the compaction and layer flushing logic more. @@ -192,14 +205,28 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str): log.info(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) + # We run "random" kills using a fixed seed, to improve reproducibility if a test + # failure is related to a particular order of operations. + seed = 0xDEADBEEF + rng = random.Random(seed) + # Update the whole table, then immediately kill and restart the pageserver for i in range(1, 15): endpoint.safe_psql("UPDATE foo set updates = updates + 1") # This kills the pageserver immediately, to simulate a crash - env.pageserver.stop(immediate=True) - env.pageserver.start() + to_kill = rng.choice(env.pageservers) + to_kill.stop(immediate=True) + to_kill.start() # Check that all the updates are visible num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0] assert num_updates == i * 100000 + + # currently pageserver cannot tolerate the fact that "s3" goes away, and if + # we succeeded in a compaction before shutdown, there might be a lot of + # uploads pending, certainly more than what we can ingest with MOCK_S3 + # + # so instead, do a fast shutdown for this one test. + # See https://github.com/neondatabase/neon/issues/8709 + env.stop(immediate=True) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 293152dd62..8746b88a75 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -1,18 +1,24 @@ +import json +import os import random +import time from pathlib import Path -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Union import pytest +from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber +from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_prefix_empty, - tenant_delete_wait_completed, + wait_for_upload_queue_empty, ) -from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.types import TenantId, TimelineId +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage from fixtures.utils import wait_until from fixtures.workload import Workload +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response # A tenant configuration that is convenient for generating uploads and deletions # without a large amount of postgres traffic. @@ -46,13 +52,17 @@ def evict_random_layers( if "ephemeral" in layer.name or "temp_download" in layer.name: continue + layer_name = parse_layer_file_name(layer.name) + if rng.choice([True, False]): - log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}") - client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name) + log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer_name.to_str()}") + client.evict_layer( + tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer_name.to_str() + ) @pytest.mark.parametrize("seed", [1, 2, 3]) -def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): +def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, seed: int): """ Issue many location configuration changes, ensure that tenants remain readable & we don't get any unexpected errors. We should @@ -64,8 +74,22 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): """ neon_env_builder.num_pageservers = 3 neon_env_builder.enable_pageserver_remote_storage( - remote_storage_kind=RemoteStorageKind.MOCK_S3, + remote_storage_kind=s3_storage(), ) + neon_env_builder.control_plane_compute_hook_api = ( + f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach" + ) + + def ignore_notify(request: Request): + # This test does all its own compute configuration (by passing explicit pageserver ID to Workload functions), + # so we send controller notifications to /dev/null to prevent it fighting the test for control of the compute. + log.info(f"Ignoring storage controller compute notification: {request.json}") + return Response(status=200) + + make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler( + ignore_notify + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) pageservers = env.pageservers @@ -73,27 +97,39 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): tenant_id = env.initial_tenant timeline_id = env.initial_timeline - # We will make no effort to avoid stale attachments for ps in env.pageservers: ps.allowed_errors.extend( [ - ".*Dropped remote consistent LSN updates.*", - ".*Dropping stale deletions.*", # page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found ".*query handler.*Tenant.*not found.*", # page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active ".*query handler.*Tenant.*not active.*", + # this shutdown case is logged at WARN severity by the time it bubbles up to logical size calculation code + # WARN ...: initial size calculation failed: downloading failed, possibly for shutdown + ".*downloading failed, possibly for shutdown", + # {tenant_id=... timeline_id=...}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1664/0/1260 blkno=0 req_lsn=0/149F0D8}: error reading relation or page version: Not found: will not become active. Current state: Stopping\n' + ".*page_service.*will not become active.*", ] ) - # these can happen, if we shutdown at a good time. to be fixed as part of #5172. - message = ".*duplicated L1 layer layer=.*" - ps.allowed_errors.append(message) - workload = Workload(env, tenant_id, timeline_id) workload.init(env.pageservers[0].id) workload.write_rows(256, env.pageservers[0].id) + # Discourage the storage controller from interfering with the changes we will make directly on the pageserver + env.storage_controller.tenant_policy_update( + tenant_id, + { + "scheduling": "Stop", + }, + ) + env.storage_controller.allowed_errors.extend( + [ + ".*Scheduling is disabled by policy Stop.*", + ".*Skipping reconcile for policy Stop.*", + ] + ) + # We use a fixed seed to make the test reproducible: we want a randomly # chosen order, but not to change the order every time we run the test. rng = random.Random(seed) @@ -152,7 +188,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): workload.churn_rows(rng.randint(128, 256), pageserver.id) workload.validate(pageserver.id) elif last_state_ps[0].startswith("Attached"): - # The `attachment_service` will only re-attach on startup when a pageserver was the + # The `storage_controller` will only re-attach on startup when a pageserver was the # holder of the latest generation: otherwise the pageserver will revert to detached # state if it was running attached with a stale generation last_state[pageserver.id] = ("Detached", None) @@ -177,12 +213,12 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): generation = last_state_ps[1] else: # Switch generations, while also jumping between attached states - generation = env.attachment_service.attach_hook_issue( + generation = env.storage_controller.attach_hook_issue( tenant_id, pageserver.id ) latest_attached = pageserver.id else: - generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver.id) + generation = env.storage_controller.attach_hook_issue(tenant_id, pageserver.id) latest_attached = pageserver.id else: generation = None @@ -201,6 +237,13 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): ) workload.validate(pageserver.id) + # Having done a bunch of attach/detach cycles, we will have generated some index garbage: check + # that the scrubber sees it and cleans it up. We do this before the final attach+validate pass, + # to also validate that the scrubber isn't breaking anything. + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1) + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] > 0 + # Attach all pageservers for ps in env.pageservers: location_conf = {"mode": "AttachedMulti", "secondary_conf": None, "tenant_conf": {}} @@ -213,10 +256,11 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): # Detach all pageservers for ps in env.pageservers: location_conf = {"mode": "Detached", "secondary_conf": None, "tenant_conf": {}} + assert ps.list_layers(tenant_id, timeline_id) != [] ps.tenant_location_configure(tenant_id, location_conf) - # Confirm that all local disk state was removed on detach - # TODO + # Confirm that all local disk state was removed on detach + assert ps.list_layers(tenant_id, timeline_id) == [] def test_live_migration(neon_env_builder: NeonEnvBuilder): @@ -224,9 +268,8 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): Test the sequence of location states that are used in a live migration. """ neon_env_builder.num_pageservers = 2 - neon_env_builder.enable_pageserver_remote_storage( - remote_storage_kind=RemoteStorageKind.MOCK_S3, - ) + remote_storage_kind = RemoteStorageKind.MOCK_S3 + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind=remote_storage_kind) env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) tenant_id = env.initial_tenant @@ -269,7 +312,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): # Encourage the new location to download while still in secondary mode pageserver_b.http_client().tenant_secondary_download(tenant_id) - migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id) + migrated_generation = env.storage_controller.attach_hook_issue(tenant_id, pageserver_b.id) log.info(f"Acquired generation {migrated_generation} for destination pageserver") assert migrated_generation == initial_generation + 1 @@ -342,6 +385,14 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): workload.churn_rows(64, pageserver_b.id) workload.validate(pageserver_b.id) + del workload + + # Check that deletion works properly on a tenant that was live-migrated + # (reproduce https://github.com/neondatabase/neon/issues/6802) + pageserver_b.http_client().tenant_delete(tenant_id) + + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): @@ -387,31 +438,34 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): validate_heatmap(heatmap_second) -def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]: +def list_elegible_layers( + pageserver, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId +) -> list[Path]: """ - Inspect local storage on a pageserver to discover which layer files are present. - - :return: list of relative paths to layers, from the timeline root. + The subset of layer filenames that are elegible for secondary download: at time of writing this + is all resident layers which are also visible. """ - timeline_path = pageserver.timeline_dir(tenant_id, timeline_id) + candidates = pageserver.list_layers(tenant_id, timeline_id) - def relative(p: Path) -> Path: - return p.relative_to(timeline_path) + layer_map = pageserver.http_client().layer_map_info(tenant_id, timeline_id) - return sorted( - list( - map( - relative, - filter( - lambda path: path.name != "metadata" - and "ephemeral" not in path.name - and "temp" not in path.name, - timeline_path.glob("*"), - ), - ) - ) + # Map of layer filenames to their visibility the "layer name" is not the same as the filename: add suffix to resolve one to the other + visible_map = dict( + (f"{layer.layer_file_name}-v1-00000001", layer.visible) + for layer in layer_map.historic_layers ) + def is_visible(layer_file_name): + try: + return visible_map[str(layer_file_name)] + except KeyError: + # Unexpected: tests should call this when pageservers are in a quiet state such that the layer map + # matches what's on disk. + log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}") + raise + + return list(c for c in candidates if is_visible(c)) + def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): """ @@ -421,12 +475,17 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): - Eviction of layers on the attached location results in deletion on the secondary location as well. """ + + # For debug of https://github.com/neondatabase/neon/issues/6966 + neon_env_builder.rust_log_override = "DEBUG" + neon_env_builder.num_pageservers = 2 neon_env_builder.enable_pageserver_remote_storage( remote_storage_kind=RemoteStorageKind.MOCK_S3, ) env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) - assert env.attachment_service is not None + assert env.storage_controller is not None + assert isinstance(env.pageserver_remote_storage, S3Storage) # Satisfy linter tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -456,10 +515,14 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): log.info("Synchronizing after initial write...") ps_attached.http_client().tenant_heatmap_upload(tenant_id) + # Ensure that everything which appears in the heatmap is also present in S3: heatmap writers + # are allowed to upload heatmaps that reference layers which are only enqueued for upload + wait_for_upload_queue_empty(ps_attached.http_client(), tenant_id, timeline_id) + ps_secondary.http_client().tenant_secondary_download(tenant_id) - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id + assert list_elegible_layers(ps_attached, tenant_id, timeline_id) == ps_secondary.list_layers( + tenant_id, timeline_id ) # Make changes on attached pageserver, check secondary downloads them @@ -468,11 +531,26 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): workload.churn_rows(128, ps_attached.id) ps_attached.http_client().tenant_heatmap_upload(tenant_id) + + # Ensure that everything which appears in the heatmap is also present in S3: heatmap writers + # are allowed to upload heatmaps that reference layers which are only enqueued for upload + wait_for_upload_queue_empty(ps_attached.http_client(), tenant_id, timeline_id) + ps_secondary.http_client().tenant_secondary_download(tenant_id) - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id - ) + try: + assert list_elegible_layers( + ps_attached, tenant_id, timeline_id + ) == ps_secondary.list_layers(tenant_id, timeline_id) + except: + # Do a full listing of the secondary location on errors, to help debug of + # https://github.com/neondatabase/neon/issues/6966 + timeline_path = ps_secondary.timeline_dir(tenant_id, timeline_id) + for path, _dirs, files in os.walk(timeline_path): + for f in files: + log.info(f"Secondary file: {os.path.join(path, f)}") + + raise # FIXME: this sleep is needed to avoid on-demand promotion of the layers we evict, while # walreceiver is still doing something. @@ -482,23 +560,41 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # Do evictions on attached pageserver, check secondary follows along # ================================================================== - log.info("Evicting a layer...") - layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0] - ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name) + try: + log.info("Evicting a layer...") + layer_to_evict = list_elegible_layers(ps_attached, tenant_id, timeline_id)[0] + some_other_layer = list_elegible_layers(ps_attached, tenant_id, timeline_id)[1] + log.info(f"Victim layer: {layer_to_evict.name}") + ps_attached.http_client().evict_layer( + tenant_id, timeline_id, layer_name=layer_to_evict.name + ) - log.info("Synchronizing after eviction...") - ps_attached.http_client().tenant_heatmap_upload(tenant_id) - ps_secondary.http_client().tenant_secondary_download(tenant_id) + log.info("Synchronizing after eviction...") + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + heatmap_after_eviction = env.pageserver_remote_storage.heatmap_content(tenant_id) + heatmap_layers = set( + layer["name"] for layer in heatmap_after_eviction["timelines"][0]["layers"] + ) + assert layer_to_evict.name not in heatmap_layers + assert parse_layer_file_name(some_other_layer.name).to_str() in heatmap_layers - assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id) - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id - ) + ps_secondary.http_client().tenant_secondary_download(tenant_id) + + assert layer_to_evict not in ps_attached.list_layers(tenant_id, timeline_id) + assert list_elegible_layers( + ps_attached, tenant_id, timeline_id + ) == ps_secondary.list_layers(tenant_id, timeline_id) + except: + # On assertion failures, log some details to help with debugging + heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id) + log.warn(f"heatmap contents: {json.dumps(heatmap,indent=2)}") + raise # Scrub the remote storage # ======================== # This confirms that the scrubber isn't upset by the presence of the heatmap - S3Scrubber(neon_env_builder.test_output_dir, neon_env_builder).scan_metadata() + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy # Detach secondary and delete tenant # =================================== @@ -514,7 +610,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ) log.info("Deleting tenant...") - tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10) + ps_attached.http_client().tenant_delete(tenant_id) assert_prefix_empty( neon_env_builder.pageserver_remote_storage, @@ -525,3 +621,239 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ) ), ) + workload.stop() + + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + + +def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): + """ + Slow test that runs in realtime, checks that the background scheduling of secondary + downloads happens as expected. + """ + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + # Create this many tenants, each with two timelines + tenant_count = 4 + tenant_timelines = {} + + # This mirrors a constant in `downloader.rs` + default_download_period_secs = 60 + + # The upload period, which will also be the download once the secondary has seen its first heatmap + upload_period_secs = 30 + + for _i in range(0, tenant_count): + tenant_id = TenantId.generate() + timeline_a = TimelineId.generate() + timeline_b = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, + timeline_a, + placement_policy='{"Attached":1}', + # Run with a low heatmap period so that we can avoid having to do synthetic API calls + # to trigger the upload promptly. + conf={"heatmap_period": f"{upload_period_secs}s"}, + ) + env.neon_cli.create_timeline("main2", tenant_id, timeline_b) + + tenant_timelines[tenant_id] = [timeline_a, timeline_b] + + def await_log(pageserver, deadline, expression): + """ + Wrapper around assert_log_contains that waits with a deadline rather than timeout + """ + now = time.time() + if now > deadline: + raise RuntimeError(f"Timed out waiting for {expression}") + else: + timeout = int(deadline - now) + 1 + try: + wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression)) # type: ignore + except: + log.error(f"Timed out waiting for '{expression}'") + raise + + t_start = time.time() + + # Wait long enough that the background downloads should happen; we expect all the inital layers + # of all the initial timelines to show up on the secondary location of each tenant. + initial_download_deadline = time.time() + default_download_period_secs * 3 + + for tenant_id, timelines in tenant_timelines.items(): + attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] + ps_attached = env.get_pageserver(attached_to_id) + # We only have two: the other one must be secondary + ps_secondary = next(p for p in env.pageservers if p != ps_attached) + + now = time.time() + if now > initial_download_deadline: + raise RuntimeError("Timed out waiting for initial secondary download") + else: + for timeline_id in timelines: + log.info( + f"Waiting for downloads of timeline {timeline_id} on secondary pageserver {ps_secondary.id}" + ) + await_log( + ps_secondary, + initial_download_deadline, + f".*{timeline_id}.*Wrote timeline_detail.*", + ) + + for timeline_id in timelines: + log.info( + f"Checking for secondary timeline downloads {timeline_id} on node {ps_secondary.id}" + ) + # One or more layers should be present for all timelines + assert ps_secondary.list_layers(tenant_id, timeline_id) + + # Delete the second timeline: this should be reflected later on the secondary + env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1]) + + # Wait long enough for the secondary locations to see the deletion: 2x period plus a grace factor + deletion_deadline = time.time() + upload_period_secs * 3 + + for tenant_id, timelines in tenant_timelines.items(): + attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] + ps_attached = env.get_pageserver(attached_to_id) + # We only have two: the other one must be secondary + ps_secondary = next(p for p in env.pageservers if p != ps_attached) + + expect_del_timeline = timelines[1] + log.info( + f"Waiting for deletion of timeline {expect_del_timeline} on secondary pageserver {ps_secondary.id}" + ) + await_log( + ps_secondary, + deletion_deadline, + f".*Timeline no longer in heatmap.*{expect_del_timeline}.*", + ) + + # This one was not deleted + assert ps_secondary.list_layers(tenant_id, timelines[0]) + + # This one was deleted + log.info( + f"Checking for secondary timeline deletion {tenant_id}/{timeline_id} on node {ps_secondary.id}" + ) + assert not ps_secondary.list_layers(tenant_id, expect_del_timeline) + + t_end = time.time() + + # Measure how many heatmap downloads we did in total: this checks that we succeeded with + # proper scheduling, and not some bug that just runs downloads in a loop. + total_heatmap_downloads = 0 + for ps in env.pageservers: + v = ps.http_client().get_metric_value("pageserver_secondary_download_heatmap_total") + assert v is not None + total_heatmap_downloads += int(v) + + download_rate = (total_heatmap_downloads / tenant_count) / (t_end - t_start) + + expect_download_rate = 1.0 / upload_period_secs + log.info(f"Download rate: {download_rate * 60}/min vs expected {expect_download_rate * 60}/min") + + assert download_rate < expect_download_rate * 2 + + +@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build") +@pytest.mark.parametrize("via_controller", [True, False]) +def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool): + """ + Test use of secondary download API for slow downloads, where slow means either a healthy + system with a large capacity shard, or some unhealthy remote storage. + + The download API is meant to respect a client-supplied time limit, and return 200 or 202 + selectively based on whether the download completed. + """ + neon_env_builder.num_pageservers = 2 + neon_env_builder.enable_pageserver_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + + env.neon_cli.create_tenant( + tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}' + ) + + attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] + ps_attached = env.get_pageserver(attached_to_id) + ps_secondary = next(p for p in env.pageservers if p != ps_attached) + + # Generate a bunch of small layers (we will apply a slowdown failpoint that works on a per-layer basis) + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(128) + ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id) + workload.write_rows(128) + ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id) + workload.write_rows(128) + ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id) + workload.write_rows(128) + ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id) + + # Expect lots of layers + assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10 + + # Simulate large data by making layer downloads artifically slow + for ps in env.pageservers: + ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")]) + + # Upload a heatmap, so that secondaries have something to download + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + + if via_controller: + http_client = env.storage_controller.pageserver_api() + http_client.tenant_location_conf( + tenant_id, + { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + "generation": None, + }, + ) + else: + http_client = ps_secondary.http_client() + + # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms + (status, progress_1) = http_client.tenant_secondary_download(tenant_id, wait_ms=4000) + assert status == 202 + assert progress_1["heatmap_mtime"] is not None + assert progress_1["layers_downloaded"] > 0 + assert progress_1["bytes_downloaded"] > 0 + assert progress_1["layers_total"] > progress_1["layers_downloaded"] + assert progress_1["bytes_total"] > progress_1["bytes_downloaded"] + + # Multiple polls should work: use a shorter wait period this time + (status, progress_2) = http_client.tenant_secondary_download(tenant_id, wait_ms=1000) + assert status == 202 + assert progress_2["heatmap_mtime"] is not None + assert progress_2["layers_downloaded"] > 0 + assert progress_2["bytes_downloaded"] > 0 + assert progress_2["layers_total"] > progress_2["layers_downloaded"] + assert progress_2["bytes_total"] > progress_2["bytes_downloaded"] + + # Progress should be >= the first poll: this can only go backward if we see a new heatmap, + # and the heatmap period on the attached node is much longer than the runtime of this test, so no + # new heatmap should have been uploaded. + assert progress_2["layers_downloaded"] >= progress_1["layers_downloaded"] + assert progress_2["bytes_downloaded"] >= progress_1["bytes_downloaded"] + assert progress_2["layers_total"] == progress_1["layers_total"] + assert progress_2["bytes_total"] == progress_1["bytes_total"] + + # Make downloads fast again: when the download completes within this last request, we + # get a 200 instead of a 202 + for ps in env.pageservers: + ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "off")]) + (status, progress_3) = http_client.tenant_secondary_download(tenant_id, wait_ms=20000) + assert status == 200 + assert progress_3["heatmap_mtime"] is not None + assert progress_3["layers_total"] == progress_3["layers_downloaded"] + assert progress_3["bytes_total"] == progress_3["bytes_downloaded"] diff --git a/test_runner/regress/test_parallel_copy.py b/test_runner/regress/test_parallel_copy.py index 6f74d50b92..b33e387a66 100644 --- a/test_runner/regress/test_parallel_copy.py +++ b/test_runner/regress/test_parallel_copy.py @@ -1,7 +1,6 @@ import asyncio from io import BytesIO -from fixtures.log_helper import log from fixtures.neon_fixtures import Endpoint, NeonEnv @@ -44,7 +43,6 @@ def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5): env = neon_simple_env env.neon_cli.create_branch("test_parallel_copy", "empty") endpoint = env.endpoints.create_start("test_parallel_copy") - log.info("postgres is running on 'test_parallel_copy' branch") # Create test table conn = endpoint.connect() diff --git a/test_runner/regress/test_pg_query_cancellation.py b/test_runner/regress/test_pg_query_cancellation.py new file mode 100644 index 0000000000..bad2e5865e --- /dev/null +++ b/test_runner/regress/test_pg_query_cancellation.py @@ -0,0 +1,282 @@ +from contextlib import closing +from typing import Set + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonPageserver +from fixtures.pageserver.http import PageserverHttpClient +from psycopg2.errors import QueryCanceled + +CRITICAL_PG_PS_WAIT_FAILPOINTS: Set[str] = { + "ps::connection-start::pre-login", + "ps::connection-start::startup-packet", + "ps::connection-start::process-query", + "ps::handle-pagerequest-message::exists", + "ps::handle-pagerequest-message::nblocks", + "ps::handle-pagerequest-message::getpage", + "ps::handle-pagerequest-message::dbsize", + # We don't yet have a good way to on-demand guarantee the download of an + # SLRU segment, so that's disabled for now. + # "ps::handle-pagerequest-message::slrusegment", +} + +PG_PS_START_FAILPOINTS = { + "ps::connection-start::pre-login", + "ps::connection-start::startup-packet", + "ps::connection-start::process-query", +} +SMGR_EXISTS = "ps::handle-pagerequest-message::exists" +SMGR_NBLOCKS = "ps::handle-pagerequest-message::nblocks" +SMGR_GETPAGE = "ps::handle-pagerequest-message::getpage" +SMGR_DBSIZE = "ps::handle-pagerequest-message::dbsize" + +""" +Test that we can handle connection delays and cancellations at various +unfortunate connection startup and request states. +""" + + +def test_cancellations(neon_simple_env: NeonEnv): + env = neon_simple_env + ps = env.pageserver + ps_http = ps.http_client() + ps_http.is_testing_enabled_or_skip() + + env.neon_cli.create_branch("test_config", "empty") + + # We don't want to have any racy behaviour with autovacuum IOs + ep = env.endpoints.create_start( + "test_config", + config_lines=[ + "autovacuum = off", + "shared_buffers = 128MB", + ], + ) + + with closing(ep.connect()) as conn: + with conn.cursor() as cur: + cur.execute( + """ + CREATE TABLE test1 AS + SELECT id, sha256(id::text::bytea) payload + FROM generate_series(1, 1024::bigint) p(id); + """ + ) + cur.execute( + """ + CREATE TABLE test2 AS + SELECT id, sha256(id::text::bytea) payload + FROM generate_series(1025, 2048::bigint) p(id); + """ + ) + cur.execute( + """ + VACUUM (ANALYZE, FREEZE) test1, test2; + """ + ) + cur.execute( + """ + CREATE EXTENSION pg_buffercache; + """ + ) + cur.execute( + """ + CREATE EXTENSION pg_prewarm; + """ + ) + + # data preparation is now complete, with 2 disjoint tables that aren't + # preloaded into any caches. + + ep.stop() + + for failpoint in CRITICAL_PG_PS_WAIT_FAILPOINTS: + connect_works_correctly(failpoint, ep, ps, ps_http) + + +ENABLED_FAILPOINTS: Set[str] = set() + + +def connect_works_correctly( + failpoint: str, ep: Endpoint, ps: NeonPageserver, ps_http: PageserverHttpClient +): + log.debug("Starting work on %s", failpoint) + # All queries we use should finish (incl. IO) within 500ms, + # including all their IO. + # This allows us to use `SET statement_timeout` to let the query + # timeout system cancel queries, rather than us having to go + # through the most annoying effort of manual query cancellation + # in psycopg2. + options = "-cstatement_timeout=500ms -ceffective_io_concurrency=1" + + ep.start() + + def fp_enable(): + global ENABLED_FAILPOINTS + ps_http.configure_failpoints( + [ + (failpoint, "pause"), + ] + ) + ENABLED_FAILPOINTS = ENABLED_FAILPOINTS | {failpoint} + log.info( + 'Enabled failpoint "%s", current_active=%s', failpoint, ENABLED_FAILPOINTS, stacklevel=2 + ) + + def fp_disable(): + global ENABLED_FAILPOINTS + ps_http.configure_failpoints( + [ + (failpoint, "off"), + ] + ) + ENABLED_FAILPOINTS = ENABLED_FAILPOINTS - {failpoint} + log.info( + 'Disabled failpoint "%s", current_active=%s', + failpoint, + ENABLED_FAILPOINTS, + stacklevel=2, + ) + + def check_buffers(cur): + cur.execute( + """ + SELECT n.nspname AS nspname + , c.relname AS relname + , count(*) AS count + FROM pg_buffercache b + JOIN pg_class c + ON b.relfilenode = pg_relation_filenode(c.oid) AND + b.reldatabase = (SELECT oid FROM pg_database WHERE datname = current_database()) + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE c.oid IN ('test1'::regclass::oid, 'test2'::regclass::oid) + GROUP BY n.nspname, c.relname + ORDER BY 3 DESC + LIMIT 10 + """ + ) + return cur.fetchone() + + def exec_may_cancel(query, cursor, result, cancels): + if cancels: + with pytest.raises(QueryCanceled): + cursor.execute(query) + assert cursor.fetchone() == result + else: + cursor.execute(query) + assert cursor.fetchone() == result + + fp_disable() + + # Warm caches required for new connections, so that they can run without + # requiring catalog reads. + with closing(ep.connect()) as conn: + with conn.cursor() as cur: + cur.execute( + """ + SELECT 1; + """ + ) + assert cur.fetchone() == (1,) + + assert check_buffers(cur) is None + # Ensure all caches required for connection start are correctly + # filled, so that we don't have any "accidents" in this test run + # caused by changes in connection startup plans that require + # requests to the PageServer. + cur.execute( + """ + select array_agg(distinct (pg_prewarm(c.oid::regclass, 'buffer') >= 0)) + from pg_class c + where c.oid < 16384 AND c.relkind IN ('i', 'r'); + """ + ) + assert cur.fetchone() == ([True],) + + # Enable failpoint + fp_enable() + + with closing(ep.connect(options=options, autocommit=True)) as conn: + with conn.cursor() as cur: + cur.execute("SHOW statement_timeout;") + assert cur.fetchone() == ("500ms",) + assert check_buffers(cur) is None + exec_may_cancel( + """ + SELECT min(id) FROM test1; + """, + cur, + (1,), + failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}), + ) + + fp_disable() + + with closing(ep.connect(options=options, autocommit=True)) as conn: + with conn.cursor() as cur: + # Do a select on the data, putting some buffers into the prefetch + # queue. + cur.execute( + """ + SELECT count(id) FROM (select * from test1 LIMIT 256) a; + """ + ) + assert cur.fetchone() == (256,) + + ps.stop() + ps.start() + fp_enable() + + exec_may_cancel( + """ + SELECT COUNT(id) FROM test1; + """, + cur, + (1024,), + failpoint + in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_NBLOCKS, SMGR_DBSIZE}), + ) + + with closing(ep.connect(options=options, autocommit=True)) as conn: + with conn.cursor() as cur: + exec_may_cancel( + """ + SELECT COUNT(id) FROM test2; + """, + cur, + (1024,), + failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}), + ) + + fp_disable() + fp_enable() + + exec_may_cancel( + """ + SELECT 0 < pg_database_size(CURRENT_DATABASE()); + """, + cur, + (True,), + failpoint + in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_GETPAGE, SMGR_NBLOCKS}), + ) + + fp_disable() + + cur.execute( + """ + SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test2; + """ + ) + + assert cur.fetchone() == (1024, 1024, 1025, 2048, 1573376) + + cur.execute( + """ + SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test1; + """ + ) + + assert cur.fetchone() == (1024, 1024, 1, 1024, 524800) + + ep.stop() diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index f26d04e2f3..45ce5b1c5b 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -1,27 +1,157 @@ # # This file runs pg_regress-based tests. # -from pathlib import Path +from __future__ import annotations -from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content +from pathlib import Path +from typing import TYPE_CHECKING, cast + +import pytest +from fixtures.neon_fixtures import ( + Endpoint, + NeonEnv, + NeonEnvBuilder, + check_restored_datadir_content, + tenant_get_shards, +) +from fixtures.pg_version import PgVersion +from fixtures.remote_storage import s3_storage + +if TYPE_CHECKING: + from typing import Optional + + from fixtures.neon_fixtures import PgBin + from pytest import CaptureFixture + + +TENANT_CONF = { + # Scaled down thresholds so that we are exercising the pageserver beyond just writing + # ephemeral/L0 layers, and because debug-mode code is slow to read from full sized ephemeral layer files. + "pitr_interval": "60s", + "checkpoint_distance": f"{8 * 1024 * 1024}", + "compaction_target_size": f"{8 * 1024 * 1024}", +} + +# # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create. +# # There should have been compactions mid-test as well, this final check is in addition those. +# for (shard, pageserver) in tenant_get_shards(env, env.initial_tenant): +# pageserver.http_client().timeline_checkpoint(env.initial_tenant, env.initial_timeline, force_repartition=True, force_image_layer_creation=True) + + +def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: Endpoint): + """ + After running some opaque tests that create interesting content in a timeline, run + some generic integrity checks that the storage stack is able to reproduce the written + data properly. + """ + + ignored_files: Optional[list[str]] = None + + # Neon handles unlogged relations in a special manner. During a + # basebackup, we ship the init fork as the main fork. This presents a + # problem in that the endpoint's data directory and the basebackup will + # have differences and will fail the eventual file comparison. + # + # Unlogged tables were introduced in version 9.1. ALTER TABLE grew + # support for setting the persistence of a table in 9.5. The reason that + # this doesn't affect versions < 15 (but probably would between 9.1 and + # 9.5) is that all the regression tests that deal with unlogged tables + # up until that point dropped the unlogged tables or set them to logged + # at some point during the test. + # + # In version 15, Postgres grew support for unlogged sequences, and with + # that came a few more regression tests. These tests did not all drop + # the unlogged tables/sequences prior to finishing. + # + # But unlogged sequences came with a bug in that, sequences didn't + # inherit the persistence of their "parent" tables if they had one. This + # was fixed and backported to 15, thus exacerbating our problem a bit. + # + # So what we can do is just ignore file differences between the data + # directory and basebackup for unlogged relations. + results = cast( + "list[tuple[str, str]]", + endpoint.safe_psql( + """ + SELECT + relkind, + pg_relation_filepath( + pg_filenode_relation(reltablespace, relfilenode) + ) AS unlogged_relation_paths + FROM pg_class + WHERE relpersistence = 'u' + """, + dbname=db_name, + ), + ) + + unlogged_relation_files: list[str] = [] + for r in results: + unlogged_relation_files.append(r[1]) + # This is related to the following Postgres commit: + # + # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b + # Author: Heikki Linnakangas + # Date: 2023-08-23 09:21:31 -0500 + # + # Use the buffer cache when initializing an unlogged index. + # + # This patch was backpatched to 16. Without it, the LSN in the + # page header would be 0/0 in the data directory, which wouldn't + # match the LSN generated during the basebackup, thus creating + # a difference. + if env.pg_version <= PgVersion.V15 and r[0] == "i": + unlogged_relation_files.append(f"{r[1]}_init") + + ignored_files = unlogged_relation_files + + check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files) + + # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create. + # There should have been compactions mid-test as well, this final check is in addition those. + for shard, pageserver in tenant_get_shards(env, env.initial_tenant): + pageserver.http_client().timeline_checkpoint( + shard, env.initial_timeline, force_repartition=True, force_image_layer_creation=True + ) # Run the main PostgreSQL regression tests, in src/test/regress. # +@pytest.mark.timeout(900) # Contains many sub-tests, is slow in debug builds +@pytest.mark.parametrize("shard_count", [None, 4]) def test_pg_regress( - neon_simple_env: NeonEnv, + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, - pg_bin, - capsys, + pg_bin: PgBin, + capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, + shard_count: Optional[int], ): - env = neon_simple_env + DBNAME = "regression" + + """ + :param shard_count: if None, create an unsharded tenant. Otherwise create a tenant with this + many shards. + """ + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, + initial_tenant_shard_count=shard_count, + ) - env.neon_cli.create_branch("test_pg_regress", "empty") # Connect to postgres and create a database called "regression". - endpoint = env.endpoints.create_start("test_pg_regress") - endpoint.safe_psql("CREATE DATABASE regression") + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + # Enable the test mode, so that we don't need to patch the test cases. + "neon.regress_test_mode = true", + ], + ) + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_regress to run in. runpath = test_output_dir / "regress" @@ -56,28 +186,42 @@ def test_pg_regress( with capsys.disabled(): pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - check_restored_datadir_content(test_output_dir, env, endpoint) + post_checks(env, test_output_dir, DBNAME, endpoint) # Run the PostgreSQL "isolation" tests, in src/test/isolation. # +@pytest.mark.timeout(600) # Contains many sub-tests, is slow in debug builds +@pytest.mark.parametrize("shard_count", [None, 4]) def test_isolation( - neon_simple_env: NeonEnv, + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, - pg_bin, - capsys, + pg_bin: PgBin, + capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, + shard_count: Optional[int], ): - env = neon_simple_env + DBNAME = "isolation_regression" + + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count + ) - env.neon_cli.create_branch("test_isolation", "empty") # Connect to postgres and create a database called "regression". # isolation tests use prepared transactions, so enable them endpoint = env.endpoints.create_start( - "test_isolation", config_lines=["max_prepared_transactions=100"] + "main", + config_lines=[ + "max_prepared_transactions=100", + # Enable the test mode, so that we don't need to patch the test cases. + "neon.regress_test_mode = true", + ], ) - endpoint.safe_psql("CREATE DATABASE isolation_regression") + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_isolation_regress to run in. runpath = test_output_dir / "regress" @@ -111,23 +255,40 @@ def test_isolation( with capsys.disabled(): pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath) + # This fails with a mismatch on `pg_multixact/offsets/0000` + # post_checks(env, test_output_dir, DBNAME, endpoint) + # Run extra Neon-specific pg_regress-based tests. The tests and their # schedule file are in the sql_regress/ directory. +@pytest.mark.parametrize("shard_count", [None, 4]) def test_sql_regress( - neon_simple_env: NeonEnv, + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, - pg_bin, - capsys, + pg_bin: PgBin, + capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, + shard_count: Optional[int], ): - env = neon_simple_env + DBNAME = "regression" + + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count + ) - env.neon_cli.create_branch("test_sql_regress", "empty") # Connect to postgres and create a database called "regression". - endpoint = env.endpoints.create_start("test_sql_regress") - endpoint.safe_psql("CREATE DATABASE regression") + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + # Enable the test mode, so that we don't need to patch the test cases. + "neon.regress_test_mode = true", + ], + ) + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_regress to run in. runpath = test_output_dir / "regress" @@ -162,4 +323,4 @@ def test_sql_regress( with capsys.disabled(): pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - check_restored_datadir_content(test_output_dir, env, endpoint) + post_checks(env, test_output_dir, DBNAME, endpoint) diff --git a/test_runner/regress/test_pg_waldump.py b/test_runner/regress/test_pg_waldump.py new file mode 100644 index 0000000000..8e80efd9ba --- /dev/null +++ b/test_runner/regress/test_pg_waldump.py @@ -0,0 +1,61 @@ +import os +import shutil + +from fixtures.neon_fixtures import NeonEnv, PgBin +from fixtures.utils import subprocess_capture + + +def check_wal_segment(pg_waldump_path: str, segment_path: str, test_output_dir): + # use special --ignore option to ignore the validation checks in pg_waldump + # this is necessary, because neon WAL files contain gap at the beginning + output_path, _, _ = subprocess_capture( + test_output_dir, [pg_waldump_path, "--ignore", segment_path] + ) + + with open(f"{output_path}.stdout", "r") as f: + stdout = f.read() + assert "ABORT" in stdout + assert "COMMIT" in stdout + + +# Simple test to check that pg_waldump works with neon WAL files +def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin): + env = neon_simple_env + tenant_id = env.initial_tenant + timeline_id = env.neon_cli.create_branch("test_pg_waldump", "empty") + endpoint = env.endpoints.create_start("test_pg_waldump") + + cur = endpoint.connect().cursor() + cur.execute( + """ + BEGIN; + CREATE TABLE t1(i int primary key, n_updated int); + INSERT INTO t1 select g, 0 from generate_series(1, 50) g; + ROLLBACK; + """ + ) + + cur.execute( + """ + BEGIN; + CREATE TABLE t1(i int primary key, n_updated int); + INSERT INTO t1 select g, 0 from generate_series(1, 50) g; + COMMIT; + """ + ) + + # stop the endpoint to make sure that WAL files are flushed and won't change + endpoint.stop() + + assert endpoint.pgdata_dir + wal_path = os.path.join(endpoint.pgdata_dir, "pg_wal/000000010000000000000001") + pg_waldump_path = os.path.join(pg_bin.pg_bin_path, "pg_waldump") + # check segment on compute + check_wal_segment(pg_waldump_path, wal_path, test_output_dir) + + # Check file on safekeepers as well. pg_waldump is strict about file naming, so remove .partial suffix. + sk = env.safekeepers[0] + sk_tli_dir = sk.timeline_dir(tenant_id, timeline_id) + non_partial_path = os.path.join(sk_tli_dir, "000000010000000000000001") + shutil.copyfile(os.path.join(sk_tli_dir, "000000010000000000000001.partial"), non_partial_path) + check_wal_segment(pg_waldump_path, non_partial_path, test_output_dir) diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py index 034f2b669d..043aff686b 100644 --- a/test_runner/regress/test_physical_replication.py +++ b/test_runner/regress/test_physical_replication.py @@ -1,12 +1,15 @@ +from __future__ import annotations + import random import time +from typing import TYPE_CHECKING -from fixtures.neon_fixtures import NeonEnv +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv def test_physical_replication(neon_simple_env: NeonEnv): env = neon_simple_env - n_records = 100000 with env.endpoints.create_start( branch_name="main", endpoint_id="primary", @@ -22,8 +25,20 @@ def test_physical_replication(neon_simple_env: NeonEnv): with p_con.cursor() as p_cur: with secondary.connect() as s_con: with s_con.cursor() as s_cur: - for pk in range(n_records): + runtime_secs = 30 + started_at = time.time() + pk = 0 + while True: + pk += 1 + now = time.time() + if now - started_at > runtime_secs: + break p_cur.execute("insert into t (pk) values (%s)", (pk,)) + # an earlier version of this test was based on a fixed number of loop iterations + # and selected for pk=(random.randrange(1, fixed number of loop iterations)). + # => the probability of selection for a value that was never inserted changed from 99.9999% to 0% over the course of the test. + # + # We changed the test to where=(random.randrange(1, 2*pk)), which means the probability is now fixed to 50%. s_cur.execute( - "select * from t where pk=%s", (random.randrange(1, n_records),) + "select * from t where pk=%s", (random.randrange(1, 2 * pk),) ) diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index c2ea5b332a..7e676b5515 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -1,6 +1,6 @@ +from fixtures.common_types import TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -10,13 +10,10 @@ from fixtures.utils import print_gc_result, query_scalar # def test_pitr_gc(neon_env_builder: NeonEnvBuilder): # Set pitr interval such that we need to keep the data - neon_env_builder.pageserver_config_override = ( - "tenant_config={pitr_interval = '1 day', gc_horizon = 0}" + env = neon_env_builder.init_start( + initial_tenant_conf={"pitr_interval": "1 day", "gc_horizon": "0"} ) - - env = neon_env_builder.init_start() endpoint_main = env.endpoints.create_start("main") - log.info("postgres is running on 'main' branch") main_pg_conn = endpoint_main.connect() main_cur = main_pg_conn.cursor() diff --git a/test_runner/regress/test_postgres_version.py b/test_runner/regress/test_postgres_version.py new file mode 100644 index 0000000000..03e8c7c0df --- /dev/null +++ b/test_runner/regress/test_postgres_version.py @@ -0,0 +1,35 @@ +import json +import re +from pathlib import Path + +from fixtures.neon_fixtures import PgBin +from fixtures.pg_version import PgVersion + + +def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion): + """Test that Postgres version matches the one we expect""" + + with (base_dir / "vendor" / "revisions.json").open() as f: + expected_revisions = json.load(f) + + output_prefix = pg_bin.run_capture(["postgres", "--version"], with_command_header=False) + stdout = Path(f"{output_prefix}.stdout") + assert stdout.exists(), "postgres --version didn't print anything to stdout" + + with stdout.open() as f: + output = f.read().strip() + + # `postgres --version` prints something like "postgres (PostgreSQL) 15.6 (85d809c124a898847a97d66a211f7d5ef4f8e0cb)". + pattern = r"postgres \(PostgreSQL\) (?P\d+\.\d+) \((?P[0-9a-f]{40})\)" + match = re.search(pattern, output, re.IGNORECASE) + assert match is not None, f"Can't parse {output} with {pattern}" + + version = match.group("version") + commit = match.group("commit") + + assert ( + pg_version.v_prefixed in expected_revisions + ), f"Version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional" + + msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional" + assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index 1d62f09840..d2b8c2ed8b 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -2,6 +2,7 @@ import asyncio import json import subprocess import time +import urllib.parse from typing import Any, List, Optional, Tuple import psycopg2 @@ -275,6 +276,31 @@ def test_sql_over_http(static_proxy: NeonProxy): assert res["rowCount"] is None +def test_sql_over_http_db_name_with_space(static_proxy: NeonProxy): + db = "db with spaces" + static_proxy.safe_psql_many( + ( + f'create database "{db}"', + "create role http with login password 'http' superuser", + ) + ) + + def q(sql: str, params: Optional[List[Any]] = None) -> Any: + params = params or [] + connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/{urllib.parse.quote(db)}" + response = requests.post( + f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql", + data=json.dumps({"query": sql, "params": params}), + headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr}, + verify=str(static_proxy.test_output_dir / "proxy.crt"), + ) + assert response.status_code == 200, response.text + return response.json() + + rows = q("select 42 as answer")["rows"] + assert rows == [{"answer": 42}] + + def test_sql_over_http_output_options(static_proxy: NeonProxy): static_proxy.safe_psql("create role http2 with login password 'http2' superuser") @@ -390,14 +416,47 @@ def test_sql_over_http_batch(static_proxy: NeonProxy): assert result[0]["rows"] == [{"answer": 42}] +def test_sql_over_http_batch_output_options(static_proxy: NeonProxy): + static_proxy.safe_psql("create role http with login password 'http' superuser") + + connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" + response = requests.post( + f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql", + data=json.dumps( + { + "queries": [ + {"query": "select $1 as answer", "params": [42], "arrayMode": True}, + {"query": "select $1 as answer", "params": [42], "arrayMode": False}, + ] + } + ), + headers={ + "Content-Type": "application/sql", + "Neon-Connection-String": connstr, + "Neon-Batch-Isolation-Level": "Serializable", + "Neon-Batch-Read-Only": "false", + "Neon-Batch-Deferrable": "false", + }, + verify=str(static_proxy.test_output_dir / "proxy.crt"), + ) + assert response.status_code == 200 + results = response.json()["results"] + + assert results[0]["rowAsArray"] + assert results[0]["rows"] == [["42"]] + + assert not results[1]["rowAsArray"] + assert results[1]["rows"] == [{"answer": "42"}] + + def test_sql_over_http_pool(static_proxy: NeonProxy): static_proxy.safe_psql("create user http_auth with password 'http' superuser") - def get_pid(status: int, pw: str) -> Any: + def get_pid(status: int, pw: str, user="http_auth") -> Any: return static_proxy.http_query( GET_CONNECTION_PID_QUERY, [], - user="http_auth", + user=user, password=pw, expected_code=status, ) @@ -418,23 +477,29 @@ def test_sql_over_http_pool(static_proxy: NeonProxy): static_proxy.safe_psql("alter user http_auth with password 'http2'") - # after password change, should open a new connection to verify it - pid2 = get_pid(200, "http2")["rows"][0]["pid"] - assert pid1 != pid2 + # after password change, shouldn't open a new connection because it checks password in proxy. + rows = get_pid(200, "http2")["rows"] + assert rows == [{"pid": pid1}] time.sleep(0.02) - # query should be on an existing connection - pid = get_pid(200, "http2")["rows"][0]["pid"] - assert pid in [pid1, pid2] - - time.sleep(0.02) - - # old password should not work - res = get_pid(400, "http") + # incorrect user shouldn't reveal that the user doesn't exists + res = get_pid(400, "http", user="http_auth2") assert "password authentication failed for user" in res["message"] +def test_sql_over_http_urlencoding(static_proxy: NeonProxy): + static_proxy.safe_psql("create user \"http+auth$$\" with password '%+$^&*@!' superuser") + + static_proxy.http_query( + "select 1", + [], + user="http+auth$$", + password="%+$^&*@!", + expected_code=200, + ) + + # Beginning a transaction should not impact the next query, # which might come from a completely different client. def test_http_pool_begin(static_proxy: NeonProxy): @@ -515,3 +580,49 @@ def test_sql_over_http_pool_custom_types(static_proxy: NeonProxy): "select array['foo'::foo, 'bar'::foo, 'baz'::foo] as data", ) assert response["rows"][0]["data"] == ["foo", "bar", "baz"] + + +@pytest.mark.asyncio +async def test_sql_over_http2(static_proxy: NeonProxy): + static_proxy.safe_psql("create role http with login password 'http' superuser") + + resp = await static_proxy.http2_query( + "select 42 as answer", [], user="http", password="http", expected_code=200 + ) + assert resp["rows"] == [{"answer": 42}] + + +def test_sql_over_http_connection_cancel(static_proxy: NeonProxy): + static_proxy.safe_psql("create role http with login password 'http' superuser") + + static_proxy.safe_psql("create table test_table ( id int primary key )") + + # insert into a table, with a unique constraint, after sleeping for n seconds + query = "WITH temp AS ( \ + SELECT pg_sleep($1) as sleep, $2::int as id \ + ) INSERT INTO test_table (id) SELECT id FROM temp" + + try: + # The request should complete before the proxy HTTP timeout triggers. + # Timeout and cancel the request on the client side before the query completes. + static_proxy.http_query( + query, + [static_proxy.http_timeout_seconds - 1, 1], + user="http", + password="http", + timeout=2, + ) + except requests.exceptions.ReadTimeout: + pass + + # wait until the query _would_ have been complete + time.sleep(static_proxy.http_timeout_seconds) + + res = static_proxy.http_query(query, [1, 1], user="http", password="http", expected_code=200) + assert res["command"] == "INSERT", "HTTP query should insert" + assert res["rowCount"] == 1, "HTTP query should insert" + + res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400) + assert ( + "duplicate key value violates unique constraint" in res["message"] + ), "HTTP query should conflict" diff --git a/test_runner/regress/test_proxy_allowed_ips.py b/test_runner/regress/test_proxy_allowed_ips.py index f533579811..7a804114ba 100644 --- a/test_runner/regress/test_proxy_allowed_ips.py +++ b/test_runner/regress/test_proxy_allowed_ips.py @@ -24,7 +24,7 @@ async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil with pytest.raises(psycopg2.Error) as exprinfo: static_proxy.safe_psql(**kwargs) text = str(exprinfo.value).strip() - assert "This IP address is not allowed to connect" in text + assert "not allowed to connect" in text # no SNI, deprecated `options=project` syntax (before we had several endpoint in project) check_cannot_connect(query="select 1", sslsni=0, options="project=private-project") diff --git a/test_runner/regress/test_proxy_rate_limiter.py b/test_runner/regress/test_proxy_rate_limiter.py deleted file mode 100644 index f39f0cad07..0000000000 --- a/test_runner/regress/test_proxy_rate_limiter.py +++ /dev/null @@ -1,84 +0,0 @@ -import asyncio -import time -from pathlib import Path -from typing import Iterator - -import pytest -from fixtures.neon_fixtures import ( - PSQL, - NeonProxy, -) -from fixtures.port_distributor import PortDistributor -from pytest_httpserver import HTTPServer -from werkzeug.wrappers.response import Response - - -def waiting_handler(status_code: int) -> Response: - # wait more than timeout to make sure that both (two) connections are open. - # It would be better to use a barrier here, but I don't know how to do that together with pytest-httpserver. - time.sleep(2) - return Response(status=status_code) - - -@pytest.fixture(scope="function") -def proxy_with_rate_limit( - port_distributor: PortDistributor, - neon_binpath: Path, - httpserver_listen_address, - test_output_dir: Path, -) -> Iterator[NeonProxy]: - """Neon proxy that routes directly to vanilla postgres.""" - - proxy_port = port_distributor.get_port() - mgmt_port = port_distributor.get_port() - http_port = port_distributor.get_port() - external_http_port = port_distributor.get_port() - (host, port) = httpserver_listen_address - endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" - - with NeonProxy( - neon_binpath=neon_binpath, - test_output_dir=test_output_dir, - proxy_port=proxy_port, - http_port=http_port, - mgmt_port=mgmt_port, - external_http_port=external_http_port, - auth_backend=NeonProxy.Console(endpoint, fixed_rate_limit=5), - ) as proxy: - proxy.start() - yield proxy - - -@pytest.mark.asyncio -async def test_proxy_rate_limit( - httpserver: HTTPServer, - proxy_with_rate_limit: NeonProxy, -): - uri = "/billing/api/v1/usage_events/proxy_get_role_secret" - # mock control plane service - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: Response(status=200) - ) - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: waiting_handler(429) - ) - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: waiting_handler(500) - ) - - psql = PSQL(host=proxy_with_rate_limit.host, port=proxy_with_rate_limit.proxy_port) - f = await psql.run("select 42;") - await proxy_with_rate_limit.find_auth_link(uri, f) - # Limit should be 2. - - # Run two queries in parallel. - f1, f2 = await asyncio.gather(psql.run("select 42;"), psql.run("select 42;")) - await proxy_with_rate_limit.find_auth_link(uri, f1) - await proxy_with_rate_limit.find_auth_link(uri, f2) - - # Now limit should be 0. - f = await psql.run("select 42;") - await proxy_with_rate_limit.find_auth_link(uri, f) - - # There last query shouldn't reach the http-server. - assert httpserver.assertions == [] diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py new file mode 100644 index 0000000000..6211446a40 --- /dev/null +++ b/test_runner/regress/test_proxy_websockets.py @@ -0,0 +1,196 @@ +import ssl + +import pytest +import websockets +from fixtures.neon_fixtures import NeonProxy + + +@pytest.mark.asyncio +async def test_websockets(static_proxy: NeonProxy): + static_proxy.safe_psql("create user ws_auth with password 'ws' superuser") + + user = "ws_auth" + password = "ws" + + version = b"\x00\x03\x00\x00" + params = { + "user": user, + "database": "postgres", + "client_encoding": "UTF8", + } + + ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt")) + + async with websockets.connect( + f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql", + ssl=ssl_context, + ) as websocket: + startup_message = bytearray(version) + for key, value in params.items(): + startup_message.extend(key.encode("ascii")) + startup_message.extend(b"\0") + startup_message.extend(value.encode("ascii")) + startup_message.extend(b"\0") + startup_message.extend(b"\0") + length = (4 + len(startup_message)).to_bytes(4, byteorder="big") + + await websocket.send([length, startup_message]) + + startup_response = await websocket.recv() + assert isinstance(startup_response, bytes) + assert startup_response[0:1] == b"R", "should be authentication message" + assert startup_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message" + assert startup_response[5:9] == b"\x00\x00\x00\x03", "should be cleartext" + + auth_message = password.encode("utf-8") + b"\0" + length = (4 + len(auth_message)).to_bytes(4, byteorder="big") + await websocket.send([b"p", length, auth_message]) + + auth_response = await websocket.recv() + assert isinstance(auth_response, bytes) + assert auth_response[0:1] == b"R", "should be authentication message" + assert auth_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message" + assert auth_response[5:9] == b"\x00\x00\x00\x00", "should be authenticated" + + query_message = "SELECT 1".encode("utf-8") + b"\0" + length = (4 + len(query_message)).to_bytes(4, byteorder="big") + await websocket.send([b"Q", length, query_message]) + + query_response = await websocket.recv() + assert isinstance(query_response, bytes) + # 'T\x00\x00\x00!\x00\x01?column?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17\x00\x04\xff\xff\xff\xff\x00\x00' + # 'D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011' + # 'C\x00\x00\x00\rSELECT 1\x00' + # 'Z\x00\x00\x00\x05I' + + assert query_response[0:1] == b"T", "should be row description message" + row_description_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 + row_description, query_response = ( + query_response[:row_description_len], + query_response[row_description_len:], + ) + assert row_description[5:7] == b"\x00\x01", "should have 1 column" + assert row_description[7:16] == b"?column?\0", "column should be named ?column?" + assert row_description[22:26] == b"\x00\x00\x00\x17", "column should be an int4" + + assert query_response[0:1] == b"D", "should be data row message" + data_row_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 + data_row, query_response = query_response[:data_row_len], query_response[data_row_len:] + assert ( + data_row == b"D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011" + ), "should contain 1 column with text value 1" + + assert query_response[0:1] == b"C", "should be command complete message" + command_complete_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 + command_complete, query_response = ( + query_response[:command_complete_len], + query_response[command_complete_len:], + ) + assert command_complete == b"C\x00\x00\x00\x0dSELECT 1\0" + + assert query_response[0:6] == b"Z\x00\x00\x00\x05I", "should be ready for query (idle)" + + # close + await websocket.send(b"X\x00\x00\x00\x04") + await websocket.wait_closed() + + +@pytest.mark.asyncio +async def test_websockets_pipelined(static_proxy: NeonProxy): + """ + Test whether we can send the startup + auth + query all in one go + """ + + static_proxy.safe_psql("create user ws_auth with password 'ws' superuser") + + user = "ws_auth" + password = "ws" + + version = b"\x00\x03\x00\x00" + params = { + "user": user, + "database": "postgres", + "client_encoding": "UTF8", + } + + ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt")) + + async with websockets.connect( + f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql", + ssl=ssl_context, + ) as websocket: + startup_message = bytearray(version) + for key, value in params.items(): + startup_message.extend(key.encode("ascii")) + startup_message.extend(b"\0") + startup_message.extend(value.encode("ascii")) + startup_message.extend(b"\0") + startup_message.extend(b"\0") + length0 = (4 + len(startup_message)).to_bytes(4, byteorder="big") + + auth_message = password.encode("utf-8") + b"\0" + length1 = (4 + len(auth_message)).to_bytes(4, byteorder="big") + query_message = "SELECT 1".encode("utf-8") + b"\0" + length2 = (4 + len(query_message)).to_bytes(4, byteorder="big") + await websocket.send( + length0 + + startup_message + + b"p" + + length1 + + auth_message + + b"Q" + + length2 + + query_message + ) + + startup_response = await websocket.recv() + assert isinstance(startup_response, bytes) + assert startup_response[0:1] == b"R", "should be authentication message" + assert startup_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message" + assert startup_response[5:9] == b"\x00\x00\x00\x03", "should be cleartext" + + auth_response = await websocket.recv() + assert isinstance(auth_response, bytes) + assert auth_response[0:1] == b"R", "should be authentication message" + assert auth_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message" + assert auth_response[5:9] == b"\x00\x00\x00\x00", "should be authenticated" + + query_response = await websocket.recv() + assert isinstance(query_response, bytes) + # 'T\x00\x00\x00!\x00\x01?column?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17\x00\x04\xff\xff\xff\xff\x00\x00' + # 'D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011' + # 'C\x00\x00\x00\rSELECT 1\x00' + # 'Z\x00\x00\x00\x05I' + + assert query_response[0:1] == b"T", "should be row description message" + row_description_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 + row_description, query_response = ( + query_response[:row_description_len], + query_response[row_description_len:], + ) + assert row_description[5:7] == b"\x00\x01", "should have 1 column" + assert row_description[7:16] == b"?column?\0", "column should be named ?column?" + assert row_description[22:26] == b"\x00\x00\x00\x17", "column should be an int4" + + assert query_response[0:1] == b"D", "should be data row message" + data_row_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 + data_row, query_response = query_response[:data_row_len], query_response[data_row_len:] + assert ( + data_row == b"D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011" + ), "should contain 1 column with text value 1" + + assert query_response[0:1] == b"C", "should be command complete message" + command_complete_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 + command_complete, query_response = ( + query_response[:command_complete_len], + query_response[command_complete_len:], + ) + assert command_complete == b"C\x00\x00\x00\x0dSELECT 1\0" + + assert query_response[0:6] == b"Z\x00\x00\x00\x05I", "should be ready for query (idle)" + + # close + await websocket.send(b"X\x00\x00\x00\x04") + await websocket.wait_closed() diff --git a/test_runner/regress/test_read_trace.py b/test_runner/regress/test_read_trace.py deleted file mode 100644 index e6b3ccd7ec..0000000000 --- a/test_runner/regress/test_read_trace.py +++ /dev/null @@ -1,39 +0,0 @@ -from contextlib import closing - -from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.pageserver.utils import wait_for_last_record_lsn -from fixtures.types import Lsn -from fixtures.utils import query_scalar - - -# This test demonstrates how to collect a read trace. It's useful until -# it gets replaced by a test that actually does stuff with the trace. -# -# Additionally, tests that pageserver is able to create tenants with custom configs. -def test_read_request_tracing(neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 1 - env = neon_env_builder.init_start( - initial_tenant_conf={ - "trace_read_requests": "true", - } - ) - - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline - endpoint = env.endpoints.create_start("main") - - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("create table t (i integer);") - cur.execute(f"insert into t values (generate_series(1,{10000}));") - cur.execute("select count(*) from t;") - current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) - # wait until pageserver receives that data - pageserver_http = env.pageserver.http_client() - wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) - - # Stop postgres so we drop the connection and flush the traces - endpoint.stop() - - trace_path = env.pageserver.workdir / "traces" / str(tenant_id) / str(timeline_id) - assert trace_path.exists() diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index d695410efc..1ac881553f 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -17,13 +17,14 @@ def test_read_validation(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_read_validation", "empty") - endpoint = env.endpoints.create_start("test_read_validation") - log.info("postgres is running on 'test_read_validation' branch") + endpoint = env.endpoints.create_start( + "test_read_validation", + ) with closing(endpoint.connect()) as con: with con.cursor() as c: for e in extensions: - c.execute("create extension if not exists {};".format(e)) + c.execute(f"create extension if not exists {e};") c.execute("create table foo (c int) with (autovacuum_enabled = false)") c.execute("insert into foo values (1)") @@ -43,52 +44,48 @@ def test_read_validation(neon_simple_env: NeonEnv): log.info("Test table is populated, validating buffer cache") cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries > 0, "No buffers cached for the test relation" c.execute( - "select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {}".format( - relfilenode - ) + f"select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {relfilenode}" ) reln = c.fetchone() assert reln is not None log.info("Clear buffer cache to ensure no stale pages are brought into the cache") - c.execute("select clear_buffer_cache()") + endpoint.clear_shared_buffers(cursor=c) cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries == 0, "Failed to clear buffer cache" log.info("Cache is clear, reading stale page version") c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{}'))".format( - first[0] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}', NULL))" ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn" cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries == 0, "relation buffers detected after invalidation" log.info("Cache is clear, reading latest page version without cache") c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL, NULL))" ) direct_latest = c.fetchone() assert second == direct_latest, "Failed fetch page at latest lsn" cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries == 0, "relation buffers detected after invalidation" @@ -97,9 +94,7 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format( - reln[0], reln[1], reln[2], first[0] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}', NULL))" ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn using oid" @@ -109,9 +104,7 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, NULL ))".format( - reln[0], reln[1], reln[2] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL, NULL))" ) direct_latest = c.fetchone() assert second == direct_latest, "Failed fetch page at latest lsn" @@ -123,9 +116,7 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format( - reln[0], reln[1], reln[2], first[0] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}', NULL))" ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn using oid" @@ -135,7 +126,7 @@ def test_read_validation(neon_simple_env: NeonEnv): c.execute("select * from page_header(get_raw_page('foo', 'main', 0));") raise AssertionError("query should have failed") except UndefinedTable as e: - log.info("Caught an expected failure: {}".format(e)) + log.info(f"Caught an expected failure: {e}") def test_read_validation_neg(neon_simple_env: NeonEnv): @@ -144,22 +135,23 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*") - endpoint = env.endpoints.create_start("test_read_validation_neg") - log.info("postgres is running on 'test_read_validation_neg' branch") + endpoint = env.endpoints.create_start( + "test_read_validation_neg", + ) with closing(endpoint.connect()) as con: with con.cursor() as c: for e in extensions: - c.execute("create extension if not exists {};".format(e)) + c.execute(f"create extension if not exists {e};") log.info("read a page of a missing relation") try: c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0', NULL))" ) raise AssertionError("query should have failed") except UndefinedTable as e: - log.info("Caught an expected failure: {}".format(e)) + log.info(f"Caught an expected failure: {e}") c.execute("create table foo (c int) with (autovacuum_enabled = false)") c.execute("insert into foo values (1)") @@ -167,31 +159,31 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): log.info("read a page at lsn 0") try: c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0', NULL))" ) raise AssertionError("query should have failed") except IoError as e: - log.info("Caught an expected failure: {}".format(e)) + log.info(f"Caught an expected failure: {e}") log.info("Pass NULL as an input") expected = (None, None, None) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0', NULL))" ) assert c.fetchone() == expected, "Expected null output" c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0', NULL))" ) assert c.fetchone() == expected, "Expected null output" c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0', NULL))" ) assert c.fetchone() == expected, "Expected null output" # This check is currently failing, reading beyond EOF is returning a 0-page log.info("Read beyond EOF") c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL, NULL))" ) diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 2d641e36a7..368f60127e 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -1,8 +1,16 @@ +import time + import pytest +from fixtures.common_types import Lsn from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import ( + Endpoint, + NeonEnv, + NeonEnvBuilder, + last_flush_lsn_upload, + tenant_get_shards, +) from fixtures.pageserver.utils import wait_for_last_record_lsn -from fixtures.types import Lsn from fixtures.utils import query_scalar @@ -16,9 +24,13 @@ def test_readonly_node(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_readonly_node", "empty") endpoint_main = env.endpoints.create_start("test_readonly_node") - log.info("postgres is running on 'test_readonly_node' branch") - env.pageserver.allowed_errors.append(".*basebackup .* failed: invalid basebackup lsn.*") + env.pageserver.allowed_errors.extend( + [ + ".*basebackup .* failed: invalid basebackup lsn.*", + ".*page_service.*handle_make_lsn_lease.*.*tried to request a page version that was garbage collected", + ] + ) main_pg_conn = endpoint_main.connect() main_cur = main_pg_conn.cursor() @@ -106,6 +118,103 @@ def test_readonly_node(neon_simple_env: NeonEnv): ) +def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): + """ + Test static endpoint is protected from GC by acquiring and renewing lsn leases. + """ + + neon_env_builder.num_pageservers = 2 + # GC is manual triggered. + env = neon_env_builder.init_start( + initial_tenant_conf={ + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", + # Short lease length to fit test. + "lsn_lease_length": "3s", + }, + initial_tenant_shard_count=2, + ) + + ROW_COUNT = 500 + + def generate_updates_on_main( + env: NeonEnv, + ep_main: Endpoint, + data: int, + start=1, + end=ROW_COUNT, + ) -> Lsn: + """ + Generates some load on main branch that results in some uploads. + """ + with ep_main.cursor() as cur: + cur.execute( + f"INSERT INTO t0 (v0, v1) SELECT g, '{data}' FROM generate_series({start}, {end}) g ON CONFLICT (v0) DO UPDATE SET v1 = EXCLUDED.v1" + ) + cur.execute("VACUUM t0") + last_flush_lsn = last_flush_lsn_upload( + env, ep_main, env.initial_tenant, env.initial_timeline + ) + return last_flush_lsn + + # Insert some records on main branch + with env.endpoints.create_start("main") as ep_main: + with ep_main.cursor() as cur: + cur.execute("CREATE TABLE t0(v0 int primary key, v1 text)") + lsn = None + for i in range(2): + lsn = generate_updates_on_main(env, ep_main, i) + + with env.endpoints.create_start( + branch_name="main", + endpoint_id="static", + lsn=lsn, + ) as ep_static: + with ep_static.cursor() as cur: + cur.execute("SELECT count(*) FROM t0") + assert cur.fetchone() == (ROW_COUNT,) + + time.sleep(3) + + generate_updates_on_main(env, ep_main, i, end=100) + + # Trigger GC + for shard, ps in tenant_get_shards(env, env.initial_tenant): + client = ps.http_client() + gc_result = client.timeline_gc(shard, env.initial_timeline, 0) + log.info(f"{gc_result=}") + + assert ( + gc_result["layers_removed"] == 0 + ), "No layers should be removed, old layers are guarded by leases." + + with ep_static.cursor() as cur: + cur.execute("SELECT count(*) FROM t0") + assert cur.fetchone() == (ROW_COUNT,) + + # Do some update so we can increment latest_gc_cutoff + generate_updates_on_main(env, ep_main, i, end=100) + + # Now trigger GC again, layers should be removed. + time.sleep(4) + for shard, ps in tenant_get_shards(env, env.initial_tenant): + client = ps.http_client() + gc_result = client.timeline_gc(shard, env.initial_timeline, 0) + log.info(f"{gc_result=}") + + assert gc_result["layers_removed"] > 0, "Old layers should be removed after leases expired." + + # Similar test, but with more data, and we force checkpoints def test_timetravel(neon_simple_env: NeonEnv): env = neon_simple_env diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index 9d7a4a8fd6..e21f9bb6f6 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -10,16 +10,24 @@ from fixtures.neon_fixtures import NeonEnvBuilder # def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): # Override default checkpointer settings to run it more often - neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}" - - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_distance": "1048576", + } + ) env.pageserver.is_testing_enabled_or_skip() + # We expect the pageserver to exit, which will cause storage storage controller + # requests to fail and warn. + env.storage_controller.allowed_errors.append(".*management API still failed.*") + env.storage_controller.allowed_errors.append( + ".*Reconcile error.*error sending request for url.*" + ) + # Create a branch for us env.neon_cli.create_branch("test_pageserver_recovery", "main") endpoint = env.endpoints.create_start("test_pageserver_recovery") - log.info("postgres is running on 'test_pageserver_recovery' branch") with closing(endpoint.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 98b2e856ec..2e5260ca78 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -1,6 +1,3 @@ -# It's possible to run any regular test with the local fs remote storage via -# env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... - import os import queue import shutil @@ -9,6 +6,7 @@ import time from typing import Dict, List, Optional, Tuple import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -27,8 +25,14 @@ from fixtures.remote_storage import ( RemoteStorageKind, available_remote_storages, ) -from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import print_gc_result, query_scalar, wait_until +from fixtures.utils import ( + assert_eq, + assert_ge, + assert_gt, + print_gc_result, + query_scalar, + wait_until, +) from requests import ReadTimeout @@ -73,9 +77,6 @@ def test_remote_storage_backup_and_restore( env.pageserver.allowed_errors.extend( [ - # FIXME: Is this expected? - ".*marking .* as locally complete, while it doesnt exist in remote index.*", - ".*No timelines to attach received.*", ".*Failed to get local tenant state.*", # FIXME retry downloads without throwing errors ".*failed to load remote timeline.*", @@ -123,10 +124,10 @@ def test_remote_storage_backup_and_restore( log.info(f"upload of checkpoint {checkpoint_number} is done") # Check that we had to retry the uploads - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadLayer.*, will retry.*" ) - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadMetadata.*, will retry.*" ) @@ -162,13 +163,14 @@ def test_remote_storage_backup_and_restore( "data": {"reason": "storage-sync-list-remote-timelines"}, } + # Even though the tenant is broken, subsequent calls to location_conf API will succeed, but + # the tenant will always end up in a broken state as a result of the failpoint. # Ensure that even though the tenant is broken, retrying the attachment fails - with pytest.raises(Exception, match="Tenant state is Broken"): - # Use same generation as in previous attempt - gen_state = env.attachment_service.inspect(tenant_id) - assert gen_state is not None - generation = gen_state[0] - env.pageserver.tenant_attach(tenant_id, generation=generation) + tenant_info = wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15) + gen_state = env.storage_controller.inspect(tenant_id) + assert gen_state is not None + generation = gen_state[0] + env.pageserver.tenant_attach(tenant_id, generation=generation) # Restart again, this implicitly clears the failpoint. # test_remote_failures=1 remains active, though, as it's in the pageserver config. @@ -231,9 +233,9 @@ def test_remote_storage_upload_queue_retries( tenant_id, timeline_id = env.neon_cli.create_tenant( conf={ # small checkpointing and compaction targets to ensure we generate many upload operations - "checkpoint_distance": f"{128 * 1024}", + "checkpoint_distance": f"{64 * 1024}", "compaction_threshold": "1", - "compaction_target_size": f"{128 * 1024}", + "compaction_target_size": f"{64 * 1024}", # no PITR horizon, we specify the horizon when we request on-demand GC "pitr_interval": "0s", # disable background compaction and GC. We invoke it manually when we want it to happen. @@ -241,6 +243,7 @@ def test_remote_storage_upload_queue_retries( "compaction_period": "0s", # create image layers eagerly, so that GC can remove some layers "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", } ) @@ -259,33 +262,30 @@ def test_remote_storage_upload_queue_retries( ] ) + FOO_ROWS_COUNT = 4000 + def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data): # create initial set of layers & upload them with failpoints configured - endpoint.safe_psql_many( - [ - f""" - INSERT INTO foo (id, val) - SELECT g, '{data}' - FROM generate_series(1, 20000) g - ON CONFLICT (id) DO UPDATE - SET val = EXCLUDED.val - """, - # to ensure that GC can actually remove some layers - "VACUUM foo", - ] - ) + for _v in range(2): + endpoint.safe_psql_many( + [ + f""" + INSERT INTO foo (id, val) + SELECT g, '{data}' + FROM generate_series(1, {FOO_ROWS_COUNT}) g + ON CONFLICT (id) DO UPDATE + SET val = EXCLUDED.val + """, + # to ensure that GC can actually remove some layers + "VACUUM foo", + ] + ) wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) def get_queued_count(file_kind, op_kind): - val = client.get_remote_timeline_client_metric( - "pageserver_remote_timeline_client_calls_unfinished", - tenant_id, - timeline_id, - file_kind, - op_kind, + return client.get_remote_timeline_client_queue_count( + tenant_id, timeline_id, file_kind, op_kind ) - assert val is not None, "expecting metric to be present" - return int(val) # create some layers & wait for uploads to finish overwrite_data_and_wait_for_it_to_arrive_at_pageserver("a") @@ -298,9 +298,9 @@ def test_remote_storage_upload_queue_retries( print_gc_result(gc_result) assert gc_result["layers_removed"] > 0 - wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0) - wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0) - wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0) + wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0)) + wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0)) + wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) # let all future operations queue up configure_storage_sync_failpoints("return") @@ -312,6 +312,7 @@ def test_remote_storage_upload_queue_retries( def churn_while_failpoints_active(result): overwrite_data_and_wait_for_it_to_arrive_at_pageserver("c") + # this call will wait for the failpoints to be turned off client.timeline_checkpoint(tenant_id, timeline_id) client.timeline_compact(tenant_id, timeline_id) overwrite_data_and_wait_for_it_to_arrive_at_pageserver("d") @@ -328,21 +329,22 @@ def test_remote_storage_upload_queue_retries( churn_while_failpoints_active_thread.start() # wait for churn thread's data to get stuck in the upload queue - wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="upload") > 0) - wait_until(10, 0.1, lambda: get_queued_count(file_kind="index", op_kind="upload") >= 2) - wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="delete") > 0) + # Exponential back-off in upload queue, so, gracious timeouts. + + wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0)) + wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1)) + wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) # unblock churn operations configure_storage_sync_failpoints("off") - # ... and wait for them to finish. Exponential back-off in upload queue, so, gracious timeouts. - wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0) - wait_until(30, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0) - wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0) + wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0)) + wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0)) + wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) # The churn thread doesn't make progress once it blocks on the first wait_completion() call, # so, give it some time to wrap up. - churn_while_failpoints_active_thread.join(30) + churn_while_failpoints_active_thread.join(60) assert not churn_while_failpoints_active_thread.is_alive() assert churn_thread_result[0] @@ -353,13 +355,6 @@ def test_remote_storage_upload_queue_retries( env.pageserver.stop(immediate=True) env.endpoints.stop_all() - # We are about to forcibly drop local dirs. Attachment service will increment generation in re-attach before - # we later increment when actually attaching it again, leading to skipping a generation and potentially getting - # these warnings if there was a durable but un-executed deletion list at time of restart. - env.pageserver.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) - dir_to_clear = env.pageserver.tenant_dir() shutil.rmtree(dir_to_clear) os.mkdir(dir_to_clear) @@ -374,7 +369,7 @@ def test_remote_storage_upload_queue_retries( log.info("restarting postgres to validate") endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) with endpoint.cursor() as cur: - assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 20000 + assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == FOO_ROWS_COUNT def test_remote_timeline_client_calls_started_metric( @@ -388,6 +383,7 @@ def test_remote_timeline_client_calls_started_metric( initial_tenant_conf={ # small checkpointing and compaction targets to ensure we generate many upload operations "checkpoint_distance": f"{128 * 1024}", + # ensure each timeline_checkpoint() calls creates L1s "compaction_threshold": "1", "compaction_target_size": f"{128 * 1024}", # no PITR horizon, we specify the horizon when we request on-demand GC @@ -395,8 +391,6 @@ def test_remote_timeline_client_calls_started_metric( # disable background compaction and GC. We invoke it manually when we want it to happen. "gc_period": "0s", "compaction_period": "0s", - # create image layers eagerly, so that GC can remove some layers - "image_creation_threshold": "1", } ) @@ -437,7 +431,7 @@ def test_remote_timeline_client_calls_started_metric( assert timeline_id is not None for (file_kind, op_kind), observations in calls_started.items(): val = client.get_metric_value( - name="pageserver_remote_timeline_client_calls_started_count", + name="pageserver_remote_timeline_client_calls_started_total", filter={ "file_kind": str(file_kind), "op_kind": str(op_kind), @@ -455,12 +449,17 @@ def test_remote_timeline_client_calls_started_metric( ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}" def churn(data_pass1, data_pass2): + # overwrite the same data in place, vacuum inbetween, and + # and create image layers; then run a gc(). + # this should + # - create new layers + # - delete some layers overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1) - client.timeline_checkpoint(tenant_id, timeline_id) - client.timeline_compact(tenant_id, timeline_id) overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2) - client.timeline_checkpoint(tenant_id, timeline_id) - client.timeline_compact(tenant_id, timeline_id) + client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True) + overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1) + overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2) + client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True) gc_result = client.timeline_gc(tenant_id, timeline_id, 0) print_gc_result(gc_result) assert gc_result["layers_removed"] > 0 @@ -540,16 +539,6 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( client = env.pageserver.http_client() - def get_queued_count(file_kind, op_kind): - val = client.get_remote_timeline_client_metric( - "pageserver_remote_timeline_client_calls_unfinished", - tenant_id, - timeline_id, - file_kind, - op_kind, - ) - return int(val) if val is not None else val - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) client.configure_failpoints(("before-upload-layer", "return")) @@ -583,9 +572,12 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( def assert_compacted_and_uploads_queued(): assert timeline_path.exists() assert len(list(timeline_path.glob("*"))) >= 8 - assert get_queued_count(file_kind="index", op_kind="upload") > 0 + assert ( + get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload") + > 0 + ) - wait_until(20, 0.1, assert_compacted_and_uploads_queued) + wait_until(200, 0.1, assert_compacted_and_uploads_queued) # Regardless, give checkpoint some time to block for good. # Not strictly necessary, but might help uncover failure modes in the future. @@ -621,10 +613,13 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( assert len(filtered) == 0 # timeline deletion should kill ongoing uploads, so, the metric will be gone - assert get_queued_count(file_kind="index", op_kind="upload") is None + assert ( + get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload") + is None + ) # timeline deletion should be unblocking checkpoint ops - checkpoint_thread.join(2.0) + checkpoint_thread.join(20.0) assert not checkpoint_thread.is_alive() # Just to be sure, unblock ongoing uploads. If the previous assert was incorrect, or the prometheus metric broken, @@ -707,10 +702,8 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv # index upload is now hitting the failpoint, it should block the shutdown env.pageserver.stop(immediate=True) - local_metadata = ( - env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata" - ) - assert local_metadata.is_file() + timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id) + assert timeline_dir.is_dir() assert isinstance(env.pageserver_remote_storage, LocalFsStorage) @@ -776,11 +769,11 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv create_thread.join() -def test_compaction_waits_for_upload( +def test_paused_upload_stalls_checkpoint( neon_env_builder: NeonEnvBuilder, ): """ - This test forces a race between upload and compaction. + This test checks that checkpoints block on uploads to remote storage. """ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) @@ -795,6 +788,10 @@ def test_compaction_waits_for_upload( } ) + env.pageserver.allowed_errors.append( + f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing" + ) + tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -815,75 +812,9 @@ def test_compaction_waits_for_upload( endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)") wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - client.timeline_checkpoint(tenant_id, timeline_id) - deltas_at_first = len(client.layer_map_info(tenant_id, timeline_id).delta_layers()) - assert ( - deltas_at_first == 2 - ), "are you fixing #5863? just add one more checkpoint after 'CREATE TABLE bar ...' statement." - - endpoint.safe_psql("CREATE TABLE bar AS SELECT x FROM generate_series(1, 10000) g(x)") - endpoint.safe_psql("UPDATE foo SET x = 0 WHERE x = 1") - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - - layers_before_last_checkpoint = client.layer_map_info(tenant_id, timeline_id).historic_by_name() - upload_stuck_layers = layers_before_last_checkpoint - layers_at_creation.historic_by_name() - - assert len(upload_stuck_layers) > 0 - - for name in upload_stuck_layers: - path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name - assert path.exists(), "while uploads are stuck the layers should be present on disk" - - # now this will do the L0 => L1 compaction and want to remove - # upload_stuck_layers and the original initdb L0 - client.timeline_checkpoint(tenant_id, timeline_id) - - # as uploads are paused, the the upload_stuck_layers should still be with us - for name in upload_stuck_layers: - path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name - assert path.exists(), "uploads are stuck still over compaction" - - compacted_layers = client.layer_map_info(tenant_id, timeline_id).historic_by_name() - overlap = compacted_layers.intersection(upload_stuck_layers) - assert len(overlap) == 0, "none of the L0's should remain after L0 => L1 compaction" - assert ( - len(compacted_layers) == 1 - ), "there should be one L1 after L0 => L1 compaction (without #5863 being fixed)" - - def layer_deletes_completed(): - m = client.get_metric_value("pageserver_layer_completed_deletes_total") - if m is None: - return 0 - return int(m) - - # if initdb created an initial delta layer, it might already be gc'd - # because it was uploaded before the failpoint was enabled. however, the - # deletion is not guaranteed to be complete. - assert layer_deletes_completed() <= 1 - - client.configure_failpoints(("before-upload-layer-pausable", "off")) - - # Ensure that this actually terminates - wait_upload_queue_empty(client, tenant_id, timeline_id) - - def until_layer_deletes_completed(): - deletes = layer_deletes_completed() - log.info(f"layer_deletes: {deletes}") - # ensure that initdb delta layer AND the previously stuck are now deleted - assert deletes >= len(upload_stuck_layers) + 1 - - wait_until(10, 1, until_layer_deletes_completed) - - for name in upload_stuck_layers: - path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name - assert ( - not path.exists() - ), "l0 should now be removed because of L0 => L1 compaction and completed uploads" - - # We should not have hit the error handling path in uploads where a uploaded file is gone - assert not env.pageserver.log_contains( - "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more." - ) + with pytest.raises(ReadTimeout): + client.timeline_checkpoint(tenant_id, timeline_id, timeout=5) + client.configure_failpoints(("before-upload-layer-pausable", "off")) def wait_upload_queue_empty( @@ -892,26 +823,23 @@ def wait_upload_queue_empty( wait_until( 2, 1, - lambda: get_queued_count( - client, tenant_id, timeline_id, file_kind="layer", op_kind="upload" - ) - == 0, + lambda: assert_eq( + get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"), 0 + ), ) wait_until( 2, 1, - lambda: get_queued_count( - client, tenant_id, timeline_id, file_kind="index", op_kind="upload" - ) - == 0, + lambda: assert_eq( + get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload"), 0 + ), ) wait_until( 2, 1, - lambda: get_queued_count( - client, tenant_id, timeline_id, file_kind="layer", op_kind="delete" - ) - == 0, + lambda: assert_eq( + get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"), 0 + ), ) @@ -922,16 +850,8 @@ def get_queued_count( file_kind: str, op_kind: str, ): - val = client.get_remote_timeline_client_metric( - "pageserver_remote_timeline_client_calls_unfinished", - tenant_id, - timeline_id, - file_kind, - op_kind, - ) - if val is None: - return val - return int(val) + """The most important aspect of this function is shorter name & no return type so asserts are more concise.""" + return client.get_remote_timeline_client_queue_count(tenant_id, timeline_id, file_kind, op_kind) def assert_nothing_to_upload( diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py new file mode 100644 index 0000000000..0d95109d6b --- /dev/null +++ b/test_runner/regress/test_replica_start.py @@ -0,0 +1,690 @@ +""" +In PostgreSQL, a standby always has to wait for a running-xacts WAL record to +arrive before it can start accepting queries. Furthermore, if there are +transactions with too many subxids (> 64) open to fit in the in-memory subxids +cache, the running-xacts record will be marked as "suboverflowed", and the +standby will need to also wait for the currently in-progress transactions to +finish. + +In Neon, we have an additional mechanism that scans the CLOG at server startup +to determine the list of running transactions, so that the standby can start up +immediately without waiting for the running-xacts record, but that mechanism +only works if the # of active (sub-)transactions is reasonably small. Otherwise +it falls back to waiting. Furthermore, it's somewhat optimistic in using up the +known-assigned XIDs array: if too many transactions with subxids are started in +the primary later, the replay in the replica will crash with "too many +KnownAssignedXids" error. + +This module contains tests for those various cases at standby startup: starting +from shutdown checkpoint, using the CLOG scanning mechanism, waiting for +running-xacts record and for in-progress transactions to finish etc. +""" + +import threading +from contextlib import closing + +import psycopg2 +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup +from fixtures.pg_version import PgVersion +from fixtures.utils import query_scalar, wait_until + +CREATE_SUBXACTS_FUNC = """ +create or replace function create_subxacts(n integer) returns void as $$ +declare + i integer; +begin + for i in 1..n loop + begin + insert into t (payload) values (0); + exception + when others then + raise exception 'caught something: %', sqlerrm; + end; + end loop; +end; $$ language plpgsql +""" + + +def test_replica_start_scan_clog(neon_simple_env: NeonEnv): + """ + Test the CLOG-scanning mechanism at hot standby startup. There is one + transaction active in the primary when the standby is started. The primary + is killed before it has a chance to write a running-xacts record. The + CLOG-scanning at neon startup allows the standby to start up anyway. + + See the module docstring for background. + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("CREATE EXTENSION neon_test_utils") + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + primary_cur.execute("select pg_switch_wal()") + + # Start a transaction in the primary. Leave the transaction open. + # + # The transaction has some subtransactions, but not too many to cause the + # CLOG-scanning mechanism to give up. + primary_cur.execute("begin") + primary_cur.execute("select create_subxacts(50)") + + # Wait for the WAL to be flushed, but then immediately kill the primary, + # before it has a chance to generate a running-xacts record. + primary_cur.execute("select neon_xlogflush()") + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + primary.stop(mode="immediate") + + # Create a replica. It should start up normally, thanks to the CLOG-scanning + # mechanism. + secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + + # The transaction did not commit, so it should not be visible in the secondary + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (0,) + + +def test_replica_start_scan_clog_crashed_xids(neon_simple_env: NeonEnv): + """ + Test the CLOG-scanning mechanism at hot standby startup, after + leaving behind crashed transactions. + + See the module docstring for background. + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + primary_cur.execute("select pg_switch_wal()") + + # Consume a lot of XIDs, then kill Postgres without giving it a + # chance to write abort records for them. + primary_cur.execute("begin") + primary_cur.execute("select create_subxacts(100000)") + primary.stop(mode="immediate") + + # Restart the primary. Do some light work, and shut it down cleanly + primary.start() + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("insert into t (payload) values (0)") + primary.stop(mode="fast") + + # Create a replica. It should start up normally, thanks to the CLOG-scanning + # mechanism. (Restarting the primary writes a checkpoint and/or running-xacts + # record, which allows the standby to know that the crashed XIDs are aborted) + secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (1,) + + +def test_replica_start_at_running_xacts(neon_simple_env: NeonEnv, pg_version): + """ + Test that starting a replica works right after the primary has + created a running-xacts record. This may seem like a trivial case, + but during development, we had a bug that was triggered by having + oldestActiveXid == nextXid. Starting right after a running-xacts + record is one way to test that case. + + See the module docstring for background. + """ + env = neon_simple_env + + if env.pg_version == PgVersion.V14 or env.pg_version == PgVersion.V15: + pytest.skip("pg_log_standby_snapshot() function is available only in PG16") + + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + + primary_cur.execute("CREATE EXTENSION neon_test_utils") + primary_cur.execute("select pg_log_standby_snapshot()") + primary_cur.execute("select neon_xlogflush()") + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + + secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select 123") + assert secondary_cur.fetchone() == (123,) + + +def test_replica_start_wait_subxids_finish(neon_simple_env: NeonEnv): + """ + Test replica startup when there are a lot of (sub)transactions active in the + primary. That's too many for the CLOG-scanning mechanism to handle, so the + replica has to wait for the large transaction to finish before it starts to + accept queries. + + After replica startup, test MVCC with transactions that were in-progress + when the replica was started. + + See the module docstring for background. + """ + + # Initialize the primary, a test table, and a helper function to create + # lots of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + + # Start a transaction with 100000 subtransactions, and leave it open. That's + # too many to fit in the "known-assigned XIDs array" in the replica, and + # also too many to fit in the subxid caches so the running-xacts record will + # also overflow. + primary_cur.execute("begin") + primary_cur.execute("select create_subxacts(100000)") + + # Start another, smaller transaction in the primary. We'll come back to this + # later. + primary_conn2 = primary.connect() + primary_cur2 = primary_conn2.cursor() + primary_cur2.execute("begin") + primary_cur2.execute("insert into t (payload) values (0)") + + # Create a replica. but before that, wait for the wal to be flushed to + # safekeepers, so that the replica is started at a point where the large + # transaction is already active. (The whole transaction might not be flushed + # yet, but that's OK.) + # + # Start it in a separate thread, so that we can do other stuff while it's + # blocked waiting for the startup to finish. + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + secondary = env.endpoints.new_replica( + origin=primary, + endpoint_id="secondary", + config_lines=["neon.running_xacts_overflow_policy='wait'"], + ) + start_secondary_thread = threading.Thread(target=secondary.start) + start_secondary_thread.start() + + # Verify that the replica has otherwise started up, but cannot start + # accepting queries yet. + log.info("Waiting 5 s to verify that the secondary does not start") + start_secondary_thread.join(5) + assert secondary.log_contains("consistent recovery state reached") + assert secondary.log_contains("started streaming WAL from primary") + # The "redo starts" message is printed when the first WAL record is + # received. It might or might not be present in the log depending on how + # far exactly the WAL was flushed when the replica was started, and whether + # background activity caused any more WAL records to be flushed on the + # primary afterwards. + # + # assert secondary.log_contains("redo # starts") + + # should not be open for connections yet + assert start_secondary_thread.is_alive() + assert not secondary.is_running() + assert not secondary.log_contains("database system is ready to accept read-only connections") + + # Commit the large transaction in the primary. + # + # Within the next 15 s, the primary should write a new running-xacts record + # to the WAL which shows the transaction as completed. Once the replica + # replays that record, it will start accepting queries. + primary_cur.execute("commit") + start_secondary_thread.join() + + # Verify that the large transaction is correctly visible in the secondary + # (but not the second, small transaction, which is still in-progress!) + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (100000,) + + # Perform some more MVCC testing using the second transaction that was + # started in the primary before the replica was created + primary_cur2.execute("select create_subxacts(10000)") + + # The second transaction still hasn't committed + wait_replica_caughtup(primary, secondary) + secondary_cur.execute("BEGIN ISOLATION LEVEL REPEATABLE READ") + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (100000,) + + # Commit the second transaction in the primary + primary_cur2.execute("commit") + + # Should still be invisible to the old snapshot + wait_replica_caughtup(primary, secondary) + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (100000,) + + # Commit the REPEATABLE READ transaction in the replica. Both + # primary transactions should now be visible to a new snapshot. + secondary_cur.execute("commit") + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (110001,) + + +def test_replica_too_many_known_assigned_xids(neon_simple_env: NeonEnv): + """ + The CLOG-scanning mechanism fills the known-assigned XIDs array + optimistically at standby startup, betting that it can still fit + upcoming transactions replayed later from the WAL in the + array. This test tests what happens when that bet fails and the + known-assigned XID array fills up after the standby has already + been started. The WAL redo will fail with an error: + + FATAL: too many KnownAssignedXids + CONTEXT: WAL redo at 0/1895CB0 for neon/INSERT: off: 25, flags: 0x08; blkref #0: rel 1663/5/16385, blk 64 + + which causes the standby to shut down. + + See the module docstring for background. + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("CREATE EXTENSION neon_test_utils") + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + + # Determine how many connections we can use + primary_cur.execute("show max_connections") + max_connections = int(primary_cur.fetchall()[0][0]) + primary_cur.execute("show superuser_reserved_connections") + superuser_reserved_connections = int(primary_cur.fetchall()[0][0]) + n_connections = max_connections - superuser_reserved_connections + n_subxids = 200 + + # Start one top transaction in primary, with lots of subtransactions. This + # uses up much of the known-assigned XIDs space in the standby, but doesn't + # cause it to overflow. + large_p_conn = primary.connect() + large_p_cur = large_p_conn.cursor() + large_p_cur.execute("begin") + large_p_cur.execute(f"select create_subxacts({max_connections} * 30)") + + with closing(primary.connect()) as small_p_conn: + with small_p_conn.cursor() as small_p_cur: + small_p_cur.execute("select create_subxacts(1)") + + # Create a replica at this LSN + primary_cur.execute("select neon_xlogflush()") + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + + # The transaction in primary has not committed yet. + wait_replica_caughtup(primary, secondary) + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (1,) + + # Start max number of top transactions in primary, with a lot of + # subtransactions each. We add the subtransactions to each top transaction + # in a round-robin fashion, instead of adding a lot of subtransactions to + # one top transaction at a time. This way, we will have the max number of + # subtransactions in the in-memory subxid cache of each top transaction, + # until they all overflow. + # + # Currently, PGPROC_MAX_CACHED_SUBXIDS == 64, so this will overflow the all + # the subxid caches after creating 64 subxids in each top transaction. The + # point just before the caches have overflowed is the most interesting point + # in time, but we'll keep going beyond that, to ensure that this test is + # robust even if PGPROC_MAX_CACHED_SUBXIDS changes. + p_curs = [] + for _ in range(0, n_connections): + p_cur = primary.connect().cursor() + p_cur.execute("begin") + p_curs.append(p_cur) + + for _subxid in range(0, n_subxids): + for i in range(0, n_connections): + p_curs[i].execute("select create_subxacts(1)") + + # Commit all the transactions in the primary + for i in range(0, n_connections): + p_curs[i].execute("commit") + large_p_cur.execute("commit") + + # Wait until the replica crashes with "too many KnownAssignedXids" error. + def check_replica_crashed(): + try: + secondary.connect() + except psycopg2.Error: + # Once the connection fails, return success + return None + raise RuntimeError("connection succeeded") + + wait_until(20, 0.5, check_replica_crashed) + assert secondary.log_contains("too many KnownAssignedXids") + + # Replica is crashed, so ignore stop result + secondary.check_stop_result = False + + +def test_replica_start_repro_visibility_bug(neon_simple_env: NeonEnv): + """ + Before PR #7288, a hot standby in neon incorrectly started up + immediately, before it had received a running-xacts record. That + led to visibility bugs if there were active transactions in the + primary. This test reproduces the incorrect query results and + incorrectly set hint bits, before that was fixed. + """ + env = neon_simple_env + + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + p_cur = primary.connect().cursor() + + p_cur.execute("begin") + p_cur.execute("create table t(pk integer primary key, payload integer)") + p_cur.execute("insert into t values (generate_series(1,100000), 0)") + + secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + wait_replica_caughtup(primary, secondary) + s_cur = secondary.connect().cursor() + + # Set hint bits for pg_class tuples. If primary's transaction is + # not marked as in-progress in MVCC snapshot, then XMIN_INVALID + # hint bit will be set for table's 't' tuple, making it invisible + # even after the commit record is replayed later. + s_cur.execute("select * from pg_class") + + p_cur.execute("commit") + wait_replica_caughtup(primary, secondary) + s_cur.execute("select * from t where pk = 1") + assert s_cur.fetchone() == (1, 0) + + +@pytest.mark.parametrize("shutdown", [True, False]) +def test_replica_start_with_prepared_xacts(neon_simple_env: NeonEnv, shutdown: bool): + """ + Test the CLOG-scanning mechanism at hot standby startup in the presence of + prepared transactions. + + This test is run in two variants: one where the primary server is shut down + before starting the secondary, or not. + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start( + branch_name="main", endpoint_id="primary", config_lines=["max_prepared_transactions=5"] + ) + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("CREATE EXTENSION neon_test_utils") + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute("create table t1(pk integer primary key)") + primary_cur.execute("create table t2(pk integer primary key)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + + # Prepare a transaction for two-phase commit + primary_cur.execute("begin") + primary_cur.execute("insert into t1 values (1)") + primary_cur.execute("prepare transaction 't1'") + + # Prepare another transaction for two-phase commit, with a subtransaction + primary_cur.execute("begin") + primary_cur.execute("insert into t2 values (2)") + primary_cur.execute("savepoint sp") + primary_cur.execute("insert into t2 values (3)") + primary_cur.execute("prepare transaction 't2'") + + # Start a transaction in the primary. Leave the transaction open. + # + # The transaction has some subtransactions, but not too many to cause the + # CLOG-scanning mechanism to give up. + primary_cur.execute("begin") + primary_cur.execute("select create_subxacts(50)") + + # Wait for the WAL to be flushed + primary_cur.execute("select neon_xlogflush()") + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + + if shutdown: + primary.stop(mode="fast") + + # Create a replica. It should start up normally, thanks to the CLOG-scanning + # mechanism. + secondary = env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary", config_lines=["max_prepared_transactions=5"] + ) + + # The transaction did not commit, so it should not be visible in the secondary + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (0,) + secondary_cur.execute("select count(*) from t1") + assert secondary_cur.fetchone() == (0,) + secondary_cur.execute("select count(*) from t2") + assert secondary_cur.fetchone() == (0,) + + if shutdown: + primary.start() + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + else: + primary_cur.execute("commit") + primary_cur.execute("commit prepared 't1'") + primary_cur.execute("commit prepared 't2'") + + wait_replica_caughtup(primary, secondary) + + secondary_cur.execute("select count(*) from t") + if shutdown: + assert secondary_cur.fetchone() == (0,) + else: + assert secondary_cur.fetchone() == (50,) + secondary_cur.execute("select * from t1") + assert secondary_cur.fetchall() == [(1,)] + secondary_cur.execute("select * from t2") + assert secondary_cur.fetchall() == [(2,), (3,)] + + +def test_replica_start_with_prepared_xacts_with_subxacts(neon_simple_env: NeonEnv): + """ + Test the CLOG-scanning mechanism at hot standby startup in the presence of + prepared transactions, with subtransactions. + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start( + branch_name="main", endpoint_id="primary", config_lines=["max_prepared_transactions=5"] + ) + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + + # Install extension containing function needed for test + primary_cur.execute("CREATE EXTENSION neon_test_utils") + + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + + # Advance nextXid close to the beginning of the next pg_subtrans segment (2^16 XIDs) + # + # This is interesting, because it tests that pg_subtrans is initialized correctly + # at standby startup. (We had a bug where it didn't at one point during development.) + while True: + xid = int(query_scalar(primary_cur, "SELECT txid_current()")) + log.info(f"xid now {xid}") + # Consume 500 transactions at a time until we get close + if xid < 65535 - 600: + primary_cur.execute("select test_consume_xids(500);") + else: + break + primary_cur.execute("checkpoint") + + # Prepare a transaction for two-phase commit + primary_cur.execute("begin") + primary_cur.execute("select create_subxacts(1000)") + primary_cur.execute("prepare transaction 't1'") + + # Wait for the WAL to be flushed, and stop the primary + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + primary.stop(mode="fast") + + # Create a replica. It should start up normally, thanks to the CLOG-scanning + # mechanism. + secondary = env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary", config_lines=["max_prepared_transactions=5"] + ) + + # The transaction did not commit, so it should not be visible in the secondary + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (0,) + + primary.start() + + # Open a lot of subtransactions in the primary, causing the subxids cache to overflow + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("select create_subxacts(100000)") + + wait_replica_caughtup(primary, secondary) + + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (100000,) + + primary_cur.execute("commit prepared 't1'") + + wait_replica_caughtup(primary, secondary) + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (101000,) + + +def test_replica_start_with_prepared_xacts_with_many_subxacts(neon_simple_env: NeonEnv): + """ + Test the CLOG-scanning mechanism at hot standby startup in the presence of + prepared transactions, with lots of subtransactions. + + Like test_replica_start_with_prepared_xacts_with_subxacts, but with more + subxacts, to test that the prepared transaction's subxids don't consume + space in the known-assigned XIDs array. (They are set in pg_subtrans + instead) + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start( + branch_name="main", endpoint_id="primary", config_lines=["max_prepared_transactions=5"] + ) + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + + # Install extension containing function needed for test + primary_cur.execute("CREATE EXTENSION neon_test_utils") + + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + + # Prepare a transaction for two-phase commit, with lots of subxids + primary_cur.execute("begin") + primary_cur.execute("select create_subxacts(50000)") + + # to make things a bit more varied, intersperse a few other XIDs in between + # the prepared transaction's sub-XIDs + with primary.connect().cursor() as primary_cur2: + primary_cur2.execute("insert into t (payload) values (123)") + primary_cur2.execute("begin; insert into t (payload) values (-1); rollback") + + primary_cur.execute("select create_subxacts(50000)") + primary_cur.execute("prepare transaction 't1'") + + # Wait for the WAL to be flushed + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + + primary.stop(mode="fast") + + # Create a replica. It should start up normally, thanks to the CLOG-scanning + # mechanism. + secondary = env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary", config_lines=["max_prepared_transactions=5"] + ) + + # The transaction did not commit, so it should not be visible in the secondary + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (1,) + + primary.start() + + # Open a lot of subtransactions in the primary, causing the subxids cache to overflow + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("select create_subxacts(100000)") + + wait_replica_caughtup(primary, secondary) + + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (100001,) + + primary_cur.execute("commit prepared 't1'") + + wait_replica_caughtup(primary, secondary) + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (200001,) + + +def test_replica_start_with_too_many_unused_xids(neon_simple_env: NeonEnv): + """ + Test the CLOG-scanning mechanism at hot standby startup in the presence of + large number of unsued XIDs, caused by XID alignment and frequent primary restarts + """ + n_restarts = 50 + + # Initialize the primary and a test table + env = neon_simple_env + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + with primary.cursor() as primary_cur: + primary_cur.execute("create table t(pk serial primary key, payload integer)") + + for _ in range(n_restarts): + with primary.cursor() as primary_cur: + primary_cur.execute("insert into t (payload) values (0)") + # restart primary + primary.stop("immediate") + primary.start() + + # Wait for the WAL to be flushed + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + + # stop primary to check that we can start replica without it + primary.stop(mode="immediate") + + # Create a replica. It should start up normally, because of ignore policy + # mechanism. + secondary = env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + config_lines=["neon.running_xacts_overflow_policy='ignore'"], + ) + + # Check that replica see all changes + with secondary.cursor() as secondary_cur: + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (n_restarts,) diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py new file mode 100644 index 0000000000..c1a80a54bc --- /dev/null +++ b/test_runner/regress/test_s3_restore.py @@ -0,0 +1,128 @@ +import time +from datetime import datetime, timezone + +from fixtures.common_types import Lsn +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, +) +from fixtures.pageserver.utils import ( + assert_prefix_empty, + enable_remote_storage_versioning, + many_small_layers_tenant_config, + wait_for_upload, +) +from fixtures.remote_storage import RemoteStorageKind, s3_storage +from fixtures.utils import run_pg_bench_small + + +def test_tenant_s3_restore( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, +): + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + # Mock S3 doesn't have versioning enabled by default, enable it + # (also do it before there is any writes to the bucket) + if remote_storage_kind == RemoteStorageKind.MOCK_S3: + remote_storage = neon_env_builder.pageserver_remote_storage + assert remote_storage, "remote storage not configured" + enable_remote_storage_versioning(remote_storage) + + # change it back after initdb, recovery doesn't work if the two + # index_part.json uploads happen at same second or too close to each other. + initial_tenant_conf = many_small_layers_tenant_config() + del initial_tenant_conf["checkpoint_distance"] + + env = neon_env_builder.init_start(initial_tenant_conf) + env.pageserver.allowed_errors.extend( + [ + # The deletion queue will complain when it encounters simulated S3 errors + ".*deletion executor: DeleteObjects request failed.*", + # lucky race with stopping from flushing a layer we fail to schedule any uploads + ".*layer flush task.+: could not flush frozen layer: update_metadata_file", + ] + ) + + ps_http = env.pageserver.http_client() + tenant_id = env.initial_tenant + + # now lets create the small layers + ps_http.set_tenant_config(tenant_id, many_small_layers_tenant_config()) + + # Default tenant and the one we created + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 + + # create two timelines one being the parent of another, both with non-trivial data + parent = "main" + last_flush_lsns = [] + + for timeline in ["first", "second"]: + timeline_id = env.neon_cli.create_branch( + timeline, tenant_id=tenant_id, ancestor_branch_name=parent + ) + with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + endpoint.safe_psql(f"CREATE TABLE created_{timeline}(id integer);") + last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + last_flush_lsns.append(last_flush_lsn) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) + log.info(f"{timeline} timeline {timeline_id} {last_flush_lsn=}") + parent = timeline + + # These sleeps are important because they fend off differences in clocks between us and S3 + time.sleep(4) + ts_before_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None) + time.sleep(4) + + assert ( + ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 + ), "tenant removed before we deletion was issued" + ps_http.tenant_delete(tenant_id) + ps_http.deletion_queue_flush(execute=True) + assert ( + ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0 + ), "tenant removed before we deletion was issued" + env.storage_controller.attach_hook_drop(tenant_id) + + tenant_path = env.pageserver.tenant_dir(tenant_id) + assert not tenant_path.exists() + + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) + + time.sleep(4) + ts_after_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None) + time.sleep(4) + + ps_http.tenant_time_travel_remote_storage( + tenant_id, timestamp=ts_before_deletion, done_if_after=ts_after_deletion + ) + + generation = env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id) + + ps_http.tenant_attach(tenant_id, generation=generation) + env.pageserver.quiesce_tenants() + + for tline in ps_http.timeline_list(env.initial_tenant): + log.info(f"timeline detail: {tline}") + + for i, timeline in enumerate(["first", "second"]): + with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint: + endpoint.safe_psql(f"SELECT * FROM created_{timeline};") + last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + expected_last_flush_lsn = last_flush_lsns[i] + # There might be some activity that advances the lsn so we can't use a strict equality check + assert last_flush_lsn >= expected_last_flush_lsn, "last_flush_lsn too old" + + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py new file mode 100644 index 0000000000..bfd82242e9 --- /dev/null +++ b/test_runner/regress/test_sharding.py @@ -0,0 +1,1511 @@ +import os +import time +from collections import defaultdict +from typing import Dict, List, Optional, Union + +import pytest +import requests +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId +from fixtures.compute_reconfigure import ComputeReconfigure +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + StorageControllerApiException, + last_flush_lsn_upload, + tenant_get_shards, + wait_for_last_flush_lsn, +) +from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty +from fixtures.remote_storage import s3_storage +from fixtures.utils import wait_until +from fixtures.workload import Workload +from pytest_httpserver import HTTPServer +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + + +def test_sharding_smoke( + neon_env_builder: NeonEnvBuilder, +): + """ + Test the basic lifecycle of a sharded tenant: + - ingested data gets split up + - page service reads + - timeline creation and deletion + - splits + """ + + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + + # 1MiB stripes: enable getting some meaningful data distribution without + # writing large quantities of data in this test. The stripe size is given + # in number of 8KiB pages. + stripe_size = 128 + + # Use S3-compatible remote storage so that we can scrub: this test validates + # that the scrubber doesn't barf when it sees a sharded tenant. + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size + ) + tenant_id = env.initial_tenant + + pageservers = dict((int(p.id), p) for p in env.pageservers) + shards = env.storage_controller.locate(tenant_id) + + def get_sizes(): + sizes = {} + for shard in shards: + node_id = int(shard["node_id"]) + pageserver = pageservers[node_id] + sizes[node_id] = pageserver.http_client().tenant_status(shard["shard_id"])[ + "current_physical_size" + ] + log.info(f"sizes = {sizes}") + return sizes + + # The imported initdb for timeline creation should + # not be fully imported on every shard. We use a 1MB strripe size so expect + # pretty good distribution: no one shard should have more than half the data + sizes = get_sizes() + physical_initdb_total = sum(sizes.values()) + expect_initdb_size = 20 * 1024 * 1024 + assert physical_initdb_total > expect_initdb_size + assert all(s < expect_initdb_size // 2 for s in sizes.values()) + + # Test that timeline creation works on a sharded tenant + timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id) + + # Test that we can write data to a sharded tenant + workload = Workload(env, tenant_id, timeline_b, branch_name="branch_b") + workload.init() + + sizes_before = get_sizes() + workload.write_rows(256) + + # Test that we can read data back from a sharded tenant + workload.validate() + + # Validate that the data is spread across pageservers + sizes_after = get_sizes() + # Our sizes increased when we wrote data + assert sum(sizes_after.values()) > sum(sizes_before.values()) + # That increase is present on all shards + assert all(sizes_after[ps.id] > sizes_before[ps.id] for ps in env.pageservers) + + # Validate that timeline list API works properly on all shards + for shard in shards: + node_id = int(shard["node_id"]) + pageserver = pageservers[node_id] + timelines = set( + TimelineId(tl["timeline_id"]) + for tl in pageserver.http_client().timeline_list(shard["shard_id"]) + ) + assert timelines == {env.initial_timeline, timeline_b} + + env.storage_controller.consistency_check() + + # Validate that deleting a sharded tenant removes all files in the prefix + + # Before deleting, stop the client and check we have some objects to delete + workload.stop() + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) + + # Check the scrubber isn't confused by sharded content, then disable + # it during teardown because we'll have deleted by then + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy + + env.storage_controller.pageserver_api().tenant_delete(tenant_id) + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) + + env.storage_controller.consistency_check() + + +def test_sharding_split_unsharded( + neon_env_builder: NeonEnvBuilder, +): + """ + Test that shard splitting works on a tenant created as unsharded (i.e. with + ShardCount(0)). + """ + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Check that we created with an unsharded TenantShardId: this is the default, + # but check it in case we change the default in future + assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 0)) is not None + + workload = Workload(env, tenant_id, timeline_id, branch_name="main") + workload.init() + workload.write_rows(256) + workload.validate() + + # Split one shard into two + env.storage_controller.tenant_shard_split(tenant_id, shard_count=2) + + # Check we got the shard IDs we expected + assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None + assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None + + workload.validate() + + env.storage_controller.consistency_check() + + +@pytest.mark.parametrize( + "failpoint", + [ + None, + "compact-shard-ancestors-localonly", + "compact-shard-ancestors-enqueued", + "compact-shard-ancestors-persistent", + ], +) +def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: Optional[str]): + """ + Test that after a split, we clean up parent layer data in the child shards via compaction. + """ + + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": 128 * 1024, + "compaction_threshold": 1, + "compaction_target_size": 128 * 1024, + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "3600s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # Disable automatic creation of image layers, as we will create them explicitly when we want them + "image_creation_threshold": 9999, + "image_layer_creation_check_threshold": 0, + } + + neon_env_builder.storage_controller_config = { + # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts. + "max_offline": "30s", + "max_warming_up": "300s", + } + + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Check that we created with an unsharded TenantShardId: this is the default, + # but check it in case we change the default in future + assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 0)) is not None + + workload = Workload(env, tenant_id, timeline_id, branch_name="main") + workload.init() + workload.write_rows(256) + workload.validate() + workload.stop() + + # Do a full image layer generation before splitting, so that when we compact after splitting + # we should only see sizes decrease (from post-split drops/rewrites), not increase (from image layer generation) + env.get_tenant_pageserver(tenant_id).http_client().timeline_checkpoint( + tenant_id, timeline_id, force_image_layer_creation=True, wait_until_uploaded=True + ) + + # Split one shard into two + shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2) + + # Let all shards move into their stable locations, so that during subsequent steps we + # don't have reconciles in progress (simpler to reason about what messages we expect in logs) + env.storage_controller.reconcile_until_idle() + + # Check we got the shard IDs we expected + assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None + assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None + + workload.validate() + workload.stop() + + env.storage_controller.consistency_check() + + # Cleanup part 1: while layers are still in PITR window, we should only drop layers that are fully redundant + for shard in shards: + ps = env.get_tenant_pageserver(shard) + + # Invoke compaction: this should drop any layers that don't overlap with the shard's key stripes + detail_before = ps.http_client().timeline_detail(shard, timeline_id) + ps.http_client().timeline_compact(shard, timeline_id) + detail_after = ps.http_client().timeline_detail(shard, timeline_id) + + # Physical size should shrink because some layers have been dropped + assert detail_after["current_physical_size"] < detail_before["current_physical_size"] + + # Compaction shouldn't make anything unreadable + workload.validate() + + # Force a generation increase: layer rewrites are a long-term thing and only happen after + # the generation has increased. + env.pageserver.stop() + env.pageserver.start() + + # Cleanup part 2: once layers are outside the PITR window, they will be rewritten if they are partially redundant + updated_conf = TENANT_CONF.copy() + updated_conf["pitr_interval"] = "0s" + env.storage_controller.pageserver_api().set_tenant_config(tenant_id, updated_conf) + env.storage_controller.reconcile_until_idle() + + for shard in shards: + ps = env.get_tenant_pageserver(shard) + + # Apply failpoints for the layer-rewriting phase: this is the area of code that has sensitive behavior + # across restarts, as we will have local layer files that temporarily disagree with the remote metadata + # for the same local layer file name. + if failpoint is not None: + ps.http_client().configure_failpoints((failpoint, "exit")) + + # Do a GC to update gc_info (compaction uses this to decide whether a layer is to be rewritten) + # Set gc_horizon=0 to let PITR horizon control GC cutoff exclusively. + ps.http_client().timeline_gc(shard, timeline_id, gc_horizon=0) + + # We will compare stats before + after compaction + detail_before = ps.http_client().timeline_detail(shard, timeline_id) + + # Invoke compaction: this should rewrite layers that are behind the pitr horizon + try: + ps.http_client().timeline_compact(shard, timeline_id) + except requests.ConnectionError as e: + if failpoint is None: + raise e + else: + log.info(f"Compaction failed (failpoint={failpoint}): {e}") + + if failpoint in ( + "compact-shard-ancestors-localonly", + "compact-shard-ancestors-enqueued", + ): + # If we left local files that don't match remote metadata, we expect warnings on next startup + env.pageserver.allowed_errors.append( + ".*removing local file .+ because it has unexpected length.*" + ) + + # Post-failpoint: we check that the pageserver comes back online happily. + env.pageserver.running = False + env.pageserver.start() + else: + assert failpoint is None # We shouldn't reach success path if a failpoint was set + + detail_after = ps.http_client().timeline_detail(shard, timeline_id) + + # Physical size should shrink because layers are smaller + assert detail_after["current_physical_size"] < detail_before["current_physical_size"] + + # Validate size statistics + for shard in shards: + ps = env.get_tenant_pageserver(shard) + timeline_info = ps.http_client().timeline_detail(shard, timeline_id) + reported_size = timeline_info["current_physical_size"] + layer_paths = ps.list_layers(shard, timeline_id) + measured_size = 0 + for p in layer_paths: + abs_path = ps.timeline_dir(shard, timeline_id) / p + measured_size += os.stat(abs_path).st_size + + log.info( + f"shard {shard} reported size {reported_size}, measured size {measured_size} ({len(layer_paths)} layers)" + ) + + if failpoint in ( + "compact-shard-ancestors-localonly", + "compact-shard-ancestors-enqueued", + ): + # If we injected a failure between local rewrite and remote upload, then after + # restart we may end up with neither version of the file on local disk (the new file + # is cleaned up because it doesn't matchc remote metadata). So local size isn't + # necessarily going to match remote physical size. + continue + + assert measured_size == reported_size + + # Compaction shouldn't make anything unreadable + workload.validate() + + +def test_sharding_split_smoke( + neon_env_builder: NeonEnvBuilder, +): + """ + Test the basics of shard splitting: + - The API results in more shards than we started with + - The tenant's data remains readable + + """ + + # We will start with 4 shards and split into 8, then migrate all those + # 8 shards onto separate pageservers + shard_count = 4 + split_shard_count = 8 + neon_env_builder.num_pageservers = split_shard_count * 2 + + # 1MiB stripes: enable getting some meaningful data distribution without + # writing large quantities of data in this test. The stripe size is given + # in number of 8KiB pages. + stripe_size = 128 + + # Use S3-compatible remote storage so that we can scrub: this test validates + # that the scrubber doesn't barf when it sees a sharded tenant. + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + + non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024} + + env = neon_env_builder.init_configs(True) + neon_env_builder.start() + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, + timeline_id, + shard_count=shard_count, + shard_stripe_size=stripe_size, + placement_policy='{"Attached": 1}', + conf=non_default_tenant_config, + ) + + workload = Workload(env, tenant_id, timeline_id, branch_name="main") + workload.init() + + # Initial data + workload.write_rows(256) + + # Note which pageservers initially hold a shard after tenant creation + pre_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)] + log.info("Pre-split pageservers: {pre_split_pageserver_ids}") + + # For pageservers holding a shard, validate their ingest statistics + # reflect a proper splitting of the WAL. + for pageserver in env.pageservers: + if pageserver.id not in pre_split_pageserver_ids: + continue + + metrics = pageserver.http_client().get_metrics_values( + [ + "pageserver_wal_ingest_records_received_total", + "pageserver_wal_ingest_records_committed_total", + "pageserver_wal_ingest_records_filtered_total", + ] + ) + + log.info(f"Pageserver {pageserver.id} metrics: {metrics}") + + # Not everything received was committed + assert ( + metrics["pageserver_wal_ingest_records_received_total"] + > metrics["pageserver_wal_ingest_records_committed_total"] + ) + + # Something was committed + assert metrics["pageserver_wal_ingest_records_committed_total"] > 0 + + # Counts are self consistent + assert ( + metrics["pageserver_wal_ingest_records_received_total"] + == metrics["pageserver_wal_ingest_records_committed_total"] + + metrics["pageserver_wal_ingest_records_filtered_total"] + ) + + # TODO: validate that shards have different sizes + + workload.validate() + + assert len(pre_split_pageserver_ids) == 4 + + def shards_on_disk(shard_ids): + for pageserver in env.pageservers: + for shard_id in shard_ids: + if pageserver.tenant_dir(shard_id).exists(): + return True + + return False + + old_shard_ids = [TenantShardId(tenant_id, i, shard_count) for i in range(0, shard_count)] + # Before split, old shards exist + assert shards_on_disk(old_shard_ids) + + # Before split, we have done one reconcile for each shard + assert ( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + == shard_count + ) + + # Make secondary downloads slow: this exercises the storage controller logic for not migrating an attachment + # during post-split optimization until the secondary is ready + for ps in env.pageservers: + ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")]) + + env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count) + + post_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)] + # We should have split into 8 shards, on the same 4 pageservers we started on. + assert len(post_split_pageserver_ids) == split_shard_count + assert len(set(post_split_pageserver_ids)) == shard_count + assert set(post_split_pageserver_ids) == set(pre_split_pageserver_ids) + + # The old parent shards should no longer exist on disk + assert not shards_on_disk(old_shard_ids) + + # Enough background reconciliations should result in the shards being properly distributed. + # Run this before the workload, because its LSN-waiting code presumes stable locations. + env.storage_controller.reconcile_until_idle(timeout_secs=60) + + workload.validate() + + workload.churn_rows(256) + + workload.validate() + + # Run GC on all new shards, to check they don't barf or delete anything that breaks reads + # (compaction was already run as part of churn_rows) + all_shards = tenant_get_shards(env, tenant_id) + for tenant_shard_id, pageserver in all_shards: + pageserver.http_client().timeline_gc(tenant_shard_id, timeline_id, None) + workload.validate() + + # Assert on how many reconciles happened during the process. This is something of an + # implementation detail, but it is useful to detect any bugs that might generate spurious + # extra reconcile iterations. + # + # We'll have: + # - shard_count reconciles for the original setup of the tenant + # - shard_count reconciles for detaching the original secondary locations during split + # - split_shard_count reconciles during shard splitting, for setting up secondaries. + # - shard_count of the child shards will need to fail over to their secondaries + # - shard_count of the child shard secondary locations will get moved to emptier nodes + expect_reconciles = shard_count * 2 + split_shard_count + shard_count * 2 + reconcile_ok = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + assert reconcile_ok == expect_reconciles + + # Check that no cancelled or errored reconciliations occurred: this test does no + # failure injection and should run clean. + cancelled_reconciles = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "cancel"} + ) + errored_reconciles = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "error"} + ) + assert cancelled_reconciles is not None and int(cancelled_reconciles) == 0 + assert errored_reconciles is not None and int(errored_reconciles) == 0 + + # We should see that the migration of shards after the split waited for secondaries to warm up + # before happening + assert env.storage_controller.log_contains(".*Skipping.*because secondary isn't ready.*") + + env.storage_controller.consistency_check() + + def get_node_shard_counts(env: NeonEnv, tenant_ids): + total: defaultdict[int, int] = defaultdict(int) + attached: defaultdict[int, int] = defaultdict(int) + for tid in tenant_ids: + for shard in env.storage_controller.tenant_describe(tid)["shards"]: + log.info( + f"{shard['tenant_shard_id']}: attached={shard['node_attached']}, secondary={shard['node_secondary']} " + ) + for node in shard["node_secondary"]: + total[int(node)] += 1 + attached[int(shard["node_attached"])] += 1 + total[int(shard["node_attached"])] += 1 + + return total, attached + + def check_effective_tenant_config(): + # Expect our custom tenant configs to have survived the split + for shard in env.storage_controller.tenant_describe(tenant_id)["shards"]: + node = env.get_pageserver(int(shard["node_attached"])) + config = node.http_client().tenant_config(TenantShardId.parse(shard["tenant_shard_id"])) + for k, v in non_default_tenant_config.items(): + assert config.effective_config[k] == v + + # Check that heatmap uploads remain enabled after shard split + # (https://github.com/neondatabase/neon/issues/8189) + assert ( + config.effective_config["heatmap_period"] + and config.effective_config["heatmap_period"] != "0s" + ) + + # Validate pageserver state: expect every child shard to have an attached and secondary location + (total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id]) + assert sum(attached.values()) == split_shard_count + assert sum(total.values()) == split_shard_count * 2 + check_effective_tenant_config() + + # More specific check: that we are fully balanced. It is deterministic that we will get exactly + # one shard on each pageserver, because for these small shards the utilization metric is + # dominated by shard count. + log.info(f"total: {total}") + assert total == { + 1: 1, + 2: 1, + 3: 1, + 4: 1, + 5: 1, + 6: 1, + 7: 1, + 8: 1, + 9: 1, + 10: 1, + 11: 1, + 12: 1, + 13: 1, + 14: 1, + 15: 1, + 16: 1, + } + + # The controller is not required to lay out the attached locations in any particular way, but + # all the pageservers that originally held an attached shard should still hold one, otherwise + # it would indicate that we had done some unnecessary migration. + log.info(f"attached: {attached}") + for ps_id in pre_split_pageserver_ids: + log.info("Pre-split pageserver {ps_id} should still hold an attached location") + assert ps_id in attached + + # Ensure post-split pageserver locations survive a restart (i.e. the child shards + # correctly wrote config to disk, and the storage controller responds correctly + # to /re-attach) + for pageserver in env.pageservers: + pageserver.stop() + pageserver.start() + + # Validate pageserver state: expect every child shard to have an attached and secondary location + (total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id]) + assert sum(attached.values()) == split_shard_count + assert sum(total.values()) == split_shard_count * 2 + check_effective_tenant_config() + + workload.validate() + + +@pytest.mark.parametrize("initial_stripe_size", [None, 65536]) +def test_sharding_split_stripe_size( + neon_env_builder: NeonEnvBuilder, + httpserver: HTTPServer, + httpserver_listen_address, + initial_stripe_size: int, +): + """ + Check that modifying stripe size inline with a shard split works as expected + """ + (host, port) = httpserver_listen_address + neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.num_pageservers = 1 + + # Set up fake HTTP notify endpoint: we will use this to validate that we receive + # the correct stripe size after split. + notifications = [] + + def handler(request: Request): + log.info(f"Notify request: {request}") + notifications.append(request.json) + return Response(status=200) + + httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + + env = neon_env_builder.init_start( + initial_tenant_shard_count=1, initial_tenant_shard_stripe_size=initial_stripe_size + ) + tenant_id = env.initial_tenant + + assert len(notifications) == 1 + expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = { + "tenant_id": str(env.initial_tenant), + "stripe_size": None, + "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + } + assert notifications[0] == expect + + new_stripe_size = 2048 + env.storage_controller.tenant_shard_split( + tenant_id, shard_count=2, shard_stripe_size=new_stripe_size + ) + env.storage_controller.reconcile_until_idle() + + # Check that we ended up with the stripe size that we expected, both on the pageserver + # and in the notifications to compute + assert len(notifications) == 2 + expect_after: Dict[str, Union[List[Dict[str, int]], str, None, int]] = { + "tenant_id": str(env.initial_tenant), + "stripe_size": new_stripe_size, + "shards": [ + {"node_id": int(env.pageservers[0].id), "shard_number": 0}, + {"node_id": int(env.pageservers[0].id), "shard_number": 1}, + ], + } + log.info(f"Got notification: {notifications[1]}") + assert notifications[1] == expect_after + + # Inspect the stripe size on the pageserver + shard_0_loc = ( + env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 0, 2)) + ) + assert shard_0_loc["shard_stripe_size"] == new_stripe_size + shard_1_loc = ( + env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 1, 2)) + ) + assert shard_1_loc["shard_stripe_size"] == new_stripe_size + + # Ensure stripe size survives a pageserver restart + env.pageservers[0].stop() + env.pageservers[0].start() + shard_0_loc = ( + env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 0, 2)) + ) + assert shard_0_loc["shard_stripe_size"] == new_stripe_size + shard_1_loc = ( + env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 1, 2)) + ) + assert shard_1_loc["shard_stripe_size"] == new_stripe_size + + # Ensure stripe size survives a storage controller restart + env.storage_controller.stop() + env.storage_controller.start() + + def assert_restart_notification(): + assert len(notifications) == 3 + assert notifications[2] == expect_after + + wait_until(10, 1, assert_restart_notification) + + +@pytest.mark.skipif( + # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're + # validating in this test don't benefit much from debug assertions. + os.getenv("BUILD_TYPE") == "debug", + reason="Avoid running bulkier ingest tests in debug mode", +) +def test_sharding_ingest_layer_sizes( + neon_env_builder: NeonEnvBuilder, +): + """ + Check that when ingesting data to a sharded tenant, we properly respect layer size limts. + """ + + # Set a small stripe size and checkpoint distance, so that we can exercise rolling logic + # without writing a lot of data. + expect_layer_size = 131072 + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{expect_layer_size}", + "compaction_target_size": f"{expect_layer_size}", + # aim to reduce flakyness, we are not doing explicit checkpointing + "compaction_period": "0s", + "gc_period": "0s", + } + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, + initial_tenant_shard_count=shard_count, + # A stripe size the same order of magnitude as layer size: this ensures that + # within checkpoint_distance some shards will have no data to ingest, if LSN + # contains sequential page writes. This test checks that this kind of + # scenario doesn't result in some shards emitting empty/tiny layers. + initial_tenant_shard_stripe_size=expect_layer_size // 8192, + ) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # ignore the initdb layer(s) for the purposes of the size comparison as a initdb image layer optimization + # will produce a lot more smaller layers. + initial_layers_per_shard = {} + log.info("initdb distribution (not asserted on):") + for shard in env.storage_controller.locate(tenant_id): + pageserver = env.get_pageserver(shard["node_id"]) + shard_id = shard["shard_id"] + layers = ( + env.get_pageserver(shard["node_id"]).http_client().layer_map_info(shard_id, timeline_id) + ) + for layer in layers.historic_layers: + log.info( + f"layer[{pageserver.id}]: {layer.layer_file_name} (size {layer.layer_file_size})" + ) + + initial_layers_per_shard[shard_id] = set(layers.historic_layers) + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + + workload.validate() + + small_layer_count = 0 + ok_layer_count = 0 + huge_layer_count = 0 + + # Inspect the resulting layer map, count how many layers are undersized. + for shard in env.storage_controller.locate(tenant_id): + pageserver = env.get_pageserver(shard["node_id"]) + shard_id = shard["shard_id"] + layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id) + + historic_layers = sorted(layer_map.historic_layers, key=lambda layer: layer.lsn_start) + + initial_layers = initial_layers_per_shard[shard_id] + + for layer in historic_layers: + if layer in initial_layers: + # ignore the initdb image layers for the size histogram + continue + + if layer.layer_file_size < expect_layer_size // 2: + classification = "Small" + small_layer_count += 1 + elif layer.layer_file_size > expect_layer_size * 2: + classification = "Huge " + huge_layer_count += 1 + else: + classification = "OK " + ok_layer_count += 1 + + if layer.kind == "Delta": + assert layer.lsn_end is not None + lsn_size = Lsn(layer.lsn_end) - Lsn(layer.lsn_start) + else: + lsn_size = 0 + + log.info( + f"{classification} layer[{pageserver.id}]: {layer.layer_file_name} (size {layer.layer_file_size}, LSN distance {lsn_size})" + ) + + # Why an inexact check? + # - Because we roll layers on checkpoint_distance * shard_count, we expect to obey the target + # layer size on average, but it is still possible to write some tiny layers. + log.info(f"Totals: {small_layer_count} small layers, {ok_layer_count} ok layers") + if small_layer_count <= shard_count: + # If each shard has <= 1 small layer + pass + else: + # General case: + # old limit was 0.25 but pg14 is right at the limit with 7/28 + assert float(small_layer_count) / float(ok_layer_count) < 0.3 + + # Each shard may emit up to one huge layer, because initdb ingest doesn't respect checkpoint_distance. + assert huge_layer_count <= shard_count + + +def test_sharding_ingest_gaps( + neon_env_builder: NeonEnvBuilder, +): + """ + Check ingest behavior when the incoming data results in some shards having gaps where + no data is ingested: they should advance their disk_consistent_lsn and remote_consistent_lsn + even if they aren't writing out layers. + """ + + # Set a small stripe size and checkpoint distance, so that we can exercise rolling logic + # without writing a lot of data. + expect_layer_size = 131072 + checkpoint_interval_secs = 5 + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{expect_layer_size}", + "compaction_target_size": f"{expect_layer_size}", + # Set a short checkpoint interval as we will wait for uploads to happen + "checkpoint_timeout": f"{checkpoint_interval_secs}s", + # Background checkpointing is done from compaction loop, so set that interval short too + "compaction_period": "1s", + } + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, + initial_tenant_shard_count=shard_count, + initial_tenant_shard_stripe_size=128, + ) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Just a few writes: we aim to produce a situation where some shards are skipping + # ingesting some records and thereby won't have layer files that advance their + # consistent LSNs, to exercise the code paths that explicitly handle this case by + # advancing consistent LSNs in the background if there is no open layer. + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(128, upload=False) + workload.churn_rows(128, upload=False) + + # Checkpoint, so that we won't get a background checkpoint happening during the next step + workload.endpoint().safe_psql("checkpoint") + # Freeze + flush, so that subsequent writes will start from a position of no open layers + last_flush_lsn_upload(env, workload.endpoint(), tenant_id, timeline_id) + + # This write is tiny: at least some of the shards should find they don't have any + # data to ingest. This will exercise how they handle that. + workload.churn_rows(1, upload=False) + + # The LSN that has reached pageservers, but may not have been flushed to historic layers yet + expect_lsn = wait_for_last_flush_lsn(env, workload.endpoint(), tenant_id, timeline_id) + + # Don't leave the endpoint running, we don't want it writing in the background + workload.stop() + + log.info(f"Waiting for shards' consistent LSNs to reach {expect_lsn}") + + shards = tenant_get_shards(env, tenant_id, None) + + def assert_all_disk_consistent(): + """ + Assert that all the shards' disk_consistent_lsns have reached expect_lsn + """ + for tenant_shard_id, pageserver in shards: + timeline_detail = pageserver.http_client().timeline_detail(tenant_shard_id, timeline_id) + log.info(f"{tenant_shard_id} (ps {pageserver.id}) detail: {timeline_detail}") + assert Lsn(timeline_detail["disk_consistent_lsn"]) >= expect_lsn + + # We set a short checkpoint timeout: expect things to get frozen+flushed within that + wait_until(checkpoint_interval_secs * 3, 1, assert_all_disk_consistent) + + def assert_all_remote_consistent(): + """ + Assert that all the shards' remote_consistent_lsns have reached expect_lsn + """ + for tenant_shard_id, pageserver in shards: + timeline_detail = pageserver.http_client().timeline_detail(tenant_shard_id, timeline_id) + log.info(f"{tenant_shard_id} (ps {pageserver.id}) detail: {timeline_detail}") + assert Lsn(timeline_detail["remote_consistent_lsn"]) >= expect_lsn + + # We set a short checkpoint timeout: expect things to get frozen+flushed within that + wait_until(checkpoint_interval_secs * 3, 1, assert_all_remote_consistent) + + workload.validate() + + +class Failure: + pageserver_id: Optional[int] + + def apply(self, env: NeonEnv): + raise NotImplementedError() + + def clear(self, env: NeonEnv): + """ + Clear the failure, in a way that should enable the system to proceed + to a totally clean state (all nodes online and reconciled) + """ + raise NotImplementedError() + + def expect_available(self): + raise NotImplementedError() + + def can_mitigate(self): + """Whether Self.mitigate is available for use""" + return False + + def mitigate(self, env: NeonEnv): + """ + Mitigate the failure in a way that should allow shard split to + complete and service to resume, but does not guarantee to leave + the whole world in a clean state (e.g. an Offline node might have + junk LocationConfigs on it) + """ + raise NotImplementedError() + + def fails_forward(self, env: NeonEnv): + """ + If true, this failure results in a state that eventualy completes the split. + """ + return False + + def expect_exception(self): + """ + How do we expect a call to the split API to fail? + """ + return StorageControllerApiException + + +class PageserverFailpoint(Failure): + def __init__(self, failpoint, pageserver_id, mitigate): + self.failpoint = failpoint + self.pageserver_id = pageserver_id + self._mitigate = mitigate + + def apply(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.allowed_errors.extend( + [".*failpoint.*", ".*Resetting.*after shard split failure.*"] + ) + pageserver.http_client().configure_failpoints((self.failpoint, "return(1)")) + + def clear(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.http_client().configure_failpoints((self.failpoint, "off")) + if self._mitigate: + env.storage_controller.node_configure(self.pageserver_id, {"availability": "Active"}) + + def expect_available(self): + return True + + def can_mitigate(self): + return self._mitigate + + def mitigate(self, env): + env.storage_controller.node_configure(self.pageserver_id, {"availability": "Offline"}) + + +class StorageControllerFailpoint(Failure): + def __init__(self, failpoint, action): + self.failpoint = failpoint + self.pageserver_id = None + self.action = action + + def apply(self, env: NeonEnv): + env.storage_controller.configure_failpoints((self.failpoint, self.action)) + + def clear(self, env: NeonEnv): + if "panic" in self.action: + log.info("Restarting storage controller after panic") + env.storage_controller.stop() + env.storage_controller.start() + else: + env.storage_controller.configure_failpoints((self.failpoint, "off")) + + def expect_available(self): + # Controller panics _do_ leave pageservers available, but our test code relies + # on using the locate API to update configurations in Workload, so we must skip + # these actions when the controller has been panicked. + return "panic" not in self.action + + def can_mitigate(self): + return False + + def fails_forward(self, env): + # Edge case: the very last failpoint that simulates a DB connection error, where + # the abort path will fail-forward and result in a complete split. + fail_forward = self.failpoint == "shard-split-post-complete" + + # If the failure was a panic, then if we expect split to eventually (after restart) + # complete, we must restart before checking that. + if fail_forward and "panic" in self.action: + log.info("Restarting storage controller after panic") + env.storage_controller.stop() + env.storage_controller.start() + + return fail_forward + + def expect_exception(self): + if "panic" in self.action: + return requests.exceptions.ConnectionError + else: + return StorageControllerApiException + + +class NodeKill(Failure): + def __init__(self, pageserver_id, mitigate): + self.pageserver_id = pageserver_id + self._mitigate = mitigate + + def apply(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.stop(immediate=True) + + def clear(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.start() + + def expect_available(self): + return False + + def mitigate(self, env): + env.storage_controller.node_configure(self.pageserver_id, {"availability": "Offline"}) + + +class CompositeFailure(Failure): + """ + Wrapper for failures in multiple components (e.g. a failpoint in the storage controller, *and* + stop a pageserver to interfere with rollback) + """ + + def __init__(self, failures: list[Failure]): + self.failures = failures + + self.pageserver_id = None + for f in failures: + if f.pageserver_id is not None: + self.pageserver_id = f.pageserver_id + break + + def apply(self, env: NeonEnv): + for f in self.failures: + f.apply(env) + + def clear(self, env): + for f in self.failures: + f.clear(env) + + def expect_available(self): + return all(f.expect_available() for f in self.failures) + + def mitigate(self, env): + for f in self.failures: + f.mitigate(env) + + def expect_exception(self): + expect = set(f.expect_exception() for f in self.failures) + + # We can't give a sensible response if our failures have different expectations + assert len(expect) == 1 + + return list(expect)[0] + + +@pytest.mark.parametrize( + "failure", + [ + PageserverFailpoint("api-500", 1, False), + NodeKill(1, False), + PageserverFailpoint("api-500", 1, True), + NodeKill(1, True), + PageserverFailpoint("shard-split-pre-prepare", 1, False), + PageserverFailpoint("shard-split-post-prepare", 1, False), + PageserverFailpoint("shard-split-pre-hardlink", 1, False), + PageserverFailpoint("shard-split-post-hardlink", 1, False), + PageserverFailpoint("shard-split-post-child-conf", 1, False), + PageserverFailpoint("shard-split-lsn-wait", 1, False), + PageserverFailpoint("shard-split-pre-finish", 1, False), + StorageControllerFailpoint("shard-split-validation", "return(1)"), + StorageControllerFailpoint("shard-split-post-begin", "return(1)"), + StorageControllerFailpoint("shard-split-post-remote", "return(1)"), + StorageControllerFailpoint("shard-split-post-complete", "return(1)"), + StorageControllerFailpoint("shard-split-validation", "panic(failpoint)"), + StorageControllerFailpoint("shard-split-post-begin", "panic(failpoint)"), + StorageControllerFailpoint("shard-split-post-remote", "panic(failpoint)"), + StorageControllerFailpoint("shard-split-post-complete", "panic(failpoint)"), + CompositeFailure( + [NodeKill(1, True), StorageControllerFailpoint("shard-split-post-begin", "return(1)")] + ), + CompositeFailure( + [NodeKill(1, False), StorageControllerFailpoint("shard-split-post-begin", "return(1)")] + ), + ], +) +def test_sharding_split_failures( + neon_env_builder: NeonEnvBuilder, + compute_reconfigure_listener: ComputeReconfigure, + failure: Failure, +): + neon_env_builder.num_pageservers = 4 + neon_env_builder.control_plane_compute_hook_api = ( + compute_reconfigure_listener.control_plane_compute_hook_api + ) + initial_shard_count = 2 + split_shard_count = 4 + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + + # Create a tenant with secondary locations enabled + env.neon_cli.create_tenant( + tenant_id, timeline_id, shard_count=initial_shard_count, placement_policy='{"Attached":1}' + ) + + env.storage_controller.allowed_errors.extend( + [ + # All split failures log a warning when then enqueue the abort operation + ".*Enqueuing background abort.*", + # We exercise failure cases where abort itself will also fail (node offline) + ".*abort_tenant_shard_split.*", + ".*Failed to abort.*", + # Tolerate any error lots that mention a failpoint + ".*failpoint.*", + # Node offline cases will fail to send requests + ".*Reconcile error: receive body: error sending request for url.*", + # Node offline cases will fail inside reconciler when detaching secondaries + ".*Reconcile error on shard.*: receive body: error sending request for url.*", + # Node offline cases may eventually cancel reconcilers when the heartbeater realizes nodes are offline + ".*Reconcile error.*Cancelled.*", + # While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning + ".*Failed to schedule metadata upload after updating disk_consistent_lsn.*", + ] + ) + + for ps in env.pageservers: + # If we're using a failure that will panic the storage controller, all background + # upcalls from the pageserver can fail + ps.allowed_errors.append(".*calling control plane generation validation API failed.*") + + # Make sure the node we're failing has a shard on it, otherwise the test isn't testing anything + assert ( + failure.pageserver_id is None + or len( + env.get_pageserver(failure.pageserver_id) + .http_client() + .tenant_list_locations()["tenant_shards"] + ) + > 0 + ) + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(100) + + # Put the environment into a failing state (exact meaning depends on `failure`) + failure.apply(env) + + with pytest.raises(failure.expect_exception()): + env.storage_controller.tenant_shard_split(tenant_id, shard_count=4) + + # We expect that the overall operation will fail, but some split requests + # will have succeeded: the net result should be to return to a clean state, including + # detaching any child shards. + def assert_rolled_back(exclude_ps_id=None) -> None: + secondary_count = 0 + attached_count = 0 + for ps in env.pageservers: + if exclude_ps_id is not None and ps.id == exclude_ps_id: + continue + + locations = ps.http_client().tenant_list_locations()["tenant_shards"] + for loc in locations: + tenant_shard_id = TenantShardId.parse(loc[0]) + log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") + assert tenant_shard_id.shard_count == initial_shard_count + if loc[1]["mode"] == "Secondary": + secondary_count += 1 + else: + attached_count += 1 + + if exclude_ps_id is not None: + # For a node failure case, we expect there to be a secondary location + # scheduled on the offline node, so expect one fewer secondary in total + assert secondary_count == initial_shard_count - 1 + else: + assert secondary_count == initial_shard_count + + assert attached_count == initial_shard_count + + def assert_split_done(exclude_ps_id=None) -> None: + secondary_count = 0 + attached_count = 0 + for ps in env.pageservers: + if exclude_ps_id is not None and ps.id == exclude_ps_id: + continue + + locations = ps.http_client().tenant_list_locations()["tenant_shards"] + for loc in locations: + tenant_shard_id = TenantShardId.parse(loc[0]) + log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") + assert tenant_shard_id.shard_count == split_shard_count + if loc[1]["mode"] == "Secondary": + secondary_count += 1 + else: + attached_count += 1 + assert attached_count == split_shard_count + assert secondary_count == split_shard_count + + def finish_split(): + # Having failed+rolled back, we should be able to split again + # No failures this time; it will succeed + env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count) + env.storage_controller.reconcile_until_idle(timeout_secs=30) + + workload.churn_rows(10) + workload.validate() + + if failure.expect_available(): + # Even though the split failed partway through, this should not leave the tenant in + # an unavailable state. + # - Disable waiting for pageservers in the workload helper, because our + # failpoints may prevent API access. This only applies for failure modes that + # leave pageserver page_service API available. + # - This is a wait_until because clients may see transient errors in some split error cases, + # e.g. while waiting for a storage controller to re-attach a parent shard if we failed + # inside the pageserver and the storage controller responds by detaching children and attaching + # parents concurrently (https://github.com/neondatabase/neon/issues/7148) + wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False)) # type: ignore + + workload.validate() + + if failure.fails_forward(env): + log.info("Fail-forward failure, checking split eventually completes...") + # A failure type which results in eventual completion of the split + wait_until(30, 1, assert_split_done) + elif failure.can_mitigate(): + log.info("Mitigating failure...") + # Mitigation phase: we expect to be able to proceed with a successful shard split + failure.mitigate(env) + + # The split should appear to be rolled back from the point of view of all pageservers + # apart from the one that is offline + wait_until(30, 1, lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id)) + + finish_split() + wait_until(30, 1, lambda: assert_split_done(exclude_ps_id=failure.pageserver_id)) + + # Having cleared the failure, everything should converge to a pristine state + failure.clear(env) + wait_until(30, 1, assert_split_done) + else: + # Once we restore the faulty pageserver's API to good health, rollback should + # eventually complete. + log.info("Clearing failure...") + failure.clear(env) + + wait_until(30, 1, assert_rolled_back) + + # Having rolled back, the tenant should be working + workload.churn_rows(10) + workload.validate() + + # Splitting again should work, since we cleared the failure + finish_split() + assert_split_done() + + if isinstance(failure, StorageControllerFailpoint) and "post-complete" in failure.failpoint: + # On a post-complete failure, the controller will recover the post-split state + # after restart, but it will have missed the optimization part of the split function + # where secondary downloads are kicked off. This means that reconcile_until_idle + # will take a very long time if we wait for all optimizations to complete, because + # those optimizations will wait for secondary downloads. + # + # Avoid that by configuring the tenant into Essential scheduling mode, so that it will + # skip optimizations when we're exercising this particular failpoint. + env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Essential"}) + + # Having completed the split, pump the background reconciles to ensure that + # the scheduler reaches an idle state + env.storage_controller.reconcile_until_idle(timeout_secs=30) + + env.storage_controller.consistency_check() + + +def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder): + """ + Check a scenario when one of the shards is much slower than others. + Without backpressure, this would lead to the slow shard falling behind + and eventually causing WAL timeouts. + """ + + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + + # 256KiB stripes: enable getting some meaningful data distribution without + # writing large quantities of data in this test. The stripe size is given + # in number of 8KiB pages. + stripe_size = 32 + + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size + ) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + pageservers = dict((int(p.id), p) for p in env.pageservers) + shards = env.storage_controller.locate(tenant_id) + + # Slow down one of the shards, around ~1MB/s + pageservers[4].http_client().configure_failpoints(("wal-ingest-record-sleep", "5%sleep(1)")) + + def shards_info(): + infos = [] + for shard in shards: + node_id = int(shard["node_id"]) + pageserver = pageservers[node_id] + shard_info = pageserver.http_client().timeline_detail(shard["shard_id"], timeline_id) + infos.append(shard_info) + last_record_lsn = shard_info["last_record_lsn"] + current_physical_size = shard_info["current_physical_size"] + log.info( + f"Shard on pageserver {node_id}: lsn={last_record_lsn}, size={current_physical_size}" + ) + return infos + + shards_info() + + workload = Workload( + env, + tenant_id, + timeline_id, + branch_name="main", + endpoint_opts={ + "config_lines": [ + # Tip: set to 100MB to make the test fail + "max_replication_write_lag=1MB", + ], + }, + ) + workload.init() + + endpoint = workload.endpoint() + + # on 2024-03-05, the default config on prod was [15MB, 10GB, null] + res = endpoint.safe_psql_many( + [ + "SHOW max_replication_write_lag", + "SHOW max_replication_flush_lag", + "SHOW max_replication_apply_lag", + ] + ) + log.info(f"backpressure config: {res}") + + last_flush_lsn = None + last_timestamp = None + + def update_write_lsn(): + nonlocal last_flush_lsn + nonlocal last_timestamp + + res = endpoint.safe_psql( + """ + SELECT + pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag, + received_lsn, + pg_current_wal_flush_lsn() as flush_lsn, + neon.backpressure_throttling_time() as throttling_time + FROM neon.backpressure_lsns(); + """, + dbname="postgres", + )[0] + log.info( + f"received_lsn_lag = {res[0]}, received_lsn = {res[1]}, flush_lsn = {res[2]}, throttling_time = {res[3]}" + ) + + lsn = Lsn(res[2]) + now = time.time() + + if last_timestamp is not None: + delta = now - last_timestamp + delta_bytes = lsn - last_flush_lsn + avg_speed = delta_bytes / delta / 1024 / 1024 + log.info( + f"flush_lsn {lsn}, written {delta_bytes/1024}kb for {delta:.3f}s, avg_speed {avg_speed:.3f} MiB/s" + ) + + last_flush_lsn = lsn + last_timestamp = now + + update_write_lsn() + + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.validate() + + update_write_lsn() + shards_info() + + for _write_iter in range(30): + # approximately 1MB of data + workload.write_rows(8000, upload=False) + update_write_lsn() + infos = shards_info() + min_lsn = min(Lsn(info["last_record_lsn"]) for info in infos) + max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos) + diff = max_lsn - min_lsn + assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure" + + +def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder): + """ + Check that an unlogged relation is handled properly on a sharded tenant + + Reproducer for https://github.com/neondatabase/neon/issues/7451 + """ + + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + neon_env_builder.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=8) + + # We will create many tables to ensure it's overwhelmingly likely that at least one + # of them doesn't land on shard 0 + table_names = [f"my_unlogged_{i}" for i in range(0, 16)] + + with env.endpoints.create_start("main", tenant_id=tenant_id) as ep: + for table_name in table_names: + ep.safe_psql(f"CREATE UNLOGGED TABLE {table_name} (id integer, value varchar(64));") + ep.safe_psql(f"INSERT INTO {table_name} VALUES (1, 'foo')") + result = ep.safe_psql(f"SELECT * from {table_name};") + assert result == [(1, "foo")] + ep.safe_psql(f"CREATE INDEX ON {table_name} USING btree (value);") + + wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id) + + with env.endpoints.create_start("main", tenant_id=tenant_id) as ep: + for table_name in table_names: + # Check that table works: we can select and insert + result = ep.safe_psql(f"SELECT * from {table_name};") + assert result == [] + ep.safe_psql(f"INSERT INTO {table_name} VALUES (2, 'bar');") + result = ep.safe_psql(f"SELECT * from {table_name};") + assert result == [(2, "bar")] + + # Ensure that post-endpoint-restart modifications are ingested happily by pageserver + wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id) + + +def test_top_tenants(neon_env_builder: NeonEnvBuilder): + """ + The top_tenants API is used in shard auto-splitting to find candidates. + """ + + env = neon_env_builder.init_configs() + neon_env_builder.start() + + tenants = [] + n_tenants = 8 + for i in range(0, n_tenants): + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant(tenant_id, timeline_id) + + # Write a different amount of data to each tenant + w = Workload(env, tenant_id, timeline_id) + w.init() + w.write_rows(i * 1000) + w.stop() + + logical_size = env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)[ + "current_logical_size" + ] + tenants.append((tenant_id, timeline_id, logical_size)) + + log.info(f"Created {tenant_id}/{timeline_id} with size {logical_size}") + + # Ask for 1 largest tenant + top_1 = env.pageserver.http_client().top_tenants("max_logical_size", 1, 8, 0) + assert len(top_1["shards"]) == 1 + assert top_1["shards"][0]["id"] == str(tenants[-1][0]) + assert top_1["shards"][0]["max_logical_size"] == tenants[-1][2] + + # Apply a lower bound limit + top = env.pageserver.http_client().top_tenants( + "max_logical_size", 100, 8, where_gt=tenants[3][2] + ) + assert len(top["shards"]) == n_tenants - 4 + assert set(i["id"] for i in top["shards"]) == set(str(i[0]) for i in tenants[4:]) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py new file mode 100644 index 0000000000..92cd74eba5 --- /dev/null +++ b/test_runner/regress/test_storage_controller.py @@ -0,0 +1,2566 @@ +import concurrent.futures +import json +import threading +import time +from collections import defaultdict +from datetime import datetime, timezone +from typing import Any, Dict, List, Optional, Set, Tuple, Union + +import pytest +from fixtures.common_types import TenantId, TenantShardId, TimelineId +from fixtures.compute_reconfigure import ComputeReconfigure +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PageserverAvailability, + PageserverSchedulingPolicy, + PgBin, + StorageControllerApiException, + StorageControllerLeadershipStatus, + TokenScope, + last_flush_lsn_upload, +) +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient +from fixtures.pageserver.utils import ( + assert_prefix_empty, + assert_prefix_not_empty, + enable_remote_storage_versioning, + list_prefix, + many_small_layers_tenant_config, + remote_storage_delete_key, + timeline_delete_wait_completed, +) +from fixtures.pg_version import PgVersion, run_only_on_default_postgres +from fixtures.port_distributor import PortDistributor +from fixtures.remote_storage import RemoteStorageKind, s3_storage +from fixtures.storage_controller_proxy import StorageControllerProxy +from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until +from fixtures.workload import Workload +from mypy_boto3_s3.type_defs import ( + ObjectTypeDef, +) +from pytest_httpserver import HTTPServer +from urllib3 import Retry +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + + +def get_node_shard_counts(env: NeonEnv, tenant_ids): + counts: defaultdict[int, int] = defaultdict(int) + for tid in tenant_ids: + for shard in env.storage_controller.locate(tid): + counts[shard["node_id"]] += 1 + return counts + + +def test_storage_controller_smoke( + neon_env_builder: NeonEnvBuilder, +): + """ + Test the basic lifecycle of a storage controller: + - Restarting + - Restarting a pageserver + - Creating and deleting tenants and timelines + - Marking a pageserver offline + """ + + neon_env_builder.num_pageservers = 3 + env = neon_env_builder.init_configs() + + # Start services by hand so that we can skip a pageserver (this will start + register later) + env.broker.try_start() + env.storage_controller.start() + env.pageservers[0].start() + env.pageservers[1].start() + for sk in env.safekeepers: + sk.start() + + # The pageservers we started should have registered with the sharding service on startup + nodes = env.storage_controller.node_list() + assert len(nodes) == 2 + assert set(n["id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id} + + # Starting an additional pageserver should register successfully + env.pageservers[2].start() + nodes = env.storage_controller.node_list() + assert len(nodes) == 3 + assert set(n["id"] for n in nodes) == {ps.id for ps in env.pageservers} + + # Use a multiple of pageservers to get nice even number of shards on each one + tenant_shard_count = len(env.pageservers) * 4 + tenant_count = len(env.pageservers) * 2 + shards_per_tenant = tenant_shard_count // tenant_count + tenant_ids = set(TenantId.generate() for i in range(0, tenant_count)) + + # Creating several tenants should spread out across the pageservers + for tid in tenant_ids: + env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant) + + # Repeating a creation should be idempotent (we are just testing it doesn't return an error) + env.storage_controller.tenant_create( + tenant_id=next(iter(tenant_ids)), shard_count=shards_per_tenant + ) + + for node_id, count in get_node_shard_counts(env, tenant_ids).items(): + # we used a multiple of pagservers for the total shard count, + # so expect equal number on all pageservers + assert count == tenant_shard_count / len( + env.pageservers + ), f"Node {node_id} has bad count {count}" + + # Creating and deleting timelines should work, using identical API to pageserver + timeline_crud_tenant = next(iter(tenant_ids)) + timeline_id = TimelineId.generate() + env.storage_controller.pageserver_api().timeline_create( + pg_version=PgVersion.NOT_SET, tenant_id=timeline_crud_tenant, new_timeline_id=timeline_id + ) + timelines = env.storage_controller.pageserver_api().timeline_list(timeline_crud_tenant) + assert len(timelines) == 2 + assert timeline_id in set(TimelineId(t["timeline_id"]) for t in timelines) + # virtual_ps_http.timeline_delete(tenant_id=timeline_crud_tenant, timeline_id=timeline_id) + timeline_delete_wait_completed( + env.storage_controller.pageserver_api(), timeline_crud_tenant, timeline_id + ) + timelines = env.storage_controller.pageserver_api().timeline_list(timeline_crud_tenant) + assert len(timelines) == 1 + assert timeline_id not in set(TimelineId(t["timeline_id"]) for t in timelines) + + # Marking a pageserver offline should migrate tenants away from it. + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"}) + + def node_evacuated(node_id: int) -> None: + counts = get_node_shard_counts(env, tenant_ids) + assert counts[node_id] == 0 + + wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id)) + + # Let all the reconciliations after marking the node offline complete + env.storage_controller.reconcile_until_idle() + + # Marking pageserver active should not migrate anything to it + # immediately + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Active"}) + time.sleep(1) + assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 0 + + # Restarting a pageserver should not detach any tenants (i.e. /re-attach works) + before_restart = env.pageservers[1].http_client().tenant_list_locations() + env.pageservers[1].stop() + env.pageservers[1].start() + after_restart = env.pageservers[1].http_client().tenant_list_locations() + assert len(after_restart) == len(before_restart) + + # Locations should be the same before & after restart, apart from generations + for _shard_id, tenant in after_restart["tenant_shards"]: + del tenant["generation"] + for _shard_id, tenant in before_restart["tenant_shards"]: + del tenant["generation"] + assert before_restart == after_restart + + # Delete all the tenants + for tid in tenant_ids: + env.storage_controller.pageserver_api().tenant_delete(tid) + + env.storage_controller.consistency_check() + + # Set a scheduling policy on one node, create all the tenants, observe + # that the scheduling policy is respected. + env.storage_controller.node_configure(env.pageservers[1].id, {"scheduling": "Draining"}) + + # Create some fresh tenants + tenant_ids = set(TenantId.generate() for i in range(0, tenant_count)) + for tid in tenant_ids: + env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant) + + counts = get_node_shard_counts(env, tenant_ids) + # Nothing should have been scheduled on the node in Draining + assert counts[env.pageservers[1].id] == 0 + assert counts[env.pageservers[0].id] == tenant_shard_count // 2 + assert counts[env.pageservers[2].id] == tenant_shard_count // 2 + + env.storage_controller.consistency_check() + + +def test_node_status_after_restart( + neon_env_builder: NeonEnvBuilder, +): + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_start() + + # Initially we have two online pageservers + nodes = env.storage_controller.node_list() + assert len(nodes) == 2 + + env.pageservers[1].stop() + env.storage_controller.allowed_errors.extend([".*Could not scan node"]) + + env.storage_controller.stop() + env.storage_controller.start() + + def is_ready(): + assert env.storage_controller.ready() is True + + wait_until(30, 1, is_ready) + + # We loaded nodes from database on restart + nodes = env.storage_controller.node_list() + assert len(nodes) == 2 + + # We should still be able to create a tenant, because the pageserver which is still online + # should have had its availabilty state set to Active. + env.storage_controller.tenant_create(TenantId.generate()) + + env.storage_controller.consistency_check() + + +def test_storage_controller_passthrough( + neon_env_builder: NeonEnvBuilder, +): + """ + For simple timeline/tenant GET APIs that don't require coordination across + shards, the sharding service implements a proxy to shard zero. This test + calls those APIs. + """ + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_start() + + # We will talk to storage controller as if it was a pageserver, using the pageserver + # HTTP client + client = PageserverHttpClient(env.storage_controller_port, lambda: True) + timelines = client.timeline_list(tenant_id=env.initial_tenant) + assert len(timelines) == 1 + + status = client.tenant_status(env.initial_tenant) + assert TenantId(status["id"]) == env.initial_tenant + assert set(TimelineId(t) for t in status["timelines"]) == { + env.initial_timeline, + } + assert status["state"]["slug"] == "Active" + + (synthetic_size, size_inputs) = client.tenant_size_and_modelinputs(env.initial_tenant) + assert synthetic_size > 0 + assert "segments" in size_inputs + + env.storage_controller.consistency_check() + + +def test_storage_controller_restart(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + tenant_a = env.initial_tenant + tenant_b = TenantId.generate() + env.storage_controller.tenant_create(tenant_b) + env.pageserver.tenant_detach(tenant_a) + + # TODO: extend this test to use multiple pageservers, and check that locations don't move around + # on restart. + + # Storage controller restart + env.storage_controller.stop() + env.storage_controller.start() + + observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list()) + + # Tenant A should still be attached + assert tenant_a not in observed + + # Tenant B should remain detached + assert tenant_b in observed + + # Pageserver restart + env.pageserver.stop() + env.pageserver.start() + + # Same assertions as above: restarting either service should not perturb things + observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list()) + assert tenant_a not in observed + assert tenant_b in observed + + env.storage_controller.consistency_check() + + +@pytest.mark.parametrize("warm_up", [True, False]) +def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool): + """ + We onboard tenants to the sharding service by treating it as a 'virtual pageserver' + which provides the /location_config API. This is similar to creating a tenant, + but imports the generation number. + """ + + # One pageserver to simulate legacy environment, two to be managed by storage controller + neon_env_builder.num_pageservers = 3 + + # Start services by hand so that we can skip registration on one of the pageservers + env = neon_env_builder.init_configs() + env.broker.try_start() + env.storage_controller.start() + + # This is the pageserver where we'll initially create the tenant. Run it in emergency + # mode so that it doesn't talk to storage controller, and do not register it. + env.pageservers[0].allowed_errors.append(".*Emergency mode!.*") + env.pageservers[0].patch_config_toml_nonrecursive( + { + "control_plane_emergency_mode": True, + } + ) + env.pageservers[0].start() + origin_ps = env.pageservers[0] + + # These are the pageservers managed by the sharding service, where the tenant + # will be attached after onboarding + env.pageservers[1].start() + env.pageservers[2].start() + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + + for sk in env.safekeepers: + sk.start() + + # Create a tenant directly via pageserver HTTP API, skipping the storage controller + tenant_id = TenantId.generate() + generation = 123 + origin_ps.tenant_create(tenant_id, generation=generation) + + # As if doing a live migration, first configure origin into stale mode + r = origin_ps.http_client().tenant_location_conf( + tenant_id, + { + "mode": "AttachedStale", + "secondary_conf": None, + "tenant_conf": {}, + "generation": generation, + }, + ) + assert len(r["shards"]) == 1 + + if warm_up: + origin_ps.http_client().tenant_heatmap_upload(tenant_id) + + # We expect to be called via live migration code, which may try to configure the tenant into secondary + # mode before attaching it. + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + "generation": None, + }, + ) + + virtual_ps_http.tenant_secondary_download(tenant_id) + warm_up_ps = env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "node_secondary" + ][0] + + # Call into storage controller to onboard the tenant + generation += 1 + r = virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "AttachedMulti", + "secondary_conf": None, + "tenant_conf": {}, + "generation": generation, + }, + ) + assert len(r["shards"]) == 1 + + describe = env.storage_controller.tenant_describe(tenant_id)["shards"][0] + dest_ps_id = describe["node_attached"] + dest_ps = env.get_pageserver(dest_ps_id) + if warm_up: + # The storage controller should have attached the tenant to the same placce + # it had a secondary location, otherwise there was no point warming it up + assert dest_ps_id == warm_up_ps + + # It should have been given a new secondary location as well + assert len(describe["node_secondary"]) == 1 + assert describe["node_secondary"][0] != warm_up_ps + + # As if doing a live migration, detach the original pageserver + origin_ps.http_client().tenant_location_conf( + tenant_id, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + + # As if doing a live migration, call into the storage controller to + # set it to AttachedSingle: this is a no-op, but we test it because the + # cloud control plane may call this for symmetry with live migration to + # an individual pageserver + r = virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": generation, + }, + ) + assert len(r["shards"]) == 1 + + # We should see the tenant is now attached to the pageserver managed + # by the sharding service + origin_tenants = origin_ps.http_client().tenant_list() + assert len(origin_tenants) == 0 + dest_tenants = dest_ps.http_client().tenant_list() + assert len(dest_tenants) == 1 + assert TenantId(dest_tenants[0]["id"]) == tenant_id + + # sharding service advances generation by 1 when it first attaches. We started + # with a nonzero generation so this equality also proves that the generation + # was properly carried over during onboarding. + assert dest_tenants[0]["generation"] == generation + 1 + + # The onboarded tenant should survive a restart of sharding service + env.storage_controller.stop() + env.storage_controller.start() + + # The onboarded tenant should surviev a restart of pageserver + dest_ps.stop() + dest_ps.start() + + # Having onboarded via /location_config, we should also be able to update the + # TenantConf part of LocationConf, without inadvertently resetting the generation + modified_tenant_conf = {"max_lsn_wal_lag": 1024 * 1024 * 1024 * 100} + dest_tenant_before_conf_change = dest_ps.http_client().tenant_status(tenant_id) + + # The generation has moved on since we onboarded + assert generation != dest_tenant_before_conf_change["generation"] + + r = virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": modified_tenant_conf, + # This is intentionally a stale generation + "generation": generation, + }, + ) + assert len(r["shards"]) == 1 + dest_tenant_after_conf_change = dest_ps.http_client().tenant_status(tenant_id) + assert ( + dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"] + ) + dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id) + + # Storage controller auto-sets heatmap period, ignore it for the comparison + del dest_tenant_conf_after.tenant_specific_overrides["heatmap_period"] + assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf + + env.storage_controller.consistency_check() + + +def test_storage_controller_compute_hook( + httpserver: HTTPServer, + neon_env_builder: NeonEnvBuilder, + httpserver_listen_address, +): + """ + Test that the sharding service calls out to the configured HTTP endpoint on attachment changes + """ + + # We will run two pageserver to migrate and check that the storage controller sends notifications + # when migrating. + neon_env_builder.num_pageservers = 2 + (host, port) = httpserver_listen_address + neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + + # Set up fake HTTP notify endpoint + notifications = [] + + handle_params = {"status": 200} + + def handler(request: Request): + status = handle_params["status"] + log.info(f"Notify request[{status}]: {request}") + notifications.append(request.json) + return Response(status=status) + + httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + + # Start running + env = neon_env_builder.init_start() + + # Initial notification from tenant creation + assert len(notifications) == 1 + expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = { + "tenant_id": str(env.initial_tenant), + "stripe_size": None, + "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + } + assert notifications[0] == expect + + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"}) + + def node_evacuated(node_id: int) -> None: + counts = get_node_shard_counts(env, [env.initial_tenant]) + assert counts[node_id] == 0 + + wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id)) + + # Additional notification from migration + log.info(f"notifications: {notifications}") + expect = { + "tenant_id": str(env.initial_tenant), + "stripe_size": None, + "shards": [{"node_id": int(env.pageservers[1].id), "shard_number": 0}], + } + + def received_migration_notification(): + assert len(notifications) == 2 + assert notifications[1] == expect + + wait_until(20, 0.25, received_migration_notification) + + # When we restart, we should re-emit notifications for all tenants + env.storage_controller.stop() + env.storage_controller.start() + + def received_restart_notification(): + assert len(notifications) == 3 + assert notifications[2] == expect + + wait_until(10, 1, received_restart_notification) + + # Splitting a tenant should cause its stripe size to become visible in the compute notification + env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2) + expect = { + "tenant_id": str(env.initial_tenant), + "stripe_size": 32768, + "shards": [ + {"node_id": int(env.pageservers[1].id), "shard_number": 0}, + {"node_id": int(env.pageservers[1].id), "shard_number": 1}, + ], + } + + def received_split_notification(): + assert len(notifications) == 4 + assert notifications[3] == expect + + wait_until(10, 1, received_split_notification) + + # If the compute hook is unavailable, that should not block creating a tenant and + # creating a timeline. This simulates a control plane refusing to accept notifications + handle_params["status"] = 423 + degraded_tenant_id = TenantId.generate() + degraded_timeline_id = TimelineId.generate() + env.storage_controller.tenant_create(degraded_tenant_id) + env.storage_controller.pageserver_api().timeline_create( + PgVersion.NOT_SET, degraded_tenant_id, degraded_timeline_id + ) + + # Ensure we hit the handler error path + env.storage_controller.allowed_errors.append( + ".*Failed to notify compute of attached pageserver.*tenant busy.*" + ) + env.storage_controller.allowed_errors.append(".*Reconcile error.*tenant busy.*") + assert notifications[-1] is not None + assert notifications[-1]["tenant_id"] == str(degraded_tenant_id) + + env.storage_controller.consistency_check() + + +def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): + """ + Verify that occasional-use debug APIs work as expected. This is a lightweight test + that just hits the endpoints to check that they don't bitrot. + """ + + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_start() + + tenant_id = TenantId.generate() + env.storage_controller.tenant_create(tenant_id, shard_count=2, shard_stripe_size=8192) + + # Check that the consistency check passes on a freshly setup system + env.storage_controller.consistency_check() + + # These APIs are intentionally not implemented as methods on NeonStorageController, as + # they're just for use in unanticipated circumstances. + + # Initial tenant (1 shard) and the one we just created (2 shards) should be visible + response = env.storage_controller.request( + "GET", + f"{env.storage_controller_api}/debug/v1/tenant", + headers=env.storage_controller.headers(TokenScope.ADMIN), + ) + assert len(response.json()) == 3 + + # Scheduler should report the expected nodes and shard counts + response = env.storage_controller.request( + "GET", f"{env.storage_controller_api}/debug/v1/scheduler" + ) + # Two nodes, in a dict of node_id->node + assert len(response.json()["nodes"]) == 2 + assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3 + assert all(v["may_schedule"] for v in response.json()["nodes"].values()) + + response = env.storage_controller.request( + "POST", + f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop", + headers=env.storage_controller.headers(TokenScope.ADMIN), + ) + assert len(env.storage_controller.node_list()) == 1 + + response = env.storage_controller.request( + "POST", + f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop", + headers=env.storage_controller.headers(TokenScope.ADMIN), + ) + + # Tenant drop should be reflected in dump output + response = env.storage_controller.request( + "GET", + f"{env.storage_controller_api}/debug/v1/tenant", + headers=env.storage_controller.headers(TokenScope.ADMIN), + ) + assert len(response.json()) == 1 + + # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're + # meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind. + env.storage_controller.consistency_check() + + +def test_storage_controller_s3_time_travel_recovery( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, +): + """ + Test for S3 time travel + """ + + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + # Mock S3 doesn't have versioning enabled by default, enable it + # (also do it before there is any writes to the bucket) + if remote_storage_kind == RemoteStorageKind.MOCK_S3: + remote_storage = neon_env_builder.pageserver_remote_storage + assert remote_storage, "remote storage not configured" + enable_remote_storage_versioning(remote_storage) + + neon_env_builder.num_pageservers = 1 + + env = neon_env_builder.init_start() + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + + tenant_id = TenantId.generate() + env.storage_controller.tenant_create( + tenant_id, + shard_count=2, + shard_stripe_size=8192, + tenant_config=many_small_layers_tenant_config(), + ) + + # Check that the consistency check passes + env.storage_controller.consistency_check() + + branch_name = "main" + timeline_id = env.neon_cli.create_timeline( + branch_name, + tenant_id=tenant_id, + ) + # Write some nontrivial amount of data into the endpoint and wait until it is uploaded + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + endpoint.safe_psql("CREATE TABLE created_foo(id integer);") + # last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) + + # Give the data time to be uploaded + time.sleep(4) + + # Detach the tenant + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + + time.sleep(4) + ts_before_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None) + time.sleep(4) + + # Simulate a "disaster": delete some random files from remote storage for one of the shards + assert env.pageserver_remote_storage + shard_id_for_list = "0002" + objects: List[ObjectTypeDef] = list_prefix( + env.pageserver_remote_storage, + f"tenants/{tenant_id}-{shard_id_for_list}/timelines/{timeline_id}/", + ).get("Contents", []) + assert len(objects) > 1 + log.info(f"Found {len(objects)} objects in remote storage") + should_delete = False + for obj in objects: + obj_key = obj["Key"] + should_delete = not should_delete + if not should_delete: + log.info(f"Keeping key on remote storage: {obj_key}") + continue + log.info(f"Deleting key from remote storage: {obj_key}") + remote_storage_delete_key(env.pageserver_remote_storage, obj_key) + pass + + time.sleep(4) + ts_after_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None) + time.sleep(4) + + # Do time travel recovery + virtual_ps_http.tenant_time_travel_remote_storage( + tenant_id, ts_before_disaster, ts_after_disaster, shard_counts=[2] + ) + time.sleep(4) + + # Attach the tenant again + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": 100, + }, + ) + + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + endpoint.safe_psql("SELECT * FROM created_foo;") + + env.storage_controller.consistency_check() + + +def test_storage_controller_auth(neon_env_builder: NeonEnvBuilder): + neon_env_builder.auth_enabled = True + env = neon_env_builder.init_start() + svc = env.storage_controller + api = env.storage_controller_api + + tenant_id = TenantId.generate() + body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)} + + env.storage_controller.allowed_errors.append(".*Unauthorized.*") + env.storage_controller.allowed_errors.append(".*Forbidden.*") + + # No token + with pytest.raises( + StorageControllerApiException, + match="Unauthorized: missing authorization header", + ): + svc.request("POST", f"{env.storage_controller_api}/v1/tenant", json=body) + + # Token with incorrect scope + with pytest.raises( + StorageControllerApiException, + match="Forbidden: JWT authentication error", + ): + svc.request( + "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.SAFEKEEPER_DATA) + ) + + # Token with correct scope + svc.request( + "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.PAGE_SERVER_API) + ) + + # Token with admin scope should also be permitted + svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN)) + + # No token + with pytest.raises( + StorageControllerApiException, + match="Unauthorized: missing authorization header", + ): + svc.request("GET", f"{api}/debug/v1/tenant") + + # Token with incorrect scope + with pytest.raises( + StorageControllerApiException, + match="Forbidden: JWT authentication error", + ): + svc.request( + "GET", f"{api}/debug/v1/tenant", headers=svc.headers(TokenScope.GENERATIONS_API) + ) + + # No token + with pytest.raises( + StorageControllerApiException, + match="Unauthorized: missing authorization header", + ): + svc.request("POST", f"{api}/upcall/v1/re-attach") + + # Token with incorrect scope + with pytest.raises( + StorageControllerApiException, + match="Forbidden: JWT authentication error", + ): + svc.request( + "POST", f"{api}/upcall/v1/re-attach", headers=svc.headers(TokenScope.PAGE_SERVER_API) + ) + + +def test_storage_controller_tenant_conf(neon_env_builder: NeonEnvBuilder): + """ + Validate the pageserver-compatible API endpoints for setting and getting tenant conf, without + supplying the whole LocationConf. + """ + + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + + http = env.storage_controller.pageserver_api() + + default_value = "7days" + new_value = "1h" + http.set_tenant_config(tenant_id, {"pitr_interval": new_value}) + + # Ensure the change landed on the storage controller + readback_controller = http.tenant_config(tenant_id) + assert readback_controller.effective_config["pitr_interval"] == new_value + assert readback_controller.tenant_specific_overrides["pitr_interval"] == new_value + + # Ensure the change made it down to the pageserver + readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id) + assert readback_ps.effective_config["pitr_interval"] == new_value + assert readback_ps.tenant_specific_overrides["pitr_interval"] == new_value + + # Omitting a value clears it. This looks different in storage controller + # vs. pageserver API calls, because pageserver has defaults. + http.set_tenant_config(tenant_id, {}) + readback_controller = http.tenant_config(tenant_id) + assert readback_controller.effective_config["pitr_interval"] is None + assert readback_controller.tenant_specific_overrides["pitr_interval"] is None + readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id) + assert readback_ps.effective_config["pitr_interval"] == default_value + assert "pitr_interval" not in readback_ps.tenant_specific_overrides + + env.storage_controller.consistency_check() + + +def test_storage_controller_tenant_deletion( + neon_env_builder: NeonEnvBuilder, + compute_reconfigure_listener: ComputeReconfigure, +): + """ + Validate that: + - Deleting a tenant deletes all its shards + - Deletion does not require the compute notification hook to be responsive + - Deleting a tenant also removes all secondary locations + """ + neon_env_builder.num_pageservers = 4 + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.control_plane_compute_hook_api = ( + compute_reconfigure_listener.control_plane_compute_hook_api + ) + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, timeline_id, shard_count=2, placement_policy='{"Attached":1}' + ) + + # Ensure all the locations are configured, including secondaries + env.storage_controller.reconcile_until_idle() + + shard_ids = [ + TenantShardId.parse(shard["shard_id"]) for shard in env.storage_controller.locate(tenant_id) + ] + + # Assert attachments all have local content + for shard_id in shard_ids: + pageserver = env.get_tenant_pageserver(shard_id) + assert pageserver.tenant_dir(shard_id).exists() + + # Assert all shards have some content in remote storage + for shard_id in shard_ids: + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(shard_id), + ) + ), + ) + + # Break the compute hook: we are checking that deletion does not depend on the compute hook being available + def break_hook(): + raise RuntimeError("Unexpected call to compute hook") + + compute_reconfigure_listener.register_on_notify(break_hook) + + # No retry loop: deletion should complete in one shot without polling for 202 responses, because + # it cleanly detaches all the shards first, and then deletes them in remote storage + env.storage_controller.pageserver_api().tenant_delete(tenant_id) + + # Assert no pageservers have any local content + for pageserver in env.pageservers: + for shard_id in shard_ids: + assert not pageserver.tenant_dir(shard_id).exists() + + for shard_id in shard_ids: + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(shard_id), + ) + ), + ) + + # Assert the tenant is not visible in storage controller API + with pytest.raises(StorageControllerApiException): + env.storage_controller.tenant_describe(tenant_id) + + +class Failure: + pageserver_id: int + offline_timeout: int + must_detect_after: int + + def apply(self, env: NeonEnv): + raise NotImplementedError() + + def clear(self, env: NeonEnv): + raise NotImplementedError() + + def nodes(self): + raise NotImplementedError() + + +class NodeStop(Failure): + def __init__(self, pageserver_ids, immediate, offline_timeout, must_detect_after): + self.pageserver_ids = pageserver_ids + self.immediate = immediate + self.offline_timeout = offline_timeout + self.must_detect_after = must_detect_after + + def apply(self, env: NeonEnv): + for ps_id in self.pageserver_ids: + pageserver = env.get_pageserver(ps_id) + pageserver.stop(immediate=self.immediate) + + def clear(self, env: NeonEnv): + for ps_id in self.pageserver_ids: + pageserver = env.get_pageserver(ps_id) + pageserver.start() + + def nodes(self): + return self.pageserver_ids + + +class NodeRestartWithSlowReattach(Failure): + def __init__(self, pageserver_id, offline_timeout, must_detect_after): + self.pageserver_id = pageserver_id + self.offline_timeout = offline_timeout + self.must_detect_after = must_detect_after + self.thread = None + + def apply(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.stop(immediate=False) + + def start_ps(): + pageserver.start( + extra_env_vars={"FAILPOINTS": "control-plane-client-re-attach=return(30000)"} + ) + + self.thread = threading.Thread(target=start_ps) + self.thread.start() + + def clear(self, env: NeonEnv): + if self.thread is not None: + self.thread.join() + + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.http_client().configure_failpoints(("control-plane-client-re-attach", "off")) + + def nodes(self): + return [self.pageserver_id] + + +class PageserverFailpoint(Failure): + def __init__(self, failpoint, pageserver_id, offline_timeout, must_detect_after): + self.failpoint = failpoint + self.pageserver_id = pageserver_id + self.offline_timeout = offline_timeout + self.must_detect_after = must_detect_after + + def apply(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.http_client().configure_failpoints((self.failpoint, "return(1)")) + + def clear(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.http_client().configure_failpoints((self.failpoint, "off")) + + def nodes(self): + return [self.pageserver_id] + + +def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]: + tenants = env.storage_controller.tenant_list() + + node_to_tenants: dict[int, list[TenantId]] = {} + for t in tenants: + for node_id, loc_state in t["observed"]["locations"].items(): + if ( + loc_state is not None + and "conf" in loc_state + and loc_state["conf"] is not None + and loc_state["conf"]["mode"] == "AttachedSingle" + ): + crnt = node_to_tenants.get(int(node_id), []) + crnt.append(TenantId(t["tenant_shard_id"])) + node_to_tenants[int(node_id)] = crnt + + return node_to_tenants + + +@pytest.mark.parametrize( + "failure", + [ + NodeStop(pageserver_ids=[1], immediate=False, offline_timeout=20, must_detect_after=5), + NodeStop(pageserver_ids=[1], immediate=True, offline_timeout=20, must_detect_after=5), + NodeStop(pageserver_ids=[1, 2], immediate=True, offline_timeout=20, must_detect_after=5), + PageserverFailpoint( + pageserver_id=1, + failpoint="get-utilization-http-handler", + offline_timeout=20, + must_detect_after=5, + ), + # Instrument a scenario where the node is slow to re-attach. The re-attach request itself + # should serve as a signal to the storage controller to use a more lenient heartbeat timeout. + NodeRestartWithSlowReattach(pageserver_id=1, offline_timeout=60, must_detect_after=15), + ], +) +def test_storage_controller_heartbeats( + neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure +): + neon_env_builder.storage_controller_config = { + "max_offline": "10s", + "max_warming_up": "20s", + } + + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + # Default log allow list permits connection errors, but this test will use error responses on + # the utilization endpoint. + env.storage_controller.allowed_errors.append( + ".*Call to node.*management API.*failed.*failpoint.*" + ) + + # Initially we have two online pageservers + nodes = env.storage_controller.node_list() + assert len(nodes) == 2 + assert all([n["availability"] == "Active" for n in nodes]) + + # ... then we create two tenants and write some data into them + def create_tenant(tid: TenantId): + env.storage_controller.tenant_create(tid) + + branch_name = "main" + env.neon_cli.create_timeline( + branch_name, + tenant_id=tid, + ) + + with env.endpoints.create_start("main", tenant_id=tid) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + endpoint.safe_psql("CREATE TABLE created_foo(id integer);") + + tenant_ids = [TenantId.generate(), TenantId.generate()] + for tid in tenant_ids: + create_tenant(tid) + + # ... expecting that each tenant will be placed on a different node + def tenants_placed(): + node_to_tenants = build_node_to_tenants_map(env) + log.info(f"{node_to_tenants=}") + + # Check that all the tenants have been attached + assert sum((len(ts) for ts in node_to_tenants.values())) == len(tenant_ids) + # Check that each node got one tenant + assert all((len(ts) == 1 for ts in node_to_tenants.values())) + + wait_until(10, 1, tenants_placed) + + # ... then we apply the failure + offline_node_ids = set(failure.nodes()) + online_node_ids = set(range(1, len(env.pageservers) + 1)) - offline_node_ids + + for node_id in offline_node_ids: + if len(offline_node_ids) > 1: + env.get_pageserver(node_id).allowed_errors.append( + ".*Scheduling error when marking pageserver.*offline.*", + ) + + failure.apply(env) + + # ... expecting the heartbeats to mark it offline + def nodes_offline(): + nodes = env.storage_controller.node_list() + log.info(f"{nodes=}") + for node in nodes: + if node["id"] in offline_node_ids: + assert node["availability"] == "Offline" + + start = time.time() + wait_until(failure.offline_timeout, 1, nodes_offline) + detected_after = time.time() - start + log.info(f"Detected node failures after {detected_after}s") + + assert detected_after >= failure.must_detect_after + + # .. expecting the tenant on the offline node to be migrated + def tenant_migrated(): + if len(online_node_ids) == 0: + time.sleep(5) + return + + node_to_tenants = build_node_to_tenants_map(env) + log.info(f"{node_to_tenants=}") + + observed_tenants = set() + for node_id in online_node_ids: + observed_tenants |= set(node_to_tenants[node_id]) + + assert observed_tenants == set(tenant_ids) + + wait_until(10, 1, tenant_migrated) + + # ... then we clear the failure + failure.clear(env) + + # ... expecting the offline node to become active again + def nodes_online(): + nodes = env.storage_controller.node_list() + for node in nodes: + if node["id"] in online_node_ids: + assert node["availability"] == "Active" + + wait_until(10, 1, nodes_online) + + time.sleep(5) + + node_to_tenants = build_node_to_tenants_map(env) + log.info(f"Back online: {node_to_tenants=}") + + # ... expecting the storage controller to reach a consistent state + def storage_controller_consistent(): + env.storage_controller.consistency_check() + + wait_until(30, 1, storage_controller_consistent) + + +def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder): + """ + Exercise the behavior of the /re-attach endpoint on pageserver startup when + pageservers have a mixture of attached and secondary locations + """ + + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + # We'll have two tenants. + tenant_a = TenantId.generate() + env.neon_cli.create_tenant(tenant_a, placement_policy='{"Attached":1}') + tenant_b = TenantId.generate() + env.neon_cli.create_tenant(tenant_b, placement_policy='{"Attached":1}') + + # Each pageserver will have one attached and one secondary location + env.storage_controller.tenant_shard_migrate( + TenantShardId(tenant_a, 0, 0), env.pageservers[0].id + ) + env.storage_controller.tenant_shard_migrate( + TenantShardId(tenant_b, 0, 0), env.pageservers[1].id + ) + + # Hard-fail a pageserver + victim_ps = env.pageservers[1] + survivor_ps = env.pageservers[0] + victim_ps.stop(immediate=True) + + # Heatbeater will notice it's offline, and consequently attachments move to the other pageserver + def failed_over(): + locations = survivor_ps.http_client().tenant_list_locations()["tenant_shards"] + log.info(f"locations: {locations}") + assert len(locations) == 2 + assert all(loc[1]["mode"] == "AttachedSingle" for loc in locations) + + # We could pre-empty this by configuring the node to Offline, but it's preferable to test + # the realistic path we would take when a node restarts uncleanly. + # The delay here will be ~NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL in neon_local + wait_until(30, 1, failed_over) + + reconciles_before_restart = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + + # Restart the failed pageserver + victim_ps.start() + + # We expect that the re-attach call correctly tipped off the pageserver that its locations + # are all secondaries now. + locations = victim_ps.http_client().tenant_list_locations()["tenant_shards"] + assert len(locations) == 2 + assert all(loc[1]["mode"] == "Secondary" for loc in locations) + + # We expect that this situation resulted from the re_attach call, and not any explicit + # Reconciler runs: assert that the reconciliation count has not gone up since we restarted. + reconciles_after_restart = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + assert reconciles_after_restart == reconciles_before_restart + + +def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBuilder): + """ + Check that emergency hooks for disabling rogue tenants' reconcilers work as expected. + """ + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + + env.storage_controller.allowed_errors.extend( + [ + # We will intentionally cause reconcile errors + ".*Reconcile error.*", + # Message from using a scheduling policy + ".*Scheduling is disabled by policy.*", + ".*Skipping reconcile for policy.*", + # Message from a node being offline + ".*Call to node .* management API .* failed", + ] + ) + + # Stop pageserver so that reconcile cannot complete + env.pageserver.stop() + + env.storage_controller.tenant_create(tenant_id, placement_policy="Detached") + + # Try attaching it: we should see reconciles failing + env.storage_controller.tenant_policy_update( + tenant_id, + { + "placement": {"Attached": 0}, + }, + ) + + def reconcile_errors() -> int: + return int( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "error"} + ) + or 0 + ) + + def reconcile_ok() -> int: + return int( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + or 0 + ) + + def assert_errors_gt(n) -> int: + e = reconcile_errors() + assert e > n + return e + + errs = wait_until(10, 1, lambda: assert_errors_gt(0)) + + # Try reconciling again, it should fail again + with pytest.raises(StorageControllerApiException): + env.storage_controller.reconcile_all() + errs = wait_until(10, 1, lambda: assert_errors_gt(errs)) + + # Configure the tenant to disable reconciles + env.storage_controller.tenant_policy_update( + tenant_id, + { + "scheduling": "Stop", + }, + ) + + # Try reconciling again, it should not cause an error (silently skip) + env.storage_controller.reconcile_all() + assert reconcile_errors() == errs + + # Start the pageserver and re-enable reconciles + env.pageserver.start() + env.storage_controller.tenant_policy_update( + tenant_id, + { + "scheduling": "Active", + }, + ) + + def assert_ok_gt(n) -> int: + o = reconcile_ok() + assert o > n + return o + + # We should see a successful reconciliation + wait_until(10, 1, lambda: assert_ok_gt(0)) + + # And indeed the tenant should be attached + assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1 + + +def test_storcon_cli(neon_env_builder: NeonEnvBuilder): + """ + The storage controller command line interface (storcon-cli) is an internal tool. Most tests + just use the APIs directly: this test exercises some basics of the CLI as a regression test + that the client remains usable as the server evolves. + """ + output_dir = neon_env_builder.test_output_dir + shard_count = 4 + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + base_args = [env.neon_binpath / "storcon_cli", "--api", env.storage_controller_api] + + def storcon_cli(args): + """ + CLI wrapper: returns stdout split into a list of non-empty strings + """ + (output_path, stdout, status_code) = subprocess_capture( + output_dir, + [str(s) for s in base_args + args], + echo_stderr=True, + echo_stdout=True, + env={}, + check=False, + capture_stdout=True, + timeout=10, + ) + if status_code: + log.warning(f"Command {args} failed") + log.warning(f"Output at: {output_path}") + + raise RuntimeError("CLI failure (check logs for stderr)") + + assert stdout is not None + return [line.strip() for line in stdout.split("\n") if line.strip()] + + # List nodes + node_lines = storcon_cli(["nodes"]) + # Table header, footer, and one line of data + assert len(node_lines) == 5 + assert "localhost" in node_lines[3] + + # Pause scheduling onto a node + storcon_cli(["node-configure", "--node-id", "1", "--scheduling", "pause"]) + assert "Pause" in storcon_cli(["nodes"])[3] + + # We will simulate a node death and then marking it offline + env.pageservers[0].stop(immediate=True) + # Sleep to make it unlikely that the controller's heartbeater will race handling + # a /utilization response internally, such that it marks the node back online. IRL + # there would always be a longer delay than this before a node failing and a human + # intervening. + time.sleep(2) + + storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"]) + assert "Offline" in storcon_cli(["nodes"])[3] + + # List tenants + tenant_lines = storcon_cli(["tenants"]) + assert len(tenant_lines) == 5 + assert str(env.initial_tenant) in tenant_lines[3] + + # Setting scheduling policies intentionally result in warnings, they're for rare use. + env.storage_controller.allowed_errors.extend( + [".*Skipping reconcile for policy.*", ".*Scheduling is disabled by policy.*"] + ) + + # Describe a tenant + tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)]) + assert len(tenant_lines) == 3 + shard_count * 2 + assert str(env.initial_tenant) in tenant_lines[3] + + # Pause changes on a tenant + storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"]) + assert "Stop" in storcon_cli(["tenants"])[3] + + # Change a tenant's placement + storcon_cli( + ["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"] + ) + assert "Secondary" in storcon_cli(["tenants"])[3] + + # Modify a tenant's config + storcon_cli( + [ + "tenant-config", + "--tenant-id", + str(env.initial_tenant), + "--config", + json.dumps({"pitr_interval": "1m"}), + ] + ) + + # Quiesce any background reconciliation before doing consistency check + env.storage_controller.reconcile_until_idle(timeout_secs=10) + env.storage_controller.consistency_check() + + +def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder): + """ + Check that when lock on resource (tenants, nodes) is held for too long it is + traced in logs. + """ + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + env.storage_controller.allowed_errors.extend( + [ + ".*Exclusive lock by.*", + ".*Shared lock by.*", + ".*Scheduling is disabled by policy.*", + f".*Operation TimelineCreate on key {tenant_id} has waited.*", + ] + ) + + # Apply failpoint + env.storage_controller.configure_failpoints( + ("tenant-update-policy-exclusive-lock", "return(35000)") + ) + + # This will hold the exclusive for enough time to cause an warning + def update_tenent_policy(): + env.storage_controller.tenant_policy_update( + tenant_id=tenant_id, + body={ + "scheduling": "Stop", + }, + ) + + thread_update_tenant_policy = threading.Thread(target=update_tenent_policy) + thread_update_tenant_policy.start() + + # Make sure the update policy thread has started + time.sleep(1) + # This will not be able to access and will log a warning + timeline_id = TimelineId.generate() + env.storage_controller.pageserver_api().timeline_create( + pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id + ) + thread_update_tenant_policy.join() + + env.storage_controller.assert_log_contains("Exclusive lock by UpdatePolicy was held for") + _, last_log_cursor = env.storage_controller.assert_log_contains( + f"Operation TimelineCreate on key {tenant_id} has waited" + ) + + # Test out shared lock + env.storage_controller.configure_failpoints( + ("tenant-create-timeline-shared-lock", "return(31000)") + ) + + timeline_id = TimelineId.generate() + # This will hold the shared lock for enough time to cause an warning + env.storage_controller.pageserver_api().timeline_create( + pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id + ) + env.storage_controller.assert_log_contains( + "Shared lock by TimelineCreate was held for", offset=last_log_cursor + ) + + +@pytest.mark.parametrize("remote_storage", [RemoteStorageKind.LOCAL_FS, s3_storage()]) +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_storage): + """ + Tenant import is a support/debug tool for recovering a tenant from remote storage + if we don't have any metadata for it in the storage controller. + """ + + # This test is parametrized on remote storage because it exercises the relatively rare + # code path of listing with a prefix that is not a directory name: this helps us notice + # quickly if local_fs or s3_bucket implementations diverge. + neon_env_builder.enable_pageserver_remote_storage(remote_storage) + + # Use multiple pageservers because some test helpers assume single sharded tenants + # if there is only one pageserver. + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + tenant_id = env.initial_tenant + + # Create a second timeline to ensure that import finds both + timeline_a = env.initial_timeline + timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id) + + workload_a = Workload(env, tenant_id, timeline_a, branch_name="main") + workload_a.init() + + workload_b = Workload(env, tenant_id, timeline_b, branch_name="branch_b") + workload_b.init() + + # Write some data + workload_a.write_rows(72) + expect_rows_a = workload_a.expect_rows + workload_a.stop() + del workload_a + + # Bump generation to make sure generation recovery works properly + for pageserver in env.pageservers: + pageserver.stop() + pageserver.start() + + # Write some data in the higher generation into the other branch + workload_b.write_rows(107) + expect_rows_b = workload_b.expect_rows + workload_b.stop() + del workload_b + + # Detach from pageservers + env.storage_controller.tenant_policy_update( + tenant_id, + { + "placement": "Detached", + }, + ) + env.storage_controller.reconcile_until_idle(timeout_secs=10) + + # Force-drop it from the storage controller + env.storage_controller.request( + "POST", + f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop", + headers=env.storage_controller.headers(TokenScope.ADMIN), + ) + + # Now import it again + env.neon_cli.import_tenant(tenant_id) + + # Check we found the shards + describe = env.storage_controller.tenant_describe(tenant_id) + literal_shard_count = 1 if shard_count is None else shard_count + assert len(describe["shards"]) == literal_shard_count + + # Check the data is still there: this implicitly proves that we recovered generation numbers + # properly, for the timeline which was written to after a generation bump. + for timeline, branch, expect_rows in [ + (timeline_a, "main", expect_rows_a), + (timeline_b, "branch_1", expect_rows_b), + ]: + workload = Workload(env, tenant_id, timeline, branch_name=branch) + workload.expect_rows = expect_rows + workload.validate() + + +def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): + """ + Graceful reststart of storage controller clusters use the drain and + fill hooks in order to migrate attachments away from pageservers before + restarting. In practice, Ansible will drive this process. + """ + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_count = 5 + shard_count_per_tenant = 8 + total_shards = tenant_count * shard_count_per_tenant + tenant_ids = [] + + for _ in range(0, tenant_count): + tid = TenantId.generate() + tenant_ids.append(tid) + env.neon_cli.create_tenant( + tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant + ) + + # Give things a chance to settle. + env.storage_controller.reconcile_until_idle(timeout_secs=30) + + nodes = env.storage_controller.node_list() + assert len(nodes) == 2 + + def assert_shard_counts_balanced(env: NeonEnv, shard_counts, total_shards): + # Assert that all nodes have some attached shards + assert len(shard_counts) == len(env.pageservers) + + min_shard_count = min(shard_counts.values()) + max_shard_count = max(shard_counts.values()) + + flake_factor = 5 / 100 + assert max_shard_count - min_shard_count <= int(total_shards * flake_factor) + + # Perform a graceful rolling restart + for ps in env.pageservers: + env.storage_controller.warm_up_all_secondaries() + + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2 + ) + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.PAUSE_FOR_RESTART, + max_attempts=6, + backoff=5, + ) + + shard_counts = get_node_shard_counts(env, tenant_ids) + log.info(f"Shard counts after draining node {ps.id}: {shard_counts}") + # Assert that we've drained the node + assert shard_counts[ps.id] == 0 + # Assert that those shards actually went somewhere + assert sum(shard_counts.values()) == total_shards + + ps.restart() + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=10, + backoff=1, + ) + + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2 + ) + env.storage_controller.poll_node_status( + ps.id, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=6, + backoff=5, + ) + + shard_counts = get_node_shard_counts(env, tenant_ids) + log.info(f"Shard counts after filling node {ps.id}: {shard_counts}") + assert_shard_counts_balanced(env, shard_counts, total_shards) + + # Now check that shards are reasonably balanced + shard_counts = get_node_shard_counts(env, tenant_ids) + log.info(f"Shard counts after rolling restart: {shard_counts}") + assert_shard_counts_balanced(env, shard_counts, total_shards) + + +def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + """ + Artificially make a tenant shard's secondary location lag behind the primary + and check that storage controller driven node drains skip the lagging tenant shard. + Finally, validate that the tenant shard is migrated when a new drain request comes + in and it's no longer lagging. + """ + neon_env_builder.num_pageservers = 2 + neon_env_builder.storage_controller_config = { + "max_secondary_lag_bytes": 1 * 1024 * 1024, + } + + env = neon_env_builder.init_configs() + env.start() + + tid, timeline_id = env.neon_cli.create_tenant(placement_policy='{"Attached":1}') + + # Give things a chance to settle. + env.storage_controller.reconcile_until_idle(timeout_secs=30) + + locations = env.storage_controller.locate(tid) + assert len(locations) == 1 + primary: int = locations[0]["node_id"] + not_primary = [ps.id for ps in env.pageservers if ps.id != primary] + assert len(not_primary) == 1 + secondary = not_primary[0] + + log.info(f"Paused secondary downloads on {secondary}") + env.get_pageserver(secondary).http_client().configure_failpoints( + ("secondary-layer-download-pausable", "pause") + ) + + log.info(f"Ingesting some data for {tid}") + + with env.endpoints.create_start("main", tenant_id=tid) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + endpoint.safe_psql("CREATE TABLE created_foo(id integer);") + last_flush_lsn_upload(env, endpoint, tid, timeline_id) + + log.info(f"Uploading heatmap from {primary} and requesting download from {secondary}") + + env.get_pageserver(primary).http_client().tenant_heatmap_upload(tid) + env.get_pageserver(secondary).http_client().tenant_secondary_download(tid, wait_ms=100) + + def secondary_is_lagging(): + resp = env.get_pageserver(secondary).http_client().tenant_secondary_status(tid) + lag = resp["bytes_total"] - resp["bytes_downloaded"] + + if lag <= 1 * 1024 * 1024: + raise Exception(f"Secondary lag not big enough: {lag}") + + log.info(f"Looking for lag to develop on the secondary {secondary}") + wait_until(10, 1, secondary_is_lagging) + + log.info(f"Starting drain of primary {primary} with laggy secondary {secondary}") + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_drain(ps_id), primary, max_attempts=3, backoff=2 + ) + + env.storage_controller.poll_node_status( + primary, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.PAUSE_FOR_RESTART, + max_attempts=6, + backoff=5, + ) + + locations = env.storage_controller.locate(tid) + assert len(locations) == 1 + assert locations[0]["node_id"] == primary + + log.info(f"Unpausing secondary downloads on {secondary}") + env.get_pageserver(secondary).http_client().configure_failpoints( + ("secondary-layer-download-pausable", "off") + ) + env.get_pageserver(secondary).http_client().tenant_secondary_download(tid, wait_ms=100) + + log.info(f"Waiting for lag to reduce on {secondary}") + + def lag_is_acceptable(): + resp = env.get_pageserver(secondary).http_client().tenant_secondary_status(tid) + lag = resp["bytes_total"] - resp["bytes_downloaded"] + + if lag > 1 * 1024 * 1024: + raise Exception(f"Secondary lag not big enough: {lag}") + + wait_until(10, 1, lag_is_acceptable) + + env.storage_controller.node_configure(primary, {"scheduling": "Active"}) + + log.info(f"Starting drain of primary {primary} with non-laggy secondary {secondary}") + + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_drain(ps_id), primary, max_attempts=3, backoff=2 + ) + + env.storage_controller.poll_node_status( + primary, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.PAUSE_FOR_RESTART, + max_attempts=6, + backoff=5, + ) + + locations = env.storage_controller.locate(tid) + assert len(locations) == 1 + assert locations[0]["node_id"] == secondary + + +def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_count = 10 + shard_count_per_tenant = 8 + tenant_ids = [] + + for _ in range(0, tenant_count): + tid = TenantId.generate() + tenant_ids.append(tid) + env.neon_cli.create_tenant( + tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant + ) + + # See sleep comment in the test above. + time.sleep(2) + + nodes = env.storage_controller.node_list() + assert len(nodes) == 2 + + env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(2000)")) + + ps_id_to_drain = env.pageservers[0].id + + env.storage_controller.warm_up_all_secondaries() + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_drain(ps_id), + ps_id_to_drain, + max_attempts=3, + backoff=2, + ) + + env.storage_controller.poll_node_status( + ps_id_to_drain, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.DRAINING, + max_attempts=6, + backoff=2, + ) + + env.storage_controller.cancel_node_drain(ps_id_to_drain) + + env.storage_controller.poll_node_status( + ps_id_to_drain, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=6, + backoff=2, + ) + + +@pytest.mark.parametrize("while_offline", [True, False]) +def test_storage_controller_node_deletion( + neon_env_builder: NeonEnvBuilder, + compute_reconfigure_listener: ComputeReconfigure, + while_offline: bool, +): + """ + Test that deleting a node works & properly reschedules everything that was on the node. + """ + neon_env_builder.num_pageservers = 3 + env = neon_env_builder.init_configs() + env.start() + + tenant_count = 10 + shard_count_per_tenant = 8 + tenant_ids = [] + for _ in range(0, tenant_count): + tid = TenantId.generate() + tenant_ids.append(tid) + env.neon_cli.create_tenant( + tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant + ) + + victim = env.pageservers[-1] + + # The procedure a human would follow is: + # 1. Mark pageserver scheduling=pause + # 2. Mark pageserver availability=offline to trigger migrations away from it + # 3. Wait for attachments to all move elsewhere + # 4. Call deletion API + # 5. Stop the node. + + env.storage_controller.node_configure(victim.id, {"scheduling": "Pause"}) + + if while_offline: + victim.stop(immediate=True) + env.storage_controller.node_configure(victim.id, {"availability": "Offline"}) + + def assert_shards_migrated(): + counts = get_node_shard_counts(env, tenant_ids) + elsewhere = sum(v for (k, v) in counts.items() if k != victim.id) + log.info(f"Shards on nodes other than on victim: {elsewhere}") + assert elsewhere == tenant_count * shard_count_per_tenant + + wait_until(30, 1, assert_shards_migrated) + + log.info(f"Deleting pageserver {victim.id}") + env.storage_controller.node_delete(victim.id) + + if not while_offline: + + def assert_victim_evacuated(): + counts = get_node_shard_counts(env, tenant_ids) + count = counts[victim.id] + log.info(f"Shards on node {victim.id}: {count}") + assert count == 0 + + wait_until(30, 1, assert_victim_evacuated) + + # The node should be gone from the list API + assert victim.id not in [n["id"] for n in env.storage_controller.node_list()] + + # No tenants should refer to the node in their intent + for tenant_id in tenant_ids: + describe = env.storage_controller.tenant_describe(tenant_id) + for shard in describe["shards"]: + assert shard["node_attached"] != victim.id + assert victim.id not in shard["node_secondary"] + + # Reconciles running during deletion should all complete + # FIXME: this currently doesn't work because the deletion schedules shards without a proper ScheduleContext, resulting + # in states that background_reconcile wants to optimize, but can't proceed with migrations yet because this is a short3 + # test that hasn't uploaded any heatmaps for secondaries. + # In the interim, just do a reconcile_all to enable the consistency check. + # env.storage_controller.reconcile_until_idle() + env.storage_controller.reconcile_all() + + # Controller should pass its own consistency checks + env.storage_controller.consistency_check() + + # The node should stay gone across a restart + env.storage_controller.stop() + env.storage_controller.start() + assert victim.id not in [n["id"] for n in env.storage_controller.node_list()] + env.storage_controller.reconcile_all() # FIXME: workaround for optimizations happening on startup, see FIXME above. + env.storage_controller.consistency_check() + + +@pytest.mark.parametrize("shard_count", [None, 2]) +def test_storage_controller_metadata_health( + neon_env_builder: NeonEnvBuilder, + shard_count: Optional[int], +): + """ + Create three tenants A, B, C. + + Phase 1: + - A: Post healthy status. + - B: Post unhealthy status. + - C: No updates. + + Phase 2: + - B: Post healthy status. + - C: Post healthy status. + + Phase 3: + - A: Post unhealthy status. + + Phase 4: + - Delete tenant A, metadata health status should be deleted as well. + """ + + def update_and_query_metadata_health( + env: NeonEnv, + healthy: List[TenantShardId], + unhealthy: List[TenantShardId], + outdated_duration: str = "1h", + ) -> Tuple[Set[str], Set[str]]: + """ + Update metadata health. Then list tenant shards with unhealthy and + outdated metadata health status. + """ + if healthy or unhealthy: + env.storage_controller.metadata_health_update(healthy, unhealthy) + result = env.storage_controller.metadata_health_list_unhealthy() + unhealthy_res = set(result["unhealthy_tenant_shards"]) + result = env.storage_controller.metadata_health_list_outdated(outdated_duration) + outdated_res = set(record["tenant_shard_id"] for record in result["health_records"]) + + return unhealthy_res, outdated_res + + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_start() + + # Mock tenant (`initial_tenant``) with healthy scrubber scan result + tenant_a_shard_ids = ( + env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=shard_count) + if shard_count is not None + else [TenantShardId(env.initial_tenant, 0, 0)] + ) + + # Mock tenant with unhealthy scrubber scan result + tenant_b, _ = env.neon_cli.create_tenant(shard_count=shard_count) + tenant_b_shard_ids = ( + env.storage_controller.tenant_shard_split(tenant_b, shard_count=shard_count) + if shard_count is not None + else [TenantShardId(tenant_b, 0, 0)] + ) + + # Mock tenant that never gets a health update from scrubber + tenant_c, _ = env.neon_cli.create_tenant(shard_count=shard_count) + + tenant_c_shard_ids = ( + env.storage_controller.tenant_shard_split(tenant_c, shard_count=shard_count) + if shard_count is not None + else [TenantShardId(tenant_c, 0, 0)] + ) + + # Metadata health table also updated as tenant shards are created. + assert env.storage_controller.metadata_health_is_healthy() + + # post "fake" updates to storage controller db + + unhealthy, outdated = update_and_query_metadata_health( + env, healthy=tenant_a_shard_ids, unhealthy=tenant_b_shard_ids + ) + + log.info(f"After Phase 1: {unhealthy=}, {outdated=}") + assert len(unhealthy) == len(tenant_b_shard_ids) + for t in tenant_b_shard_ids: + assert str(t) in unhealthy + assert len(outdated) == 0 + + unhealthy, outdated = update_and_query_metadata_health( + env, healthy=tenant_b_shard_ids + tenant_c_shard_ids, unhealthy=[] + ) + + log.info(f"After Phase 2: {unhealthy=}, {outdated=}") + assert len(unhealthy) == 0 + assert len(outdated) == 0 + + unhealthy, outdated = update_and_query_metadata_health( + env, healthy=[], unhealthy=tenant_a_shard_ids + ) + + log.info(f"After Phase 3: {unhealthy=}, {outdated=}") + assert len(unhealthy) == len(tenant_a_shard_ids) + for t in tenant_a_shard_ids: + assert str(t) in unhealthy + assert len(outdated) == 0 + + # Phase 4: Delete A + env.storage_controller.pageserver_api().tenant_delete(env.initial_tenant) + + # A's unhealthy metadata health status should be deleted as well. + assert env.storage_controller.metadata_health_is_healthy() + + # All shards from B and C are not fresh if set outdated duration to 0 seconds. + unhealthy, outdated = update_and_query_metadata_health( + env, healthy=[], unhealthy=tenant_a_shard_ids, outdated_duration="0s" + ) + assert len(unhealthy) == 0 + for t in tenant_b_shard_ids + tenant_c_shard_ids: + assert str(t) in outdated + + +def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): + """ + Test the `/control/v1/step_down` storage controller API. Upon receiving such + a request, the storage controller cancels any on-going reconciles and replies + with 503 to all requests apart from `/control/v1/step_down`, `/status` and `/metrics`. + """ + env = neon_env_builder.init_configs() + env.start() + + tid = TenantId.generate() + tsid = str(TenantShardId(tid, shard_number=0, shard_count=0)) + env.storage_controller.tenant_create(tid) + + env.storage_controller.reconcile_until_idle() + env.storage_controller.configure_failpoints(("sleep-on-reconcile-epilogue", "return(10000)")) + + # Make a change to the tenant config to trigger a slow reconcile + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None) + env.storage_controller.allowed_errors.append( + ".*Accepted configuration update but reconciliation failed.*" + ) + + observed_state = env.storage_controller.step_down() + log.info(f"Storage controller stepped down with {observed_state=}") + + # Validate that we waited for the slow reconcile to complete + # and updated the observed state in the storcon before stepping down. + node_id = str(env.pageserver.id) + assert tsid in observed_state + assert node_id in observed_state[tsid]["locations"] + assert "conf" in observed_state[tsid]["locations"][node_id] + assert "tenant_conf" in observed_state[tsid]["locations"][node_id]["conf"] + + tenant_conf = observed_state[tsid]["locations"][node_id]["conf"]["tenant_conf"] + assert "compaction_threshold" in tenant_conf + assert tenant_conf["compaction_threshold"] == 5 + + # Validate that we propagated the change to the pageserver + ps_tenant_conf = env.pageserver.http_client().tenant_config(tid) + assert "compaction_threshold" in ps_tenant_conf.effective_config + assert ps_tenant_conf.effective_config["compaction_threshold"] == 5 + + # Validate that the storcon is not replying to the usual requests + # once it has stepped down. + with pytest.raises(StorageControllerApiException, match="stepped_down"): + env.storage_controller.tenant_list() + + # Validate that we can step down multiple times and the observed state + # doesn't change. + observed_state_again = env.storage_controller.step_down() + assert observed_state == observed_state_again + + assert ( + env.storage_controller.get_metric_value( + "storage_controller_leadership_status", filter={"status": "leader"} + ) + == 0 + ) + + assert ( + env.storage_controller.get_metric_value( + "storage_controller_leadership_status", filter={"status": "stepped_down"} + ) + == 1 + ) + + assert ( + env.storage_controller.get_metric_value( + "storage_controller_leadership_status", filter={"status": "candidate"} + ) + == 0 + ) + + +# This is a copy of NeonEnv.start which injects the instance id and port +# into the call to NeonStorageController.start +def start_env(env: NeonEnv, storage_controller_port: int): + timeout_in_seconds = 30 + + # Storage controller starts first, so that pageserver /re-attach calls don't + # bounce through retries on startup + env.storage_controller.start(timeout_in_seconds, 1, storage_controller_port) + + # Wait for storage controller readiness to prevent unnecessary post start-up + # reconcile. + env.storage_controller.wait_until_ready() + + # Start up broker, pageserver and all safekeepers + futs = [] + with concurrent.futures.ThreadPoolExecutor( + max_workers=2 + len(env.pageservers) + len(env.safekeepers) + ) as executor: + futs.append( + executor.submit(lambda: env.broker.try_start() or None) + ) # The `or None` is for the linter + + for pageserver in env.pageservers: + futs.append( + executor.submit( + lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) + ) + ) + + for safekeeper in env.safekeepers: + futs.append( + executor.submit( + lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds) + ) + ) + + for f in futs: + f.result() + + +@pytest.mark.parametrize("step_down_times_out", [False, True]) +def test_storage_controller_leadership_transfer( + neon_env_builder: NeonEnvBuilder, + storage_controller_proxy: StorageControllerProxy, + port_distributor: PortDistributor, + step_down_times_out: bool, +): + neon_env_builder.auth_enabled = True + + neon_env_builder.num_pageservers = 3 + + neon_env_builder.storage_controller_config = { + "database_url": f"127.0.0.1:{port_distributor.get_port()}", + "start_as_candidate": True, + } + + neon_env_builder.storage_controller_port_override = storage_controller_proxy.port() + + storage_controller_1_port = port_distributor.get_port() + storage_controller_2_port = port_distributor.get_port() + + storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") + + env = neon_env_builder.init_configs() + start_env(env, storage_controller_1_port) + + assert ( + env.storage_controller.get_leadership_status() == StorageControllerLeadershipStatus.LEADER + ) + leader = env.storage_controller.get_leader() + assert leader["address"] == f"http://127.0.0.1:{storage_controller_1_port}/" + + if step_down_times_out: + env.storage_controller.configure_failpoints( + ("sleep-on-step-down-handling", "return(10000)") + ) + env.storage_controller.allowed_errors.append(".*request was dropped before completing.*") + + tenant_count = 2 + shard_count = 4 + tenants = set(TenantId.generate() for _ in range(0, tenant_count)) + + for tid in tenants: + env.storage_controller.tenant_create( + tid, shard_count=shard_count, placement_policy={"Attached": 1} + ) + env.storage_controller.reconcile_until_idle() + + env.storage_controller.start( + timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port + ) + + if not step_down_times_out: + + def previous_stepped_down(): + assert ( + env.storage_controller.get_leadership_status() + == StorageControllerLeadershipStatus.STEPPED_DOWN + ) + + wait_until(5, 1, previous_stepped_down) + + storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}") + + def new_becomes_leader(): + assert ( + env.storage_controller.get_leadership_status() + == StorageControllerLeadershipStatus.LEADER + ) + + wait_until(15, 1, new_becomes_leader) + leader = env.storage_controller.get_leader() + assert leader["address"] == f"http://127.0.0.1:{storage_controller_2_port}/" + + env.storage_controller.wait_until_ready() + env.storage_controller.consistency_check() + + if step_down_times_out: + env.storage_controller.allowed_errors.extend( + [ + ".*Leader.*did not respond to step-down request.*", + ".*Send step down request failed.*", + ".*Send step down request still failed.*", + ] + ) + + +def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder): + # single unsharded tenant, two locations + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_start() + + env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}}) + env.storage_controller.reconcile_until_idle() + + attached_id = int(env.storage_controller.locate(env.initial_tenant)[0]["node_id"]) + attached = next((ps for ps in env.pageservers if ps.id == attached_id)) + + def attached_is_draining(): + details = env.storage_controller.node_status(attached.id) + assert details["scheduling"] == "Draining" + + env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(10000)")) + env.storage_controller.node_drain(attached.id) + + wait_until(10, 0.5, attached_is_draining) + + attached.restart() + + # we are unable to reconfigure node while the operation is still ongoing + with pytest.raises( + StorageControllerApiException, + match="Precondition failed: Ongoing background operation forbids configuring: drain.*", + ): + env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"}) + with pytest.raises( + StorageControllerApiException, + match="Precondition failed: Ongoing background operation forbids configuring: drain.*", + ): + env.storage_controller.node_configure(attached.id, {"availability": "Offline"}) + + env.storage_controller.cancel_node_drain(attached.id) + + def reconfigure_node_again(): + env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"}) + + # allow for small delay between actually having cancelled and being able reconfigure again + wait_until(4, 0.5, reconfigure_node_again) + + +def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder): + """ + The storage controller is meant to handle the case where a timeline CRUD operation races + with a generation-incrementing change to the tenant: this should trigger a retry so that + the operation lands on the highest-generation'd tenant location. + """ + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + tenant_id = TenantId.generate() + env.storage_controller.tenant_create(tenant_id) + + # Set up a failpoint so that a timeline creation will be very slow + failpoint = "timeline-creation-after-uninit" + for ps in env.pageservers: + ps.http_client().configure_failpoints((failpoint, "sleep(10000)")) + + # Start a timeline creation in the background + create_timeline_id = TimelineId.generate() + futs = [] + with concurrent.futures.ThreadPoolExecutor( + max_workers=2 + len(env.pageservers) + len(env.safekeepers) + ) as executor: + futs.append( + executor.submit( + env.storage_controller.pageserver_api( + retries=Retry( + status=0, + connect=0, # Disable retries: we want to see the 503 + ) + ).timeline_create, + PgVersion.NOT_SET, + tenant_id, + create_timeline_id, + ) + ) + + def has_hit_failpoint(): + assert any( + ps.log_contains(f"at failpoint {failpoint}") is not None for ps in env.pageservers + ) + + wait_until(10, 1, has_hit_failpoint) + + # Migrate the tenant while the timeline creation is in progress: this migration will complete once it + # can detach from the old pageserver, which will happen once the failpoint completes. + env.storage_controller.tenant_shard_migrate( + TenantShardId(tenant_id, 0, 0), env.pageservers[1].id + ) + + with pytest.raises(PageserverApiException, match="Tenant attachment changed, please retry"): + futs[0].result(timeout=20) + + # Timeline creation should work when there isn't a concurrent migration, even though it's + # slow (our failpoint is still enabled) + env.storage_controller.pageserver_api( + retries=Retry( + status=0, + connect=0, # Disable retries: we want to see the 503 + ) + ).timeline_create(PgVersion.NOT_SET, tenant_id, create_timeline_id) + + +def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvBuilder): + """ + A correctness edge case: while we are live migrating and a shard's generation is + visible to the Reconciler but not to the central Service, the generation validation + API should still prevent stale generations from doing deletions. + """ + neon_env_builder.num_pageservers = 2 + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + env = neon_env_builder.init_configs() + env.start() + + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": 128 * 1024, + "compaction_threshold": 1, + "compaction_target_size": 128 * 1024, + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + } + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + env.neon_cli.create_tenant(tenant_id, timeline_id) + env.storage_controller.pageserver_api().set_tenant_config(tenant_id, TENANT_CONF) + + # Write enough data that a compaction would do some work (deleting some L0s) + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(64) + for _i in range(0, 2): + workload.churn_rows(64, upload=False) + + # Upload but don't compact + origin_pageserver = env.get_tenant_pageserver(tenant_id) + dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0] + origin_pageserver.http_client().timeline_checkpoint( + tenant_id, timeline_id, wait_until_uploaded=True, compact=False + ) + + # Start a compaction that will pause on a failpoint. + compaction_failpoint = "before-upload-index-pausable" + origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "pause")) + + # This failpoint can also cause migration code to time out trying to politely flush + # during migrations + origin_pageserver.allowed_errors.append(".*Timed out waiting for flush to remote storage.*") + + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + compact_fut = executor.submit( + origin_pageserver.http_client().timeline_compact, + tenant_id, + timeline_id, + wait_until_uploaded=True, + ) + + # Let the compaction start and then get stuck uploading an index: when we live migrate, the new generation's + # index will be initialized from the pre-compaction index, referencing layers that the compaction will try to delete + def has_hit_compaction_failpoint(): + assert origin_pageserver.log_contains(f"at failpoint {compaction_failpoint}") + + wait_until(10, 1, has_hit_compaction_failpoint) + + # While the compaction is running, start a live migration which will pause long enough for the compaction to sleep, + # after incrementing generation and attaching the new location + migration_failpoint = "reconciler-live-migrate-post-notify" + env.storage_controller.configure_failpoints((migration_failpoint, "pause")) + migrate_fut = executor.submit( + env.storage_controller.tenant_shard_migrate, + TenantShardId(tenant_id, 0, 0), + dest_ps_id, + ) + + def has_hit_migration_failpoint(): + assert env.storage_controller.log_contains(f"at failpoint {migration_failpoint}") + + # Long wait because the migration will have to time out during transition to AttachedStale + # before it reaches this point. The timeout is because the AttachedStale transition includes + # a flush of remote storage, and if the compaction already enqueued an index upload this cannot + # make progress. + wait_until(60, 1, has_hit_migration_failpoint) + + # Origin pageserver has succeeded with compaction before the migration completed. It has done all the writes it wanted to do in its own (stale) generation + origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off")) + compact_fut.result() + origin_pageserver.http_client().deletion_queue_flush(execute=True) + + # Eventually migration completes + env.storage_controller.configure_failpoints((migration_failpoint, "off")) + migrate_fut.result() + except: + # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown + env.storage_controller.configure_failpoints((migration_failpoint, "off")) + origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off")) + raise + + # Ensure the destination of the migration writes an index, so that if it has corrupt state that is + # visible to the scrubber. + workload.write_rows(1, upload=False) + env.get_pageserver(dest_ps_id).http_client().timeline_checkpoint( + tenant_id, timeline_id, wait_until_uploaded=True, compact=False + ) + + # The destination of the live migration would now have a corrupt index (referencing deleted L0s) if + # the controller had not properly applied validation rules. + healthy, _summary = env.storage_scrubber.scan_metadata() + try: + log.info(f"scrubbed, healthy={healthy}") + assert healthy + except: + # On failures, we want to report them FAIL during the test, not as ERROR during teardown + neon_env_builder.enable_scrub_on_exit = False + raise + + +@run_only_on_default_postgres("this is like a 'unit test' against storcon db") +def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_configs() + env.start() + + fake_id = 5 + + target = env.storage_controller + + assert target.get_safekeeper(fake_id) is None + + body = { + "active": True, + "id": fake_id, + "created_at": "2023-10-25T09:11:25Z", + "updated_at": "2024-08-28T11:32:43Z", + "region_id": "aws-us-east-2", + "host": "safekeeper-333.us-east-2.aws.neon.build", + "port": 6401, + "http_port": 7676, + "version": 5957, + "availability_zone_id": "us-east-2b", + } + + target.on_safekeeper_deploy(fake_id, body) + + inserted = target.get_safekeeper(fake_id) + assert inserted is not None + assert eq_safekeeper_records(body, inserted) + + # error out if pk is changed (unexpected) + with pytest.raises(StorageControllerApiException) as exc: + different_pk = dict(body) + different_pk["id"] = 4 + assert different_pk["id"] != body["id"] + target.on_safekeeper_deploy(fake_id, different_pk) + assert exc.value.status_code == 400 + + inserted_again = target.get_safekeeper(fake_id) + assert inserted_again is not None + assert eq_safekeeper_records(inserted, inserted_again) + + # the most common case, version goes up: + assert isinstance(body["version"], int) + body["version"] += 1 + target.on_safekeeper_deploy(fake_id, body) + inserted_now = target.get_safekeeper(fake_id) + assert inserted_now is not None + + assert eq_safekeeper_records(body, inserted_now) + + +def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool: + compared = [dict(a), dict(b)] + + masked_keys = ["created_at", "updated_at"] + + for d in compared: + # keep deleting these in case we are comparing the body as it will be uploaded by real scripts + for key in masked_keys: + if key in d: + del d[key] + + return compared[0] == compared[1] + + +@run_only_on_default_postgres("this is like a 'unit test' against storcon db") +def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder): + def assign_az(ps_cfg): + az = f"az-{ps_cfg['id']}" + ps_cfg["availability_zone"] = az + + neon_env_builder.pageserver_config_override = assign_az + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tids = [TenantId.generate() for _ in range(0, 3)] + for tid in tids: + env.storage_controller.tenant_create(tid) + + shards = env.storage_controller.tenant_describe(tid)["shards"] + assert len(shards) == 1 + attached_to = shards[0]["node_attached"] + expected_az = env.get_pageserver(attached_to).az_id + + assert shards[0]["preferred_az_id"] == expected_az + + updated = env.storage_controller.set_preferred_azs( + {TenantShardId(tid, 0, 0): "foo" for tid in tids} + ) + + assert set(updated) == set([TenantShardId(tid, 0, 0) for tid in tids]) + + for tid in tids: + shards = env.storage_controller.tenant_describe(tid)["shards"] + assert len(shards) == 1 + assert shards[0]["preferred_az_id"] == "foo" + + # Generate a layer to avoid shard split handling on ps from tripping + # up on debug assert. + timeline_id = TimelineId.generate() + env.neon_cli.create_timeline("bar", tids[0], timeline_id) + + workload = Workload(env, tids[0], timeline_id, branch_name="bar") + workload.init() + workload.write_rows(256) + workload.validate() + + env.storage_controller.tenant_shard_split(tids[0], shard_count=2) + shards = env.storage_controller.tenant_describe(tids[0])["shards"] + assert len(shards) == 2 + for shard in shards: + attached_to = shard["node_attached"] + expected_az = env.get_pageserver(attached_to).az_id + assert shard["preferred_az_id"] == expected_az diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py new file mode 100644 index 0000000000..848e214c5e --- /dev/null +++ b/test_runner/regress/test_storage_scrubber.py @@ -0,0 +1,572 @@ +import os +import pprint +import shutil +import threading +import time +from concurrent.futures import ThreadPoolExecutor +from typing import Optional + +import pytest +from fixtures.common_types import TenantId, TenantShardId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, +) +from fixtures.pg_version import PgVersion +from fixtures.remote_storage import S3Storage, s3_storage +from fixtures.utils import wait_until +from fixtures.workload import Workload + + +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): + """ + Test the `tenant-snapshot` subcommand, which grabs data from remote storage + + This is only a support/debug tool, but worth testing to ensure the tool does not regress. + """ + + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1 + + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + branch = "main" + + # Do some work + workload = Workload(env, tenant_id, timeline_id, branch) + workload.init() + + # Multiple write/flush passes to generate multiple layers + for _n in range(0, 3): + workload.write_rows(128) + + # Do some more work after a restart, so that we have multiple generations + for pageserver in env.pageservers: + pageserver.stop() + pageserver.start() + + for _n in range(0, 3): + workload.write_rows(128) + + # If we're doing multiple shards, split: this is important to exercise + # the scrubber's ability to understand the references from child shards to parent shard's layers + if shard_count is not None: + tenant_shard_ids = env.storage_controller.tenant_shard_split( + tenant_id, shard_count=shard_count + ) + + # Write after shard split: this will result in shards containing a mixture of owned + # and parent layers in their index. + workload.write_rows(128) + else: + tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)] + + output_path = neon_env_builder.test_output_dir / "snapshot" + os.makedirs(output_path) + + env.storage_scrubber.tenant_snapshot(tenant_id, output_path) + + assert len(os.listdir(output_path)) > 0 + + workload.stop() + + # Stop pageservers + for pageserver in env.pageservers: + pageserver.stop() + + # Drop all shards' local storage + for tenant_shard_id in tenant_shard_ids: + pageserver = env.get_tenant_pageserver(tenant_shard_id) + shutil.rmtree(pageserver.timeline_dir(tenant_shard_id, timeline_id)) + + # Replace remote storage contents with the snapshot we downloaded + assert isinstance(env.pageserver_remote_storage, S3Storage) + + remote_tenant_path = env.pageserver_remote_storage.tenant_path(tenant_id) + + # Delete current remote storage contents + bucket = env.pageserver_remote_storage.bucket_name + remote_client = env.pageserver_remote_storage.client + deleted = 0 + for object in remote_client.list_objects_v2(Bucket=bucket, Prefix=remote_tenant_path)[ + "Contents" + ]: + key = object["Key"] + remote_client.delete_object(Key=key, Bucket=bucket) + deleted += 1 + assert deleted > 0 + + # Upload from snapshot + for root, _dirs, files in os.walk(output_path): + for file in files: + full_local_path = os.path.join(root, file) + full_remote_path = ( + env.pageserver_remote_storage.tenants_path() + + "/" + + full_local_path.removeprefix(f"{output_path}/") + ) + remote_client.upload_file(full_local_path, bucket, full_remote_path) + + for pageserver in env.pageservers: + pageserver.start() + + # Check we can read everything + workload.validate() + + +def drop_local_state(env: NeonEnv, tenant_id: TenantId): + env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"}) + env.storage_controller.reconcile_until_idle() + + env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}}) + env.storage_controller.reconcile_until_idle() + + +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=shard_count) + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + + # We will end up with an index per shard, per cycle, plus one for the initial startup + n_cycles = 4 + expect_indices_per_shard = n_cycles + 1 + shard_count = 1 if shard_count is None else shard_count + + # For each cycle, detach and attach the tenant to bump the generation, and do some writes to generate uploads + for _i in range(0, n_cycles): + drop_local_state(env, tenant_id) + + # This write includes remote upload, will generate an index in this generation + workload.write_rows(1) + + # We will use a min_age_secs=1 threshold for deletion, let it pass + time.sleep(2) + + # With a high min_age, the scrubber should decline to delete anything + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600) + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + + # If targeting a different tenant, the scrubber shouldn't do anything + gc_summary = env.storage_scrubber.pageserver_physical_gc( + min_age_secs=1, tenant_ids=[TenantId.generate()] + ) + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + + # With a low min_age, the scrubber should go ahead and clean up all but the latest 2 generations + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1) + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == (expect_indices_per_shard - 2) * shard_count + + +@pytest.mark.parametrize("shard_count", [None, 2]) +def test_scrubber_physical_gc_ancestors( + neon_env_builder: NeonEnvBuilder, shard_count: Optional[int] +): + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, + timeline_id, + shard_count=shard_count, + conf={ + # Small layers and low compaction thresholds, so that when we split we can expect some to + # be dropped by child shards + "checkpoint_distance": f"{1024 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{1024 * 1024}", + # Disable automatic creation of image layers, as future image layers can result in layers in S3 that + # aren't referenced by children, earlier than the test expects such layers to exist + "image_creation_threshold": "9999", + "image_layer_creation_check_threshold": "0", + # Disable background compaction, we will do it explicitly + "compaction_period": "0s", + # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas + # and makes them GC'able + "pitr_interval": "0s", + }, + ) + + # Create an extra timeline, to ensure the scrubber isn't confused by multiple timelines + env.storage_controller.pageserver_api().timeline_create( + env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate() + ) + + # Make sure the original shard has some layers + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(100) + + # Issue a deletion queue flush so that the parent shard can't leave behind layers + # that will look like unexpected garbage to the scrubber + for pre_split_shard in env.storage_controller.locate(tenant_id): + env.get_pageserver(pre_split_shard["node_id"]).http_client().deletion_queue_flush( + execute=True + ) + + new_shard_count = 4 + assert shard_count is None or new_shard_count > shard_count + shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) + env.storage_controller.reconcile_until_idle() # Move shards to their final locations immediately + + # Create a timeline after split, to ensure scrubber can handle timelines that exist in child shards but not ancestors + env.storage_controller.pageserver_api().timeline_create( + env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate() + ) + + # Make sure child shards have some layers. Do not force upload, because the test helper calls checkpoint, which + # compacts, and we only want to do tha explicitly later in the test. + workload.write_rows(100, upload=False) + for shard in shards: + ps = env.get_tenant_pageserver(shard) + log.info(f"Waiting for shard {shard} on pageserver {ps.id}") + ps.http_client().timeline_checkpoint( + shard, timeline_id, compact=False, wait_until_uploaded=True + ) + + # Flush deletion queue so that we don't leave any orphan layers in the parent that will confuse subsequent checks: once + # a shard is split, any layers in its prefix that aren't referenced by a child will be considered GC'able, even + # if they were logically deleted before the shard split, just not physically deleted yet because of the queue. + for ps in env.pageservers: + ps.http_client().deletion_queue_flush(execute=True) + + # Before compacting, all the layers in the ancestor should still be referenced by the children: the scrubber + # should not erase any ancestor layers + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["ancestor_layers_deleted"] == 0 + + # Write some data and compact: compacting, some ancestor layers should no longer be needed by children + # (the compaction is part of the checkpoint that Workload does for us) + workload.churn_rows(100) + workload.churn_rows(100) + workload.churn_rows(100) + for shard in shards: + ps = env.get_tenant_pageserver(shard) + ps.http_client().timeline_compact(shard, timeline_id, force_image_layer_creation=True) + ps.http_client().timeline_gc(shard, timeline_id, 0) + + # We will use a min_age_secs=1 threshold for deletion, let it pass + time.sleep(2) + + # Our time threshold should be respected: check that with a high threshold we delete nothing + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["ancestor_layers_deleted"] == 0 + + # Now run with a low time threshold: deletions of ancestor layers should be executed + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["ancestor_layers_deleted"] > 0 + + # We deleted some layers: now check we didn't corrupt the tenant by doing so. Detach and + # attach it, to drop any local state, then check it's still readable. + workload.stop() + drop_local_state(env, tenant_id) + workload.validate() + + +def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder): + """ + When we delete a timeline after a shard split, the child shards do not directly delete the + layers in the ancestor shards. They rely on the scrubber to clean up. + """ + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, + timeline_id, + shard_count=None, + conf={ + # Small layers and low compaction thresholds, so that when we split we can expect some to + # be dropped by child shards + "checkpoint_distance": f"{1024 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{1024 * 1024}", + "image_creation_threshold": "2", + "image_layer_creation_check_threshold": "0", + # Disable background compaction, we will do it explicitly + "compaction_period": "0s", + # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas + # and makes them GC'able + "pitr_interval": "0s", + }, + ) + + # Make sure the original shard has some layers + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(100, upload=False) + workload.stop() + + # Issue a deletion queue flush so that the parent shard can't leave behind layers + # that will look like unexpected garbage to the scrubber + env.get_tenant_pageserver(tenant_id).http_client().deletion_queue_flush(execute=True) + + new_shard_count = 4 + shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) + for shard in shards: + ps = env.get_tenant_pageserver(shard) + log.info(f"Waiting for shard {shard} on pageserver {ps.id}") + ps.http_client().timeline_checkpoint( + shard, timeline_id, compact=False, wait_until_uploaded=True + ) + + ps.http_client().deletion_queue_flush(execute=True) + + # Create a second timeline so that when we delete the first one, child shards still have some content in S3. + # + # This is a limitation of the scrubber: if a shard isn't in S3 (because it has no timelines), then the scrubber + # doesn't know about it, and won't perceive its ancestors as ancestors. + other_timeline_id = TimelineId.generate() + env.storage_controller.pageserver_api().timeline_create( + PgVersion.NOT_SET, tenant_id, other_timeline_id + ) + + # The timeline still exists in child shards and they reference its layers, so scrubbing + # now shouldn't delete anything. + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["ancestor_layers_deleted"] == 0 + + # Delete the timeline + env.storage_controller.pageserver_api().timeline_delete(tenant_id, timeline_id) + + # Subsequently doing physical GC should clean up the ancestor layers + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["ancestor_layers_deleted"] > 0 + + +def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder): + """ + Exercise ancestor GC while a tenant is partly split: this test ensures that if we have some child shards + which don't reference an ancestor, but some child shards that don't exist yet, then we do not incorrectly + GC any ancestor layers. + """ + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + initial_shard_count = 2 + env.neon_cli.create_tenant( + tenant_id, + timeline_id, + shard_count=initial_shard_count, + conf={ + # Small layers and low compaction thresholds, so that when we split we can expect some to + # be dropped by child shards + "checkpoint_distance": f"{1024 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{1024 * 1024}", + "image_creation_threshold": "2", + "image_layer_creation_check_threshold": "0", + # Disable background compaction, we will do it explicitly + "compaction_period": "0s", + # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas + # and makes them GC'able + "pitr_interval": "0s", + }, + ) + + unstuck = threading.Event() + + def stuck_split(): + # Pause our shard split after the first shard but before the second, such that when we run + # the scrub, the S3 bucket contains shards 0002, 0101, 0004, 0204 (but not 0104, 0304). + env.storage_controller.configure_failpoints( + ("shard-split-post-remote-sleep", "return(3600000)") + ) + try: + split_response = env.storage_controller.tenant_shard_split(tenant_id, shard_count=4) + except Exception as e: + log.info(f"Split failed with {e}") + else: + if not unstuck.is_set(): + raise RuntimeError(f"Split succeeded unexpectedly ({split_response})") + + with ThreadPoolExecutor(max_workers=1) as threads: + log.info("Starting hung shard split") + stuck_split_fut = threads.submit(stuck_split) + + # Let the controller reach the failpoint + wait_until( + 10, + 1, + lambda: env.storage_controller.assert_log_contains( + 'failpoint "shard-split-post-remote-sleep": sleeping' + ), + ) + + # Run compaction on the new child shards, so that they drop some refs to their parent + child_shards = [ + TenantShardId(tenant_id, 0, 4), + TenantShardId(tenant_id, 2, 4), + ] + log.info("Compacting first two children") + for child in child_shards: + env.get_tenant_pageserver( + TenantShardId(tenant_id, 0, initial_shard_count) + ).http_client().timeline_compact(child, timeline_id) + + # Check that the other child shards weren't created + assert env.get_tenant_pageserver(TenantShardId(tenant_id, 1, 4)) is None + assert env.get_tenant_pageserver(TenantShardId(tenant_id, 3, 4)) is None + + # Run scrubber: it should not incorrectly interpret the **04 shards' lack of refs to all + # ancestor layers as a reason to GC them, because it should realize that a split is in progress. + # (GC requires that controller does not indicate split in progress, and that if we see the highest + # shard count N, then there are N shards present with that shard count). + gc_output = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full") + log.info(f"Ran physical GC partway through split: {gc_output}") + assert gc_output["ancestor_layers_deleted"] == 0 + assert gc_output["remote_storage_errors"] == 0 + assert gc_output["controller_api_errors"] == 0 + + # Storage controller shutdown lets our split request client complete + log.info("Stopping storage controller") + unstuck.set() + env.storage_controller.allowed_errors.append(".*Timed out joining HTTP server task.*") + env.storage_controller.stop() + stuck_split_fut.result() + + # Restart the controller and retry the split with the failpoint disabled, this should + # complete successfully and result in an S3 state that allows the scrubber to proceed with removing ancestor layers + log.info("Starting & retrying split") + env.storage_controller.start() + env.storage_controller.tenant_shard_split(tenant_id, shard_count=4) + + # The other child shards exist now, we can compact them to drop refs to ancestor + log.info("Compacting second two children") + for child in [ + TenantShardId(tenant_id, 1, 4), + TenantShardId(tenant_id, 3, 4), + ]: + env.get_tenant_pageserver(child).http_client().timeline_compact(child, timeline_id) + + gc_output = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full") + log.info(f"Ran physical GC after split completed: {gc_output}") + assert gc_output["ancestor_layers_deleted"] > 0 + assert gc_output["remote_storage_errors"] == 0 + assert gc_output["controller_api_errors"] == 0 + + +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_scrubber_scan_pageserver_metadata( + neon_env_builder: NeonEnvBuilder, shard_count: Optional[int] +): + """ + Create some layers. Delete an object listed in index. Run scrubber and see if it detects the defect. + """ + + # Use s3_storage so we could test out scrubber. + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1 + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + + # Create some layers. + + workload = Workload(env, env.initial_tenant, env.initial_timeline) + workload.init() + + for _ in range(3): + workload.write_rows(128) + + for pageserver in env.pageservers: + pageserver.stop() + pageserver.start() + + for _ in range(3): + workload.write_rows(128) + + # Get the latest index for a particular timeline. + + tenant_shard_id = TenantShardId(env.initial_tenant, 0, shard_count if shard_count else 0) + + assert isinstance(env.pageserver_remote_storage, S3Storage) + timeline_path = env.pageserver_remote_storage.timeline_path( + tenant_shard_id, env.initial_timeline + ) + + client = env.pageserver_remote_storage.client + bucket = env.pageserver_remote_storage.bucket_name + objects = client.list_objects_v2(Bucket=bucket, Prefix=f"{timeline_path}/", Delimiter="").get( + "Contents", [] + ) + keys = [obj["Key"] for obj in objects] + index_keys = list(filter(lambda s: s.startswith(f"{timeline_path}/index_part"), keys)) + assert len(index_keys) > 0 + + latest_index_key = env.pageserver_remote_storage.get_latest_index_key(index_keys) + log.info(f"{latest_index_key=}") + + index = env.pageserver_remote_storage.download_index_part(latest_index_key) + + assert len(index.layer_metadata) > 0 + it = iter(index.layer_metadata.items()) + + healthy, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True) + assert healthy + + assert env.storage_controller.metadata_health_is_healthy() + + # Delete a layer file that is listed in the index. + layer, metadata = next(it) + log.info(f"Deleting {timeline_path}/{layer.to_str()}") + delete_response = client.delete_object( + Bucket=bucket, + Key=f"{timeline_path}/{layer.to_str()}-{metadata.generation:08x}", + ) + log.info(f"delete response: {delete_response}") + + # Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings. + _, scan_summary = env.storage_scrubber.scan_metadata() + log.info(f"{pprint.pformat(scan_summary)}") + assert len(scan_summary["with_warnings"]) > 0 + + assert env.storage_controller.metadata_health_is_healthy() + + # Now post to storage controller, expect seeing one unhealthy health record + _, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True) + log.info(f"{pprint.pformat(scan_summary)}") + assert len(scan_summary["with_warnings"]) > 0 + + unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"] + assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id) + + neon_env_builder.disable_scrub_on_exit() diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py new file mode 100644 index 0000000000..91caad7220 --- /dev/null +++ b/test_runner/regress/test_subscriber_restart.py @@ -0,0 +1,57 @@ +import threading +import time + +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import wait_until + + +# This test checks of logical replication subscriber is able to correctly restart replication without receiving duplicates. +# It requires tracking information about replication origins at page server side +def test_subscriber_restart(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("publisher") + pub = env.endpoints.create("publisher") + pub.start() + + env.neon_cli.create_branch("subscriber") + sub = env.endpoints.create("subscriber") + sub.start() + + n_records = 100000 + n_restarts = 100 + + def check_that_changes_propagated(): + scur.execute("SELECT count(*) FROM t") + res = scur.fetchall() + assert res[0][0] == n_records + + def insert_data(pub): + with pub.cursor() as pcur: + for i in range(0, n_records): + pcur.execute("INSERT into t values (%s,random()*100000)", (i,)) + + with pub.cursor() as pcur: + with sub.cursor() as scur: + pcur.execute("CREATE TABLE t (pk integer primary key, sk integer)") + pcur.execute("CREATE PUBLICATION pub FOR TABLE t") + scur.execute("CREATE TABLE t (pk integer primary key, sk integer)") + # scur.execute("CREATE INDEX on t(sk)") # slowdown applying WAL at replica + pub_conn = f"host=localhost port={pub.pg_port} dbname=postgres user=cloud_admin" + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub" + scur.execute(query) + time.sleep(2) # let initial table sync complete + + thread = threading.Thread(target=insert_data, args=(pub,), daemon=True) + thread.start() + + for _ in range(n_restarts): + # restart subscriber + # time.sleep(2) + sub.stop("immediate") + sub.start() + + thread.join() + pcur.execute(f"INSERT into t values ({n_records}, 0)") + n_records += 1 + with sub.cursor() as scur: + wait_until(60, 0.5, check_that_changes_propagated) diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py index eb96a8faa4..10cb00c780 100644 --- a/test_runner/regress/test_subxacts.py +++ b/test_runner/regress/test_subxacts.py @@ -1,4 +1,3 @@ -from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content @@ -13,15 +12,10 @@ def test_subxacts(neon_simple_env: NeonEnv, test_output_dir): env.neon_cli.create_branch("test_subxacts", "empty") endpoint = env.endpoints.create_start("test_subxacts") - log.info("postgres is running on 'test_subxacts' branch") pg_conn = endpoint.connect() cur = pg_conn.cursor() - cur.execute( - """ - CREATE TABLE t1(i int, j int); - """ - ) + cur.execute("CREATE TABLE t1(i int, j int);") cur.execute("select pg_switch_wal();") diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 2ed22cabc4..9fb7324fa1 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -1,29 +1,34 @@ import json -from contextlib import closing +from typing import Any, Dict -import psycopg2.extras -from fixtures.log_helper import log +from fixtures.common_types import Lsn from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.types import Lsn from fixtures.utils import wait_until +from fixtures.workload import Workload def test_tenant_config(neon_env_builder: NeonEnvBuilder): """Test per tenant configuration""" - # set some non-default global config - neon_env_builder.pageserver_config_override = """ -page_cache_size=444; -wait_lsn_timeout='111 s'; -[tenant_config] -checkpoint_distance = 10000 -compaction_target_size = 1048576 -evictions_low_residence_duration_metric_threshold = "2 days" -eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "23 hours" } -""" + + def set_some_nondefault_global_config(ps_cfg: Dict[str, Any]): + ps_cfg["page_cache_size"] = 444 + ps_cfg["wait_lsn_timeout"] = "111 s" + + tenant_config = ps_cfg.setdefault("tenant_config", {}) + tenant_config["checkpoint_distance"] = 10000 + tenant_config["compaction_target_size"] = 1048576 + tenant_config["evictions_low_residence_duration_metric_threshold"] = "2 days" + tenant_config["eviction_policy"] = { + "kind": "LayerAccessThreshold", + "period": "20s", + "threshold": "23 hours", + } + + neon_env_builder.pageserver_config_override = set_some_nondefault_global_config env = neon_env_builder.init_start() # we configure eviction but no remote storage, there might be error lines @@ -56,25 +61,6 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = # check the configuration of the default tenant # it should match global configuration - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - log.info(f"show {env.initial_tenant}") - pscur.execute(f"show {env.initial_tenant}") - res = pscur.fetchone() - assert res is not None - assert all( - i in res.items() - for i in { - "checkpoint_distance": 10000, - "compaction_target_size": 1048576, - "compaction_period": 20, - "compaction_threshold": 10, - "gc_horizon": 67108864, - "gc_period": 60 * 60, - "image_creation_threshold": 3, - "pitr_interval": 604800, # 7 days - }.items() - ), f"Unexpected res: {res}" default_tenant_config = http_client.tenant_config(tenant_id=env.initial_tenant) assert ( not default_tenant_config.tenant_specific_overrides @@ -96,25 +82,6 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = } # check the configuration of the new tenant - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant}") - res = pscur.fetchone() - log.info(f"res: {res}") - assert res is not None - assert all( - i in res.items() - for i in { - "checkpoint_distance": 20000, - "compaction_target_size": 1048576, - "compaction_period": 20, - "compaction_threshold": 10, - "gc_horizon": 67108864, - "gc_period": 30, - "image_creation_threshold": 3, - "pitr_interval": 604800, - }.items() - ), f"Unexpected res: {res}" new_tenant_config = http_client.tenant_config(tenant_id=tenant) new_specific_config = new_tenant_config.tenant_specific_overrides assert new_specific_config["checkpoint_distance"] == 20000 @@ -159,25 +126,6 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = conf=conf_update, ) - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant}") - res = pscur.fetchone() - log.info(f"after config res: {res}") - assert res is not None - assert all( - i in res.items() - for i in { - "checkpoint_distance": 15000, - "compaction_target_size": 1048576, - "compaction_period": 80, - "compaction_threshold": 10, - "gc_horizon": 67108864, - "gc_period": 80, - "image_creation_threshold": 2, - "pitr_interval": 604800, - }.items() - ), f"Unexpected res: {res}" updated_tenant_config = http_client.tenant_config(tenant_id=tenant) updated_specific_config = updated_tenant_config.tenant_specific_overrides assert updated_specific_config["checkpoint_distance"] == 15000 @@ -215,25 +163,6 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = env.pageserver.stop() env.pageserver.start() - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant}") - res = pscur.fetchone() - log.info(f"after restart res: {res}") - assert res is not None - assert all( - i in res.items() - for i in { - "checkpoint_distance": 15000, - "compaction_target_size": 1048576, - "compaction_period": 80, - "compaction_threshold": 10, - "gc_horizon": 67108864, - "gc_period": 80, - "image_creation_threshold": 2, - "pitr_interval": 604800, - }.items() - ), f"Unexpected res: {res}" restarted_tenant_config = http_client.tenant_config(tenant_id=tenant) assert ( restarted_tenant_config == updated_tenant_config @@ -270,25 +199,16 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "period": "20s", "threshold": "23h", } - assert final_effective_config["max_lsn_wal_lag"] == 10 * 1024 * 1024 + assert final_effective_config["max_lsn_wal_lag"] == 1024 * 1024 * 1024 # restart the pageserver and ensure that the config is still correct env.pageserver.stop() env.pageserver.start() - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: - pscur.execute(f"show {tenant}") - res = pscur.fetchone() - log.info(f"after restart res: {res}") - assert res is not None - assert all( - i in res.items() - for i in { - "compaction_period": 20, - "pitr_interval": 60, - }.items() - ), f"Unexpected res: {res}" + restarted_final_tenant_config = http_client.tenant_config(tenant_id=tenant) + assert ( + restarted_final_tenant_config == final_tenant_config + ), "Updated config should not change after the restart" def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): @@ -299,8 +219,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): # tenant is created with defaults, as in without config file (tenant_id, timeline_id) = env.neon_cli.create_tenant() - config_path = env.pageserver.tenant_dir(tenant_id) / "config" - assert config_path.exists(), "config file is always initially created" + config_path = env.pageserver.tenant_dir(tenant_id) / "config-v1" http_client = env.pageserver.http_client() @@ -314,10 +233,6 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): assert not config_path.exists(), "detach did not remove config file" - # The re-attach's increment of the generation number may invalidate deletion queue - # updates in flight from the previous attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - env.pageserver.tenant_attach(tenant_id) wait_until( number_of_iterations=5, @@ -351,6 +266,13 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( (tenant_id, timeline_id) = env.initial_tenant, env.initial_timeline ps_http = env.pageserver.http_client() + # When we evict/download layers, we will use this Workload to generate getpage requests + # that touch some layers, as otherwise the pageserver doesn't report totally unused layers + # as problems when they have short residence duration. + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(100) + def get_metric(): metrics = ps_http.get_metrics() metric = metrics.query_one( @@ -371,6 +293,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( assert default_value == "1day" ps_http.download_all_layers(tenant_id, timeline_id) + workload.validate() ps_http.evict_all_layers(tenant_id, timeline_id) metric = get_metric() assert int(metric.value) > 0, "metric is updated" @@ -391,6 +314,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( assert int(metric.value) == 0 ps_http.download_all_layers(tenant_id, timeline_id) + workload.validate() ps_http.evict_all_layers(tenant_id, timeline_id) metric = get_metric() assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60 @@ -404,6 +328,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( assert int(metric.value) == 0, "value resets if label changes" ps_http.download_all_layers(tenant_id, timeline_id) + workload.validate() ps_http.evict_all_layers(tenant_id, timeline_id) metric = get_metric() assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60 diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index b4e5a550f3..7ee949e8d3 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -1,32 +1,50 @@ -import concurrent.futures -import enum -import os -import shutil +import json from threading import Thread import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, - last_flush_lsn_upload, wait_for_last_flush_lsn, ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( - MANY_SMALL_LAYERS_TENANT_CONFIG, assert_prefix_empty, assert_prefix_not_empty, - poll_for_remote_storage_iterations, - tenant_delete_wait_completed, - wait_tenant_status_404, - wait_until_tenant_active, - wait_until_tenant_state, + many_small_layers_tenant_config, + wait_for_upload, ) -from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage -from fixtures.types import TenantId, TimelineId +from fixtures.remote_storage import RemoteStorageKind, s3_storage from fixtures.utils import run_pg_bench_small, wait_until from requests.exceptions import ReadTimeout +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + + +def error_tolerant_delete(ps_http, tenant_id): + """ + For tests that inject 500 errors, we must retry repeatedly when issuing deletions + """ + while True: + try: + ps_http.tenant_delete(tenant_id=tenant_id) + except PageserverApiException as e: + if e.status_code == 500: + # This test uses failure injection, which can produce 500s as the pageserver expects + # the object store to always be available, and the ListObjects during deletion is generally + # an infallible operation. This can show up as a clear simulated error, or as a general + # error during delete_objects() + assert ( + "simulated failure of remote operation" in e.message + or "failed to delete" in e.message + ) + else: + raise + else: + # Success, drop out + break def test_tenant_delete_smoke( @@ -52,17 +70,21 @@ def test_tenant_delete_smoke( # first try to delete non existing tenant tenant_id = TenantId.generate() - env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*") - with pytest.raises(PageserverApiException, match=f"NotFound: tenant {tenant_id}"): - ps_http.tenant_delete(tenant_id=tenant_id) + env.pageserver.allowed_errors.extend( + [".*NotFound.*", ".*simulated failure.*", ".*failed to delete .+ objects.*"] + ) + + # Check that deleting a non-existent tenant gives the expected result: this is a loop because we + # may need to retry on some remote storage errors injected by the test harness + error_tolerant_delete(ps_http, tenant_id) env.neon_cli.create_tenant( tenant_id=tenant_id, - conf=MANY_SMALL_LAYERS_TENANT_CONFIG, + conf=many_small_layers_tenant_config(), ) # Default tenant and the one we created - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2 # create two timelines one being the parent of another parent = None @@ -86,11 +108,12 @@ def test_tenant_delete_smoke( parent = timeline - iterations = poll_for_remote_storage_iterations(remote_storage_kind) + # Upload a heatmap so that we exercise deletion of that too + ps_http.tenant_heatmap_upload(tenant_id) - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2 - tenant_delete_wait_completed(ps_http, tenant_id, iterations) - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2 + error_tolerant_delete(ps_http, tenant_id) + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 tenant_path = env.pageserver.tenant_dir(tenant_id) assert not tenant_path.exists() @@ -106,287 +129,11 @@ def test_tenant_delete_smoke( ) # Deletion updates the tenant count: the one default tenant remains - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0 - -class Check(enum.Enum): - RETRY_WITHOUT_RESTART = enum.auto() - RETRY_WITH_RESTART = enum.auto() - - -FAILPOINTS = [ - "tenant-delete-before-shutdown", - "tenant-delete-before-create-remote-mark", - "tenant-delete-before-create-local-mark", - "tenant-delete-before-background", - "tenant-delete-before-polling-ongoing-deletions", - "tenant-delete-before-cleanup-remaining-fs-traces", - "tenant-delete-before-remove-timelines-dir", - "tenant-delete-before-remove-deleted-mark", - "tenant-delete-before-remove-tenant-dir", - # Some failpoints from timeline deletion - "timeline-delete-before-index-deleted-at", - "timeline-delete-before-rm", - "timeline-delete-before-index-delete", - "timeline-delete-after-rm-dir", -] - -FAILPOINTS_BEFORE_BACKGROUND = [ - "timeline-delete-before-schedule", - "tenant-delete-before-shutdown", - "tenant-delete-before-create-remote-mark", - "tenant-delete-before-create-local-mark", - "tenant-delete-before-background", -] - - -def combinations(): - result = [] - - remotes = available_s3_storages() - - for remote_storage_kind in remotes: - for delete_failpoint in FAILPOINTS: - # Simulate failures for only one type of remote storage - # to avoid log pollution and make tests run faster - if remote_storage_kind is RemoteStorageKind.MOCK_S3: - simulate_failures = True - else: - simulate_failures = False - result.append((remote_storage_kind, delete_failpoint, simulate_failures)) - return result - - -@pytest.mark.parametrize("check", list(Check)) -@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations()) -def test_delete_tenant_exercise_crash_safety_failpoints( - neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, - failpoint: str, - simulate_failures: bool, - check: Check, - pg_bin: PgBin, -): - if simulate_failures: - neon_env_builder.pageserver_config_override = "test_remote_failures=1" - - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - - env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) - - tenant_id = env.initial_tenant - - env.pageserver.allowed_errors.extend( - [ - # From deletion polling - f".*NotFound: tenant {env.initial_tenant}.*", - # allow errors caused by failpoints - f".*failpoint: {failpoint}", - # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", - # We may leave some upload tasks in the queue. They're likely deletes. - # For uploads we explicitly wait with `last_flush_lsn_upload` below. - # So by ignoring these instead of waiting for empty upload queue - # we execute more distinct code paths. - '.*stopping left-over name="remote upload".*', - ] - ) - - if simulate_failures: - env.pageserver.allowed_errors.append( - # The deletion queue will complain when it encounters simulated S3 errors - ".*deletion executor: DeleteObjects request failed.*", - ) - - ps_http = env.pageserver.http_client() - - timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id) - with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint: - # generate enough layers - run_pg_bench_small(pg_bin, endpoint.connstr()) - last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) - - assert_prefix_not_empty( - neon_env_builder.pageserver_remote_storage, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - ) - - ps_http.configure_failpoints((failpoint, "return")) - - iterations = poll_for_remote_storage_iterations(remote_storage_kind) - - # These failpoints are earlier than background task is spawned. - # so they result in api request failure. - if failpoint in FAILPOINTS_BEFORE_BACKGROUND: - with pytest.raises(PageserverApiException, match=failpoint): - ps_http.tenant_delete(tenant_id) - - else: - ps_http.tenant_delete(tenant_id) - tenant_info = wait_until_tenant_state( - pageserver_http=ps_http, - tenant_id=tenant_id, - expected_state="Broken", - iterations=iterations, - ) - - reason = tenant_info["state"]["data"]["reason"] - log.info(f"tenant broken: {reason}") - - # failpoint may not be the only error in the stack - assert reason.endswith(f"failpoint: {failpoint}"), reason - - if check is Check.RETRY_WITH_RESTART: - env.pageserver.restart() - - if failpoint in ( - "tenant-delete-before-shutdown", - "tenant-delete-before-create-remote-mark", - ): - wait_until_tenant_active( - ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25 - ) - tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations) - else: - # Pageserver should've resumed deletion after restart. - wait_tenant_status_404(ps_http, tenant_id, iterations=iterations + 10) - elif check is Check.RETRY_WITHOUT_RESTART: - # this should succeed - # this also checks that delete can be retried even when tenant is in Broken state - ps_http.configure_failpoints((failpoint, "off")) - - tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations) - - tenant_dir = env.pageserver.tenant_dir(tenant_id) - # Check local is empty - assert not tenant_dir.exists() - - # Check remote is empty - assert_prefix_empty( - neon_env_builder.pageserver_remote_storage, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - allowed_postfix="initdb.tar.zst", - ) - - -def test_tenant_delete_is_resumed_on_attach( - neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, -): - remote_storage_kind = s3_storage() - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - - env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) - env.pageserver.allowed_errors.append( - # lucky race with stopping from flushing a layer we fail to schedule any uploads - ".*layer flush task.+: could not flush frozen layer: update_metadata_file" - ) - - tenant_id = env.initial_tenant - - ps_http = env.pageserver.http_client() - # create two timelines - for timeline in ["first", "second"]: - timeline_id = env.neon_cli.create_timeline(timeline, tenant_id=tenant_id) - with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint: - run_pg_bench_small(pg_bin, endpoint.connstr()) - wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id) - - # sanity check, data should be there - assert_prefix_not_empty( - neon_env_builder.pageserver_remote_storage, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - ) - - # failpoint before we remove index_part from s3 - failpoint = "timeline-delete-before-index-delete" - ps_http.configure_failpoints((failpoint, "return")) - - env.pageserver.allowed_errors.extend( - ( - # allow errors caused by failpoints - f".*failpoint: {failpoint}", - # From deletion polling - f".*NotFound: tenant {env.initial_tenant}.*", - # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", - # error from http response is also logged - ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*", - '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*', - ) - ) - - iterations = poll_for_remote_storage_iterations(remote_storage_kind) - - ps_http.tenant_delete(tenant_id) - - tenant_info = wait_until_tenant_state( - pageserver_http=ps_http, - tenant_id=tenant_id, - expected_state="Broken", - iterations=iterations, - ) - - assert_prefix_not_empty( - neon_env_builder.pageserver_remote_storage, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - ) - - reason = tenant_info["state"]["data"]["reason"] - # failpoint may not be the only error in the stack - assert reason.endswith(f"failpoint: {failpoint}"), reason - - # now we stop pageserver and remove local tenant state - env.endpoints.stop_all() env.pageserver.stop() - dir_to_clear = env.pageserver.tenant_dir() - shutil.rmtree(dir_to_clear) - os.mkdir(dir_to_clear) - - env.pageserver.start() - - # now we call attach - env.pageserver.tenant_attach(tenant_id=tenant_id) - - # delete should be resumed - wait_tenant_status_404(ps_http, tenant_id, iterations) - - # we shouldn've created tenant dir on disk - tenant_path = env.pageserver.tenant_dir(tenant_id) - assert not tenant_path.exists() - - ps_http.deletion_queue_flush(execute=True) - assert_prefix_empty( - neon_env_builder.pageserver_remote_storage, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - ) - def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder): """Reproduction of 2023-11-23 stuck tenants investigation""" @@ -459,105 +206,10 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE if deletion is not None: deletion.join() - -def test_tenant_delete_concurrent( - neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, -): - """ - Validate that concurrent delete requests to the same tenant behave correctly: - exactly one should succeed. - - This is a reproducer for https://github.com/neondatabase/neon/issues/5936 - """ - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) - env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) - ps_http = env.pageserver.http_client() - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline - - # Populate some data - with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: - run_pg_bench_small(pg_bin, endpoint.connstr()) - last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) - - CONFLICT_MESSAGE = "Precondition failed: Invalid state Stopping. Expected Active or Broken" - - env.pageserver.allowed_errors.extend( - [ - # lucky race with stopping from flushing a layer we fail to schedule any uploads - ".*layer flush task.+: could not flush frozen layer: update_metadata_file", - # Errors logged from our 4xx requests - f".*{CONFLICT_MESSAGE}.*", - ] - ) - - BEFORE_REMOVE_FAILPOINT = "tenant-delete-before-map-remove" - BEFORE_RUN_FAILPOINT = "tenant-delete-before-run" - - # We will let the initial delete run until right before it would remove - # the tenant's TenantSlot. This pauses it in a state where the tenant - # is visible in Stopping state, and concurrent requests should fail with 4xx. - ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "pause")) - - def delete_tenant(): - return ps_http.tenant_delete(tenant_id) - - def hit_remove_failpoint(): - assert env.pageserver.log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}") - - def hit_run_failpoint(): - assert env.pageserver.log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}") - - with concurrent.futures.ThreadPoolExecutor() as executor: - background_200_req = executor.submit(delete_tenant) - assert background_200_req.result(timeout=10).status_code == 202 - - # Wait until the first request completes its work and is blocked on removing - # the TenantSlot from tenant manager. - wait_until(100, 0.1, hit_remove_failpoint) - - # Start another request: this should fail when it sees a tenant in Stopping state - with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE): - ps_http.tenant_delete(tenant_id) - - # Start another background request, which will pause after acquiring a TenantSlotGuard - # but before completing. - ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "pause")) - background_4xx_req = executor.submit(delete_tenant) - wait_until(100, 0.1, hit_run_failpoint) - - # The TenantSlot is still present while the original request is hung before - # final removal - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 - - # Permit the original request to run to success - ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "off")) - - # Permit the duplicate background request to run to completion and fail. - ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off")) - with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE): - background_4xx_req.result(timeout=10) - - # Physical deletion should have happened - assert_prefix_empty( - neon_env_builder.pageserver_remote_storage, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - ) - - # Zero tenants remain (we deleted the default tenant) - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0 + env.pageserver.stop() -def test_tenant_delete_races_timeline_creation( - neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, -): +def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder): """ Validate that timeline creation executed in parallel with deletion works correctly. @@ -567,7 +219,7 @@ def test_tenant_delete_races_timeline_creation( # (and there is no way to reconstruct the used remote storage kind) remote_storage_kind = RemoteStorageKind.MOCK_S3 neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) + env = neon_env_builder.init_start(initial_tenant_conf=many_small_layers_tenant_config()) ps_http = env.pageserver.http_client() tenant_id = env.initial_tenant @@ -611,12 +263,12 @@ def test_tenant_delete_races_timeline_creation( Thread(target=timeline_create).start() def hit_initdb_upload_failpoint(): - assert env.pageserver.log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}") + env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}") wait_until(100, 0.1, hit_initdb_upload_failpoint) def creation_connection_timed_out(): - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( "POST.*/timeline.* request was dropped before completing" ) @@ -635,7 +287,7 @@ def test_tenant_delete_races_timeline_creation( Thread(target=tenant_delete).start() def deletion_arrived(): - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause" ) @@ -646,9 +298,7 @@ def test_tenant_delete_races_timeline_creation( # Disable the failpoint and wait for deletion to finish ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off")) - iterations = poll_for_remote_storage_iterations(remote_storage_kind) - - tenant_delete_wait_completed(ps_http, tenant_id, iterations, ignore_errors=True) + ps_http.tenant_delete(tenant_id) # Physical deletion should have happened assert_prefix_empty( @@ -662,10 +312,95 @@ def test_tenant_delete_races_timeline_creation( ) # Ensure that creation cancelled and deletion didn't end up in broken state or encountered the leftover temp file - assert env.pageserver.log_contains(CANCELLED_ERROR) + env.pageserver.assert_log_contains(CANCELLED_ERROR) assert not env.pageserver.log_contains( ".*ERROR.*delete_tenant.*Timelines directory is not empty after all timelines deletion" ) # Zero tenants remain (we deleted the default tenant) - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0 + + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + + env.pageserver.stop() + + +def test_tenant_delete_scrubber(pg_bin: PgBin, make_httpserver, neon_env_builder: NeonEnvBuilder): + """ + Validate that creating and then deleting the tenant both survives the scrubber, + and that one can run the scrubber without problems. + """ + + remote_storage_kind = RemoteStorageKind.MOCK_S3 + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + env = neon_env_builder.init_start(initial_tenant_conf=many_small_layers_tenant_config()) + + ps_http = env.pageserver.http_client() + # create a tenant separate from the main tenant so that we have one remaining + # after we deleted it, as the scrubber treats empty buckets as an error. + (tenant_id, timeline_id) = env.neon_cli.create_tenant() + + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) + env.stop() + + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy + + timeline_lsns = { + "tenant_id": f"{tenant_id}", + "timeline_id": f"{timeline_id}", + "timeline_start_lsn": f"{last_flush_lsn}", + "backup_lsn": f"{last_flush_lsn}", + } + + cloud_admin_url = f"http://{make_httpserver.host}:{make_httpserver.port}/" + cloud_admin_token = "" + + def get_branches(request: Request): + # Compare definition with `BranchData` struct + dummy_data = { + "id": "test-branch-id", + "created_at": "", # TODO + "updated_at": "", # TODO + "name": "testbranchname", + "project_id": "test-project-id", + "timeline_id": f"{timeline_id}", + "default": False, + "deleted": False, + "logical_size": 42000, + "physical_size": 42000, + "written_size": 42000, + } + # This test does all its own compute configuration (by passing explicit pageserver ID to Workload functions), + # so we send controller notifications to /dev/null to prevent it fighting the test for control of the compute. + log.info(f"got get_branches request: {request.json}") + return Response(json.dumps(dummy_data), content_type="application/json", status=200) + + make_httpserver.expect_request("/branches", method="GET").respond_with_handler(get_branches) + + healthy, _ = env.storage_scrubber.scan_metadata_safekeeper( + timeline_lsns=[timeline_lsns], + cloud_admin_api_url=cloud_admin_url, + cloud_admin_api_token=cloud_admin_token, + ) + assert healthy + + env.start() + ps_http = env.pageserver.http_client() + ps_http.tenant_delete(tenant_id) + env.stop() + + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy + + healthy, _ = env.storage_scrubber.scan_metadata_safekeeper( + timeline_lsns=[timeline_lsns], + cloud_admin_api_url=cloud_admin_url, + cloud_admin_api_token=cloud_admin_token, + ) + assert healthy diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 8d5ef4e3c4..b165588636 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -7,6 +7,7 @@ from typing import List, Optional import asyncpg import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -22,7 +23,6 @@ from fixtures.pageserver.utils import ( from fixtures.remote_storage import ( RemoteStorageKind, ) -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, wait_until from prometheus_client.samples import Sample @@ -76,10 +76,6 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str): env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - # Our re-attach may race with the deletion queue processing LSN updates - # from the original attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: with endpoint.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") @@ -92,10 +88,10 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str): wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) # Check that we had to retry the uploads - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadLayer.*, will retry.*" ) - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadMetadata.*, will retry.*" ) @@ -132,7 +128,7 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str): assert query_scalar(cur, "SELECT count(*) FROM t") == 100000 # Check that we had to retry the downloads - assert env.pageserver.log_contains(".*list timelines.*failed, will retry.*") + assert env.pageserver.log_contains(".*list identifiers.*failed, will retry.*") assert env.pageserver.log_contains(".*download.*failed, will retry.*") @@ -275,16 +271,6 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - # first check for non existing tenant - tenant_id = TenantId.generate() - with pytest.raises( - expected_exception=PageserverApiException, - match=f"NotFound: tenant {tenant_id}", - ) as excinfo: - pageserver_http.tenant_detach(tenant_id) - - assert excinfo.value.status_code == 404 - # create new nenant tenant_id, timeline_id = env.neon_cli.create_tenant() @@ -302,7 +288,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # gc should not try to even start on a timeline that doesn't exist with pytest.raises( - expected_exception=PageserverApiException, match="gc target timeline does not exist" + expected_exception=PageserverApiException, match="NotFound: Timeline not found" ): bogus_timeline_id = TimelineId.generate() pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0) @@ -310,7 +296,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend( [ # the error will be printed to the log too - ".*gc target timeline does not exist.*", + ".*NotFound: Timeline not found.*", # Timelines get stopped during detach, ignore the gc calls that error, witnessing that ".*InternalServerError\\(timeline is Stopping.*", ] @@ -344,94 +330,6 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): pageserver_http.timeline_gc(tenant_id, timeline_id, 0) -# Creates and ignores a tenant, then detaches it: first, with no parameters (should fail), -# then with parameters to force ignored tenant detach (should not fail). -def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv): - env = neon_simple_env - client = env.pageserver.http_client() - - # create a new tenant - tenant_id, _ = env.neon_cli.create_tenant() - - env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - - # assert tenant exists on disk - assert env.pageserver.tenant_dir(tenant_id).exists() - - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) - # we rely upon autocommit after each statement - endpoint.safe_psql_many( - queries=[ - "CREATE TABLE t(key int primary key, value text)", - "INSERT INTO t SELECT generate_series(1,100000), 'payload'", - ] - ) - - # ignore tenant - client.tenant_ignore(tenant_id) - env.pageserver.allowed_errors.append(".*NotFound: tenant .*") - # ensure tenant couldn't be detached without the special flag for ignored tenant - log.info("detaching ignored tenant WITHOUT required flag") - with pytest.raises( - expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}" - ): - client.tenant_detach(tenant_id) - - log.info("tenant detached failed as expected") - - # ensure tenant is detached with ignore state - log.info("detaching ignored tenant with required flag") - client.tenant_detach(tenant_id, True) - log.info("ignored tenant detached without error") - - # check that nothing is left on disk for deleted tenant - assert not env.pageserver.tenant_dir(tenant_id).exists() - - # assert the tenant does not exists in the Pageserver - tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()] - assert ( - tenant_id not in tenants_after_detach - ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory" - - -# Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach. -# Tenant should be detached without issues. -def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv): - env = neon_simple_env - client = env.pageserver.http_client() - - # create a new tenant - tenant_id, _ = env.neon_cli.create_tenant() - - env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - - # assert tenant exists on disk - assert env.pageserver.tenant_dir(tenant_id).exists() - - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) - # we rely upon autocommit after each statement - endpoint.safe_psql_many( - queries=[ - "CREATE TABLE t(key int primary key, value text)", - "INSERT INTO t SELECT generate_series(1,100000), 'payload'", - ] - ) - - log.info("detaching regular tenant with detach ignored flag") - client.tenant_detach(tenant_id, True) - - log.info("regular tenant detached without error") - - # check that nothing is left on disk for deleted tenant - assert not env.pageserver.tenant_dir(tenant_id).exists() - - # assert the tenant does not exists in the Pageserver - tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()] - assert ( - tenant_id not in tenants_after_detach - ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory" - - def test_detach_while_attaching( neon_env_builder: NeonEnvBuilder, ): @@ -447,10 +345,6 @@ def test_detach_while_attaching( env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - # Our re-attach may race with the deletion queue processing LSN updates - # from the original attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point @@ -500,153 +394,6 @@ def test_detach_while_attaching( cur.execute("SELECT COUNT(*) FROM foo") -# Tests that `ignore` and `get` operations' combination is able to remove and restore the tenant in pageserver's memory. -# * writes some data into tenant's timeline -# * ensures it's synced with the remote storage -# * `ignore` the tenant -# * verify that ignored tenant files are generally unchanged, only an ignored mark had appeared -# * verify the ignored tenant is gone from pageserver's memory -# * restart the pageserver and verify that ignored tenant is still not loaded -# * `load` the same tenant -# * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines -def test_ignored_tenant_reattach(neon_env_builder: NeonEnvBuilder): - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) - env = neon_env_builder.init_start() - pageserver_http = env.pageserver.http_client() - - ignored_tenant_id, _ = env.neon_cli.create_tenant() - tenant_dir = env.pageserver.tenant_dir(ignored_tenant_id) - tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] - tenants_before_ignore.sort() - timelines_before_ignore = [ - timeline["timeline_id"] - for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id) - ] - files_before_ignore = [tenant_path for tenant_path in tenant_dir.glob("**/*")] - - # ignore the tenant and veirfy it's not present in pageserver replies, with its files still on disk - pageserver_http.tenant_ignore(ignored_tenant_id) - - files_after_ignore_with_retain = [tenant_path for tenant_path in tenant_dir.glob("**/*")] - new_files = set(files_after_ignore_with_retain) - set(files_before_ignore) - disappeared_files = set(files_before_ignore) - set(files_after_ignore_with_retain) - assert ( - len(disappeared_files) == 0 - ), f"Tenant ignore should not remove files from disk, missing: {disappeared_files}" - assert ( - len(new_files) == 1 - ), f"Only tenant ignore file should appear on disk but got: {new_files}" - - tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] - assert ignored_tenant_id not in tenants_after_ignore, "Ignored tenant should be missing" - assert len(tenants_after_ignore) + 1 == len( - tenants_before_ignore - ), "Only ignored tenant should be missing" - - # restart the pageserver to ensure we don't load the ignore timeline - env.pageserver.stop() - env.pageserver.start() - tenants_after_restart = [tenant["id"] for tenant in pageserver_http.tenant_list()] - tenants_after_restart.sort() - assert ( - tenants_after_restart == tenants_after_ignore - ), "Ignored tenant should not be reloaded after pageserver restart" - - # now, load it from the local files and expect it works - env.pageserver.tenant_load(tenant_id=ignored_tenant_id) - wait_until_tenant_state(pageserver_http, ignored_tenant_id, "Active", 5) - - tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()] - tenants_after_attach.sort() - assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back" - - timelines_after_ignore = [ - timeline["timeline_id"] - for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id) - ] - assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back" - - -# Tests that it's possible to `load` tenants with missing layers and get them restored: -# * writes some data into tenant's timeline -# * ensures it's synced with the remote storage -# * `ignore` the tenant -# * removes all timeline's local layers -# * `load` the same tenant -# * ensure that it's status is `Active` -# * check that timeline data is restored -def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder): - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() - pageserver_http = env.pageserver.http_client() - endpoint = env.endpoints.create_start("main") - - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline - - env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - - data_id = 1 - data_secret = "very secret secret" - insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint) - - tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] - tenants_before_ignore.sort() - timelines_before_ignore = [ - timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id) - ] - - # ignore the tenant and remove its layers - pageserver_http.tenant_ignore(tenant_id) - timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id) - layers_removed = False - for dir_entry in timeline_dir.iterdir(): - if dir_entry.name.startswith("00000"): - # Looks like a layer file. Remove it - dir_entry.unlink() - layers_removed = True - assert layers_removed, f"Found no layers for tenant {timeline_dir}" - - # now, load it from the local files and expect it to work due to remote storage restoration - env.pageserver.tenant_load(tenant_id=tenant_id) - wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5) - - tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()] - tenants_after_attach.sort() - assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back" - - timelines_after_ignore = [ - timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id) - ] - assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back" - - endpoint.stop() - endpoint.start() - ensure_test_data(data_id, data_secret, endpoint) - - -# Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally -# Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored. -def test_load_negatives(neon_env_builder: NeonEnvBuilder): - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() - pageserver_http = env.pageserver.http_client() - env.endpoints.create_start("main") - - tenant_id = env.initial_tenant - - env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - - env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*") - with pytest.raises( - expected_exception=PageserverApiException, - match=f"tenant {tenant_id} already exists, state: Active", - ): - env.pageserver.tenant_load(tenant_id) - - pageserver_http.tenant_ignore(tenant_id) - - def test_detach_while_activating( neon_env_builder: NeonEnvBuilder, ): @@ -667,10 +414,6 @@ def test_detach_while_activating( env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - # Our re-attach may race with the deletion queue processing LSN updates - # from the original attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - data_id = 1 data_secret = "very secret secret" insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint) @@ -742,8 +485,6 @@ def ensure_test_data(data_id: int, data: str, endpoint: Endpoint): def test_metrics_while_ignoring_broken_tenant_and_reloading( neon_env_builder: NeonEnvBuilder, ): - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() client = env.pageserver.http_client() @@ -761,56 +502,37 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( client.tenant_break(env.initial_tenant) - found_broken = False - active, broken, broken_set = ([], [], []) - for _ in range(10): + def found_broken(): m = client.get_metrics() active = m.query_all("pageserver_tenant_states_count", {"state": "Active"}) broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"}) broken_set = m.query_all( "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)} ) - found_broken = only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1 + assert only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1 - if found_broken: - break - log.info(f"active: {active}, broken: {broken}, broken_set: {broken_set}") - time.sleep(0.5) - assert ( - found_broken - ), f"tenant shows up as broken; active={active}, broken={broken}, broken_set={broken_set}" + wait_until(10, 0.5, found_broken) - client.tenant_ignore(env.initial_tenant) + client.tenant_detach(env.initial_tenant) - found_broken = False - broken, broken_set = ([], []) - for _ in range(10): + def found_cleaned_up(): m = client.get_metrics() broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"}) broken_set = m.query_all( "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)} ) - found_broken = only_int(broken) == 0 and only_int(broken_set) == 1 + assert only_int(broken) == 0 and len(broken_set) == 0 - if found_broken: - break - time.sleep(0.5) - assert found_broken, f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}" + wait_until(10, 0.5, found_cleaned_up) - env.pageserver.tenant_load(env.initial_tenant) + env.pageserver.tenant_attach(env.initial_tenant) - found_active = False - active, broken_set = ([], []) - for _ in range(10): + def found_active(): m = client.get_metrics() active = m.query_all("pageserver_tenant_states_count", {"state": "Active"}) broken_set = m.query_all( "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)} ) - found_active = only_int(active) == 1 and len(broken_set) == 0 + assert only_int(active) == 1 and len(broken_set) == 0 - if found_active: - break - time.sleep(0.5) - - assert found_active, f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}" + wait_until(10, 0.5, found_active) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 80b4fab1d3..43e9a0d36e 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -7,6 +7,7 @@ from pathlib import Path from typing import Any, Dict, Optional, Tuple import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder, NeonPageserver from fixtures.pageserver.http import PageserverHttpClient @@ -14,17 +15,13 @@ from fixtures.pageserver.utils import ( assert_tenant_state, wait_for_last_record_lsn, wait_for_upload, - wait_tenant_status_404, ) -from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import ( LocalFsStorage, RemoteStorageKind, ) -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( query_scalar, - subprocess_capture, wait_until, ) @@ -157,10 +154,7 @@ def switch_pg_to_new_pageserver( timeline_to_detach_local_path = origin_ps.timeline_dir(tenant_id, timeline_id) files_before_detach = os.listdir(timeline_to_detach_local_path) assert ( - "metadata" in files_before_detach - ), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file, but got: {files_before_detach}" - assert ( - len(files_before_detach) >= 2 + len(files_before_detach) >= 1 ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file, but got {files_before_detach}" return timeline_to_detach_local_path @@ -187,20 +181,14 @@ def post_migration_check(endpoint: Endpoint, sum_before_migration: int, old_loca # A minor migration involves no storage breaking changes. # It is done by attaching the tenant to a new pageserver. "minor", - # A major migration involves exporting a postgres datadir - # basebackup and importing it into the new pageserver. - # This kind of migration can tolerate breaking changes - # to storage format - "major", + # In the unlikely and unfortunate event that we have to break + # the storage format, extend this test with the param below. + # "major", ], ) @pytest.mark.parametrize("with_load", ["with_load", "without_load"]) def test_tenant_relocation( neon_env_builder: NeonEnvBuilder, - port_distributor: PortDistributor, - test_output_dir: Path, - neon_binpath: Path, - base_dir: Path, method: str, with_load: str, ): @@ -213,12 +201,8 @@ def test_tenant_relocation( env.pageservers[0].allowed_errors.extend( [ - # FIXME: Is this expected? - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*", # Needed for detach polling on the original pageserver f".*NotFound: tenant {tenant_id}.*", - # We will dual-attach in this test, so stale generations are expected - ".*Dropped remote consistent LSN updates.*", ] ) @@ -304,40 +288,7 @@ def test_tenant_relocation( current_lsn=current_lsn_second, ) - # Migrate either by attaching from s3 or import/export basebackup - if method == "major": - cmd = [ - "poetry", - "run", - "python", - str(base_dir / "scripts/export_import_between_pageservers.py"), - "--tenant-id", - str(tenant_id), - "--from-host", - "localhost", - "--from-http-port", - str(origin_http.port), - "--from-pg-port", - str(origin_ps.service_port.pg), - "--to-host", - "localhost", - "--to-http-port", - str(destination_http.port), - "--to-pg-port", - str(destination_ps.service_port.pg), - "--pg-distrib-dir", - str(neon_env_builder.pg_distrib_dir), - "--work-dir", - str(test_output_dir), - "--tmp-pg-port", - str(port_distributor.get_port()), - ] - subprocess_capture(test_output_dir, cmd, check=True) - - destination_ps.allowed_errors.append( - ".*ignored .* unexpected bytes after the tar archive.*" - ) - elif method == "minor": + if method == "minor": # call to attach timeline to new pageserver destination_ps.tenant_attach(tenant_id) @@ -394,9 +345,6 @@ def test_tenant_relocation( # is no longer involved, and if it is, we will see the error origin_http.tenant_detach(tenant_id) - # Wait a little, so that the detach operation has time to finish. - wait_tenant_status_404(origin_http, tenant_id, iterations=100, interval=1) - post_migration_check(ep_main, 500500, old_local_path_main) post_migration_check(ep_second, 1001000, old_local_path_second) @@ -500,7 +448,7 @@ def test_emergency_relocate_with_branches_slow_replay( assert cur.fetchall() == [("before pause",), ("after pause",)] # Sanity check that the failpoint was reached - assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done') + env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done') assert time.time() - before_attach_time > 5 # Clean up @@ -637,7 +585,7 @@ def test_emergency_relocate_with_branches_createdb( assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200 # Sanity check that the failpoint was reached - assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done') + env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done') assert time.time() - before_attach_time > 5 # Clean up diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 7cea301a9c..f872116a1c 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -1,27 +1,32 @@ +import os +from concurrent.futures import ThreadPoolExecutor from pathlib import Path from typing import List, Tuple import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, NeonEnv, NeonEnvBuilder, + flush_ep_to_pageserver, wait_for_last_flush_lsn, wait_for_wal_insert_lsn, ) -from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_until_tenant_active, ) from fixtures.pg_version import PgVersion -from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import wait_until -@pytest.mark.xfail -def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path): - env = neon_simple_env +def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_configs() + env.start() + (tenant_id, _) = env.neon_cli.create_tenant() http_client = env.pageserver.http_client() initial_size = http_client.tenant_size(tenant_id) @@ -34,66 +39,25 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path): branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0] assert branch_name == main_branch_name - with env.endpoints.create_start( + endpoint = env.endpoints.create_start( main_branch_name, tenant_id=tenant_id, config_lines=["autovacuum=off", "checkpoint_timeout=10min"], - ) as endpoint: - with endpoint.cursor() as cur: - cur.execute("SELECT 1") - row = cur.fetchone() - assert row is not None - assert row[0] == 1 - size = http_client.tenant_size(tenant_id) - # we've disabled the autovacuum and checkpoint - # so background processes should not change the size. - # If this test will flake we should probably loosen the check - assert ( - size == initial_size - ), f"starting idle compute should not change the tenant size (Currently {size}, expected {initial_size})" + ) - # the size should be the same, until we increase the size over the - # gc_horizon - size, inputs = http_client.tenant_size_and_modelinputs(tenant_id) - assert ( - size == initial_size - ), f"tenant_size should not be affected by shutdown of compute (Currently {size}, expected {initial_size})" + with endpoint.cursor() as cur: + cur.execute("SELECT 1") + row = cur.fetchone() + assert row is not None + assert row[0] == 1 - expected_inputs = { - "segments": [ - { - "segment": {"parent": None, "lsn": 23694408, "size": 25362432, "needed": True}, - "timeline_id": f"{main_timeline_id}", - "kind": "BranchStart", - }, - { - "segment": {"parent": 0, "lsn": 23694528, "size": None, "needed": True}, - "timeline_id": f"{main_timeline_id}", - "kind": "BranchEnd", - }, - ], - "timeline_inputs": [ - { - "timeline_id": f"{main_timeline_id}", - "ancestor_id": None, - "ancestor_lsn": "0/0", - "last_record": "0/1698CC0", - "latest_gc_cutoff": "0/1698C48", - "horizon_cutoff": "0/0", - "pitr_cutoff": "0/0", - "next_gc_cutoff": "0/0", - "retention_param_cutoff": None, - } - ], - } - expected_inputs = mask_model_inputs(expected_inputs) - actual_inputs = mask_model_inputs(inputs) + # The transaction above will make the compute generate a checkpoint. + # In turn, the pageserver persists the checkpoint. This should only be + # one key with a size of a couple hundred bytes. + wait_for_last_flush_lsn(env, endpoint, tenant_id, main_timeline_id) + size = http_client.tenant_size(tenant_id) - assert expected_inputs == actual_inputs - - size_debug_file = open(test_output_dir / "size_debug.html", "w") - size_debug = http_client.tenant_size_debug(tenant_id) - size_debug_file.write(size_debug) + assert size >= initial_size and size - initial_size < 1024 def test_branched_empty_timeline_size(neon_simple_env: NeonEnv, test_output_dir: Path): @@ -189,7 +153,6 @@ def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv, test_ou size_debug_file.write(size_debug) -@pytest.mark.skip("This should work, but is left out because assumed covered by other tests") def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = 15 @@ -232,7 +195,6 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: size_debug_file.write(size_debug) -@pytest.mark.skip("This should work, but is left out because assumed covered by other tests") def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = 5 @@ -281,7 +243,6 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): size_debug_file.write(size_debug) -@pytest.mark.skip("This should work, but is left out because assumed covered by other tests") def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = small @@ -326,7 +287,7 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa size_debug_file.write(size_debug) -@pytest.mark.xfail +@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build") def test_single_branch_get_tenant_size_grows( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion ): @@ -334,25 +295,15 @@ def test_single_branch_get_tenant_size_grows( Operate on single branch reading the tenants size after each transaction. """ - # Disable automatic gc and compaction. - # The pitr_interval here is quite problematic, so we cannot really use it. - # it'd have to be calibrated per test executing env. + # Disable automatic compaction and GC, and set a long PITR interval: we will expect + # size to always increase with writes as all writes remain within the PITR + tenant_config = { + "compaction_period": "0s", + "gc_period": "0s", + "pitr_interval": "3600s", + } - # there was a bug which was hidden if the create table and first batch of - # inserts is larger than gc_horizon. for example 0x20000 here hid the fact - # that there next_gc_cutoff could be smaller than initdb_lsn, which will - # obviously lead to issues when calculating the size. - gc_horizon = 0x3BA00 - - # it's a bit of a hack, but different versions of postgres have different - # amount of WAL generated for the same amount of data. so we need to - # adjust the gc_horizon accordingly. - if pg_version == PgVersion.V14: - gc_horizon = 0x4A000 - - neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}" - - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf=tenant_config) tenant_id = env.initial_tenant branch_name, timeline_id = env.neon_cli.list_timelines(tenant_id)[0] @@ -363,18 +314,6 @@ def test_single_branch_get_tenant_size_grows( size_debug_file = open(test_output_dir / "size_debug.html", "w") - def check_size_change( - current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev_size: int - ): - if current_lsn - initdb_lsn >= gc_horizon: - assert ( - size >= prev_size - ), f"tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})" - else: - assert ( - size > prev_size - ), f"tenant_size should grow, because we continue to add WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})" - def get_current_consistent_size( env: NeonEnv, endpoint: Endpoint, @@ -405,6 +344,7 @@ def test_single_branch_get_tenant_size_grows( current_lsn = after_lsn size_debug_file.write(size_debug) assert size > 0 + log.info(f"size: {size} at lsn {current_lsn}") return (current_lsn, size) with env.endpoints.create_start( @@ -442,14 +382,6 @@ def test_single_branch_get_tenant_size_grows( ) prev_size = collected_responses[-1][2] - - # branch start shouldn't be past gc_horizon yet - # thus the size should grow as we insert more data - # "gc_horizon" is tuned so that it kicks in _after_ the - # insert phase, but before the update phase ends. - assert ( - current_lsn - initdb_lsn <= gc_horizon - ), "Tuning of GC window is likely out-of-date" assert size > prev_size collected_responses.append(("INSERT", current_lsn, size)) @@ -469,8 +401,7 @@ def test_single_branch_get_tenant_size_grows( ) prev_size = collected_responses[-1][2] - - check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size) + assert size > prev_size collected_responses.append(("UPDATE", current_lsn, size)) @@ -487,30 +418,42 @@ def test_single_branch_get_tenant_size_grows( ) prev_size = collected_responses[-1][2] - - check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size) + assert size > prev_size collected_responses.append(("DELETE", current_lsn, size)) + size_before_drop = get_current_consistent_size( + env, endpoint, size_debug_file, http_client, tenant_id, timeline_id + )[1] + with endpoint.cursor() as cur: cur.execute("DROP TABLE t0") - # The size of the tenant should still be as large as before we dropped - # the table, because the drop operation can still be undone in the PITR - # defined by gc_horizon. + # Dropping the table doesn't reclaim any space + # from the user's point of view, because the DROP transaction is still + # within pitr_interval. (current_lsn, size) = get_current_consistent_size( env, endpoint, size_debug_file, http_client, tenant_id, timeline_id ) + assert size >= prev_size + prev_size = size - prev_size = collected_responses[-1][2] - - check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size) + # Set a zero PITR interval to allow the DROP to impact the synthetic size + # Because synthetic size calculation uses pitr interval when available, + # when our tenant is configured with a tiny pitr interval, dropping a table should + # cause synthetic size to go down immediately + tenant_config["pitr_interval"] = "0s" + env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config) + (current_lsn, size) = get_current_consistent_size( + env, endpoint, size_debug_file, http_client, tenant_id, timeline_id + ) + assert size < size_before_drop + # The size of the tenant should still be as large as before we dropped + # the table, because the drop operation can still be undone in the PITR + # defined by gc_horizon. collected_responses.append(("DROP", current_lsn, size)) - # Should have gone past gc_horizon, otherwise gc_horizon is too large - assert current_lsn - initdb_lsn > gc_horizon - # this isn't too many lines to forget for a while. observed while # developing these tests that locally the value is a bit more than what we # get in the ci. @@ -559,9 +502,14 @@ def test_get_tenant_size_with_multiple_branches( gc_horizon = 128 * 1024 - neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}" - - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + "compaction_period": "0s", + "gc_period": "0s", + "pitr_interval": "0sec", + "gc_horizon": gc_horizon, + } + ) # FIXME: we have a race condition between GC and delete timeline. GC might fail with this # error. Similar to https://github.com/neondatabase/neon/issues/2671 @@ -676,6 +624,64 @@ def test_get_tenant_size_with_multiple_branches( size_debug_file.write(size_debug) +def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder): + """ + Makes sure synthetic size can still be calculated even if one of the + timelines is deleted or the tenant is deleted. + """ + + env = neon_env_builder.init_start() + failpoint = "Timeline::find_gc_cutoffs-pausable" + client = env.pageserver.http_client() + + orig_size = client.tenant_size(env.initial_tenant) + + branch_id = env.neon_cli.create_branch( + tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch" + ) + client.configure_failpoints((failpoint, "pause")) + + with ThreadPoolExecutor(max_workers=1) as exec: + completion = exec.submit(client.tenant_size, env.initial_tenant) + _, last_offset = wait_until( + 10, 1.0, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + ) + + timeline_delete_wait_completed(client, env.initial_tenant, branch_id) + + client.configure_failpoints((failpoint, "off")) + size = completion.result() + + assert_size_approx_equal(orig_size, size) + + branch_id = env.neon_cli.create_branch( + tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch2" + ) + client.configure_failpoints((failpoint, "pause")) + + with ThreadPoolExecutor(max_workers=1) as exec: + completion = exec.submit(client.tenant_size, env.initial_tenant) + wait_until( + 10, + 1.0, + lambda: env.pageserver.assert_log_contains( + f"at failpoint {failpoint}", offset=last_offset + ), + ) + + client.tenant_delete(env.initial_tenant) + + client.configure_failpoints((failpoint, "off")) + + # accept both, because the deletion might still complete before + matcher = "(Failed to refresh gc_info before gathering inputs|NotFound: tenant)" + with pytest.raises(PageserverApiException, match=matcher): + completion.result() + + # this happens only in the case of deletion (http response logging) + env.pageserver.allowed_errors.append(".*Failed to refresh gc_info before gathering inputs.*") + + # Helper for tests that compare timeline_inputs # We don't want to compare the exact values, because they can be unstable # and cause flaky tests. So replace the values with useful invariants. @@ -705,3 +711,118 @@ def mask_model_inputs(x): return newlist else: return x + + +@pytest.mark.parametrize("zero_gc", [True, False]) +def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path, zero_gc: bool): + """ + Compare a LSN lease to a read-only branch for synthetic size calculation. + They should have the same effect. + """ + + def assert_size_approx_equal_for_lease_test(size_lease, size_branch): + """ + Tests that evaluate sizes are checking the pageserver space consumption + that sits many layers below the user input. The exact space needed + varies slightly depending on postgres behavior. + + Rather than expecting postgres to be determinstic and occasionally + failing the test, we permit sizes for the same data to vary by a few pages. + """ + + # FIXME(yuchen): The delta is too large, used as temp solution to pass the test reliably. + # Investigate and reduce the threshold. + threshold = 22 * 8272 + + log.info( + f"delta: size_branch({size_branch}) - size_lease({size_lease}) = {size_branch - size_lease}" + ) + + assert size_lease == pytest.approx(size_branch, abs=threshold) + + conf = { + "pitr_interval": "0s" if zero_gc else "3600s", + "gc_period": "0s", + "compaction_period": "0s", + } + + env = neon_env_builder.init_start(initial_tenant_conf=conf) + + ro_branch_res = insert_with_action( + env, env.initial_tenant, env.initial_timeline, test_output_dir, action="branch" + ) + + tenant, timeline = env.neon_cli.create_tenant(conf=conf) + lease_res = insert_with_action(env, tenant, timeline, test_output_dir, action="lease") + + assert_size_approx_equal_for_lease_test(lease_res, ro_branch_res) + + # we are writing a lot, and flushing all of that to disk is not important for this test + env.stop(immediate=True) + + +def insert_with_action( + env: NeonEnv, + tenant: TenantId, + timeline: TimelineId, + test_output_dir: Path, + action: str, +) -> int: + """ + Inserts some data on the timeline, perform an action, and insert more data on the same timeline. + Returns the size at the end of the insertion. + + Valid actions: + - "lease": Acquires a lease. + - "branch": Creates a child branch but never writes to it. + """ + + client = env.pageserver.http_client() + with env.endpoints.create_start( + "main", + tenant_id=tenant, + config_lines=["autovacuum=off"], + ) as ep: + initial_size = client.tenant_size(tenant) + log.info(f"initial size: {initial_size}") + + with ep.cursor() as cur: + cur.execute( + "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" + ) + last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline) + + if action == "lease": + res = client.timeline_lsn_lease(tenant, timeline, last_flush_lsn) + log.info(f"result from lsn_lease api: {res}") + elif action == "branch": + ro_branch = env.neon_cli.create_branch( + "ro_branch", tenant_id=tenant, ancestor_start_lsn=last_flush_lsn + ) + log.info(f"{ro_branch=} created") + else: + raise AssertionError("Invalid action type, only `lease` and `branch`are accepted") + + with ep.cursor() as cur: + cur.execute( + "CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" + ) + cur.execute( + "CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" + ) + cur.execute( + "CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" + ) + + last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline) + + # Avoid flakiness when calculating logical size. + flush_ep_to_pageserver(env, ep, tenant, timeline) + + size_after_action_and_insert = client.tenant_size(tenant) + log.info(f"{size_after_action_and_insert=}") + + size_debug_file = open(test_output_dir / f"size_debug_{action}.html", "w") + size_debug = client.tenant_size_debug(tenant) + size_debug_file.write(size_debug) + return size_after_action_and_insert diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 75e5c2c91c..d08ad3cd2e 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -1,3 +1,4 @@ +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pageserver.utils import ( @@ -5,7 +6,6 @@ from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_until_tenant_active, ) -from fixtures.types import TenantId, TimelineId from fixtures.utils import wait_until diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 5164bda470..b63ff7f6bd 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -9,6 +9,7 @@ from typing import List import pytest import requests +from fixtures.common_types import Lsn, TenantId from fixtures.log_helper import log from fixtures.metrics import ( PAGESERVER_GLOBAL_METRICS, @@ -18,12 +19,12 @@ from fixtures.metrics import ( from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + wait_for_last_flush_lsn, ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import timeline_delete_wait_completed, wait_until_tenant_active from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId from fixtures.utils import wait_until from prometheus_client.samples import Sample @@ -35,21 +36,33 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): ) [d for d in tenants_dir.iterdir()] - neon_simple_env.pageserver.allowed_errors.append(".*tenant-config-before-write.*") + error_regexes = [".*tenant-config-before-write.*"] + neon_simple_env.pageserver.allowed_errors.extend(error_regexes) + neon_simple_env.storage_controller.allowed_errors.extend(error_regexes) pageserver_http = neon_simple_env.pageserver.http_client() - pageserver_http.configure_failpoints(("tenant-config-before-write", "return")) - with pytest.raises(Exception, match="tenant-config-before-write"): - _ = neon_simple_env.neon_cli.create_tenant() + # Failure to write a config to local disk makes the pageserver assume that local disk is bad and abort the process + pageserver_http.configure_failpoints(("tenant-config-before-write", "return")) + + tenant_id = TenantId.generate() + + with pytest.raises(requests.exceptions.ConnectionError, match="Connection aborted"): + neon_simple_env.pageserver.http_client().tenant_attach(tenant_id=tenant_id, generation=1) + + # Any files left behind on disk during failed creation do not prevent + # a retry from succeeding. Restart pageserver with no failpoints. + neon_simple_env.pageserver.running = False + neon_simple_env.pageserver.start() + + # The failed creation should not be present in list of tenants, as when we start up we'll see + # an empty tenant dir with no config in it. + neon_simple_env.pageserver.allowed_errors.append(".*Failed to load tenant config.*") new_tenants = sorted( map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) ) assert initial_tenants == new_tenants, "should not create new tenants" - # Any files left behind on disk during failed creation do not prevent - # a retry from succeeding. - pageserver_http.configure_failpoints(("tenant-config-before-write", "off")) neon_simple_env.neon_cli.create_tenant() @@ -285,7 +298,6 @@ def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend( [ - ".*marking .* as locally complete, while it doesnt exist in remote index.*", ".*load failed.*list timelines directory.*", ] ) @@ -360,27 +372,20 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): tenant_id: TenantId = env.initial_tenant timeline_id = env.initial_timeline - # Multiple creation requests which race will generate this error + # Multiple creation requests which race will generate this error on the pageserver + # and storage controller respectively env.pageserver.allowed_errors.append(".*Conflict: Tenant is already being modified.*") + env.storage_controller.allowed_errors.append(".*Conflict: Tenant is already being modified.*") # Tenant creation requests which arrive out of order will generate complaints about # generation nubmers out of order. env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+") - # Our multiple creation requests will advance generation quickly, and when we skip - # a generation number we can generate these warnings - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates for tenant .+") - # Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of # an incomplete attach, or some other problem. In the field this should be rare, # so we allow it to log at WARN, even if it is occasionally a false positive. env.pageserver.allowed_errors.append(".*failed to freeze and flush.*") - # When we shut down a tenant during a timeline creation, initdb is not cancelled, we wait - # for it to complete (since https://github.com/neondatabase/neon/pull/6451). This means - # that shutdown can be delayed by >=1s on debug builds where initdb takes a long time to run. - env.pageserver.allowed_errors.append(".*still waiting, taking longer than expected... gate.*") - def create_bg(delay_ms): time.sleep(delay_ms / 1000.0) try: @@ -392,6 +397,9 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): if e.status_code == 409: log.info(f"delay_ms={delay_ms} 409") pass + elif e.status_code == 429: + log.info(f"delay_ms={delay_ms} 429") + pass elif e.status_code == 400: if "is less than existing" in e.message: # We send creation requests very close together in time: it is expected that these @@ -420,3 +428,50 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): # The tenant should end up active wait_until_tenant_active(env.pageserver.http_client(), tenant_id, iterations=10, period=1) + + +def test_pageserver_metrics_many_relations(neon_env_builder: NeonEnvBuilder): + """Test for the directory_entries_count metric""" + + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + endpoint_tenant = env.endpoints.create_start("main", tenant_id=env.initial_tenant) + + # Not sure why but this many tables creates more relations than our limit + TABLE_COUNT = 1600 + COUNT_AT_LEAST_EXPECTED = 5500 + + with endpoint_tenant.connect() as conn: + with conn.cursor() as cur: + # Wrapping begin; commit; around this and the loop below keeps the reproduction + # but it also doesn't have a performance benefit + cur.execute("CREATE TABLE template_tbl(key int primary key, value text);") + for i in range(TABLE_COUNT): + cur.execute(f"CREATE TABLE tbl_{i}(like template_tbl INCLUDING ALL);") + wait_for_last_flush_lsn(env, endpoint_tenant, env.initial_tenant, env.initial_timeline) + endpoint_tenant.stop() + + m = ps_http.get_metrics() + directory_entries_count_metric = m.query_all( + "pageserver_directory_entries_count", {"tenant_id": str(env.initial_tenant)} + ) + + def only_int(samples: List[Sample]) -> int: + assert len(samples) == 1 + return int(samples[0].value) + + directory_entries_count = only_int(directory_entries_count_metric) + + log.info(f"pageserver_directory_entries_count metric value: {directory_entries_count}") + + assert directory_entries_count > COUNT_AT_LEAST_EXPECTED + + timeline_detail = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline) + + counts = timeline_detail["directory_entries_counts"] + assert counts + log.info(f"directory counts: {counts}") + assert counts[2] > COUNT_AT_LEAST_EXPECTED diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 6f05d7f7cb..168876b711 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -11,6 +11,7 @@ import os from pathlib import Path from typing import List, Tuple +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -18,6 +19,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, last_flush_lsn_upload, ) +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_tenant_state, wait_for_last_record_lsn, @@ -27,7 +29,6 @@ from fixtures.remote_storage import ( LocalFsStorage, RemoteStorageKind, ) -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, wait_until @@ -61,11 +62,6 @@ async def all_tenants_workload(env: NeonEnv, tenants_endpoints): def test_tenants_many(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" - ) - tenants_endpoints: List[Tuple[TenantId, Endpoint]] = [] for _ in range(1, 5): @@ -117,14 +113,6 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder): ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() - env.pageserver.allowed_errors.extend( - [ - # FIXME: Are these expected? - ".*No timelines to attach received.*", - ".*marking .* as locally complete, while it doesnt exist in remote index.*", - ] - ) - pageserver_http = env.pageserver.http_client() endpoint = env.endpoints.create_start("main") @@ -160,10 +148,10 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder): log.info(f"upload of checkpoint {checkpoint_number} is done") # Check that we had to retry the uploads - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadLayer.*, will retry.*" ) - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadMetadata.*, will retry.*" ) @@ -223,9 +211,6 @@ def test_tenant_redownloads_truncated_file_on_startup( env.pageserver.allowed_errors.extend( [ ".*removing local file .* because .*", - # FIXME: Are these expected? - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*", - ".*No timelines to attach received.*", ] ) @@ -262,7 +247,10 @@ def test_tenant_redownloads_truncated_file_on_startup( # ensure the same size is found from the index_part.json index_part = env.pageserver_remote_storage.index_content(tenant_id, timeline_id) - assert index_part["layer_metadata"][path.name]["file_size"] == expected_size + assert ( + index_part["layer_metadata"][parse_layer_file_name(path.name).to_str()]["file_size"] + == expected_size + ) ## Start the pageserver. It will notice that the file size doesn't match, and ## rename away the local file. It will be re-downloaded when it's needed. @@ -292,7 +280,7 @@ def test_tenant_redownloads_truncated_file_on_startup( # the remote side of local_layer_truncated remote_layer_path = env.pageserver_remote_storage.remote_layer_path( - tenant_id, timeline_id, path.name + tenant_id, timeline_id, parse_layer_file_name(path.name).to_str() ) # if the upload ever was ongoing, this check would be racy, but at least one diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index 5f72cfd747..840c7159ad 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -48,13 +48,12 @@ def test_threshold_based_eviction( tenant_id, timeline_id = env.initial_tenant, env.initial_timeline ps_http = env.pageserver.http_client() - assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { - "kind": "NoEviction" - } + vps_http = env.storage_controller.pageserver_api() + assert vps_http.tenant_config(tenant_id).effective_config["eviction_policy"] is None - eviction_threshold = 5 - eviction_period = 1 - ps_http.set_tenant_config( + eviction_threshold = 10 + eviction_period = 2 + vps_http.set_tenant_config( tenant_id, { "eviction_policy": { @@ -64,7 +63,7 @@ def test_threshold_based_eviction( }, }, ) - assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { + assert vps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { "kind": "LayerAccessThreshold", "threshold": f"{eviction_threshold}s", "period": f"{eviction_period}s", @@ -73,7 +72,7 @@ def test_threshold_based_eviction( # restart because changing tenant config is not instant env.pageserver.restart() - assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { + assert vps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { "kind": "LayerAccessThreshold", "threshold": f"{eviction_threshold}s", "period": f"{eviction_period}s", @@ -81,7 +80,7 @@ def test_threshold_based_eviction( # create a bunch of L1s, only the least of which will need to be resident compaction_threshold = 3 # create L1 layers quickly - ps_http.patch_tenant_config_client_side( + vps_http.patch_tenant_config_client_side( tenant_id, inserts={ # Disable gc and compaction to avoid on-demand downloads from their side. @@ -154,7 +153,7 @@ def test_threshold_based_eviction( while time.time() - started_waiting_at < observation_window: current = ( time.time(), - MapInfoProjection(ps_http.layer_map_info(tenant_id, timeline_id)), + MapInfoProjection(vps_http.layer_map_info(tenant_id, timeline_id)), ) last = map_info_changes[-1] if map_info_changes else (0, None) if last[1] is None or current[1] != last[1]: @@ -179,6 +178,6 @@ def test_threshold_based_eviction( assert len(post.remote_layers) > 0, "some layers should be evicted once it's stabilized" assert len(post.local_layers) > 0, "the imitate accesses should keep some layers resident" - assert env.pageserver.log_contains( - metrics_refused_log_line + assert ( + env.pageserver.log_contains(metrics_refused_log_line) is not None ), "ensure the metrics collection worker ran" diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py new file mode 100644 index 0000000000..de43e51c9e --- /dev/null +++ b/test_runner/regress/test_timeline_archive.py @@ -0,0 +1,113 @@ +import pytest +from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId +from fixtures.neon_fixtures import ( + NeonEnvBuilder, +) +from fixtures.pageserver.http import PageserverApiException + + +@pytest.mark.parametrize("shard_count", [0, 4]) +def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int): + unsharded = shard_count == 0 + if unsharded: + env = neon_env_builder.init_start() + # If we run the unsharded version, talk to the pageserver directly + ps_http = env.pageserver.http_client() + else: + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + # If we run the unsharded version, talk to the storage controller + ps_http = env.storage_controller.pageserver_api() + + # first try to archive a non existing timeline for an existing tenant: + invalid_timeline_id = TimelineId.generate() + with pytest.raises(PageserverApiException, match="timeline not found") as exc: + ps_http.timeline_archival_config( + env.initial_tenant, + invalid_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + assert exc.value.status_code == 404 + + # for a non existing tenant: + invalid_tenant_id = TenantId.generate() + with pytest.raises( + PageserverApiException, + match="NotFound: [tT]enant", + ) as exc: + ps_http.timeline_archival_config( + invalid_tenant_id, + invalid_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + assert exc.value.status_code == 404 + + # construct a pair of branches to validate that pageserver prohibits + # archival of ancestor timelines when they have non-archived child branches + parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_archive_parent") + + leaf_timeline_id = env.neon_cli.create_branch( + "test_ancestor_branch_archive_branch1", "test_ancestor_branch_archive_parent" + ) + + with pytest.raises( + PageserverApiException, + match="Cannot archive timeline which has non-archived child timelines", + ) as exc: + ps_http.timeline_archival_config( + env.initial_tenant, + parent_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + assert exc.value.status_code == 412 + + leaf_detail = ps_http.timeline_detail( + env.initial_tenant, + timeline_id=leaf_timeline_id, + ) + assert leaf_detail["is_archived"] is False + + # Test that archiving the leaf timeline and then the parent works + ps_http.timeline_archival_config( + env.initial_tenant, + leaf_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + leaf_detail = ps_http.timeline_detail( + env.initial_tenant, + leaf_timeline_id, + ) + assert leaf_detail["is_archived"] is True + + ps_http.timeline_archival_config( + env.initial_tenant, + parent_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + # Test that the leaf can't be unarchived + with pytest.raises( + PageserverApiException, + match="ancestor is archived", + ) as exc: + ps_http.timeline_archival_config( + env.initial_tenant, + leaf_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + + # Unarchive works for the leaf if the parent gets unarchived first + ps_http.timeline_archival_config( + env.initial_tenant, + parent_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + + ps_http.timeline_archival_config( + env.initial_tenant, + leaf_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 352b82d525..328131cd08 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -6,6 +6,7 @@ import threading import pytest import requests +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -15,9 +16,9 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( - MANY_SMALL_LAYERS_TENANT_CONFIG, assert_prefix_empty, assert_prefix_not_empty, + many_small_layers_tenant_config, poll_for_remote_storage_iterations, timeline_delete_wait_completed, wait_for_last_record_lsn, @@ -31,7 +32,6 @@ from fixtures.remote_storage import ( RemoteStorageKind, s3_storage, ) -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, run_pg_bench_small, wait_until from urllib3.util.retry import Retry @@ -89,6 +89,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): assert timeline_path.exists() # retry deletes when compaction or gc is running in pageserver + # TODO: review whether this wait_until is actually necessary, we do an await() internally wait_until( number_of_iterations=3, interval=0.2, @@ -136,12 +137,9 @@ DELETE_FAILPOINTS = [ "timeline-delete-before-index-deleted-at", "timeline-delete-before-schedule", "timeline-delete-before-rm", - "timeline-delete-during-rm", "timeline-delete-after-rm", "timeline-delete-before-index-delete", "timeline-delete-after-index-delete", - "timeline-delete-after-rm-metadata", - "timeline-delete-after-rm-dir", ] @@ -206,7 +204,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( [ f".*{timeline_id}.*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # This happens when we fail before scheduling background operation. # Timeline is left in stopping state and retry tries to stop it again. ".*Ignoring new state, equal to the existing one: Stopping", @@ -215,7 +213,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints( # This happens when timeline remains are cleaned up during loading ".*Timeline dir entry become invalid.*", # In one of the branches we poll for tenant to become active. Polls can generate this log message: - f".*Tenant {env.initial_tenant} is not active*", + f".*Tenant {env.initial_tenant} is not active.*", + # an on-demand is cancelled by shutdown + ".*initial size calculation failed: downloading failed, possibly for shutdown", ] ) @@ -398,7 +398,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild ".*failpoint: timeline-delete-before-rm", ".*Ignoring new state, equal to the existing one: Stopping", # this happens, because the stuck timeline is visible to shutdown - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", ] ) @@ -485,6 +485,9 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage), ) + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + @pytest.mark.parametrize( "stuck_failpoint", @@ -534,7 +537,7 @@ def test_concurrent_timeline_delete_stuck_on( try: def first_call_hit_failpoint(): - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( f".*{child_timeline_id}.*at failpoint {stuck_failpoint}" ) @@ -605,7 +608,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): at_failpoint_log_message = f".*{child_timeline_id}.*at failpoint {failpoint_name}.*" def hit_failpoint(): - assert env.pageserver.log_contains(at_failpoint_log_message) + env.pageserver.assert_log_contains(at_failpoint_log_message) wait_until(50, 0.1, hit_failpoint) @@ -615,7 +618,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.append(hangup_log_message) def got_hangup_log_message(): - assert env.pageserver.log_contains(hangup_log_message) + env.pageserver.assert_log_contains(hangup_log_message) wait_until(50, 0.1, got_hangup_log_message) @@ -627,7 +630,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): def first_request_finished(): message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished" - assert env.pageserver.log_contains(message) + env.pageserver.assert_log_contains(message) wait_until(50, 0.1, first_request_finished) @@ -651,9 +654,7 @@ def test_timeline_delete_works_for_remote_smoke( timeline_ids = [env.initial_timeline] for i in range(2): branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main") - pg = env.endpoints.create_start(f"new{i}") - - with pg.cursor() as cur: + with env.endpoints.create_start(f"new{i}") as pg, pg.cursor() as cur: cur.execute("CREATE TABLE f (i integer);") cur.execute("INSERT INTO f VALUES (generate_series(1,1000));") current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) @@ -705,6 +706,9 @@ def test_timeline_delete_works_for_remote_smoke( # Assume it is mock server inconsistency and check twice. wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage)) + # We deleted our only tenant, and the scrubber fails if it detects nothing + neon_env_builder.disable_scrub_on_exit() + def test_delete_orphaned_objects( neon_env_builder: NeonEnvBuilder, @@ -764,7 +768,7 @@ def test_delete_orphaned_objects( for orphan in orphans: assert not orphan.exists() - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( f"deleting a file not referenced from index_part.json name={orphan.stem}" ) @@ -778,7 +782,7 @@ def test_timeline_delete_resumed_on_attach( remote_storage_kind = s3_storage() neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) + env = neon_env_builder.init_start(initial_tenant_conf=many_small_layers_tenant_config()) tenant_id = env.initial_tenant @@ -803,7 +807,7 @@ def test_timeline_delete_resumed_on_attach( ) # failpoint before we remove index_part from s3 - failpoint = "timeline-delete-during-rm" + failpoint = "timeline-delete-after-rm" ps_http.configure_failpoints((failpoint, "return")) env.pageserver.allowed_errors.extend( @@ -811,7 +815,7 @@ def test_timeline_delete_resumed_on_attach( # allow errors caused by failpoints f".*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # error from http response is also logged ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*", # Polling after attach may fail with this diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py new file mode 100644 index 0000000000..d152d0f41f --- /dev/null +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -0,0 +1,1493 @@ +import datetime +import enum +import threading +import time +from concurrent.futures import ThreadPoolExecutor +from queue import Empty, Queue +from threading import Barrier +from typing import List, Set, Tuple + +import pytest +from fixtures.common_types import Lsn, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + LogCursor, + NeonEnvBuilder, + PgBin, + flush_ep_to_pageserver, + wait_for_last_flush_lsn, +) +from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_timeline_detail_404 +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind +from fixtures.utils import assert_pageserver_backups_equal, wait_until +from requests import ReadTimeout + + +def by_end_lsn(info: HistoricLayerInfo) -> Lsn: + assert info.lsn_end is not None + return Lsn(info.lsn_end) + + +def layer_name(info: HistoricLayerInfo) -> str: + return info.layer_file_name + + +@enum.unique +class Branchpoint(str, enum.Enum): + """ + Have branches at these Lsns possibly relative to L0 layer boundary. + """ + + EARLIER = "earlier" + AT_L0 = "at" + AFTER_L0 = "after" + LAST_RECORD_LSN = "head" + + def __str__(self) -> str: + return self.value + + @staticmethod + def all() -> List["Branchpoint"]: + return [ + Branchpoint.EARLIER, + Branchpoint.AT_L0, + Branchpoint.AFTER_L0, + Branchpoint.LAST_RECORD_LSN, + ] + + +SHUTDOWN_ALLOWED_ERRORS = [ + ".*initial size calculation failed: downloading failed, possibly for shutdown", + ".*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*logical_size_calculation_task:panic.*: Sequential get failed with Bad state \\(not active\\).*", + ".*Task 'initial size calculation' .* panicked.*", +] + + +@pytest.mark.parametrize("branchpoint", Branchpoint.all()) +@pytest.mark.parametrize("restart_after", [True, False]) +@pytest.mark.parametrize("write_to_branch_first", [True, False]) +def test_ancestor_detach_branched_from( + test_output_dir, + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + branchpoint: Branchpoint, + restart_after: bool, + write_to_branch_first: bool, +): + """ + Creates a branch relative to L0 lsn boundary according to Branchpoint. Later the timeline is detached. + """ + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + client = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("CREATE TABLE foo (i BIGINT);") + + after_first_tx = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);") + + # create a single layer for us to remote copy + wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);") + flush_ep_to_pageserver(env, ep, env.initial_tenant, env.initial_timeline) + + deltas = client.layer_map_info(env.initial_tenant, env.initial_timeline).delta_layers() + # there is also the in-mem layer, but ignore it for now + assert len(deltas) == 2, "expecting there to be two deltas: initdb and checkpointed" + later_delta = max(deltas, key=by_end_lsn) + assert later_delta.lsn_end is not None + + # -1 as the lsn_end is exclusive. + last_lsn = Lsn(later_delta.lsn_end).lsn_int - 1 + + if branchpoint == Branchpoint.EARLIER: + branch_at = after_first_tx + rows = 0 + truncated_layers = 1 + elif branchpoint == Branchpoint.AT_L0: + branch_at = Lsn(last_lsn) + rows = 8192 + truncated_layers = 0 + elif branchpoint == Branchpoint.AFTER_L0: + branch_at = Lsn(last_lsn + 8) + rows = 8192 + # as there is no 8 byte walrecord, nothing should get copied from the straddling layer + truncated_layers = 0 + else: + # this case also covers the implicit flush of ancestor as the inmemory hasn't been flushed yet + assert branchpoint == Branchpoint.LAST_RECORD_LSN + branch_at = None + rows = 16384 + truncated_layers = 0 + + name = "new main" + + timeline_id = env.neon_cli.create_branch( + name, "main", env.initial_tenant, ancestor_start_lsn=branch_at + ) + + recorded = Lsn(client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_lsn"]) + if branch_at is None: + # fix it up if we need it later (currently unused) + branch_at = recorded + else: + assert branch_at == recorded, "the test should not use unaligned lsns" + + if write_to_branch_first: + with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + # make sure the ep is writable + # with BEFORE_L0, AFTER_L0 there will be a gap in Lsns caused by accurate end_lsn on straddling layers + ep.safe_psql("CREATE TABLE audit AS SELECT 1 as starts;") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id) + + # branch must have a flush for "PREV_LSN: none" + client.timeline_checkpoint(env.initial_tenant, timeline_id) + branch_layers = set( + map(layer_name, client.layer_map_info(env.initial_tenant, timeline_id).historic_layers) + ) + else: + branch_layers = set() + + # run fullbackup to make sure there are no off by one errors + # take this on the parent + fullbackup_before = test_output_dir / "fullbackup-before.tar" + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, env.initial_timeline, branch_at, fullbackup_before + ) + + all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) + assert all_reparented == set() + + if restart_after: + env.pageserver.stop() + env.pageserver.start() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == 16384 + + with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + + old_main_info = client.layer_map_info(env.initial_tenant, env.initial_timeline) + old_main = set(map(layer_name, old_main_info.historic_layers)) + + new_main_info = client.layer_map_info(env.initial_tenant, timeline_id) + new_main = set(map(layer_name, new_main_info.historic_layers)) + + new_main_copied_or_truncated = new_main - branch_layers + new_main_truncated = new_main_copied_or_truncated - old_main + + assert len(new_main_truncated) == truncated_layers + # could additionally check that the symmetric difference has layers starting at the same lsn + # but if nothing was copied, then there is no nice rule. + # there could be a hole in LSNs between copied from the "old main" and the first branch layer. + + # take this on the detached, at same lsn + fullbackup_after = test_output_dir / "fullbackup-after.tar" + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, timeline_id, branch_at, fullbackup_after + ) + + client.timeline_delete(env.initial_tenant, env.initial_timeline) + wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) + + # because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different + # as there is always "PREV_LSN: invalid" for "before" + skip_files = {"zenith.signal"} + + assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, skip_files) + + +def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): + """ + The case from RFC: + + +-> another branch with same ancestor_lsn as new main + | + old main -------|---------X---------> + | | | + | | +-> after + | | + | +-> new main + | + +-> reparented + + Ends up as: + + old main ---------------------------> + | + +-> after + + +-> another branch with same ancestor_lsn as new main + | + new main -------|---------|-> + | + +-> reparented + + We confirm the end result by being able to delete "old main" after deleting "after". + """ + + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + client = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("CREATE TABLE foo (i BIGINT);") + ep.safe_psql("CREATE TABLE audit AS SELECT 1 as starts;") + + branchpoint_pipe = wait_for_last_flush_lsn( + env, ep, env.initial_tenant, env.initial_timeline + ) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);") + + branchpoint_x = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + # as this only gets reparented, we don't need to write to it like new main + reparented = env.neon_cli.create_branch( + "reparented", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_pipe + ) + + same_branchpoint = env.neon_cli.create_branch( + "same_branchpoint", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_x + ) + + timeline_id = env.neon_cli.create_branch( + "new main", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_x + ) + + after = env.neon_cli.create_branch("after", "main", env.initial_tenant, ancestor_start_lsn=None) + + all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) + assert set(all_reparented) == {reparented, same_branchpoint} + + env.pageserver.quiesce_tenants() + + # checking the ancestor after is much faster than waiting for the endpoint not start + expected_result = [ + ("main", env.initial_timeline, None, 16384, 1), + ("after", after, env.initial_timeline, 16384, 1), + ("new main", timeline_id, None, 8192, 1), + ("same_branchpoint", same_branchpoint, timeline_id, 8192, 1), + ("reparented", reparented, timeline_id, 0, 1), + ] + + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + + for _, queried_timeline, expected_ancestor, _, _ in expected_result: + details = client.timeline_detail(env.initial_tenant, queried_timeline) + ancestor_timeline_id = details["ancestor_timeline_id"] + if expected_ancestor is None: + assert ancestor_timeline_id is None + else: + assert TimelineId(ancestor_timeline_id) == expected_ancestor + + index_part = env.pageserver_remote_storage.index_content( + env.initial_tenant, queried_timeline + ) + lineage = index_part["lineage"] + assert lineage is not None + + assert lineage.get("reparenting_history_overflown", "false") == "false" + + if queried_timeline == timeline_id: + original_ancestor = lineage["original_ancestor"] + assert original_ancestor is not None + assert original_ancestor[0] == str(env.initial_timeline) + assert original_ancestor[1] == str(branchpoint_x) + + # this does not contain Z in the end, so fromisoformat accepts it + # it is to be in line with the deletion timestamp.. well, almost. + when = original_ancestor[2][:26] + when_ts = datetime.datetime.fromisoformat(when) + assert when_ts < datetime.datetime.now() + assert len(lineage.get("reparenting_history", [])) == 0 + elif expected_ancestor == timeline_id: + assert len(lineage.get("original_ancestor", [])) == 0 + assert lineage["reparenting_history"] == [str(env.initial_timeline)] + else: + assert len(lineage.get("original_ancestor", [])) == 0 + assert len(lineage.get("reparenting_history", [])) == 0 + + for name, _, _, rows, starts in expected_result: + with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + assert ep.safe_psql(f"SELECT count(*) FROM audit WHERE starts = {starts}")[0][0] == 1 + + # delete the timelines to confirm detach actually worked + client.timeline_delete(env.initial_tenant, after) + wait_timeline_detail_404(client, env.initial_tenant, after, 10, 1.0) + + client.timeline_delete(env.initial_tenant, env.initial_timeline) + wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) + + +def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder): + """ + Makes sure that the timeline is able to receive writes through-out the detach process. + """ + + env = neon_env_builder.init_start() + + client = env.pageserver.http_client() + + # row counts have been manually verified to cause reconnections and getpage + # requests when restart_after=False with pg16 + def insert_rows(n: int, ep) -> int: + ep.safe_psql( + f"INSERT INTO foo SELECT i::bigint, 'more info!! this is a long string' || i FROM generate_series(0, {n - 1}) g(i);" + ) + return n + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("CREATE EXTENSION neon_test_utils;") + ep.safe_psql("CREATE TABLE foo (i BIGINT, aux TEXT NOT NULL);") + + rows = insert_rows(256, ep) + + branchpoint = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + timeline_id = env.neon_cli.create_branch( + "new main", "main", tenant_id=env.initial_tenant, ancestor_start_lsn=branchpoint + ) + + log.info("starting the new main endpoint") + ep = env.endpoints.create_start("new main", tenant_id=env.initial_tenant) + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + + def small_txs(ep, queue: Queue[str], barrier): + extra_rows = 0 + + with ep.connect() as conn: + while True: + try: + queue.get_nowait() + break + except Empty: + pass + + if barrier is not None: + barrier.wait() + barrier = None + + cursor = conn.cursor() + cursor.execute( + "INSERT INTO foo(i, aux) VALUES (1, 'more info!! this is a long string' || 1);" + ) + extra_rows += 1 + return extra_rows + + with ThreadPoolExecutor(max_workers=1) as exec: + queue: Queue[str] = Queue() + barrier = Barrier(2) + + completion = exec.submit(small_txs, ep, queue, barrier) + barrier.wait() + + reparented = client.detach_ancestor(env.initial_tenant, timeline_id) + assert len(reparented) == 0 + + env.pageserver.quiesce_tenants() + + queue.put("done") + extra_rows = completion.result() + assert extra_rows > 0, "some rows should had been written" + rows += extra_rows + + assert client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_timeline_id"] is None + + ep.clear_shared_buffers() + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + assert ep.safe_psql("SELECT SUM(LENGTH(aux)) FROM foo")[0][0] != 0 + ep.stop() + + # finally restart the endpoint and make sure we still have the same answer + with env.endpoints.create_start("new main", tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + +def test_compaction_induced_by_detaches_in_history( + neon_env_builder: NeonEnvBuilder, test_output_dir, pg_bin: PgBin +): + """ + Assuming the tree of timelines: + + root + |- child1 + |- ... + |- wanted_detached_child + + Each detach can add N more L0 per level, this is actually unbounded because + compaction can be arbitrarily delayed (or detach happen right before one + starts). If "wanted_detached_child" has already made progress and compacted + L1s, we want to make sure "compaction in the history" does not leave the + timeline broken. + """ + + env = neon_env_builder.init_start( + initial_tenant_conf={ + # we want to create layers manually so we don't branch on arbitrary + # Lsn, but we also do not want to compact L0 -> L1. + "compaction_threshold": "99999", + "compaction_period": "0s", + # shouldn't matter, but just in case + "gc_period": "0s", + } + ) + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + env.pageserver.allowed_errors.append( + ".*await_initial_logical_size: can't get semaphore cancel token, skipping" + ) + client = env.pageserver.http_client() + + def delta_layers(timeline_id: TimelineId): + # shorthand for more readable formatting + return client.layer_map_info(env.initial_tenant, timeline_id).delta_layers() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("create table integers (i bigint not null);") + ep.safe_psql("insert into integers (i) values (42)") + branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + assert len(delta_layers(env.initial_timeline)) == 2 + + more_good_numbers = range(0, 3) + + branches: List[Tuple[str, TimelineId]] = [("main", env.initial_timeline)] + + for num in more_good_numbers: + branch_name = f"br-{len(branches)}" + branch_timeline_id = env.neon_cli.create_branch( + branch_name, + ancestor_branch_name=branches[-1][0], + tenant_id=env.initial_tenant, + ancestor_start_lsn=branch_lsn, + ) + branches.append((branch_name, branch_timeline_id)) + + with env.endpoints.create_start(branches[-1][0], tenant_id=env.initial_tenant) as ep: + ep.safe_psql( + f"insert into integers (i) select i from generate_series({num}, {num + 100}) as s(i)" + ) + branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id) + client.timeline_checkpoint(env.initial_tenant, branch_timeline_id) + + assert len(delta_layers(branch_timeline_id)) == 1 + + # now fill in the final, most growing timeline + + branch_name, branch_timeline_id = branches[-1] + with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep: + ep.safe_psql("insert into integers (i) select i from generate_series(50, 500) s(i)") + + last_suffix = None + for suffix in range(0, 4): + ep.safe_psql(f"create table other_table_{suffix} as select * from integers") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id) + client.timeline_checkpoint(env.initial_tenant, branch_timeline_id) + last_suffix = suffix + + assert last_suffix is not None + + assert len(delta_layers(branch_timeline_id)) == 5 + + client.patch_tenant_config_client_side( + env.initial_tenant, {"compaction_threshold": 5}, None + ) + + client.timeline_compact(env.initial_tenant, branch_timeline_id) + + # one more layer + ep.safe_psql(f"create table other_table_{last_suffix + 1} as select * from integers") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id) + + # we need to wait here, because the detaches will do implicit tenant restart, + # and we could get unexpected layer counts + client.timeline_checkpoint(env.initial_tenant, branch_timeline_id, wait_until_uploaded=True) + + assert len([filter(lambda x: x.l0, delta_layers(branch_timeline_id))]) == 1 + + skip_main = branches[1:] + + branch_lsn = client.timeline_detail(env.initial_tenant, branch_timeline_id)["ancestor_lsn"] + + # take the fullbackup before and after inheriting the new L0s + fullbackup_before = test_output_dir / "fullbackup-before.tar" + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_before + ) + + # force initial logical sizes, so we can evict all layers from all + # timelines and exercise on-demand download for copy lsn prefix + client.timeline_detail( + env.initial_tenant, env.initial_timeline, force_await_initial_logical_size=True + ) + client.evict_all_layers(env.initial_tenant, env.initial_timeline) + + for _, timeline_id in skip_main: + reparented = client.detach_ancestor(env.initial_tenant, timeline_id) + assert reparented == set(), "we have no earlier branches at any level" + + post_detach_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id))) + assert len(post_detach_l0s) == 5, "should had inherited 4 L0s, have 5 in total" + + # checkpoint does compaction, which in turn decides to run, because + # there is now in total threshold number L0s even if they are not + # adjacent in Lsn space: + # + # inherited flushed during this checkpoint + # \\\\ / + # 1234X5---> lsn + # | + # l1 layers from "fill in the final, most growing timeline" + # + # branch_lsn is between 4 and first X. + client.timeline_checkpoint(env.initial_tenant, branch_timeline_id) + + post_compact_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id))) + assert len(post_compact_l0s) == 1, "only the consecutive inherited L0s should be compacted" + + fullbackup_after = test_output_dir / "fullbackup_after.tar" + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_after + ) + + # we don't need to skip any files, because zenith.signal will be identical + assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set()) + + +@pytest.mark.parametrize("sharded", [True, False]) +def test_timeline_ancestor_detach_idempotent_success( + neon_env_builder: NeonEnvBuilder, sharded: bool +): + shards = 2 if sharded else 1 + + neon_env_builder.num_pageservers = shards + env = neon_env_builder.init_start(initial_tenant_shard_count=shards if sharded else None) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + for ps in pageservers.values(): + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + if sharded: + # FIXME: should this be in the neon_env_builder.init_start? + env.storage_controller.reconcile_until_idle() + client = env.storage_controller.pageserver_api() + else: + client = env.pageserver.http_client() + + first_branch = env.neon_cli.create_branch("first_branch") + + _ = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch") + + # these two will be reparented, and they should be returned in stable order + # from pageservers OR otherwise there will be an `error!` logging from + # storage controller + reparented1 = env.neon_cli.create_branch("first_reparented", ancestor_branch_name="main") + reparented2 = env.neon_cli.create_branch("second_reparented", ancestor_branch_name="main") + + first_reparenting_response = client.detach_ancestor(env.initial_tenant, first_branch) + assert set(first_reparenting_response) == {reparented1, reparented2} + + # FIXME: this should be done by the http req handler + for ps in pageservers.values(): + ps.quiesce_tenants() + + for _ in range(5): + # once completed, we can retry this how many times + assert ( + client.detach_ancestor(env.initial_tenant, first_branch) == first_reparenting_response + ) + + client.tenant_delete(env.initial_tenant) + + with pytest.raises(PageserverApiException) as e: + client.detach_ancestor(env.initial_tenant, first_branch) + assert e.value.status_code == 404 + + +@pytest.mark.parametrize("sharded", [True, False]) +def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, sharded: bool): + # the test is split from test_timeline_ancestor_detach_idempotent_success as only these error cases should create "request was dropped before completing", + # given the current first error handling + shards = 2 if sharded else 1 + + neon_env_builder.num_pageservers = shards + env = neon_env_builder.init_start(initial_tenant_shard_count=shards if sharded else None) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + for ps in pageservers.values(): + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + ps.allowed_errors.extend( + [ + ".* WARN .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: request was dropped before completing", + # rare error logging, which is hard to reproduce without instrumenting responding with random sleep + '.* ERROR .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: Cancelled request finished with an error: Conflict\\("no ancestors"\\)', + ] + ) + + client = ( + env.pageserver.http_client() if not sharded else env.storage_controller.pageserver_api() + ) + + with pytest.raises(PageserverApiException, match=".* no ancestors") as info: + client.detach_ancestor(env.initial_tenant, env.initial_timeline) + assert info.value.status_code == 409 + + _ = env.neon_cli.create_branch("first_branch") + + second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch") + + # funnily enough this does not have a prefix + with pytest.raises(PageserverApiException, match="too many ancestors") as info: + client.detach_ancestor(env.initial_tenant, second_branch) + assert info.value.status_code == 400 + + +def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder): + """ + Sharded timeline detach ancestor; 4 nodes: 1 stuck, 1 restarted, 2 normal. + + Stuck node gets stuck on a pause failpoint for first storage controller request. + Restarted node remains stuck until explicit restart from test code. + + We retry the request until storage controller gets 200 OK from all nodes. + """ + branch_name = "soon_detached" + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + for ps in env.pageservers: + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + # FIXME: should this be in the neon_env_builder.init_start? + env.storage_controller.reconcile_until_idle() + # as we will stop a node, make sure there is no clever rebalancing + env.storage_controller.tenant_policy_update(env.initial_tenant, body={"scheduling": "Stop"}) + env.storage_controller.allowed_errors.append(".*: Scheduling is disabled by policy Stop .*") + + shards = env.storage_controller.locate(env.initial_tenant) + + utilized_pageservers = {x["node_id"] for x in shards} + assert len(utilized_pageservers) > 1, "all shards got placed on single pageserver?" + + branch_timeline_id = env.neon_cli.create_branch(branch_name, tenant_id=env.initial_tenant) + + with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep: + ep.safe_psql( + "create table foo as select 1::bigint, i::bigint from generate_series(1, 10000) v(i)" + ) + lsn = flush_ep_to_pageserver(env, ep, env.initial_tenant, branch_timeline_id) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + for shard_info in shards: + node_id = int(shard_info["node_id"]) + shard_id = shard_info["shard_id"] + detail = pageservers[node_id].http_client().timeline_detail(shard_id, branch_timeline_id) + + assert Lsn(detail["last_record_lsn"]) >= lsn + assert Lsn(detail["initdb_lsn"]) < lsn + assert TimelineId(detail["ancestor_timeline_id"]) == env.initial_timeline + + # make one of the nodes get stuck, but continue the initial operation + # make another of the nodes get stuck, then restart + + stuck = pageservers[int(shards[0]["node_id"])] + log.info(f"stuck pageserver is id={stuck.id}") + stuck_http = stuck.http_client() + stuck_http.configure_failpoints( + ("timeline-detach-ancestor::before_starting_after_locking-pausable", "pause") + ) + + restarted = pageservers[int(shards[1]["node_id"])] + log.info(f"restarted pageserver is id={restarted.id}") + # this might be hit; see `restart_restarted` + restarted.allowed_errors.append(".*: Cancelled request finished with an error: ShuttingDown") + assert restarted.id != stuck.id + restarted_http = restarted.http_client() + restarted_http.configure_failpoints( + [ + ("timeline-detach-ancestor::before_starting_after_locking-pausable", "pause"), + ] + ) + + for info in shards: + pageserver = pageservers[int(info["node_id"])] + # the first request can cause these, but does not repeatedly + pageserver.allowed_errors.append(".*: request was dropped before completing") + + # first request again + env.storage_controller.allowed_errors.append(".*: request was dropped before completing") + + target = env.storage_controller.pageserver_api() + + with pytest.raises(ReadTimeout): + target.detach_ancestor(env.initial_tenant, branch_timeline_id, timeout=1) + + stuck_http.configure_failpoints( + ("timeline-detach-ancestor::before_starting_after_locking-pausable", "off") + ) + + barrier = threading.Barrier(2) + + def restart_restarted(): + barrier.wait() + # graceful shutdown should just work, because simultaneously unpaused + restarted.stop() + # this does not happen always, depends how fast we exit after unpausing + # restarted.assert_log_contains("Cancelled request finished with an error: ShuttingDown") + restarted.start() + + with ThreadPoolExecutor(max_workers=1) as pool: + fut = pool.submit(restart_restarted) + barrier.wait() + # we have 10s, lets use 1/2 of that to help the shutdown start + time.sleep(5) + restarted_http.configure_failpoints( + ("timeline-detach-ancestor::before_starting_after_locking-pausable", "off") + ) + fut.result() + + # detach ancestor request handling is not sensitive to http cancellation. + # this means that the "stuck" is on its way to complete the detach, but the restarted is off + # now it can either be complete on all nodes, or still in progress with + # one. + without_retrying = target.without_status_retrying() + + # this retry loop will be long enough that the tenant can always activate + reparented = None + for _ in range(10): + try: + reparented = without_retrying.detach_ancestor(env.initial_tenant, branch_timeline_id) + except PageserverApiException as info: + assert info.status_code == 503 + time.sleep(2) + else: + break + + assert reparented == set(), "too many retries (None) or unexpected reparentings" + + for shard_info in shards: + node_id = int(shard_info["node_id"]) + shard_id = shard_info["shard_id"] + + # TODO: ensure quescing is done on pageserver? + pageservers[node_id].quiesce_tenants() + detail = pageservers[node_id].http_client().timeline_detail(shard_id, branch_timeline_id) + wait_for_last_record_lsn( + pageservers[node_id].http_client(), shard_id, branch_timeline_id, lsn + ) + assert detail.get("ancestor_timeline_id") is None + + with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep: + count = int(ep.safe_psql("select count(*) from foo")[0][0]) + assert count == 10000 + + +@pytest.mark.parametrize("mode", ["delete_timeline", "delete_tenant"]) +@pytest.mark.parametrize("sharded", [False, True]) +def test_timeline_detach_ancestor_interrupted_by_deletion( + neon_env_builder: NeonEnvBuilder, mode: str, sharded: bool +): + """ + Timeline ancestor detach interrupted by deleting either: + - the detached timeline + - the whole tenant + + after starting the detach. + + What remains not tested by this: + - shutdown winning over complete, see test_timeline_is_deleted_before_timeline_detach_ancestor_completes + """ + + if sharded and mode == "delete_tenant": + # the shared/exclusive lock for tenant is blocking this: + # timeline detach ancestor takes shared, delete tenant takes exclusive + pytest.skip("tenant deletion while timeline ancestor detach is underway cannot happen") + + shard_count = 2 if sharded else 1 + + neon_env_builder.num_pageservers = shard_count + + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count if sharded else None, + initial_tenant_conf={ + "gc_period": "1s", + "lsn_lease_length": "0s", + }, + ) + + for ps in env.pageservers: + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + detached_timeline = env.neon_cli.create_branch("detached soon", "main") + + pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable" + + env.storage_controller.reconcile_until_idle() + shards = env.storage_controller.locate(env.initial_tenant) + + assert len(set(info["node_id"] for info in shards)) == shard_count + + target = env.storage_controller.pageserver_api() if sharded else env.pageserver.http_client() + target = target.without_status_retrying() + + victim = pageservers[int(shards[-1]["node_id"])] + victim_http = victim.http_client() + victim_http.configure_failpoints((pausepoint, "pause")) + + def detach_ancestor(): + target.detach_ancestor(env.initial_tenant, detached_timeline) + + def at_failpoint() -> LogCursor: + msg, offset = victim.assert_log_contains(f"at failpoint {pausepoint}") + log.info(f"found {msg}") + msg, offset = victim.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + log.info(f"found {msg}") + return offset + + def start_delete(): + if mode == "delete_timeline": + target.timeline_delete(env.initial_tenant, detached_timeline) + elif mode == "delete_tenant": + target.tenant_delete(env.initial_tenant) + else: + raise RuntimeError(f"unimplemented mode {mode}") + + def at_waiting_on_gate_close(start_offset: LogCursor) -> LogCursor: + _, offset = victim.assert_log_contains( + "closing is taking longer than expected", offset=start_offset + ) + return offset + + def is_deleted(): + try: + if mode == "delete_timeline": + target.timeline_detail(env.initial_tenant, detached_timeline) + elif mode == "delete_tenant": + target.tenant_status(env.initial_tenant) + else: + return False + except PageserverApiException as e: + assert e.status_code == 404 + return True + else: + raise RuntimeError("waiting for 404") + + with ThreadPoolExecutor(max_workers=2) as pool: + try: + fut = pool.submit(detach_ancestor) + offset = wait_until(10, 1.0, at_failpoint) + + delete = pool.submit(start_delete) + + offset = wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset)) + + victim_http.configure_failpoints((pausepoint, "off")) + + delete.result() + + assert wait_until(10, 1.0, is_deleted), f"unimplemented mode {mode}" + + # TODO: match the error + with pytest.raises(PageserverApiException) as exc: + fut.result() + log.info(f"TODO: match this error: {exc.value}") + assert exc.value.status_code == 503 + finally: + victim_http.configure_failpoints((pausepoint, "off")) + + if mode != "delete_timeline": + return + + # make sure the gc is unblocked + time.sleep(2) + victim.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset) + + if not sharded: + # we have the other node only while sharded + return + + other = pageservers[int(shards[0]["node_id"])] + log.info(f"other is {other.id}") + _, offset = other.assert_log_contains( + ".*INFO request\\{method=PUT path=/v1/tenant/\\S+/timeline/\\S+/detach_ancestor .*\\}: Request handled, status: 200 OK", + ) + # this might be a lot earlier than the victims line, but that is okay. + _, offset = other.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset) + + +@pytest.mark.parametrize("mode", ["delete_reparentable_timeline", "create_reparentable_timeline"]) +def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnvBuilder, mode: str): + """ + Technically possible storage controller concurrent interleaving timeline + deletion with timeline detach. + + Deletion is fine, as any sharded pageservers reach the same end state, but + creating reparentable timeline would create an issue as the two nodes would + never agree. There is a solution though: the created reparentable timeline + must be detached. + """ + + shard_count = 2 + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + + for ps in env.pageservers: + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + env.storage_controller.reconcile_until_idle() + shards = env.storage_controller.locate(env.initial_tenant) + assert len(set(x["node_id"] for x in shards)) == shard_count + + with env.endpoints.create_start("main") as ep: + ep.safe_psql("create table foo as select i::bigint from generate_series(1, 1000) t(i)") + + # as the interleaved operation, we will delete this timeline, which was reparenting candidate + first_branch_lsn = wait_for_last_flush_lsn( + env, ep, env.initial_tenant, env.initial_timeline + ) + for ps, shard_id in [(pageservers[int(x["node_id"])], x["shard_id"]) for x in shards]: + ps.http_client().timeline_checkpoint(shard_id, env.initial_timeline) + + ep.safe_psql("create table bar as select i::bigint from generate_series(1, 2000) t(i)") + detached_branch_lsn = flush_ep_to_pageserver( + env, ep, env.initial_tenant, env.initial_timeline + ) + + for ps, shard_id in [(pageservers[int(x["node_id"])], x["shard_id"]) for x in shards]: + ps.http_client().timeline_checkpoint(shard_id, env.initial_timeline) + + def create_reparentable_timeline() -> TimelineId: + return env.neon_cli.create_branch( + "first_branch", ancestor_branch_name="main", ancestor_start_lsn=first_branch_lsn + ) + + if mode == "delete_reparentable_timeline": + first_branch = create_reparentable_timeline() + else: + first_branch = None + + detached_branch = env.neon_cli.create_branch( + "detached_branch", ancestor_branch_name="main", ancestor_start_lsn=detached_branch_lsn + ) + + pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable" + + stuck = pageservers[int(shards[0]["node_id"])] + stuck_http = stuck.http_client().without_status_retrying() + stuck_http.configure_failpoints((pausepoint, "pause")) + + victim = pageservers[int(shards[-1]["node_id"])] + victim_http = victim.http_client().without_status_retrying() + victim_http.configure_failpoints( + (pausepoint, "pause"), + ) + + # interleaving a create_timeline which could be reparented will produce two + # permanently different reparentings: one node has reparented, other has + # not + # + # with deletion there is no such problem + def detach_timeline(): + env.storage_controller.pageserver_api().detach_ancestor(env.initial_tenant, detached_branch) + + def paused_at_failpoint(): + stuck.assert_log_contains(f"at failpoint {pausepoint}") + victim.assert_log_contains(f"at failpoint {pausepoint}") + + def first_completed(): + detail = stuck_http.timeline_detail(shards[0]["shard_id"], detached_branch) + log.info(detail) + assert detail.get("ancestor_lsn") is None + + def first_branch_gone(): + assert first_branch is not None + try: + env.storage_controller.pageserver_api().timeline_detail( + env.initial_tenant, first_branch + ) + except PageserverApiException as e: + log.info(f"error {e}") + assert e.status_code == 404 + else: + log.info("still ok") + raise RuntimeError("not done yet") + + with ThreadPoolExecutor(max_workers=1) as pool: + try: + fut = pool.submit(detach_timeline) + wait_until(10, 1.0, paused_at_failpoint) + + # let stuck complete + stuck_http.configure_failpoints((pausepoint, "off")) + wait_until(10, 1.0, first_completed) + + if mode == "delete_reparentable_timeline": + assert first_branch is not None + env.storage_controller.pageserver_api().timeline_delete( + env.initial_tenant, first_branch + ) + victim_http.configure_failpoints((pausepoint, "off")) + wait_until(10, 1.0, first_branch_gone) + elif mode == "create_reparentable_timeline": + first_branch = create_reparentable_timeline() + victim_http.configure_failpoints((pausepoint, "off")) + else: + raise RuntimeError("{mode}") + + # it now passes, and we should get an error messages about mixed reparenting as the stuck still had something to reparent + mixed_results = "pageservers returned mixed results for ancestor detach; manual intervention is required." + with pytest.raises(PageserverApiException, match=mixed_results): + fut.result() + + msg, offset = env.storage_controller.assert_log_contains( + ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*" + ) + log.info(f"expected error message: {msg.rstrip()}") + env.storage_controller.allowed_errors.extend( + [ + ".*: shards returned different results matching=0 .*", + f".*: InternalServerError\\({mixed_results}", + ] + ) + + if mode == "create_reparentable_timeline": + with pytest.raises(PageserverApiException, match=mixed_results): + detach_timeline() + else: + # it is a bit shame to flag it and then it suceeds, but most + # likely there would be a retry loop which would take care of + # this in cplane + detach_timeline() + + retried = env.storage_controller.log_contains( + ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*", + offset, + ) + if mode == "delete_reparentable_timeline": + assert ( + retried is None + ), "detaching should had converged after both nodes saw the deletion" + elif mode == "create_reparentable_timeline": + assert retried is not None, "detaching should not have converged" + _, offset = retried + finally: + stuck_http.configure_failpoints((pausepoint, "off")) + victim_http.configure_failpoints((pausepoint, "off")) + + if mode == "create_reparentable_timeline": + assert first_branch is not None + # now we have mixed ancestry + assert ( + TimelineId( + stuck_http.timeline_detail(shards[0]["shard_id"], first_branch)[ + "ancestor_timeline_id" + ] + ) + == env.initial_timeline + ) + assert ( + TimelineId( + victim_http.timeline_detail(shards[-1]["shard_id"], first_branch)[ + "ancestor_timeline_id" + ] + ) + == detached_branch + ) + + # make sure we are still able to repair this by detaching the ancestor on the storage controller in case it ever happens + # if the ancestor would be deleted, we would partially fail, making deletion stuck. + env.storage_controller.pageserver_api().detach_ancestor(env.initial_tenant, first_branch) + + # and we should now have good results + not_found = env.storage_controller.log_contains( + ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*", + offset, + ) + + assert not_found is None + assert ( + stuck_http.timeline_detail(shards[0]["shard_id"], first_branch)["ancestor_timeline_id"] + is None + ) + assert ( + victim_http.timeline_detail(shards[-1]["shard_id"], first_branch)[ + "ancestor_timeline_id" + ] + is None + ) + + +def test_retryable_500_hit_through_storcon_during_timeline_detach_ancestor( + neon_env_builder: NeonEnvBuilder, +): + shard_count = 2 + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + + for ps in env.pageservers: + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + env.storage_controller.reconcile_until_idle() + shards = env.storage_controller.locate(env.initial_tenant) + assert len(set(x["node_id"] for x in shards)) == shard_count + + detached_branch = env.neon_cli.create_branch("detached_branch", ancestor_branch_name="main") + + pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable" + failpoint = "timeline-detach-ancestor::before_starting_after_locking" + + stuck = pageservers[int(shards[0]["node_id"])] + stuck_http = stuck.http_client().without_status_retrying() + stuck_http.configure_failpoints( + (pausepoint, "pause"), + ) + + env.storage_controller.allowed_errors.append( + f".*Error processing HTTP request: .* failpoint: {failpoint}" + ) + http = env.storage_controller.pageserver_api() + + victim = pageservers[int(shards[-1]["node_id"])] + victim.allowed_errors.append( + f".*Error processing HTTP request: InternalServerError\\(failpoint: {failpoint}" + ) + victim_http = victim.http_client().without_status_retrying() + victim_http.configure_failpoints([(pausepoint, "pause"), (failpoint, "return")]) + + def detach_timeline(): + http.detach_ancestor(env.initial_tenant, detached_branch) + + def paused_at_failpoint(): + stuck.assert_log_contains(f"at failpoint {pausepoint}") + victim.assert_log_contains(f"at failpoint {pausepoint}") + + def first_completed(): + detail = stuck_http.timeline_detail(shards[0]["shard_id"], detached_branch) + log.info(detail) + assert detail.get("ancestor_lsn") is None + + with ThreadPoolExecutor(max_workers=1) as pool: + try: + fut = pool.submit(detach_timeline) + wait_until(10, 1.0, paused_at_failpoint) + + # let stuck complete + stuck_http.configure_failpoints((pausepoint, "off")) + wait_until(10, 1.0, first_completed) + + victim_http.configure_failpoints((pausepoint, "off")) + + with pytest.raises( + PageserverApiException, + match=f".*failpoint: {failpoint}", + ) as exc: + fut.result() + assert exc.value.status_code == 500 + + finally: + stuck_http.configure_failpoints((pausepoint, "off")) + victim_http.configure_failpoints((pausepoint, "off")) + + victim_http.configure_failpoints((failpoint, "off")) + detach_timeline() + + +def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: NeonEnvBuilder): + """ + Using a failpoint, force the completion step of timeline ancestor detach to + fail after reparenting a single timeline. + + Retrying should try reparenting until all reparentings are done, all the + time blocking gc even across restarts (first round). + + A completion failpoint is used to inhibit completion on second to last + round. + + On last round, the completion uses a path where no reparentings can happen + because original ancestor is deleted, and there is a completion to unblock + gc without restart. + """ + + # to get the remote storage metrics + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "1s", + "lsn_lease_length": "0s", + } + ) + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + env.pageserver.allowed_errors.extend( + [ + ".* reparenting failed: failpoint: timeline-detach-ancestor::allow_one_reparented", + ".* Error processing HTTP request: InternalServerError\\(failed to reparent all candidate timelines, please retry", + ".* Error processing HTTP request: InternalServerError\\(failpoint: timeline-detach-ancestor::complete_before_uploading", + ] + ) + + http = env.pageserver.http_client() + + def remote_storage_copy_requests(): + return http.get_metric_value( + "remote_storage_s3_request_seconds_count", + {"request_type": "copy_object", "result": "ok"}, + ) + + def reparenting_progress(timelines: List[TimelineId]) -> Tuple[int, Set[TimelineId]]: + reparented = 0 + not_reparented = set() + for timeline in timelines: + detail = http.timeline_detail(env.initial_tenant, timeline) + ancestor = TimelineId(detail["ancestor_timeline_id"]) + if ancestor == detached: + reparented += 1 + else: + not_reparented.add(timeline) + return (reparented, not_reparented) + + # main ------A-----B-----C-----D-----E> lsn + timelines = [] + with env.endpoints.create_start("main") as ep: + for counter in range(5): + ep.safe_psql( + f"create table foo_{counter} as select i::bigint from generate_series(1, 10000) t(i)" + ) + branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + http.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + branch = env.neon_cli.create_branch( + f"branch_{counter}", "main", ancestor_start_lsn=branch_lsn + ) + timelines.append(branch) + + flush_ep_to_pageserver(env, ep, env.initial_tenant, env.initial_timeline) + + # detach "E" which has most reparentable timelines under it + detached = timelines.pop() + assert len(timelines) == 4 + + http = http.without_status_retrying() + + http.configure_failpoints(("timeline-detach-ancestor::allow_one_reparented", "return")) + + not_reparented: Set[TimelineId] = set() + # tracked offset in the pageserver log which is at least at the most recent activation + offset = None + + def try_detach(): + with pytest.raises( + PageserverApiException, + match=".*failed to reparent all candidate timelines, please retry", + ) as exc: + http.detach_ancestor(env.initial_tenant, detached) + assert exc.value.status_code == 503 + + # first round -- do more checking to make sure the gc gets paused + try_detach() + + assert ( + http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None + ), "first round should had detached 'detached'" + + reparented, not_reparented = reparenting_progress(timelines) + assert reparented == 1 + + time.sleep(2) + _, offset = env.pageserver.assert_log_contains( + ".*INFO request\\{method=PUT path=/v1/tenant/[0-9a-f]{32}/timeline/[0-9a-f]{32}/detach_ancestor .*\\}: Handling request", + offset, + ) + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + _, offset = env.pageserver.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + metric = remote_storage_copy_requests() + assert metric != 0 + # make sure the gc blocking is persistent over a restart + env.pageserver.restart() + env.pageserver.quiesce_tenants() + time.sleep(2) + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + assert env.pageserver.log_contains(".* gc_loop.*: [0-9] timelines need GC", offset) is None + _, offset = env.pageserver.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + # restore failpoint for the next reparented + http.configure_failpoints(("timeline-detach-ancestor::allow_one_reparented", "return")) + + reparented_before = reparented + + # do two more rounds + for _ in range(2): + try_detach() + + assert ( + http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None + ), "first round should had detached 'detached'" + + reparented, not_reparented = reparenting_progress(timelines) + assert reparented == reparented_before + 1 + reparented_before = reparented + + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + metric = remote_storage_copy_requests() + assert metric == 0, "copies happen in the first round" + + assert offset is not None + assert len(not_reparented) == 1 + + http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "return")) + + # almost final round, the failpoint is hit no longer as there is only one reparented and one always gets to succeed. + # the tenant is restarted once more, but we fail during completing. + with pytest.raises( + PageserverApiException, match=".* timeline-detach-ancestor::complete_before_uploading" + ) as exc: + http.detach_ancestor(env.initial_tenant, detached) + assert exc.value.status_code == 500 + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + + # delete the previous ancestor to take a different path to completion. all + # other tests take the "detach? reparent complete", but this only hits + # "complete". + http.timeline_delete(env.initial_tenant, env.initial_timeline) + wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline, 20) + + http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "off")) + + reparented_resp = http.detach_ancestor(env.initial_tenant, detached) + assert reparented_resp == set(timelines) + # no need to quiesce_tenants anymore, because completion does that + + reparented, not_reparented = reparenting_progress(timelines) + assert reparented == len(timelines) + + time.sleep(2) + assert ( + env.pageserver.log_contains(".*: attach finished, activating", offset) is None + ), "there should be no restart with the final detach_ancestor as it only completed" + + # gc is unblocked + env.pageserver.assert_log_contains(".* gc_loop.*: 5 timelines need GC", offset) + + metric = remote_storage_copy_requests() + assert metric == 0 + + +def test_timeline_is_deleted_before_timeline_detach_ancestor_completes( + neon_env_builder: NeonEnvBuilder, +): + """ + Make sure that a timeline deleted after restart will unpause gc blocking. + """ + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "1s", + "lsn_lease_length": "0s", + } + ) + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + http = env.pageserver.http_client() + + detached = env.neon_cli.create_branch("detached") + + failpoint = "timeline-detach-ancestor::after_activating_before_finding-pausable" + + http.configure_failpoints((failpoint, "pause")) + + def detach_and_get_stuck(): + return http.detach_ancestor(env.initial_tenant, detached) + + def request_processing_noted_in_log(): + _, offset = env.pageserver.assert_log_contains( + ".*INFO request\\{method=PUT path=/v1/tenant/[0-9a-f]{32}/timeline/[0-9a-f]{32}/detach_ancestor .*\\}: Handling request", + ) + return offset + + def delete_detached(): + return http.timeline_delete(env.initial_tenant, detached) + + try: + with ThreadPoolExecutor(max_workers=1) as pool: + detach = pool.submit(detach_and_get_stuck) + + offset = wait_until(10, 1.0, request_processing_noted_in_log) + + # make this named fn tor more clear failure test output logging + def pausepoint_hit_with_gc_paused() -> LogCursor: + env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + _, at = env.pageserver.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + return at + + offset = wait_until(10, 1.0, pausepoint_hit_with_gc_paused) + + delete_detached() + + wait_timeline_detail_404(http, env.initial_tenant, detached, 10, 1.0) + + http.configure_failpoints((failpoint, "off")) + + with pytest.raises( + PageserverApiException, match="NotFound: Timeline .* was not found" + ) as exc: + detach.result() + assert exc.value.status_code == 404 + finally: + http.configure_failpoints((failpoint, "off")) + + # make sure gc has been unblocked + time.sleep(2) + + env.pageserver.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset) + + +# TODO: +# - branch near existing L1 boundary, image layers? +# - investigate: why are layers started at uneven lsn? not just after branching, but in general. +# +# TEST: 1. tad which partially succeeds, one returns 500 +# 2. create branch below timeline? ~or delete reparented timeline~ (done) +# 3. on retry all should report the same reparented timelines diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py new file mode 100644 index 0000000000..24de894687 --- /dev/null +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -0,0 +1,67 @@ +import time + +from fixtures.neon_fixtures import ( + NeonEnvBuilder, +) +from fixtures.pageserver.utils import wait_timeline_detail_404 + + +def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start( + initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"} + ) + ps = env.pageserver + http = ps.http_client() + + foo_branch = env.neon_cli.create_branch("foo", "main", env.initial_tenant) + + gc_active_line = ".* gc_loop.*: [12] timelines need GC" + gc_skipped_line = ".* gc_loop.*: Skipping GC: .*" + init_gc_skipped = ".*: initialized with gc blocked.*" + + tenant_before = http.tenant_status(env.initial_tenant) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_active_line) + + assert ps.log_contains(gc_skipped_line, offset) is None + + http.timeline_block_gc(env.initial_tenant, foo_branch) + + tenant_after = http.tenant_status(env.initial_tenant) + assert tenant_before != tenant_after + gc_blocking = tenant_after["gc_blocking"] + assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }" + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_skipped_line, offset) + + ps.restart() + ps.quiesce_tenants() + + _, offset = env.pageserver.assert_log_contains(init_gc_skipped, offset) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_skipped_line, offset) + + # deletion unblocks gc + http.timeline_delete(env.initial_tenant, foo_branch) + wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_active_line, offset) + + http.timeline_block_gc(env.initial_tenant, env.initial_timeline) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_skipped_line, offset) + + # removing the manual block also unblocks gc + http.timeline_unblock_gc(env.initial_tenant, env.initial_timeline) + + wait_for_another_gc_round() + _, offset = ps.assert_log_contains(gc_active_line, offset) + + +def wait_for_another_gc_round(): + time.sleep(2) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 4c5cb32caa..9bf5f8680b 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -1,9 +1,8 @@ import concurrent.futures import math -import queue import random -import threading import time +from collections import defaultdict from contextlib import closing from pathlib import Path from typing import Optional @@ -11,27 +10,27 @@ from typing import Optional import psycopg2.errors import psycopg2.extras import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, NeonEnv, NeonEnvBuilder, + NeonPageserver, PgBin, VanillaPostgres, wait_for_last_flush_lsn, ) -from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient +from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( assert_tenant_state, timeline_delete_wait_completed, wait_for_upload_queue_empty, - wait_tenant_status_404, wait_until_tenant_active, ) from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import TenantId, TimelineId from fixtures.utils import get_timeline_dir_size, wait_until @@ -40,10 +39,9 @@ def test_timeline_size(neon_simple_env: NeonEnv): new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty") client = env.pageserver.http_client() - wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) endpoint_main = env.endpoints.create_start("test_timeline_size") - log.info("postgres is running on 'test_timeline_size' branch") with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: @@ -73,13 +71,12 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty") client = env.pageserver.http_client() - wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) timeline_details = client.timeline_detail( env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True ) endpoint_main = env.endpoints.create_start("test_timeline_size_createdropdb") - log.info("postgres is running on 'test_timeline_size_createdropdb' branch") with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: @@ -153,34 +150,56 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder): client = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota_on_startup") - wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) + + size_limit_mb = 30 endpoint_main = env.endpoints.create( "test_timeline_size_quota_on_startup", # Set small limit for the test - config_lines=["neon.max_cluster_size=30MB"], + config_lines=[f"neon.max_cluster_size={size_limit_mb}MB"], ) endpoint_main.start() - log.info("postgres is running on 'test_timeline_size_quota_on_startup' branch") - with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE foo (t text)") # Insert many rows. This query must fail because of space limit try: - for _i in range(5000): - cur.execute( - """ - INSERT INTO foo - SELECT 'long string to consume some space' || g - FROM generate_series(1, 100) g - """ - ) - # If we get here, the timeline size limit failed - log.error("Query unexpectedly succeeded") + def write_rows(count): + for _i in range(count): + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100) g + """ + ) + + # Write some data that exceeds limit, then let the pageserver ingest it to guarantee that some feedback has made it to + # the safekeeper, then try to write some more. We expect either the initial writes or the ones after + # the wait_for_last_flush_lsn to generate an exception. + # + # Without the wait_for_last_flush_lsn, the size limit sometimes isn't enforced (see https://github.com/neondatabase/neon/issues/6562) + write_rows(2500) + wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id) + logical_size = env.pageserver.http_client().timeline_detail( + env.initial_tenant, new_timeline_id + )["current_logical_size"] + assert logical_size > size_limit_mb * 1024 * 1024 + write_rows(2500) + + # If we get here, the timeline size limit failed. Find out from the pageserver how large it + # thinks the timeline is. + wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id) + logical_size = env.pageserver.http_client().timeline_detail( + env.initial_tenant, new_timeline_id + )["current_logical_size"] + log.error( + f"Query unexpectedly succeeded, pageserver logical size is {logical_size}" + ) raise AssertionError() except psycopg2.errors.DiskFull as err: @@ -219,7 +238,7 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): client = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota") - wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) endpoint_main = env.endpoints.create( "test_timeline_size_quota", @@ -231,8 +250,6 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): endpoint_main.respec(skip_pg_catalog_updates=False) endpoint_main.start() - log.info("postgres is running on 'test_timeline_size_quota' branch") - with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE foo (t text)") @@ -337,41 +354,18 @@ def test_timeline_initial_logical_size_calculation_cancellation( assert_size_calculation_not_done() log.info( - f"try to delete the timeline using {deletion_method}, this should cancel size computation tasks and wait for them to finish" + f"delete the timeline using {deletion_method}, this should cancel size computation tasks and wait for them to finish" ) - delete_timeline_success: queue.Queue[bool] = queue.Queue(maxsize=1) - def delete_timeline_thread_fn(): - try: - if deletion_method == "tenant_detach": - client.tenant_detach(tenant_id) - elif deletion_method == "timeline_delete": - timeline_delete_wait_completed(client, tenant_id, timeline_id) - delete_timeline_success.put(True) - except PageserverApiException: - delete_timeline_success.put(False) - raise + if deletion_method == "tenant_detach": + client.tenant_detach(tenant_id) + elif deletion_method == "timeline_delete": + timeline_delete_wait_completed(client, tenant_id, timeline_id) + else: + raise RuntimeError(deletion_method) - delete_timeline_thread = threading.Thread(target=delete_timeline_thread_fn) - delete_timeline_thread.start() - # give it some time to settle in the state where it waits for size computation task - time.sleep(5) - if not delete_timeline_success.empty(): - raise AssertionError( - f"test is broken, the {deletion_method} should be stuck waiting for size computation task, got result {delete_timeline_success.get()}" - ) - - log.info( - "resume the size calculation. The failpoint checks that the timeline directory still exists." - ) - client.configure_failpoints(("timeline-calculate-logical-size-check-dir-exists", "return")) - client.configure_failpoints(("timeline-calculate-logical-size-pause", "off")) - - log.info("wait for delete timeline thread to finish and assert that it succeeded") - assert delete_timeline_success.get() - - # if the implementation is incorrect, the teardown would complain about an error log - # message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists" + # timeline-calculate-logical-size-pause is still paused, but it doesn't + # matter because it's a pausable_failpoint, which can be cancelled by drop. def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder): @@ -444,11 +438,12 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder # Disable background compaction as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed - neon_env_builder.pageserver_config_override = ( - "tenant_config={checkpoint_distance=100000, compaction_period='10m'}" + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_distance": "100000", + "compaction_period": "10m", + } ) - - env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction") @@ -491,9 +486,14 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed - neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='0s', gc_period='0s', pitr_interval='1s'}" - - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_distance": "100000", + "compaction_period": "0s", + "gc_period": "0s", + "pitr_interval": "1s", + } + ) pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc") @@ -585,7 +585,6 @@ def test_timeline_size_metrics( pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) port = port_distributor.get_port() with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: - vanilla_pg.configure([f"port={port}"]) vanilla_pg.start() # Create database based on template0 because we can't connect to template0 @@ -680,7 +679,7 @@ def get_physical_size_values( client = env.pageserver.http_client() res.layer_map_file_size_sum = sum( - layer.layer_file_size or 0 + layer.layer_file_size for layer in client.layer_map_info(tenant_id, timeline_id).historic_layers ) @@ -715,26 +714,11 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues): # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS -# Timeline logical size initialization is an asynchronous background task that runs once, -# try a few times to ensure it's activated properly -def wait_for_timeline_size_init( - client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId -): - for i in range(10): - timeline_details = client.timeline_detail( - tenant, timeline, include_non_incremental_logical_size=True - ) - current_logical_size = timeline_details["current_logical_size"] - non_incremental = timeline_details["current_logical_size_non_incremental"] - if current_logical_size == non_incremental: - return - log.info( - f"waiting for current_logical_size of a timeline to be calculated, iteration {i}: {current_logical_size} vs {non_incremental}" - ) - time.sleep(1) - raise Exception( - f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}" - ) +def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int): + def condition(): + assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count + + wait_until(5, 1.0, condition) def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): @@ -749,7 +733,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # We will run with the limit set to 1, so that once we have one tenant stuck # in a pausable failpoint, the rest are prevented from proceeding through warmup. - neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1" env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -820,10 +804,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # That one that we successfully accessed is now Active expect_activated += 1 assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active" - assert ( - pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") - == expect_activated - 1 - ) + wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1) # The ones we didn't touch are still in Attaching assert ( @@ -843,10 +824,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): == n_tenants - expect_activated ) - assert ( - pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") - == expect_activated - 1 - ) + wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1) # When we unblock logical size calculation, all tenants should proceed to active state via # the warmup route. @@ -866,7 +844,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): assert ( pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants ) - assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants + wait_for_tenant_startup_completions(pageserver_http, count=n_tenants) # Check that tenant deletion/detach proactively wakes tenants: this is done separately to the main # body of the test because it will disrupt tenant counts @@ -886,7 +864,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # Detaching a stuck tenant should proceed promptly # (reproducer for https://github.com/neondatabase/neon/pull/6430) - env.pageserver.http_client().tenant_detach(detach_tenant_id, timeout_secs=10) + env.pageserver.http_client().tenant_detach(detach_tenant_id) tenant_ids.remove(detach_tenant_id) # FIXME: currently the mechanism for cancelling attach is to set state to broken, which is reported spuriously at error level env.pageserver.allowed_errors.append( @@ -894,36 +872,9 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): ) # Deleting a stuck tenant should prompt it to go active - with concurrent.futures.ThreadPoolExecutor() as executor: - log.info("Starting background delete") - - def delete_tenant(): - env.pageserver.http_client().tenant_delete(delete_tenant_id) - - background_delete = executor.submit(delete_tenant) - - # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating - # logical size is paused in a failpoint. So instead we will use a log observation to check that - # on-demand activation was triggered by the tenant deletion - log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000}}: Activating tenant \\(on-demand\\).*" - - def activated_on_demand(): - assert env.pageserver.log_contains(log_match) is not None - - log.info(f"Waiting for activation message '{log_match}'") - try: - wait_until(10, 1, activated_on_demand) - finally: - log.info("Clearing failpoint") - pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off")) - - # Deletion should complete successfully now that failpoint is unblocked - log.info("Joining background delete") - background_delete.result(timeout=10) - - # Poll for deletion to complete - wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40) - tenant_ids.remove(delete_tenant_id) + # in some cases, it has already been activated because it's behind the detach + delete_lazy_activating(delete_tenant_id, env.pageserver, expect_attaching=False) + tenant_ids.remove(delete_tenant_id) # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one # we detached) @@ -931,6 +882,39 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): assert len(get_tenant_states()) == n_tenants - 2 +def delete_lazy_activating( + delete_tenant_id: TenantId, pageserver: NeonPageserver, expect_attaching: bool +): + pageserver_http = pageserver.http_client() + + if expect_attaching: + assert pageserver_http.tenant_status(delete_tenant_id)["state"]["slug"] == "Attaching" + + with concurrent.futures.ThreadPoolExecutor() as executor: + log.info("Starting background delete") + + def shutting_down(): + assert pageserver.log_contains(".*Waiting for timelines.*") is not None + + def delete_tenant(): + pageserver_http.tenant_delete(delete_tenant_id) + + background_delete = executor.submit(delete_tenant) + + # We expect deletion to enter shutdown of the tenant even though it's in the attaching state + try: + # Deletion will get to the point in shutdown where it's waiting for timeline shutdown, then + # hang because of our failpoint blocking activation. + wait_until(10, 1, shutting_down) + finally: + log.info("Clearing failpoint") + pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off")) + + # Deletion should complete successfully now that failpoint is unblocked and shutdown can complete + log.info("Joining background delete") + background_delete.result(timeout=10) + + def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): """ /v1/tenant/:tenant_shard_id/timeline and /v1/tenant/:tenant_shard_id @@ -952,6 +936,9 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): tenant_id = env.initial_tenant timeline_id = env.initial_timeline + # just make sure this doesn't hit an assertion + client.timeline_detail(tenant_id, timeline_id, force_await_initial_logical_size=True) + # load in some data endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) endpoint.safe_psql_many( @@ -994,3 +981,166 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): client.configure_failpoints( [("initial-size-calculation-permit-pause", "off"), ("walreceiver-after-ingest", "off")] ) + + +def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder): + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1" + + env = neon_env_builder.init_start() + + # the supporting_second does nothing except queue behind env.initial_tenant + # for purposes of showing that eager_tenant breezes past the queue + supporting_second, _ = env.neon_cli.create_tenant() + eager_tenant, _ = env.neon_cli.create_tenant() + + client = env.pageserver.http_client() + client.tenant_location_conf( + eager_tenant, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + + env.pageserver.stop() + + # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation + env.pageserver.start( + extra_env_vars={ + "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause" + } + ) + + tenant_ids = [env.initial_tenant, supporting_second] + + def get_tenant_states() -> dict[str, list[TenantId]]: + states = defaultdict(list) + for id in tenant_ids: + state = client.tenant_status(id)["state"]["slug"] + states[state].append(id) + return dict(states) + + def one_is_active(): + states = get_tenant_states() + log.info(f"{states}") + assert len(states["Active"]) == 1 + + wait_until(10, 1, one_is_active) + + def other_is_attaching(): + states = get_tenant_states() + assert len(states["Attaching"]) == 1 + + wait_until(10, 1, other_is_attaching) + + def eager_tenant_is_active(): + resp = client.tenant_status(eager_tenant) + assert resp["state"]["slug"] == "Active" + + gen = env.storage_controller.attach_hook_issue(eager_tenant, env.pageserver.id) + client.tenant_location_conf( + eager_tenant, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": gen, + }, + lazy=False, + ) + wait_until(10, 1, eager_tenant_is_active) + + other_is_attaching() + + client.configure_failpoints( + [("timeline-calculate-logical-size-pause", "off"), ("walreceiver-after-ingest", "off")] + ) + + +@pytest.mark.parametrize("activation_method", ["endpoint", "branch", "delete"]) +def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_method: str): + # env.initial_tenant will take up this permit when attaching with lazy because of a failpoint activated after restart + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1" + + env = neon_env_builder.init_start() + + # because this returns (also elsewhere in this file), we know that SpawnMode::Create skips the queue + lazy_tenant, _ = env.neon_cli.create_tenant() + + client = env.pageserver.http_client() + client.tenant_location_conf( + lazy_tenant, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + + env.pageserver.stop() + + # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation + env.pageserver.start( + extra_env_vars={ + "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause" + } + ) + + def initial_tenant_is_active(): + resp = client.tenant_status(env.initial_tenant) + assert resp["state"]["slug"] == "Active" + + wait_until(10, 1, initial_tenant_is_active) + + # even though the initial tenant is now active, because it was startup time + # attach, it will consume the only permit because logical size calculation + # is paused. + + gen = env.storage_controller.attach_hook_issue(lazy_tenant, env.pageserver.id) + client.tenant_location_conf( + lazy_tenant, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": gen, + }, + lazy=True, + ) + + def lazy_tenant_is_attaching(): + resp = client.tenant_status(lazy_tenant) + assert resp["state"]["slug"] == "Attaching" + + # paused logical size calculation of env.initial_tenant is keeping it attaching + wait_until(10, 1, lazy_tenant_is_attaching) + + for _ in range(5): + lazy_tenant_is_attaching() + time.sleep(0.5) + + def lazy_tenant_is_active(): + resp = client.tenant_status(lazy_tenant) + assert resp["state"]["slug"] == "Active" + + if activation_method == "endpoint": + with env.endpoints.create_start("main", tenant_id=lazy_tenant): + # starting up the endpoint should make it jump the queue + wait_until(10, 1, lazy_tenant_is_active) + elif activation_method == "branch": + env.neon_cli.create_timeline("second_branch", lazy_tenant) + wait_until(10, 1, lazy_tenant_is_active) + elif activation_method == "delete": + delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True) + else: + raise RuntimeError(activation_method) + + client.configure_failpoints( + [ + ("timeline-calculate-logical-size-pause", "off"), + ("walreceiver-after-ingest", "off"), + ] + ) diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py index 305271c715..dd76689008 100644 --- a/test_runner/regress/test_twophase.py +++ b/test_runner/regress/test_twophase.py @@ -13,7 +13,6 @@ def test_twophase(neon_simple_env: NeonEnv): endpoint = env.endpoints.create_start( "test_twophase", config_lines=["max_prepared_transactions=5"] ) - log.info("postgres is running on 'test_twophase' branch") conn = endpoint.connect() cur = conn.cursor() diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py index 708bf0dfeb..137d28b9fa 100644 --- a/test_runner/regress/test_unlogged.py +++ b/test_runner/regress/test_unlogged.py @@ -1,4 +1,5 @@ from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn +from fixtures.pg_version import PgVersion # @@ -17,7 +18,8 @@ def test_unlogged(neon_simple_env: NeonEnv): cur.execute("CREATE UNLOGGED TABLE iut (id int);") # create index to test unlogged index relation as well cur.execute("CREATE UNIQUE INDEX iut_idx ON iut (id);") - cur.execute("INSERT INTO iut values (42);") + cur.execute("ALTER TABLE iut ADD COLUMN seq int GENERATED ALWAYS AS IDENTITY;") + cur.execute("INSERT INTO iut (id) values (42);") # create another compute to fetch inital empty contents from pageserver fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "test_unlogged") @@ -26,7 +28,15 @@ def test_unlogged(neon_simple_env: NeonEnv): conn2 = endpoint2.connect() cur2 = conn2.cursor() # after restart table should be empty but valid - cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut VALUES ($1)") + cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut (id) VALUES ($1)") cur2.execute("EXECUTE iut_plan (43);") cur2.execute("SELECT * FROM iut") - assert cur2.fetchall() == [(43,)] + results = cur2.fetchall() + # Unlogged sequences were introduced in v15. On <= v14, the sequence created + # for the GENERATED ALWAYS AS IDENTITY column is logged, and hence it keeps + # the old value (2) on restart. While on v15 and above, it's unlogged, so it + # gets reset to 1. + if env.pg_version <= PgVersion.V14: + assert results == [(43, 2)] + else: + assert results == [(43, 1)] diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 415f086bd3..7272979c4a 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -1,6 +1,9 @@ -import pytest +import time +from contextlib import closing + from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, fork_at_current_lsn +from fixtures.utils import query_scalar # @@ -13,7 +16,6 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): env.neon_cli.create_branch("test_vm_bit_clear", "empty") endpoint = env.endpoints.create_start("test_vm_bit_clear") - log.info("postgres is running on 'test_vm_bit_clear' branch") pg_conn = endpoint.connect() cur = pg_conn.cursor() @@ -60,7 +62,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): # Clear the buffer cache, to force the VM page to be re-fetched from # the page server - cur.execute("SELECT clear_buffer_cache()") + endpoint.clear_shared_buffers(cursor=cur) # Check that an index-only scan doesn't see the deleted row. If the # clearing of the VM bit was not replayed correctly, this would incorrectly @@ -92,7 +94,6 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): # server at the right point-in-time avoids that full-page image. endpoint_new = env.endpoints.create_start("test_vm_bit_clear_new") - log.info("postgres is running on 'test_vm_bit_clear_new' branch") pg_new_conn = endpoint_new.connect() cur_new = pg_new_conn.cursor() @@ -114,18 +115,103 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): assert cur_new.fetchall() == [] -# -# Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK -# record. -# -# FIXME: This test is broken -@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/6412#issuecomment-1902072541") -def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv): - env = neon_simple_env +def test_vm_bit_clear_on_heap_lock_whitebox(neon_env_builder: NeonEnvBuilder): + """ + Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK record. - env.neon_cli.create_branch("test_vm_bit_clear_on_heap_lock", "empty") + This is a repro for the bug fixed in commit 66fa176cc8. + """ + env = neon_env_builder.init_start() endpoint = env.endpoints.create_start( - "test_vm_bit_clear_on_heap_lock", + "main", + config_lines=[ + # If auto-analyze runs at the same time that we run VACUUM FREEZE, it + # can hold a snasphot that prevent the tuples from being frozen. + "autovacuum=off", + "log_checkpoints=on", + ], + ) + + # Run the tests in a dedicated database, because the activity monitor + # periodically runs some queries on to the 'postgres' database. If that + # happens at the same time that we're trying to freeze, the activity + # monitor's queries can hold back the xmin horizon and prevent freezing. + with closing(endpoint.connect()) as pg_conn: + pg_conn.cursor().execute("CREATE DATABASE vmbitsdb") + pg_conn = endpoint.connect(dbname="vmbitsdb") + cur = pg_conn.cursor() + + # Install extension containing function needed for test + cur.execute("CREATE EXTENSION neon_test_utils") + cur.execute("CREATE EXTENSION pageinspect") + + # Create a test table and freeze it to set the all-frozen VM bit on all pages. + cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)") + cur.execute("BEGIN") + cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g") + xid = int(query_scalar(cur, "SELECT txid_current()")) + cur.execute("COMMIT") + cur.execute("VACUUM (FREEZE, DISABLE_PAGE_SKIPPING true, VERBOSE) vmtest_lock") + for notice in pg_conn.notices: + log.info(f"{notice}") + + # This test has been flaky in the past, because background activity like + # auto-analyze and compute_ctl's activity monitor queries have prevented the + # tuples from being frozen. Check that they were frozen. + relfrozenxid = int( + query_scalar(cur, "SELECT relfrozenxid FROM pg_class WHERE relname='vmtest_lock'") + ) + assert ( + relfrozenxid > xid + ), f"Inserted rows were not frozen. This can be caused by concurrent activity in the database. (XID {xid}, relfrozenxid {relfrozenxid}" + + # Lock a row. This clears the all-frozen VM bit for that page. + cur.execute("BEGIN") + cur.execute("SELECT * FROM vmtest_lock WHERE id = 40000 FOR UPDATE") + cur.execute("COMMIT") + + # The VM page in shared buffer cache, and the same page as reconstructed by + # the pageserver, should be equal. Except for the LSN: Clearing a bit in the + # VM doesn't bump the LSN in PostgreSQL, but the pageserver updates the LSN + # when it replays the VM-bit clearing record (since commit 387a36874c) + # + # This is a bit fragile, we've had lot of flakiness in this test before. For + # example, because all the VM bits were not set because concurrent + # autoanalyze prevented the VACUUM FREEZE from freezing the tuples. Or + # because autoavacuum kicked in and re-froze the page between the + # get_raw_page() and get_raw_page_at_lsn() calls. We disable autovacuum now, + # which should make this deterministic. + cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )") + vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex() + cur.execute( + "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )" + ) + vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex() + + assert vm_page_at_pageserver == vm_page_in_cache + + +def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder): + """ + The previous test is enough to verify the bug that was fixed in + commit 66fa176cc8. But for good measure, we also reproduce the + original problem that the missing VM page update caused. + """ + tenant_conf = { + "checkpoint_distance": f"{128 * 1024}", + "compaction_target_size": f"{128 * 1024}", + "compaction_threshold": "1", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", + # set PITR interval to be small, so we can do GC + "pitr_interval": "0 s", + } + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + endpoint = env.endpoints.create_start( + "main", config_lines=[ "log_autovacuum_min_duration = 0", # Perform anti-wraparound vacuuming aggressively @@ -140,71 +226,61 @@ def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv): # Install extension containing function needed for test cur.execute("CREATE EXTENSION neon_test_utils") - cur.execute("SELECT pg_switch_wal()") - # Create a test table and freeze it to set the all-frozen VM bit on all pages. cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)") cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g") - cur.execute("VACUUM FREEZE vmtest_lock") + cur.execute("VACUUM (FREEZE, DISABLE_PAGE_SKIPPING true) vmtest_lock") # Lock a row. This clears the all-frozen VM bit for that page. + cur.execute("BEGIN") cur.execute("SELECT * FROM vmtest_lock WHERE id = 40000 FOR UPDATE") # Remember the XID. We will use it later to verify that we have consumed a lot of # XIDs after this. cur.execute("select pg_current_xact_id()") - locking_xid = cur.fetchall()[0][0] + locking_xid = int(cur.fetchall()[0][0]) - # Stop and restart postgres, to clear the buffer cache. + cur.execute("COMMIT") + + # Kill and restart postgres, to clear the buffer cache. # # NOTE: clear_buffer_cache() will not do, because it evicts the dirty pages # in a "clean" way. Our neon extension will write a full-page image of the VM - # page, and we want to avoid that. - endpoint.stop() + # page, and we want to avoid that. A clean shutdown will also not do, for the + # same reason. + endpoint.stop(mode="immediate") + endpoint.start() pg_conn = endpoint.connect() cur = pg_conn.cursor() - cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 ") - tup = cur.fetchall() - xmax_before = tup[0][1] - # Consume a lot of XIDs, so that anti-wraparound autovacuum kicks # in and the clog gets truncated. We set autovacuum_freeze_max_age to a very # low value, so it doesn't take all that many XIDs for autovacuum to kick in. - for i in range(1000): - cur.execute( - """ - CREATE TEMP TABLE othertable (i int) ON COMMIT DROP; - do $$ - begin - for i in 1..100000 loop - -- Use a begin-exception block to generate a new subtransaction on each iteration - begin - insert into othertable values (i); - exception when others then - raise 'not expected %', sqlerrm; - end; - end loop; - end; - $$; - """ - ) - cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 ") - tup = cur.fetchall() - log.info(f"tuple = {tup}") - xmax = tup[0][1] - assert xmax == xmax_before - - if i % 50 == 0: - cur.execute("select datfrozenxid from pg_database where datname='postgres'") - datfrozenxid = cur.fetchall()[0][0] - if datfrozenxid > locking_xid: - break + # + # We could use test_consume_xids() to consume XIDs much faster, + # but it wouldn't speed up the overall test, because we'd still + # need to wait for autovacuum to run. + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + for _ in range(1000): + cur.execute("select min(datfrozenxid::text::int) from pg_database") + datfrozenxid = int(cur.fetchall()[0][0]) + log.info(f"datfrozenxid {datfrozenxid} locking_xid: {locking_xid}") + if datfrozenxid > locking_xid + 3000000: + break + time.sleep(0.5) cur.execute("select pg_current_xact_id()") - curr_xid = cur.fetchall()[0][0] - assert int(curr_xid) - int(locking_xid) >= 100000 + curr_xid = int(cur.fetchall()[0][0]) + assert curr_xid - locking_xid >= 100000 + + # Perform GC in the pageserver. Otherwise the compute might still + # be able to download the already-deleted SLRU segment from the + # pageserver. That masks the original bug. + env.pageserver.http_client().timeline_checkpoint(tenant_id, timeline_id) + env.pageserver.http_client().timeline_compact(tenant_id, timeline_id) + env.pageserver.http_client().timeline_gc(tenant_id, timeline_id, 0) # Now, if the VM all-frozen bit was not correctly cleared on # replay, we will try to fetch the status of the XID that was @@ -214,3 +290,4 @@ def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv): cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 for update") tup = cur.fetchall() log.info(f"tuple = {tup}") + cur.execute("commit transaction") diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 2f8e69165e..3785651aed 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1,4 +1,5 @@ import filecmp +import logging import os import random import shutil @@ -17,18 +18,18 @@ import psycopg2 import psycopg2.errors import psycopg2.extras import pytest +import requests from fixtures.broker import NeonBroker +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import ( Endpoint, - NeonEnv, NeonEnvBuilder, NeonPageserver, PgBin, PgProtocol, Safekeeper, - SafekeeperHttpClient, SafekeeperPort, last_flush_lsn_upload, ) @@ -46,8 +47,15 @@ from fixtures.remote_storage import ( default_remote_storage, s3_storage, ) -from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import get_dir_size, query_scalar, start_in_background +from fixtures.safekeeper.http import SafekeeperHttpClient +from fixtures.safekeeper.utils import are_walreceivers_absent +from fixtures.utils import ( + PropagatingThread, + get_dir_size, + query_scalar, + start_in_background, + wait_until, +) def wait_lsn_force_checkpoint( @@ -61,6 +69,33 @@ def wait_lsn_force_checkpoint( lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver") + wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options) + + +def wait_lsn_force_checkpoint_at_sk( + safekeeper: Safekeeper, + tenant_id: TenantId, + timeline_id: TimelineId, + ps: NeonPageserver, + pageserver_conn_options=None, +): + sk_flush_lsn = safekeeper.get_flush_lsn(tenant_id, timeline_id) + wait_lsn_force_checkpoint_at(sk_flush_lsn, tenant_id, timeline_id, ps, pageserver_conn_options) + + +def wait_lsn_force_checkpoint_at( + lsn: Lsn, + tenant_id: TenantId, + timeline_id: TimelineId, + ps: NeonPageserver, + pageserver_conn_options=None, +): + """ + Wait until pageserver receives given lsn, force checkpoint and wait for + upload, i.e. remote_consistent_lsn advancement. + """ + pageserver_conn_options = pageserver_conn_options or {} + auth_token = None if "password" in pageserver_conn_options: auth_token = pageserver_conn_options["password"] @@ -102,9 +137,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): n_timelines = 3 - branch_names = [ - "test_safekeepers_many_timelines_{}".format(tlin) for tlin in range(n_timelines) - ] + branch_names = [f"test_safekeepers_many_timelines_{tlin}" for tlin in range(n_timelines)] # pageserver, safekeeper operate timelines via their ids (can be represented in hex as 'ad50847381e248feaac9876cc71ae418') # that's not really human readable, so the branch names are introduced in Neon CLI. # Neon CLI stores its branch <-> timeline mapping in its internals, @@ -147,8 +180,8 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): last_record_lsn=Lsn(timeline_detail["last_record_lsn"]), ) for sk_m in sk_metrics: - m.flush_lsns.append(Lsn(sk_m.flush_lsn_inexact[(tenant_id, timeline_id)])) - m.commit_lsns.append(Lsn(sk_m.commit_lsn_inexact[(tenant_id, timeline_id)])) + m.flush_lsns.append(Lsn(int(sk_m.flush_lsn_inexact(tenant_id, timeline_id)))) + m.commit_lsns.append(Lsn(int(sk_m.commit_lsn_inexact(tenant_id, timeline_id)))) for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns): # Invariant. May be < when transaction is in progress. @@ -236,6 +269,10 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): assert max(init_m[2].flush_lsns) <= min(final_m[2].flush_lsns) < middle_lsn assert max(init_m[2].commit_lsns) <= min(final_m[2].commit_lsns) < middle_lsn + # Test timeline_list endpoint. + http_cli = env.safekeepers[0].http_client() + assert len(http_cli.timeline_list()) == 3 + # Check that dead minority doesn't prevent the commits: execute insert n_inserts # times, with fault_probability chance of getting a wal acceptor down or up @@ -280,11 +317,6 @@ def test_broker(neon_env_builder: NeonEnvBuilder): tenant_id = env.initial_tenant timeline_id = env.neon_cli.create_branch("test_broker", "main") - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" - ) - endpoint = env.endpoints.create_start("test_broker") endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)") @@ -323,9 +355,9 @@ def test_broker(neon_env_builder: NeonEnvBuilder): time.sleep(1) # Ensure that safekeepers don't lose remote_consistent_lsn on restart. - # Control file is persisted each 5s. TODO: do that on shutdown and remove sleep. - time.sleep(6) for sk in env.safekeepers: + # force persist cfile + sk.http_client().checkpoint(tenant_id, timeline_id) sk.stop() sk.start() stat_after_restart = [cli.timeline_status(tenant_id, timeline_id) for cli in clients] @@ -342,11 +374,6 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" - ) - tenant_id = env.initial_tenant timeline_id = env.neon_cli.create_branch("test_safekeepers_wal_removal") endpoint = env.endpoints.create_start("test_safekeepers_wal_removal") @@ -371,7 +398,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # We will wait for first segment removal. Make sure they exist for starter. first_segments = [ - os.path.join(sk.data_dir(), str(tenant_id), str(timeline_id), "000000010000000000000001") + sk.timeline_dir(tenant_id, timeline_id) / "000000010000000000000001" for sk in env.safekeepers ] assert all(os.path.exists(p) for p in first_segments) @@ -385,7 +412,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): http_cli_other = env.safekeepers[0].http_client( auth_token=env.auth_keys.generate_tenant_token(TenantId.generate()) ) - http_cli_noauth = env.safekeepers[0].http_client() + http_cli_noauth = env.safekeepers[0].http_client(gen_sk_wide_token=False) # Pretend WAL is offloaded to s3. if auth_enabled: @@ -456,7 +483,7 @@ def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: Tim def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb): http_cli = sk.http_client() tli_status = http_cli.timeline_status(tenant_id, timeline_id) - sk_wal_size = get_dir_size(os.path.join(sk.data_dir(), str(tenant_id), str(timeline_id))) + sk_wal_size = get_dir_size(sk.timeline_dir(tenant_id, timeline_id)) sk_wal_size_mb = sk_wal_size / 1024 / 1024 log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}") return sk_wal_size_mb <= target_size_mb @@ -602,10 +629,10 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): # save the last (partial) file to put it back after recreation; others will be fetched from s3 sk = env.safekeepers[0] - tli_dir = Path(sk.data_dir()) / str(tenant_id) / str(timeline_id) + tli_dir = Path(sk.data_dir) / str(tenant_id) / str(timeline_id) f_partial = Path([f for f in os.listdir(tli_dir) if f.endswith(".partial")][0]) f_partial_path = tli_dir / f_partial - f_partial_saved = Path(sk.data_dir()) / f_partial.name + f_partial_saved = Path(sk.data_dir) / f_partial.name f_partial_path.rename(f_partial_saved) pg_version = sk.http_client().timeline_status(tenant_id, timeline_id).pg_version @@ -627,7 +654,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): cli = sk.http_client() cli.timeline_create(tenant_id, timeline_id, pg_version, last_lsn) f_partial_path = ( - Path(sk.data_dir()) / str(tenant_id) / str(timeline_id) / f_partial_saved.name + Path(sk.data_dir) / str(tenant_id) / str(timeline_id) / f_partial_saved.name ) shutil.copy(f_partial_saved, f_partial_path) @@ -841,7 +868,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): auth_token=env.auth_keys.generate_tenant_token(TenantId.generate()) ) wa_http_cli_bad.check_status() - wa_http_cli_noauth = wa.http_client() + wa_http_cli_noauth = wa.http_client(gen_sk_wide_token=False) wa_http_cli_noauth.check_status() # debug endpoint requires safekeeper scope @@ -853,7 +880,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # fetch something sensible from status tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) - epoch = tli_status.acceptor_epoch + term = tli_status.term timeline_start_lsn = tli_status.timeline_start_lsn if auth_enabled: @@ -874,8 +901,8 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): endpoint.safe_psql("insert into t values(10)") tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) - epoch_after_reboot = tli_status.acceptor_epoch - assert epoch_after_reboot > epoch + term_after_reboot = tli_status.term + assert term_after_reboot > term # and timeline_start_lsn stays the same assert tli_status.timeline_start_lsn == timeline_start_lsn @@ -975,7 +1002,7 @@ def test_sk_auth(neon_env_builder: NeonEnvBuilder): # By default, neon_local enables auth on all services if auth is configured, # so http must require the token. - sk_http_cli_noauth = sk.http_client() + sk_http_cli_noauth = sk.http_client(gen_sk_wide_token=False) sk_http_cli_auth = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) with pytest.raises(sk_http_cli_noauth.HTTPError, match="Forbidden|Unauthorized"): sk_http_cli_noauth.timeline_status(tenant_id, timeline_id) @@ -1107,12 +1134,6 @@ def is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id): return all([flush_lsns[0] == flsn for flsn in flush_lsns]) -def are_walreceivers_absent(sk_http_cli, tenant_id: TenantId, timeline_id: TimelineId): - status = sk_http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}") - return len(status.walreceivers) == 0 - - # Assert by xxd that WAL on given safekeepers is identical. No compute must be # running for this to be reliable. def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId): @@ -1122,11 +1143,11 @@ def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: Timeline # First check that term / flush_lsn are the same: it is easier to # report/understand if WALs are different due to that. statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis] - term_flush_lsns = [(s.acceptor_epoch, s.flush_lsn) for s in statuses] + term_flush_lsns = [(s.last_log_term, s.flush_lsn) for s in statuses] for tfl, sk in zip(term_flush_lsns[1:], sks[1:]): assert ( term_flush_lsns[0] == tfl - ), f"(term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}" + ), f"(last_log_term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}" # check that WALs are identic. segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks] @@ -1149,15 +1170,15 @@ def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: Timeline ) for f in mismatch: - f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f) - f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f) - stdout_filename = "{}.filediff".format(f2) + f1 = sk0.timeline_dir(tenant_id, timeline_id) / f + f2 = sk.timeline_dir(tenant_id, timeline_id) / f + stdout_filename = f"{f2}.filediff" with open(stdout_filename, "w") as stdout_f: - subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True) - subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True) + subprocess.run(f"xxd {f1} > {f1}.hex ", shell=True) + subprocess.run(f"xxd {f2} > {f2}.hex ", shell=True) - cmd = "diff {}.hex {}.hex".format(f1, f2) + cmd = f"diff {f1}.hex {f2}.hex" subprocess.run([cmd], stdout=stdout_f, shell=True) assert (mismatch, not_regular) == ( @@ -1294,6 +1315,8 @@ def test_lagging_sk(neon_env_builder: NeonEnvBuilder): # Check that WALs are the same. cmp_sk_wal([sk1, sk2, sk3], tenant_id, timeline_id) + env.stop(immediate=True) + # Smaller version of test_one_sk_down testing peer recovery in isolation: that # it works without compute at all. @@ -1357,6 +1380,36 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'") +# Test that when compute is terminated in fast (or smart) mode, walproposer is +# allowed to run and self terminate after shutdown checkpoint is written, so it +# commits it to safekeepers before exiting. This not required for correctness, +# but needed for tests using check_restored_datadir_content. +def test_wp_graceful_shutdown(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + timeline_id = env.neon_cli.create_branch("test_wp_graceful_shutdown") + ep = env.endpoints.create_start("test_wp_graceful_shutdown") + ep.safe_psql("create table t(key int, value text)") + ep.stop() + + # figure out checkpoint lsn + ckpt_lsn = pg_bin.get_pg_controldata_checkpoint_lsn(ep.pg_data_dir_path()) + + sk_http_cli = env.safekeepers[0].http_client() + commit_lsn = sk_http_cli.timeline_status(tenant_id, timeline_id).commit_lsn + # Note: this is in memory value. Graceful shutdown of walproposer currently + # doesn't guarantee persisted value, which is ok as we need it only for + # tests. Persisting it without risking too many cf flushes needs a wp -> sk + # protocol change. (though in reality shutdown sync-safekeepers does flush + # of cf, so most of the time persisted value wouldn't lag) + log.info(f"sk commit_lsn {commit_lsn}") + # note that ckpt_lsn is the *beginning* of checkpoint record, so commit_lsn + # must be actually higher + assert commit_lsn > ckpt_lsn, "safekeeper must have checkpoint record" + + class SafekeeperEnv: def __init__( self, @@ -1618,7 +1671,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): with conn.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key)") sk = env.safekeepers[0] - sk_data_dir = Path(sk.data_dir()) + sk_data_dir = sk.data_dir if not auth_enabled: sk_http = sk.http_client() sk_http_other = sk_http @@ -1627,7 +1680,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): sk_http_other = sk.http_client( auth_token=env.auth_keys.generate_tenant_token(tenant_id_other) ) - sk_http_noauth = sk.http_client() + sk_http_noauth = sk.http_client(gen_sk_wide_token=False) assert (sk_data_dir / str(tenant_id) / str(timeline_id_1)).is_dir() assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() @@ -1710,9 +1763,12 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): cur.execute("INSERT INTO t (key) VALUES (123)") -def test_pull_timeline(neon_env_builder: NeonEnvBuilder): - def safekeepers_guc(env: NeonEnv, sk_names: List[int]) -> str: - return ",".join([f"localhost:{sk.port.pg}" for sk in env.safekeepers if sk.id in sk_names]) +# Basic pull_timeline test. +# When live_sk_change is False, compute is restarted to change set of +# safekeepers; otherwise it is live reload. +@pytest.mark.parametrize("live_sk_change", [False, True]) +def test_pull_timeline(neon_env_builder: NeonEnvBuilder, live_sk_change: bool): + neon_env_builder.auth_enabled = True def execute_payload(endpoint: Endpoint): with closing(endpoint.connect()) as conn: @@ -1729,7 +1785,7 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder): def show_statuses(safekeepers: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId): for sk in safekeepers: - http_cli = sk.http_client() + http_cli = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) try: status = http_cli.timeline_status(tenant_id, timeline_id) log.info(f"Safekeeper {sk.id} status: {status}") @@ -1739,13 +1795,12 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 4 env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_pull_timeline") + timeline_id = env.initial_timeline log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() - endpoint = env.endpoints.create("test_pull_timeline") - endpoint.active_safekeepers = [1, 2, 3] - endpoint.start() + endpoint = env.endpoints.create("main") + endpoint.start(safekeepers=[1, 2, 3]) execute_payload(endpoint) show_statuses(env.safekeepers, tenant_id, timeline_id) @@ -1757,29 +1812,22 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder): log.info("Initialize new safekeeper 4, pull data from 1 & 3") env.safekeepers[3].start() - res = ( - env.safekeepers[3] - .http_client() - .pull_timeline( - { - "tenant_id": str(tenant_id), - "timeline_id": str(timeline_id), - "http_hosts": [ - f"http://localhost:{env.safekeepers[0].port.http}", - f"http://localhost:{env.safekeepers[2].port.http}", - ], - } - ) + res = env.safekeepers[3].pull_timeline( + [env.safekeepers[0], env.safekeepers[2]], tenant_id, timeline_id ) log.info("Finished pulling timeline") log.info(res) show_statuses(env.safekeepers, tenant_id, timeline_id) - log.info("Restarting compute with new config to verify that it works") - endpoint.stop_and_destroy().create("test_pull_timeline") - endpoint.active_safekeepers = [1, 3, 4] - endpoint.start() + action = "reconfiguing" if live_sk_change else "restarting" + log.info(f"{action} compute with new config to verify that it works") + new_sks = [1, 3, 4] + if not live_sk_change: + endpoint.stop_and_destroy().create("main") + endpoint.start(safekeepers=new_sks) + else: + endpoint.reconfigure(safekeepers=new_sks) execute_payload(endpoint) show_statuses(env.safekeepers, tenant_id, timeline_id) @@ -1799,6 +1847,133 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder): show_statuses(env.safekeepers, tenant_id, timeline_id) +# Test pull_timeline while concurrently gc'ing WAL on safekeeper: +# 1) Start pull_timeline, listing files to fetch. +# 2) Write segment, do gc. +# 3) Finish pull_timeline. +# 4) Do some write, verify integrity with timeline_digest. +# Expected to fail while holding off WAL gc plus fetching commit_lsn WAL +# segment is not implemented. +def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder): + neon_env_builder.auth_enabled = True + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2]) + + log.info("use only first 2 safekeepers, 3rd will be seeded") + endpoint = env.endpoints.create("main") + endpoint.active_safekeepers = [1, 2] + endpoint.start() + endpoint.safe_psql("create table t(key int, value text)") + endpoint.safe_psql("insert into t select generate_series(1, 1000), 'pear'") + + src_flush_lsn = src_sk.get_flush_lsn(tenant_id, timeline_id) + log.info(f"flush_lsn on src before pull_timeline: {src_flush_lsn}") + + src_http = src_sk.http_client() + # run pull_timeline which will halt before downloading files + src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "pause")) + pt_handle = PropagatingThread( + target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id) + ) + pt_handle.start() + src_sk.wait_until_paused("sk-snapshot-after-list-pausable") + + # ensure segment exists + endpoint.safe_psql("insert into t select generate_series(1, 180000), 'papaya'") + lsn = last_flush_lsn_upload( + env, + endpoint, + tenant_id, + timeline_id, + auth_token=env.auth_keys.generate_tenant_token(tenant_id), + ) + assert lsn > Lsn("0/2000000") + # Checkpoint timeline beyond lsn. + src_sk.checkpoint_up_to(tenant_id, timeline_id, lsn, wait_wal_removal=False) + first_segment_p = src_sk.timeline_dir(tenant_id, timeline_id) / "000000010000000000000001" + log.info(f"first segment exist={os.path.exists(first_segment_p)}") + + src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "off")) + pt_handle.join() + + # after pull_timeline is finished WAL should be removed on donor + src_sk.checkpoint_up_to(tenant_id, timeline_id, lsn, wait_wal_removal=True) + + timeline_start_lsn = src_sk.get_timeline_start_lsn(tenant_id, timeline_id) + dst_flush_lsn = dst_sk.get_flush_lsn(tenant_id, timeline_id) + log.info(f"flush_lsn on dst after pull_timeline: {dst_flush_lsn}") + assert dst_flush_lsn >= src_flush_lsn + digests = [ + sk.http_client().timeline_digest(tenant_id, timeline_id, timeline_start_lsn, dst_flush_lsn) + for sk in [src_sk, dst_sk] + ] + assert digests[0] == digests[1], f"digest on src is {digests[0]} but on dst is {digests[1]}" + + +# Test pull_timeline while concurrently changing term on the donor: +# 1) Start pull_timeline, listing files to fetch. +# 2) Change term on the donor +# 3) Finish pull_timeline. +# +# Currently (until proper membership change procedure), we want to pull_timeline +# to fetch the log up to . This is unsafe if term +# changes during the procedure (unless timeline is locked all the time but we +# don't want that): recepient might end up with mix of WAL from different +# histories. Thus the schedule above is expected to fail. Later we'd allow +# pull_timeline to only initialize timeline to any valid state (up to +# commit_lsn), holding switch to fully new configuration until it recovers +# enough, so it won't be affected by term change anymore. +# +# Expected to fail while term check is not implemented. +def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder): + neon_env_builder.auth_enabled = True + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2]) + + log.info("use only first 2 safekeepers, 3rd will be seeded") + ep = env.endpoints.create("main") + ep.active_safekeepers = [1, 2] + ep.start() + ep.safe_psql("create table t(key int, value text)") + ep.safe_psql("insert into t select generate_series(1, 1000), 'pear'") + + src_http = src_sk.http_client() + # run pull_timeline which will halt before downloading files + src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "pause")) + pt_handle = PropagatingThread( + target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id) + ) + pt_handle.start() + src_sk.wait_until_paused("sk-snapshot-after-list-pausable") + + src_http = src_sk.http_client() + term_before = src_http.timeline_status(tenant_id, timeline_id).term + + # restart compute to bump term + ep.stop() + ep = env.endpoints.create("main") + ep.active_safekeepers = [1, 2] + ep.start() + ep.safe_psql("insert into t select generate_series(1, 100), 'pear'") + + term_after = src_http.timeline_status(tenant_id, timeline_id).term + assert term_after > term_before, f"term_after={term_after}, term_before={term_before}" + + src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "off")) + with pytest.raises(requests.exceptions.HTTPError): + pt_handle.join() + + # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries # when compute is active, but there are no writes to the timeline. In that case # pageserver should maintain a single connection to safekeeper and don't attempt @@ -1815,7 +1990,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_sk_auth_restart_endpoint") + timeline_id = env.neon_cli.create_branch("test_idle_reconnections") def collect_stats() -> Dict[str, float]: # we need to collect safekeeper_pg_queries_received_total metric from all safekeepers @@ -1846,7 +2021,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder): collect_stats() - endpoint = env.endpoints.create_start("test_sk_auth_restart_endpoint") + endpoint = env.endpoints.create_start("test_idle_reconnections") # just write something to the timeline endpoint.safe_psql("create table t(i int)") collect_stats() @@ -1929,6 +2104,11 @@ def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int): log.info(f"Original digest: {orig_digest}") for sk in env.safekeepers: + wait( + partial(is_flush_lsn_caught_up, sk, tenant_id, timeline_id, lsn), + f"sk_id={sk.id} to flush {lsn}", + ) + sk.http_client().copy_timeline( tenant_id, timeline_id, @@ -1946,3 +2126,435 @@ def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int): assert orig_digest == new_digest # TODO: test timelines can start after copy + + +def test_patch_control_file(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + endpoint = env.endpoints.create_start("main") + # initialize safekeeper + endpoint.safe_psql("create table t(key int, value text)") + + # update control file + res = ( + env.safekeepers[0] + .http_client() + .patch_control_file( + tenant_id, + timeline_id, + { + "timeline_start_lsn": "0/1", + }, + ) + ) + + timeline_start_lsn_before = res["old_control_file"]["timeline_start_lsn"] + timeline_start_lsn_after = res["new_control_file"]["timeline_start_lsn"] + + log.info(f"patch_control_file response: {res}") + log.info( + f"updated control file timeline_start_lsn, before {timeline_start_lsn_before}, after {timeline_start_lsn_after}" + ) + + assert timeline_start_lsn_after == "0/1" + env.safekeepers[0].stop().start() + + # wait/check that safekeeper is alive + endpoint.safe_psql("insert into t values (1, 'payload')") + + # check that timeline_start_lsn is updated + res = ( + env.safekeepers[0] + .http_client() + .debug_dump({"dump_control_file": "true", "timeline_id": str(timeline_id)}) + ) + log.info(f"dump_control_file response: {res}") + assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1" + + +# Test disables periodic pushes from safekeeper to the broker and checks that +# pageserver can still discover safekeepers with discovery requests. +def test_broker_discovery(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS) + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_broker_discovery") + + endpoint = env.endpoints.create_start( + "test_broker_discovery", + config_lines=["shared_buffers=1MB"], + ) + endpoint.safe_psql("create table t(i int, payload text)") + # Install extension containing function needed to clear buffer + endpoint.safe_psql("CREATE EXTENSION neon_test_utils") + + def do_something(): + time.sleep(1) + # generate some data to commit WAL on safekeepers + endpoint.safe_psql("insert into t select generate_series(1,100), 'action'") + # clear the buffers + endpoint.clear_shared_buffers() + # read data to fetch pages from pageserver + endpoint.safe_psql("select sum(i) from t") + + do_something() + do_something() + + for sk in env.safekeepers: + # Disable periodic broker push, so pageserver won't be able to discover + # safekeepers without sending a discovery request + sk.stop().start(extra_opts=["--disable-periodic-broker-push"]) + + do_something() + do_something() + + # restart pageserver and check how everything works + env.pageserver.stop().start() + + do_something() + do_something() + + +# Test creates 5 endpoints and tries to wake them up randomly. All timeouts are +# configured to be very short, so that we expect that: +# - pageserver will update remote_consistent_lsn very often +# - safekeepers will upload partial WAL segments very often +# - safekeeper will try to evict and unevict timelines +# +# Test checks that there are no critical errors while doing this. Also it checks +# that every safekeeper has at least one successful eviction. +@pytest.mark.parametrize("delete_offloaded_wal", [False, True]) +@pytest.mark.parametrize("restart_chance", [0.0, 0.2]) +def test_s3_eviction( + neon_env_builder: NeonEnvBuilder, delete_offloaded_wal: bool, restart_chance: float +): + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS) + + neon_env_builder.safekeeper_extra_opts = [ + "--enable-offload", + "--partial-backup-timeout", + "50ms", + "--control-file-save-interval", + "1s", + # Safekeepers usually wait a while before evicting something: for this test we want them to + # evict things as soon as they are inactive. + "--eviction-min-resident=100ms", + ] + if delete_offloaded_wal: + neon_env_builder.safekeeper_extra_opts.append("--delete-offloaded-wal") + + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_timeout": "100ms", + } + ) + + n_timelines = 5 + + branch_names = [f"branch{tlin}" for tlin in range(n_timelines)] + timelines = [] + ps_client = env.pageservers[0].http_client() + + # start postgres on each timeline + endpoints: list[Endpoint] = [] + for branch_name in branch_names: + timeline_id = env.neon_cli.create_branch(branch_name) + timelines.append(timeline_id) + + endpoints.append(env.endpoints.create_start(branch_name)) + endpoints[-1].safe_psql("CREATE TABLE t(i int)") + endpoints[-1].safe_psql("INSERT INTO t VALUES (0)") + + lsn = endpoints[-1].safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0] + log.info(f"{branch_name}: LSN={lsn}") + + endpoints[-1].stop() + + # update remote_consistent_lsn on pageserver + ps_client.timeline_checkpoint(env.initial_tenant, timelines[-1], wait_until_uploaded=True) + + check_values = [0] * n_timelines + + event_metrics_seen = False + + n_iters = 20 + for _ in range(n_iters): + if log.isEnabledFor(logging.DEBUG): + for j in range(n_timelines): + detail = ps_client.timeline_detail(env.initial_tenant, timelines[j]) + log.debug( + f'{branch_names[j]}: RCL={detail["remote_consistent_lsn"]}, LRL={detail["last_record_lsn"]}' + ) + + i = random.randint(0, n_timelines - 1) + log.info(f"Starting endpoint {i}") + endpoints[i].start() + check_values[i] += 1 + res = endpoints[i].safe_psql("UPDATE t SET i = i + 1 RETURNING i") + assert res[0][0] == check_values[i] + + lsn = endpoints[i].safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0] + log.info(f"{branch_names[i]}: LSN={lsn}") + + endpoints[i].stop() + + # update remote_consistent_lsn on pageserver + ps_client.timeline_checkpoint(env.initial_tenant, timelines[i], wait_until_uploaded=True) + + # Do metrics check before restarts, since these will reset to zero across a restart + event_metrics_seen |= any( + sk.http_client().get_metric_value( + "safekeeper_eviction_events_started_total", {"kind": "evict"} + ) + or 0 > 0 + and sk.http_client().get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "evict"} + ) + or 0 > 0 + and sk.http_client().get_metric_value( + "safekeeper_eviction_events_started_total", {"kind": "restore"} + ) + or 0 > 0 + and sk.http_client().get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "restore"} + ) + or 0 > 0 + for sk in env.safekeepers + ) + + # restarting random safekeepers + for sk in env.safekeepers: + if random.random() < restart_chance: + sk.stop().start() + time.sleep(0.5) + + # require at least one successful eviction in at least one safekeeper + # TODO: require eviction in each safekeeper after https://github.com/neondatabase/neon/issues/8148 is fixed + assert any( + sk.log_contains("successfully evicted timeline") + and sk.log_contains("successfully restored evicted timeline") + for sk in env.safekeepers + ) + + assert event_metrics_seen + + +# Test resetting uploaded partial segment state. +def test_backup_partial_reset(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) + # We want to upload/evict quickly, but not too quickly to check that s3 is + # empty before next round of upload happens. + # Note: this test fails with --delete-offloaded-wal, this is expected. + neon_env_builder.safekeeper_extra_opts = [ + "--enable-offload", + "--partial-backup-timeout", + "1s", + "--control-file-save-interval", + "1s", + "--eviction-min-resident=1s", + ] + # XXX: pageserver currently connects to safekeeper as long as connection + # manager doesn't remove its entry (default lagging_wal_timeout is 10s), + # causing uneviction. It should be fixed to not reconnect if last + # remote_consistent_lsn is communicated and there is nothing to fetch. Make + # value lower to speed up the test. + initial_tenant_conf = { + "lagging_wal_timeout": "1s", + } + env = neon_env_builder.init_start(initial_tenant_conf) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + endpoint = env.endpoints.create("main") + endpoint.start() + endpoint.safe_psql("create table t(key int, value text)") + endpoint.stop() + sk = env.safekeepers[0] + # eviction won't happen until remote_consistent_lsn catches up. + wait_lsn_force_checkpoint_at_sk(sk, tenant_id, timeline_id, env.pageserver) + + http_cli = env.safekeepers[0].http_client() + + # wait until eviction happens + def evicted(): + eviction_state = http_cli.get_eviction_state(timeline_id) + log.info(f"eviction_state: {eviction_state}") + if isinstance(eviction_state, str) and eviction_state == "Present": + raise Exception("eviction didn't happen yet") + + wait_until(30, 1, evicted) + # it must have uploaded something + uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id) + log.info(f"uploaded segments before reset: {uploaded_segs}") + assert len(uploaded_segs) > 0 + + reset_res = http_cli.backup_partial_reset(tenant_id, timeline_id) + log.info(f"reset res: {reset_res}") + + # Backup_partial_reset must have reset the state and dropped s3 segment. + # + # Note: if listing takes more than --partial-backup-timeout test becomes + # flaky because file might be reuploaded. With local fs it shouldn't be an + # issue, but can add retry if this appears. + uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id) + log.info(f"uploaded segments after reset: {uploaded_segs}") + assert len(uploaded_segs) == 0 + + # calling second time should be ok + http_cli.backup_partial_reset(tenant_id, timeline_id) + + # inserting data should be ok + endpoint.start() + endpoint.safe_psql("insert into t values(1, 'hehe')") + + +def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilder): + """ + Verify that pulling timeline from a SK with an uploaded partial segment + does not lead to consistency issues: + 1. Start 3 SKs - only use two + 2. Ingest a bit of WAL + 3. Wait for partial to be uploaded + 4. Pull timeline to the third SK + 6. Replace source with destination SK and start compute + 5. Wait for source SK to evict timeline + 6. Go back to initial compute SK config and validate that + source SK can unevict the timeline (S3 state is consistent) + """ + neon_env_builder.auth_enabled = True + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) + + neon_env_builder.safekeeper_extra_opts = [ + "--enable-offload", + "--delete-offloaded-wal", + "--partial-backup-timeout", + "500ms", + "--control-file-save-interval", + "500ms", + "--eviction-min-resident=500ms", + ] + + # XXX: pageserver currently connects to safekeeper as long as connection + # manager doesn't remove its entry (default lagging_wal_timeout is 10s), + # causing uneviction. It should be fixed to not reconnect if last + # remote_consistent_lsn is communicated and there is nothing to fetch. Until + # this is fixed make value lower to speed up the test. + initial_tenant_conf = { + "lagging_wal_timeout": "1s", + "checkpoint_timeout": "100ms", + } + env = neon_env_builder.init_start(initial_tenant_conf=initial_tenant_conf) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2]) + + log.info("use only first 2 safekeepers, 3rd will be seeded") + endpoint = env.endpoints.create("main") + endpoint.active_safekeepers = [1, 2] + endpoint.start() + endpoint.safe_psql("create table t(key int, value text)") + endpoint.safe_psql("insert into t select generate_series(1, 180000), 'papaya'") + + endpoint.stop() + + def source_partial_segment_uploaded(): + first_segment_name = "000000010000000000000001" + segs = src_sk.list_uploaded_segments(tenant_id, timeline_id) + + candidate_seg = None + for seg in segs: + if "partial" in seg and "sk1" in seg and not seg.startswith(first_segment_name): + candidate_seg = seg + + if candidate_seg is not None: + # The term might change, causing the segment to be gc-ed shortly after, + # so give it a bit of time to make sure it's stable. + time.sleep(2) + + segs = src_sk.list_uploaded_segments(tenant_id, timeline_id) + assert candidate_seg in segs + return candidate_seg + + raise Exception("Partial segment not uploaded yet") + + source_partial_segment = wait_until(15, 1, source_partial_segment_uploaded) + log.info( + f"Uploaded segments before pull are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}" + ) + log.info(f"Tracking source partial segment: {source_partial_segment}") + + src_flush_lsn = src_sk.get_flush_lsn(tenant_id, timeline_id) + log.info(f"flush_lsn on src before pull_timeline: {src_flush_lsn}") + + pageserver_conn_options = {"password": env.auth_keys.generate_tenant_token(tenant_id)} + wait_lsn_force_checkpoint_at( + src_flush_lsn, tenant_id, timeline_id, env.pageserver, pageserver_conn_options + ) + + dst_sk.pull_timeline([src_sk], tenant_id, timeline_id) + + def evicted(): + evictions = src_sk.http_client().get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "evict"} + ) + + if evictions is None or evictions == 0: + raise Exception("Eviction did not happen on source safekeeper yet") + + wait_until(30, 1, evicted) + + endpoint.start(safekeepers=[2, 3]) + + def new_partial_segment_uploaded(): + segs = dst_sk.list_uploaded_segments(tenant_id, timeline_id) + for seg in segs: + if "partial" in seg and "sk3" in seg: + return seg + + raise Exception("Partial segment not uploaded yet") + + log.info( + f"Uploaded segments before post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}" + ) + + endpoint.safe_psql("insert into t select generate_series(1, 1000), 'pear'") + wait_until(15, 1, new_partial_segment_uploaded) + + log.info( + f"Uploaded segments after post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}" + ) + + # Allow for some gc iterations to happen and assert that the original + # uploaded partial segment remains in place. + time.sleep(5) + segs = src_sk.list_uploaded_segments(tenant_id, timeline_id) + assert source_partial_segment in segs + + log.info( + f"Uploaded segments at the end are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}" + ) + + # Restart the endpoint in order to check that the source safekeeper + # can unevict the timeline + endpoint.stop() + endpoint.start(safekeepers=[1, 2]) + + def unevicted(): + unevictions = src_sk.http_client().get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "restore"} + ) + + if unevictions is None or unevictions == 0: + raise Exception("Uneviction did not happen on source safekeeper yet") + + wait_until(10, 1, unevicted) diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 77d67cd63a..3f0a4a2ff8 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -8,9 +8,10 @@ from typing import List, Optional import asyncpg import pytest import toml +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import getLogger from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper -from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.remote_storage import RemoteStorageKind log = getLogger("root.safekeeper_async") @@ -76,20 +77,20 @@ class WorkerStats(object): self.counters[worker_id] += 1 def check_progress(self): - log.debug("Workers progress: {}".format(self.counters)) + log.debug(f"Workers progress: {self.counters}") # every worker should finish at least one tx assert all(cnt > 0 for cnt in self.counters) progress = sum(self.counters) - log.info("All workers made {} transactions".format(progress)) + log.info(f"All workers made {progress} transactions") async def run_random_worker( stats: WorkerStats, endpoint: Endpoint, worker_id, n_accounts, max_transfer ): pg_conn = await endpoint.connect_async() - log.debug("Started worker {}".format(worker_id)) + log.debug(f"Started worker {worker_id}") while stats.running: from_uid = random.randint(0, n_accounts - 1) @@ -99,9 +100,9 @@ async def run_random_worker( await bank_transfer(pg_conn, from_uid, to_uid, amount) stats.inc_progress(worker_id) - log.debug("Executed transfer({}) {} => {}".format(amount, from_uid, to_uid)) + log.debug(f"Executed transfer({amount}) {from_uid} => {to_uid}") - log.debug("Finished worker {}".format(worker_id)) + log.debug(f"Finished worker {worker_id}") await pg_conn.close() @@ -199,7 +200,8 @@ async def run_restarts_under_load( # assert that at least one transaction has completed in every worker stats.check_progress() - victim.start() + # testing #6530 + victim.start(extra_opts=["--partial-backup-timeout=2s"]) log.info("Iterations are finished, exiting coroutines...") stats.running = False @@ -213,6 +215,7 @@ async def run_restarts_under_load( # Restart acceptors one by one, while executing and validating bank transactions def test_restarts_under_load(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() env.neon_cli.create_branch("test_safekeepers_restarts_under_load") @@ -250,7 +253,9 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): ) -def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): +def endpoint_create_start( + env: NeonEnv, branch: str, pgdir_name: Optional[str], allow_multiple: bool = False +): endpoint = Endpoint( env, tenant_id=env.initial_tenant, @@ -264,14 +269,23 @@ def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): # embed current time in endpoint ID endpoint_id = pgdir_name or f"ep-{time.time()}" return endpoint.create_start( - branch_name=branch, endpoint_id=endpoint_id, config_lines=["log_statement=all"] + branch_name=branch, + endpoint_id=endpoint_id, + config_lines=["log_statement=all"], + allow_multiple=allow_multiple, ) async def exec_compute_query( - env: NeonEnv, branch: str, query: str, pgdir_name: Optional[str] = None + env: NeonEnv, + branch: str, + query: str, + pgdir_name: Optional[str] = None, + allow_multiple: bool = False, ): - with endpoint_create_start(env, branch=branch, pgdir_name=pgdir_name) as endpoint: + with endpoint_create_start( + env, branch=branch, pgdir_name=pgdir_name, allow_multiple=allow_multiple + ) as endpoint: before_conn = time.time() conn = await endpoint.connect_async() res = await conn.fetch(query) @@ -343,6 +357,7 @@ class BackgroundCompute(object): self.branch, f"INSERT INTO query_log(index, verify_key) VALUES ({self.index}, {verify_key}) RETURNING verify_key", pgdir_name=f"bgcompute{self.index}_key{verify_key}", + allow_multiple=True, ) log.info(f"result: {res}") if len(res) != 1: @@ -515,6 +530,103 @@ def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder): asyncio.run(run_recovery_uncommitted(env)) +async def run_wal_truncation(env: NeonEnv): + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + (sk1, sk2, sk3) = env.safekeepers + + ep = env.endpoints.create_start("main") + ep.safe_psql("create table t (key int, value text)") + ep.safe_psql("insert into t select generate_series(1, 100), 'payload'") + + # insert with only one sk3 up to create tail of flushed but not committed WAL on it + sk1.stop() + sk2.stop() + conn = await ep.connect_async() + # query should hang, so execute in separate task + bg_query = asyncio.create_task( + conn.execute("insert into t select generate_series(1, 180000), 'Papaya'") + ) + sleep_sec = 2 + await asyncio.sleep(sleep_sec) + # it must still be not finished + assert not bg_query.done() + # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers. + ep.stop_and_destroy() + + # stop sk3 as well + sk3.stop() + + # now start sk1 and sk2 and make them commit something + sk1.start() + sk2.start() + ep = env.endpoints.create_start( + "main", + ) + ep.safe_psql("insert into t select generate_series(1, 200), 'payload'") + + # start sk3 and wait for it to catch up + sk3.start() + flush_lsn = Lsn(ep.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()")) + await wait_for_lsn(sk3, tenant_id, timeline_id, flush_lsn) + + timeline_start_lsn = sk1.get_timeline_start_lsn(tenant_id, timeline_id) + digests = [ + sk.http_client().timeline_digest(tenant_id, timeline_id, timeline_start_lsn, flush_lsn) + for sk in [sk1, sk2] + ] + assert digests[0] == digests[1], f"digest on sk1 is {digests[0]} but on sk3 is {digests[1]}" + + +# Simple deterministic test creating tail of WAL on safekeeper which is +# truncated when majority without this sk elects walproposer starting earlier. +def test_wal_truncation(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + asyncio.run(run_wal_truncation(env)) + + +async def run_segment_init_failure(env: NeonEnv): + env.neon_cli.create_branch("test_segment_init_failure") + ep = env.endpoints.create_start("test_segment_init_failure") + ep.safe_psql("create table t(key int, value text)") + ep.safe_psql("insert into t select generate_series(1, 100), 'payload'") + + sk = env.safekeepers[0] + sk_http = sk.http_client() + sk_http.configure_failpoints([("sk-write-zeroes", "return")]) + conn = await ep.connect_async() + ep.safe_psql("select pg_switch_wal()") # jump to the segment boundary + # next insertion should hang until failpoint is disabled. + bg_query = asyncio.create_task( + conn.execute("insert into t select generate_series(1,1), 'payload'") + ) + sleep_sec = 2 + await asyncio.sleep(sleep_sec) + # it must still be not finished + assert not bg_query.done() + # Also restart ep at segment boundary to make test more interesting. Do it in immediate mode; + # fast will hang because it will try to gracefully finish sending WAL. + ep.stop(mode="immediate") + # Without segment rename during init (#6402) previous statement created + # partially initialized 16MB segment, so sk restart also triggers #6401. + sk.stop().start() + ep = env.endpoints.create_start("test_segment_init_failure") + ep.safe_psql("insert into t select generate_series(1,1), 'payload'") # should be ok now + + +# Test (injected) failure during WAL segment init. +# https://github.com/neondatabase/neon/issues/6401 +# https://github.com/neondatabase/neon/issues/6402 +def test_segment_init_failure(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + asyncio.run(run_segment_init_failure(env)) + + @dataclass class RaceConditionTest: iteration: int diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py index 7ac6e6332c..229d3efd8e 100644 --- a/test_runner/regress/test_wal_receiver.py +++ b/test_runner/regress/test_wal_receiver.py @@ -1,8 +1,9 @@ import time +from typing import Any, Dict +from fixtures.common_types import Lsn, TenantId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder -from fixtures.types import Lsn, TenantId # Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout. @@ -42,10 +43,14 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder): # Kills one of the safekeepers and ensures that only the active ones are printed in the state. def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder): # Trigger WAL wait timeout faster - neon_env_builder.pageserver_config_override = """ - wait_lsn_timeout = "1s" - tenant_config={walreceiver_connect_timeout = "2s", lagging_wal_timeout = "2s"} - """ + def customize_pageserver_toml(ps_cfg: Dict[str, Any]): + ps_cfg["wait_lsn_timeout"] = "1s" + tenant_config = ps_cfg.setdefault("tenant_config", {}) + tenant_config["walreceiver_connect_timeout"] = "2s" + tenant_config["lagging_wal_timeout"] = "2s" + + neon_env_builder.pageserver_config_override = customize_pageserver_toml + # Have notable SK ids to ensure we check logs for their presence, not some other random numbers neon_env_builder.safekeepers_id_start = 12345 neon_env_builder.num_safekeepers = 3 @@ -57,6 +62,12 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil elements_to_insert = 1_000_000 expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive" env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*") + # we configure wait_lsn_timeout to a shorter value than the lagging_wal_timeout / walreceiver_connect_timeout + # => after we run into a timeout and reconnect to a different SK, more time than wait_lsn_timeout has passed + # ==> we log this error + env.pageserver.allowed_errors.append( + ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*" + ) insert_test_elements(env, tenant_id, start=0, count=elements_to_insert) diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 97db857c74..01a1d5cf55 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -2,19 +2,27 @@ import sys import tarfile import tempfile from pathlib import Path +from typing import List import pytest import zstandard +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, VanillaPostgres, ) -from fixtures.pageserver.utils import timeline_delete_wait_completed +from fixtures.pageserver.utils import ( + list_prefix, + remote_storage_delete_key, + timeline_delete_wait_completed, +) from fixtures.port_distributor import PortDistributor -from fixtures.remote_storage import LocalFsStorage -from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.remote_storage import LocalFsStorage, S3Storage, s3_storage +from mypy_boto3_s3.type_defs import ( + ObjectTypeDef, +) @pytest.mark.skipif( @@ -128,7 +136,11 @@ def test_wal_restore_initdb( assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)] -def test_wal_restore_http(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("broken_tenant", [True, False]) +def test_wal_restore_http(neon_env_builder: NeonEnvBuilder, broken_tenant: bool): + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + env = neon_env_builder.init_start() endpoint = env.endpoints.create_start("main") endpoint.safe_psql("create table t as select generate_series(1,300000)") @@ -137,15 +149,36 @@ def test_wal_restore_http(neon_env_builder: NeonEnvBuilder): ps_client = env.pageserver.http_client() + if broken_tenant: + env.pageserver.allowed_errors.append( + r".* Changing Active tenant to Broken state, reason: broken from test" + ) + ps_client.tenant_break(tenant_id) + # Mark the initdb archive for preservation ps_client.timeline_preserve_initdb_archive(tenant_id, timeline_id) # shut down the endpoint and delete the timeline from the pageserver endpoint.stop() - assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + assert isinstance(env.pageserver_remote_storage, S3Storage) - timeline_delete_wait_completed(ps_client, tenant_id, timeline_id) + if broken_tenant: + ps_client.tenant_detach(tenant_id) + objects: List[ObjectTypeDef] = list_prefix( + env.pageserver_remote_storage, f"tenants/{tenant_id}/timelines/{timeline_id}/" + ).get("Contents", []) + for obj in objects: + obj_key = obj["Key"] + if "initdb-preserved.tar.zst" in obj_key: + continue + log.info(f"Deleting key from remote storage: {obj_key}") + remote_storage_delete_key(env.pageserver_remote_storage, obj_key) + pass + + ps_client.tenant_attach(tenant_id, generation=10) + else: + timeline_delete_wait_completed(ps_client, tenant_id, timeline_id) # issue the restoration command ps_client.timeline_create( diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index 13159efbe8..375cfcb4fe 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -2,10 +2,10 @@ import time import psutil import pytest +from fixtures.common_types import TenantId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pageserver.http import PageserverApiException -from fixtures.types import TenantId def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_present=False): @@ -37,7 +37,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}", ): - pageserver_http.tenant_detach(tenant_id) + pageserver_http.tenant_status(tenant_id) # create new nenant tenant_id, _ = env.neon_cli.create_tenant() diff --git a/test_runner/sql_regress/expected/neon-test-utils.out b/test_runner/sql_regress/expected/neon-test-utils.out new file mode 100644 index 0000000000..7d1634a6b8 --- /dev/null +++ b/test_runner/sql_regress/expected/neon-test-utils.out @@ -0,0 +1,28 @@ +-- Test the test utils in pgxn/neon_test_utils. We don't test that +-- these actually consume resources like they should - that would be +-- tricky - but at least we check that they don't crash. +CREATE EXTENSION neon_test_utils; +select test_consume_cpu(1); + test_consume_cpu +------------------ + +(1 row) + +select test_consume_memory(20); -- Allocate 20 MB + test_consume_memory +--------------------- + +(1 row) + +select test_release_memory(5); -- Release 5 MB + test_release_memory +--------------------- + +(1 row) + +select test_release_memory(); -- Release the remaining 15 MB + test_release_memory +--------------------- + +(1 row) + diff --git a/test_runner/sql_regress/parallel_schedule b/test_runner/sql_regress/parallel_schedule index 569c7b5066..d9508d1c90 100644 --- a/test_runner/sql_regress/parallel_schedule +++ b/test_runner/sql_regress/parallel_schedule @@ -7,4 +7,5 @@ test: neon-cid test: neon-rel-truncate test: neon-clog +test: neon-test-utils test: neon-vacuum-full diff --git a/test_runner/sql_regress/sql/neon-test-utils.sql b/test_runner/sql_regress/sql/neon-test-utils.sql new file mode 100644 index 0000000000..c5ca6c624b --- /dev/null +++ b/test_runner/sql_regress/sql/neon-test-utils.sql @@ -0,0 +1,11 @@ +-- Test the test utils in pgxn/neon_test_utils. We don't test that +-- these actually consume resources like they should - that would be +-- tricky - but at least we check that they don't crash. + +CREATE EXTENSION neon_test_utils; + +select test_consume_cpu(1); + +select test_consume_memory(20); -- Allocate 20 MB +select test_release_memory(5); -- Release 5 MB +select test_release_memory(); -- Release the remaining 15 MB diff --git a/trace/Cargo.toml b/trace/Cargo.toml deleted file mode 100644 index d6eed3f49c..0000000000 --- a/trace/Cargo.toml +++ /dev/null @@ -1,13 +0,0 @@ -[package] -name = "trace" -version = "0.1.0" -edition.workspace = true -license.workspace = true - -[dependencies] -clap.workspace = true -anyhow.workspace = true - -pageserver_api.workspace = true -utils.workspace = true -workspace_hack.workspace = true diff --git a/trace/src/main.rs b/trace/src/main.rs deleted file mode 100644 index ddd970e95d..0000000000 --- a/trace/src/main.rs +++ /dev/null @@ -1,172 +0,0 @@ -//! A tool for working with read traces generated by the pageserver. -use std::collections::HashMap; -use std::path::PathBuf; -use std::str::FromStr; -use std::{ - fs::{read_dir, File}, - io::BufReader, -}; - -use pageserver_api::models::{PagestreamFeMessage, PagestreamGetPageRequest}; -use utils::id::{ConnectionId, TenantId, TimelineId}; - -use clap::{Parser, Subcommand}; - -/// Utils for working with pageserver read traces. For generating -/// traces, see the `trace_read_requests` tenant config option. -#[derive(Parser, Debug)] -#[command(author, version, about, long_about = None)] -struct Args { - /// Path of trace directory - #[arg(short, long)] - path: PathBuf, - - #[command(subcommand)] - command: Command, -} - -/// What to do with the read trace -#[derive(Subcommand, Debug)] -enum Command { - /// List traces in the directory - List, - - /// Print the traces in text format - Dump, - - /// Print stats and anomalies about the traces - Analyze, - - /// Draw the traces in svg format - Draw, - - /// Send the read requests to a pageserver - Replay, -} - -// HACK This function will change and improve as we see what kind of analysis is useful. -// Currently it collects the difference in blkno of consecutive GetPage requests, -// and counts the frequency of each value. This information is useful in order to: -// - see how sequential a workload is by seeing how often the delta is 1 -// - detect any prefetching anomalies by looking for negative deltas during seqscan -fn analyze_trace(mut reader: R) { - let mut total = 0; // Total requests traced - let mut cross_rel = 0; // Requests that ask for different rel than previous request - let mut deltas = HashMap::::new(); // Consecutive blkno differences - let mut prev: Option = None; - - // Compute stats - while let Ok(msg) = PagestreamFeMessage::parse(&mut reader) { - match msg { - PagestreamFeMessage::Exists(_) => {} - PagestreamFeMessage::Nblocks(_) => {} - PagestreamFeMessage::GetPage(req) => { - total += 1; - - if let Some(prev) = prev { - if prev.rel == req.rel { - let delta = (req.blkno as i32) - (prev.blkno as i32); - deltas.entry(delta).and_modify(|c| *c += 1).or_insert(1); - } else { - cross_rel += 1; - } - } - prev = Some(req); - } - PagestreamFeMessage::DbSize(_) => {} - }; - } - - // Print stats. - let mut other = deltas.len(); - deltas.retain(|_, count| *count > 300); - other -= deltas.len(); - dbg!(total); - dbg!(cross_rel); - dbg!(other); - dbg!(deltas); -} - -fn dump_trace(mut reader: R) { - while let Ok(msg) = PagestreamFeMessage::parse(&mut reader) { - println!("{msg:?}"); - } -} - -#[derive(Debug)] -struct TraceFile { - #[allow(dead_code)] - pub tenant_id: TenantId, - - #[allow(dead_code)] - pub timeline_id: TimelineId, - - #[allow(dead_code)] - pub connection_id: ConnectionId, - - pub path: PathBuf, -} - -fn get_trace_files(traces_dir: &PathBuf) -> anyhow::Result> { - let mut trace_files = Vec::::new(); - - // Trace files are organized as {tenant_id}/{timeline_id}/{connection_id} - for tenant_dir in read_dir(traces_dir)? { - let entry = tenant_dir?; - let path = entry.path(); - let tenant_id = TenantId::from_str(path.file_name().unwrap().to_str().unwrap())?; - - for timeline_dir in read_dir(path)? { - let entry = timeline_dir?; - let path = entry.path(); - let timeline_id = TimelineId::from_str(path.file_name().unwrap().to_str().unwrap())?; - - for trace_dir in read_dir(path)? { - let entry = trace_dir?; - let path = entry.path(); - let connection_id = - ConnectionId::from_str(path.file_name().unwrap().to_str().unwrap())?; - - trace_files.push(TraceFile { - tenant_id, - timeline_id, - connection_id, - path, - }); - } - } - } - - Ok(trace_files) -} - -fn main() -> anyhow::Result<()> { - let args = Args::parse(); - - match args.command { - Command::List => { - for trace_file in get_trace_files(&args.path)? { - println!("{trace_file:?}"); - } - } - Command::Dump => { - for trace_file in get_trace_files(&args.path)? { - let file = File::open(trace_file.path.clone())?; - let reader = BufReader::new(file); - dump_trace(reader); - } - } - Command::Analyze => { - for trace_file in get_trace_files(&args.path)? { - println!("analyzing {trace_file:?}"); - let file = File::open(trace_file.path.clone())?; - let reader = BufReader::new(file); - analyze_trace(reader); - } - } - Command::Draw => todo!(), - Command::Replay => todo!(), - } - - Ok(()) -} diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 11e970fe2b..a317b9b5b9 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 11e970fe2be56804f0a786ec5fc8141ffefa4ca7 +Subproject commit a317b9b5b96978b49e78986697f3dd80d06f99a7 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 731b4d1609..49d5e576a5 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 731b4d1609d6db1c953755810a41e0e67ea3db7b +Subproject commit 49d5e576a56e4cc59cd6a6a0791b2324b9fa675e diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index cf302768b2..6e9a4ff624 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit cf302768b2890569956641e0e5ba112ae1445351 +Subproject commit 6e9a4ff6249ac02b8175054b7b3f7dfb198be48b diff --git a/vendor/revisions.json b/vendor/revisions.json index c7b33f8c8a..e52576e61f 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,14 @@ { - "postgres-v16": "cf302768b2890569956641e0e5ba112ae1445351", - "postgres-v15": "731b4d1609d6db1c953755810a41e0e67ea3db7b", - "postgres-v14": "11e970fe2be56804f0a786ec5fc8141ffefa4ca7" + "v16": [ + "16.4", + "6e9a4ff6249ac02b8175054b7b3f7dfb198be48b" + ], + "v15": [ + "15.8", + "49d5e576a56e4cc59cd6a6a0791b2324b9fa675e" + ], + "v14": [ + "14.13", + "a317b9b5b96978b49e78986697f3dd80d06f99a7" + ] } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index bbe80ceeb1..c94f95f447 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -5,6 +5,12 @@ commands: user: root sysvInitAction: sysinit shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664' + # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for + # running it as root. + - name: chmod-resize-swap + user: root + sysvInitAction: sysinit + shell: 'chmod 711 /neonvm/bin/resize-swap' - name: pgbouncer user: postgres sysvInitAction: respawn @@ -12,14 +18,23 @@ commands: - name: postgres-exporter user: nobody sysvInitAction: respawn - shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres" /bin/postgres_exporter' + shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter' - name: sql-exporter user: nobody sysvInitAction: respawn - shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml' + shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399' + - name: sql-exporter-autoscaling + user: nobody + sysvInitAction: respawn + shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' shutdownHook: | su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' files: + - filename: compute_ctl-resize-swap + content: | + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap + # as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL) + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap - filename: pgbouncer.ini content: | [databases] @@ -78,7 +93,7 @@ files: target: # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) # the schema gets dropped or replaced to match the driver expected DSN format. - data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable' + data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter' # Collectors (referenced by name) to execute on the target. # Glob patterns are supported (see for syntax). @@ -88,6 +103,41 @@ files: # Glob patterns are supported (see for syntax). collector_files: - "neon_collector.yml" + - filename: sql_exporter_autoscaling.yml + content: | + # Configuration for sql_exporter for autoscaling-agent + # Global defaults. + global: + # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. + scrape_timeout: 10s + # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. + scrape_timeout_offset: 500ms + # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. + min_interval: 0s + # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, + # as will concurrent scrapes. + max_connections: 1 + # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should + # always be the same as max_connections. + max_idle_connections: 1 + # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. + # If 0, connections are not closed due to a connection's age. + max_connection_lifetime: 5m + + # The target to monitor and the collectors to execute on it. + target: + # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) + # the schema gets dropped or replaced to match the driver expected DSN format. + data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling' + + # Collectors (referenced by name) to execute on the target. + # Glob patterns are supported (see for syntax). + collectors: [neon_collector_autoscaling] + + # Collector files specifies a list of globs. One collector definition is read from each matching file. + # Glob patterns are supported (see for syntax). + collector_files: + - "neon_collector_autoscaling.yml" - filename: neon_collector.yml content: | collector_name: neon_collector @@ -102,7 +152,7 @@ files: - metric_name: lfc_used type: gauge - help: 'lfc_used' + help: 'LFC chunks used (chunk = 1MB)' key_labels: values: [lfc_used] query: | @@ -124,6 +174,279 @@ files: query: | select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; + - metric_name: lfc_cache_size_limit + type: gauge + help: 'LFC cache size limit in bytes' + key_labels: + values: [lfc_cache_size_limit] + query: | + select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; + + - metric_name: connection_counts + type: gauge + help: 'Connection counts' + key_labels: + - datname + - state + values: [count] + query: | + select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state; + + - metric_name: pg_stats_userdb + type: gauge + help: 'Stats for several oldest non-system dbs' + key_labels: + - datname + value_label: kind + values: + - db_size + - deadlocks + # Rows + - inserted + - updated + - deleted + # We export stats for 10 non-system database. Without this limit + # it is too easy to abuse the system by creating lots of databases. + query: | + select pg_database_size(datname) as db_size, deadlocks, + tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted, + datname + from pg_stat_database + where datname IN ( + select datname + from pg_database + where datname <> 'postgres' and not datistemplate + order by oid + limit 10 + ); + + - metric_name: max_cluster_size + type: gauge + help: 'neon.max_cluster_size setting' + key_labels: + values: [max_cluster_size] + query: | + select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size'; + + - metric_name: db_total_size + type: gauge + help: 'Size of all databases' + key_labels: + values: [total] + query: | + select sum(pg_database_size(datname)) as total from pg_database; + + # DEPRECATED + - metric_name: lfc_approximate_working_set_size + type: gauge + help: 'Approximate working set size in pages of 8192 bytes' + key_labels: + values: [approximate_working_set_size] + query: | + select neon.approximate_working_set_size(false) as approximate_working_set_size; + + - metric_name: lfc_approximate_working_set_size_windows + type: gauge + help: 'Approximate working set size in pages of 8192 bytes' + key_labels: [duration] + values: [size] + # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection + # of durations in a pretty-printed form. + query: | + select + x as duration, + neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size + from + (values ('5m'),('15m'),('1h')) as t (x); + + - metric_name: compute_current_lsn + type: gauge + help: 'Current LSN of the database' + key_labels: + values: [lsn] + query: | + select + case + when pg_catalog.pg_is_in_recovery() + then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 + else (pg_current_wal_lsn() - '0/0')::FLOAT8 + end as lsn; + + - metric_name: compute_receive_lsn + type: gauge + help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication' + key_labels: + values: [lsn] + query: | + SELECT + CASE + WHEN pg_catalog.pg_is_in_recovery() + THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8 + ELSE 0 + END AS lsn; + + - metric_name: replication_delay_bytes + type: gauge + help: 'Bytes between received and replayed LSN' + key_labels: + values: [replication_delay_bytes] + # We use a GREATEST call here because this calculation can be negative. + # The calculation is not atomic, meaning after we've gotten the receive + # LSN, the replay LSN may have advanced past the receive LSN we + # are using for the calculation. + query: | + SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; + + - metric_name: replication_delay_seconds + type: gauge + help: 'Time since last LSN was replayed' + key_labels: + values: [replication_delay_seconds] + query: | + SELECT + CASE + WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 + ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) + END AS replication_delay_seconds; + + - metric_name: checkpoints_req + type: gauge + help: 'Number of requested checkpoints' + key_labels: + values: [checkpoints_req] + query: | + SELECT checkpoints_req FROM pg_stat_bgwriter; + + - metric_name: checkpoints_timed + type: gauge + help: 'Number of scheduled checkpoints' + key_labels: + values: [checkpoints_timed] + query: | + SELECT checkpoints_timed FROM pg_stat_bgwriter; + + - metric_name: compute_logical_snapshot_files + type: gauge + help: 'Number of snapshot files in pg_logical/snapshot' + key_labels: + - timeline_id + values: [num_logical_snapshot_files] + query: | + SELECT + (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, + -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These + -- temporary snapshot files are renamed to the actual snapshot files after they are + -- completely built. We only WAL-log the completely built snapshot files. + (SELECT COUNT(*) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS num_logical_snapshot_files; + + # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats. + # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly. + + # Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad. + - metric_name: logical_slot_restart_lsn + type: gauge + help: 'restart_lsn of logical slots' + key_labels: + - slot_name + values: [restart_lsn] + query: | + select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn + from pg_replication_slots + where slot_type = 'logical'; + + - metric_name: compute_subscriptions_count + type: gauge + help: 'Number of logical replication subscriptions grouped by enabled/disabled' + key_labels: + - enabled + values: [subscriptions_count] + query: | + select subenabled::text as enabled, count(*) as subscriptions_count + from pg_subscription + group by subenabled; + + - metric_name: retained_wal + type: gauge + help: 'Retained WAL in inactive replication slots' + key_labels: + - slot_name + values: [retained_wal] + query: | + SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal + FROM pg_replication_slots + WHERE active = false; + + - metric_name: wal_is_lost + type: gauge + help: 'Whether or not the replication slot wal_status is lost' + key_labels: + - slot_name + values: [wal_is_lost] + query: | + SELECT slot_name, + CASE + WHEN wal_status = 'lost' THEN 1 + ELSE 0 + END AS wal_is_lost + FROM pg_replication_slots; + + - filename: neon_collector_autoscaling.yml + content: | + collector_name: neon_collector_autoscaling + metrics: + - metric_name: lfc_misses + type: gauge + help: 'lfc_misses' + key_labels: + values: [lfc_misses] + query: | + select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses'; + + - metric_name: lfc_used + type: gauge + help: 'LFC chunks used (chunk = 1MB)' + key_labels: + values: [lfc_used] + query: | + select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used'; + + - metric_name: lfc_hits + type: gauge + help: 'lfc_hits' + key_labels: + values: [lfc_hits] + query: | + select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits'; + + - metric_name: lfc_writes + type: gauge + help: 'lfc_writes' + key_labels: + values: [lfc_writes] + query: | + select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; + + - metric_name: lfc_cache_size_limit + type: gauge + help: 'LFC cache size limit in bytes' + key_labels: + values: [lfc_cache_size_limit] + query: | + select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; + + - metric_name: lfc_approximate_working_set_size_windows + type: gauge + help: 'Approximate working set size in pages of 8192 bytes' + key_labels: [duration_seconds] + values: [size] + # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set + # size looking back 1..60 minutes, labeled with the number of minutes. + query: | + select + x::text as duration_seconds, + neon.approximate_working_set_size_seconds(x) as size + from + (select generate_series * 60 as x from generate_series(1, 60)) as t (x); build: | # Build cgroup-tools # @@ -131,7 +454,7 @@ build: | # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor # requires cgroup v2, so we'll build cgroup-tools ourselves. FROM debian:bullseye-slim as libcgroup-builder - ENV LIBCGROUP_VERSION v2.0.3 + ENV LIBCGROUP_VERSION=v2.0.3 RUN set -exu \ && apt update \ @@ -158,7 +481,7 @@ build: | # actually build the thing... && make install - FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.0 AS postgres-exporter + FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter FROM burningalchemist/sql_exporter:0.13 AS sql-exporter @@ -174,11 +497,10 @@ build: | libtool \ pkg-config - # Note, we use pgbouncer from neondatabase/pgbouncer fork, which could contain extra commits. # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc) - ENV PGBOUNCER_TAG pgbouncer_1_21_0-neon-1 + ENV PGBOUNCER_TAG=pgbouncer_1_22_1 RUN set -e \ - && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/neondatabase/pgbouncer.git pgbouncer \ + && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \ && cd pgbouncer \ && ./autogen.sh \ && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \ @@ -193,17 +515,32 @@ merge: | && echo 'root - nofile 1048576' >>/etc/security/limits.conf \ ) + # Allow postgres user (compute_ctl) to run swap resizer. + # Need to install sudo in order to allow this. + # + # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe. + RUN set -e \ + && apt update \ + && apt install --no-install-recommends -y \ + sudo \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap + COPY cgconfig.conf /etc/cgconfig.conf COPY pgbouncer.ini /etc/pgbouncer.ini COPY sql_exporter.yml /etc/sql_exporter.yml COPY neon_collector.yml /etc/neon_collector.yml + COPY sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml + COPY neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml RUN set -e \ && chown postgres:postgres /etc/pgbouncer.ini \ && chmod 0666 /etc/pgbouncer.ini \ && chmod 0644 /etc/cgconfig.conf \ && chmod 0644 /etc/sql_exporter.yml \ - && chmod 0644 /etc/neon_collector.yml + && chmod 0644 /etc/neon_collector.yml \ + && chmod 0644 /etc/sql_exporter_autoscaling.yml \ + && chmod 0644 /etc/neon_collector_autoscaling.yml COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index b72e0f3c26..3d2fa8c214 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -13,102 +13,116 @@ publish = false ### BEGIN HAKARI SECTION [dependencies] +ahash = { version = "0.8" } anyhow = { version = "1", features = ["backtrace"] } aws-config = { version = "1", default-features = false, features = ["rustls", "sso"] } -aws-runtime = { version = "1", default-features = false, features = ["event-stream", "sigv4a"] } +aws-runtime = { version = "1", default-features = false, features = ["event-stream", "http-02x", "sigv4a"] } aws-sigv4 = { version = "1", features = ["http0-compat", "sign-eventstream", "sigv4a"] } aws-smithy-async = { version = "1", default-features = false, features = ["rt-tokio"] } aws-smithy-http = { version = "0.60", default-features = false, features = ["event-stream"] } -aws-smithy-runtime-api = { version = "1", features = ["client", "http-02x", "http-auth"] } -aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "rt-tokio"] } +aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "http-body-1-x", "rt-tokio", "test-util"] } axum = { version = "0.6", features = ["ws"] } base64 = { version = "0.21", features = ["alloc"] } base64ct = { version = "1", default-features = false, features = ["std"] } bytes = { version = "1", features = ["serde"] } +camino = { version = "1", default-features = false, features = ["serde1"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } clap = { version = "4", features = ["derive", "string"] } clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] } crossbeam-utils = { version = "0.8" } +crypto-bigint = { version = "0.5", features = ["generic-array", "zeroize"] } +der = { version = "0.7", default-features = false, features = ["oid", "pem", "std"] } +deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] } +digest = { version = "0.10", features = ["mac", "oid", "std"] } either = { version = "1" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } futures-channel = { version = "0.3", features = ["sink"] } -futures-core = { version = "0.3" } futures-executor = { version = "0.3" } futures-io = { version = "0.3" } -futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } +generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] } getrandom = { version = "0.2", default-features = false, features = ["std"] } -hashbrown = { version = "0.14", default-features = false, features = ["raw"] } +hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper = { version = "0.14", features = ["full"] } indexmap = { version = "1", default-features = false, features = ["std"] } -itertools = { version = "0.10" } -libc = { version = "0.2", features = ["extra_traits"] } +itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] } +itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } +lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } +libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nom = { version = "7" } num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } -num-traits = { version = "0.2", features = ["i128"] } +num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } -parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] } +parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] } prost = { version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } -reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] } -ring = { version = "0.16" } +reqwest-5ef9efb8ec2df382 = { package = "reqwest", version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] } +reqwest-a6292c17cd707f01 = { package = "reqwest", version = "0.11", default-features = false, features = ["blocking", "rustls-tls", "stream"] } rustls = { version = "0.21", features = ["dangerous_configuration"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } -smallvec = { version = "1", default-features = false, features = ["write"] } +sha2 = { version = "0.10", features = ["asm", "oid"] } +signature = { version = "2", default-features = false, features = ["digest", "rand_core", "std"] } +smallvec = { version = "1", default-features = false, features = ["const_new", "write"] } +spki = { version = "0.7", default-features = false, features = ["pem", "std"] } subtle = { version = "2" } -time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] } +sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] } +tikv-jemalloc-sys = { version = "0.5" } +time = { version = "0.3", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } tokio-rustls = { version = "0.24" } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } -toml_datetime = { version = "0.6", default-features = false, features = ["serde"] } -toml_edit = { version = "0.19", features = ["serde"] } tonic = { version = "0.9", features = ["tls-roots"] } tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } -tungstenite = { version = "0.20" } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4", "v7"] } +zeroize = { version = "1", features = ["derive", "serde"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] } [build-dependencies] +ahash = { version = "0.8" } anyhow = { version = "1", features = ["backtrace"] } bytes = { version = "1", features = ["serde"] } cc = { version = "1", default-features = false, features = ["parallel"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } -hashbrown = { version = "0.14", default-features = false, features = ["raw"] } +hashbrown = { version = "0.14", features = ["raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } -itertools = { version = "0.10" } -libc = { version = "0.2", features = ["extra_traits"] } +itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] } +itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } +lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } +libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nom = { version = "7" } num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } -num-traits = { version = "0.2", features = ["i128"] } +num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } -parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] } +parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] } +proc-macro2 = { version = "1" } prost = { version = "0.11" } +quote = { version = "1" } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } serde = { version = "1", features = ["alloc", "derive"] } syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] } -syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] } +syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }